Newer
Older
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
* interface as the means of communication with the user level.
*
* Implementation of the Transmission Control Protocol(TCP).
*
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Mark Evans, <evansmp@uhura.aston.ac.uk>
* Corey Minyard <wf-rch!minyard@relay.EU.net>
* Florian La Roche, <flla@stud.uni-sb.de>
* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
* Linus Torvalds, <torvalds@cs.helsinki.fi>
* Alan Cox, <gw4pts@gw4pts.ampr.org>
* Matthew Dillon, <dillon@apollo.west.oic.com>
* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
* Jorge Cwik, <jorge@laser.satlink.net>
*/
/*
* Changes:
* Pedro Roque : Fast Retransmit/Recovery.
* Two receive queues.
* Retransmit queue handled by TCP.
* Better retransmit timer handling.
* New congestion avoidance.
* Header prediction.
* Variable renaming.
*
* Eric : Fast Retransmit.
* Randy Scott : MSS option defines.
* Eric Schenk : Fixes to slow start algorithm.
* Eric Schenk : Yet another double ACK bug.
* Eric Schenk : Delayed ACK bug fixes.
* Eric Schenk : Floyd style fast retrans war avoidance.
* David S. Miller : Don't allow zero congestion window.
* Eric Schenk : Fix retransmitter so that it sends
* next packet on ack of previous packet.
* Andi Kleen : Moved open_request checking here
* and process RSTs for open_requests.
* Andi Kleen : Better prune_queue, and other fixes.
* Andrey Savochkin: Fix RTT measurements in the presence of
* timestamps.
* Andrey Savochkin: Check sequence numbers correctly when
* removing SACKs due to in sequence incoming
* data segments.
* Andi Kleen: Make sure we never ack data there is not
* enough room for. Also make this condition
* a fatal error if it might still happen.
* Andi Kleen: Add tcp_measure_rcv_mss to make
* Andi Kleen: Process packets with PSH set in the
* fast path.
* J Hadi Salim: ECN support
* Andrei Gurtov,
* Pasi Sarolahti,
* Panu Kuhlberg: Experimental audit of TCP (re)transmission
* engine. Lots of bugs are found.
* Pasi Sarolahti: F-RTO for dealing with spurious RTOs
*/
#define pr_fmt(fmt) "TCP: " fmt
#include <linux/slab.h>

Satoru SATOH
committed
#include <net/dst.h>
#include <net/tcp.h>
#include <net/inet_common.h>
#include <linux/ipsec.h>
#include <asm/unaligned.h>
int sysctl_tcp_timestamps __read_mostly = 1;
int sysctl_tcp_window_scaling __read_mostly = 1;
int sysctl_tcp_sack __read_mostly = 1;
int sysctl_tcp_fack __read_mostly = 1;
int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;
int sysctl_tcp_ecn __read_mostly = 2;
int sysctl_tcp_dsack __read_mostly = 1;
int sysctl_tcp_app_win __read_mostly = 31;
int sysctl_tcp_adv_win_scale __read_mostly = 2;
int sysctl_tcp_stdurg __read_mostly;
int sysctl_tcp_rfc1337 __read_mostly;
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
int sysctl_tcp_frto __read_mostly = 2;
int sysctl_tcp_frto_response __read_mostly;
int sysctl_tcp_nometrics_save __read_mostly;
int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
int sysctl_tcp_abc __read_mostly;
#define FLAG_DATA 0x01 /* Incoming frame contained data. */
#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */
#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
#define FLAG_DATA_SACKED 0x20 /* New SACK. */
#define FLAG_ECE 0x40 /* ECE in this ACK */
#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
#define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */
#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
#define FLAG_NONHEAD_RETRANS_ACKED 0x1000 /* Non-head rexmitted data was ACKed */
#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE)
#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED)
#define FLAG_ANY_PROGRESS (FLAG_FORWARD_PROGRESS|FLAG_SND_UNA_ADVANCED)
#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
/* Adapt the MSS value used to make delayed ack decision to the
static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
struct inet_connection_sock *icsk = inet_csk(sk);
const unsigned int lss = icsk->icsk_ack.last_seg_size;
icsk->icsk_ack.last_seg_size = 0;
/* skb->len may jitter because of SACKs, even if peer
* sends good full-sized frames.
*/
len = skb_shinfo(skb)->gso_size ? : skb->len;
if (len >= icsk->icsk_ack.rcv_mss) {
icsk->icsk_ack.rcv_mss = len;
} else {
/* Otherwise, we make more careful check taking into account,
* that SACKs block is variable.
*
* "len" is invariant segment length, including TCP header.
*/
len += skb->data - skb_transport_header(skb);
if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) ||
/* If PSH is not set, packet should be
* full sized, provided peer TCP is not badly broken.
* This observation (if it is correct 8)) allows
* to handle super-low mtu links fairly.
*/
(len >= TCP_MIN_MSS + sizeof(struct tcphdr) &&
!(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) {
/* Subtract also invariant (if peer is RFC compliant),
* tcp header plus fixed timestamp option length.
* Resulting "len" is MSS free of SACK jitter.
*/
len -= tcp_sk(sk)->tcp_header_len;
icsk->icsk_ack.last_seg_size = len;
icsk->icsk_ack.rcv_mss = len;
if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)
icsk->icsk_ack.pending |= ICSK_ACK_PUSHED2;
icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
static void tcp_incr_quickack(struct sock *sk)
struct inet_connection_sock *icsk = inet_csk(sk);
unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
if (quickacks == 0)
quickacks = 2;
if (quickacks > icsk->icsk_ack.quick)
icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
static void tcp_enter_quickack_mode(struct sock *sk)
struct inet_connection_sock *icsk = inet_csk(sk);
tcp_incr_quickack(sk);
icsk->icsk_ack.pingpong = 0;
icsk->icsk_ack.ato = TCP_ATO_MIN;
}
/* Send ACKs quickly, if "quick" count is not exhausted
* and the session is not interactive.
*/
static inline int tcp_in_quickack_mode(const struct sock *sk)
const struct inet_connection_sock *icsk = inet_csk(sk);
return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp)
{
tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
}
static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb)
{
if (tcp_hdr(skb)->cwr)
tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
}
static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp)
{
tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
}
static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
if (!(tp->ecn_flags & TCP_ECN_OK))
return;
switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
/* Funny extension: if ECT is not set on a segment,
* and we already seen ECT on a previous segment,
* it is probably a retransmit.
*/
if (tp->ecn_flags & TCP_ECN_SEEN)
tcp_enter_quickack_mode((struct sock *)tp);
break;
case INET_ECN_CE:
tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
/* fallinto */
default:
tp->ecn_flags |= TCP_ECN_SEEN;
}
}
static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr))
tp->ecn_flags &= ~TCP_ECN_OK;
}
static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)
if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr))
tp->ecn_flags &= ~TCP_ECN_OK;
}
static inline int TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
return 1;
return 0;
}
/* Buffer size and advertised window tuning.
*
* 1. Tuning sk->sk_sndbuf, when connection enters established state.
*/
static void tcp_fixup_sndbuf(struct sock *sk)
{
int sndmem = SKB_TRUESIZE(tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER);
sndmem *= TCP_INIT_CWND;
if (sk->sk_sndbuf < sndmem)
sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
}
/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
*
* All tcp_full_space() is split to two parts: "network" buffer, allocated
* forward and advertised in receiver window (tp->rcv_wnd) and
* "application buffer", required to isolate scheduling/application
* latencies from network.
* window_clamp is maximal advertised window. It can be less than
* tcp_full_space(), in this case tcp_full_space() - window_clamp
* is reserved for "application" buffer. The less window_clamp is
* the smoother our behaviour from viewpoint of network, but the lower
* throughput and the higher sensitivity of the connection to losses. 8)
*
* rcv_ssthresh is more strict window_clamp used at "slow start"
* phase to predict further behaviour of this connection.
* It is used for two goals:
* - to enforce header prediction at sender, even when application
* requires some significant "application buffer". It is check #1.
* - to prevent pruning of receive queue because of misprediction
* of receiver window. Check #2.
*
* The scheme does not work when sender sends good segments opening
* window and then starts to feed us spaghetti. But it should work
* in common situations. Otherwise, we have to rely on queue collapsing.
*/
/* Slow part of check#2. */
static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
struct tcp_sock *tp = tcp_sk(sk);
int truesize = tcp_win_from_space(skb->truesize) >> 1;
int window = tcp_win_from_space(sysctl_tcp_rmem[2]) >> 1;
while (tp->rcv_ssthresh <= window) {
if (truesize <= skb->len)
return 2 * inet_csk(sk)->icsk_ack.rcv_mss;
truesize >>= 1;
window >>= 1;
}
return 0;
}
static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
struct tcp_sock *tp = tcp_sk(sk);
/* Check #1 */
if (tp->rcv_ssthresh < tp->window_clamp &&
(int)tp->rcv_ssthresh < tcp_space(sk) &&
!sk_under_memory_pressure(sk)) {
int incr;
/* Check #2. Increase window, if skb with such overhead
* will fit to rcvbuf in future.
*/
if (tcp_win_from_space(skb->truesize) <= skb->len)
incr = __tcp_grow_window(sk, skb);
tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr,
tp->window_clamp);
inet_csk(sk)->icsk_ack.quick |= 1;
}
}
}
/* 3. Tuning rcvbuf, when connection enters established state. */
static void tcp_fixup_rcvbuf(struct sock *sk)
{
u32 mss = tcp_sk(sk)->advmss;
u32 icwnd = TCP_DEFAULT_INIT_RCVWND;
int rcvmem;
/* Limit to 10 segments if mss <= 1460,
* or 14600/mss segments, with a minimum of two segments.
if (mss > 1460)
icwnd = max_t(u32, (1460 * TCP_DEFAULT_INIT_RCVWND) / mss, 2);
rcvmem = SKB_TRUESIZE(mss + MAX_TCP_HEADER);
while (tcp_win_from_space(rcvmem) < mss)
rcvmem *= icwnd;
if (sk->sk_rcvbuf < rcvmem)
sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]);
/* 4. Try to fixup all. It is made immediately after connection enters
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
* established state.
*/
static void tcp_init_buffer_space(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
int maxwin;
if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
tcp_fixup_rcvbuf(sk);
if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
tcp_fixup_sndbuf(sk);
tp->rcvq_space.space = tp->rcv_wnd;
maxwin = tcp_full_space(sk);
if (tp->window_clamp >= maxwin) {
tp->window_clamp = maxwin;
if (sysctl_tcp_app_win && maxwin > 4 * tp->advmss)
tp->window_clamp = max(maxwin -
(maxwin >> sysctl_tcp_app_win),
4 * tp->advmss);
}
/* Force reservation of one segment. */
if (sysctl_tcp_app_win &&
tp->window_clamp > 2 * tp->advmss &&
tp->window_clamp + tp->advmss > maxwin)
tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);
tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
tp->snd_cwnd_stamp = tcp_time_stamp;
}
/* 5. Recalculate window clamp after socket hit its memory bounds. */
static void tcp_clamp_window(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
icsk->icsk_ack.quick = 0;
if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
!sk_under_memory_pressure(sk) &&
sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
sysctl_tcp_rmem[2]);
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
/* Initialize RCV_MSS value.
* RCV_MSS is an our guess about MSS used by the peer.
* We haven't any direct information about the MSS.
* It's better to underestimate the RCV_MSS rather than overestimate.
* Overestimations make us ACKing less frequently than needed.
* Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
*/
void tcp_initialize_rcv_mss(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
hint = min(hint, tp->rcv_wnd / 2);
hint = min(hint, TCP_MSS_DEFAULT);
hint = max(hint, TCP_MIN_MSS);
inet_csk(sk)->icsk_ack.rcv_mss = hint;
}
/* Receiver "autotuning" code.
*
* The algorithm for RTT estimation w/o timestamps is based on
* Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
* <http://public.lanl.gov/radiant/pubs.html#DRS>
* <http://staff.psc.edu/jheffner/>,
* though this reference is out of date. A new paper
* is pending.
*/
static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
{
u32 new_sample = tp->rcv_rtt_est.rtt;
long m = sample;
if (m == 0)
m = 1;
if (new_sample != 0) {
/* If we sample in larger samples in the non-timestamp
* case, we could grossly overestimate the RTT especially
* with chatty applications or bulk transfer apps which
* are stalled on filesystem I/O.
*
* Also, since we are only going for a minimum in the
* non-timestamp case, we do not smooth things out
* else with timestamps disabled convergence takes too
* long.
*/
if (!win_dep) {
m -= (new_sample >> 3);
new_sample += m;
} else {
m <<= 3;
if (m < new_sample)
new_sample = m;
}
new_sample = m << 3;
}
if (tp->rcv_rtt_est.rtt != new_sample)
tp->rcv_rtt_est.rtt = new_sample;
}
static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
{
if (tp->rcv_rtt_est.time == 0)
goto new_measure;
if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
return;
tcp_rcv_rtt_update(tp, jiffies - tp->rcv_rtt_est.time, 1);
new_measure:
tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
tp->rcv_rtt_est.time = tcp_time_stamp;
}
static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
const struct sk_buff *skb)
struct tcp_sock *tp = tcp_sk(sk);
if (tp->rx_opt.rcv_tsecr &&
(TCP_SKB_CB(skb)->end_seq -
TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss))
tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0);
}
/*
* This function should be called every time data is copied to user space.
* It calculates the appropriate TCP receive buffer space.
*/
void tcp_rcv_space_adjust(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
int time;
int space;
if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
space = 2 * (tp->copied_seq - tp->rcvq_space.seq);
space = max(tp->rcvq_space.space, space);
if (tp->rcvq_space.space != space) {
int rcvmem;
tp->rcvq_space.space = space;
if (sysctl_tcp_moderate_rcvbuf &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
int new_clamp = space;
/* Receive space grows, normalize in order to
* take into account packet headers and sk_buff
* structure overhead.
*/
space /= tp->advmss;
if (!space)
space = 1;
rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
while (tcp_win_from_space(rcvmem) < tp->advmss)
rcvmem += 128;
space *= rcvmem;
space = min(space, sysctl_tcp_rmem[2]);
if (space > sk->sk_rcvbuf) {
sk->sk_rcvbuf = space;
/* Make the window clamp follow along. */
tp->window_clamp = new_clamp;
}
}
}
new_measure:
tp->rcvq_space.seq = tp->copied_seq;
tp->rcvq_space.time = tcp_time_stamp;
}
/* There is something which you must keep in mind when you analyze the
* behavior of the tp->ato delayed ack timeout interval. When a
* connection starts up, we want to ack as quickly as possible. The
* problem is that "good" TCP's do slow start at the beginning of data
* transmission. The means that until we send the first few ACK's the
* sender will sit on his end and only queue most of his data, because
* he can only send snd_cwnd unacked packets at any given time. For
* each ACK we send, he increments snd_cwnd and transmits more of his
* queue. -DaveM
*/
static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
inet_csk_schedule_ack(sk);
tcp_measure_rcv_mss(sk, skb);
if (!icsk->icsk_ack.ato) {
/* The _first_ data packet received, initialize
* delayed ACK engine.
*/
tcp_incr_quickack(sk);
icsk->icsk_ack.ato = TCP_ATO_MIN;
int m = now - icsk->icsk_ack.lrcvtime;
icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;
} else if (m < icsk->icsk_ack.ato) {
icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m;
if (icsk->icsk_ack.ato > icsk->icsk_rto)
icsk->icsk_ack.ato = icsk->icsk_rto;
} else if (m > icsk->icsk_rto) {
* restart window, so that we send ACKs quickly.
*/
sk_mem_reclaim(sk);
icsk->icsk_ack.lrcvtime = now;
TCP_ECN_check_ce(tp, skb);
if (skb->len >= 128)
tcp_grow_window(sk, skb);
}
/* Called to compute a smoothed rtt estimate. The data fed to this
* routine either comes from timestamps, or from segments that were
* known _not_ to have been retransmitted [see Karn/Partridge
* Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
* piece by Van Jacobson.
* NOTE: the next three routines used to be one big routine.
* To save cycles in the RFC 1323 implementation it was better to break
* it up into three procedures. -- erics
*/
static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
struct tcp_sock *tp = tcp_sk(sk);
long m = mrtt; /* RTT */
/* The following amusing code comes from Jacobson's
* article in SIGCOMM '88. Note that rtt and mdev
* are scaled versions of rtt and mean deviation.
* This is designed to be as fast as possible
* m stands for "measurement".
*
* On a 1990 paper the rto value is changed to:
* RTO = rtt + 4 * mdev
*
* Funny. This algorithm seems to be very broken.
* These formulae increase RTO, when it should be decreased, increase
* too slowly, when it should be increased quickly, decrease too quickly
* etc. I guess in BSD RTO takes ONE value, so that it is absolutely
* does not matter how to _calculate_ it. Seems, it was trap
* that VJ failed to avoid. 8)
*/
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
m = 1;
if (tp->srtt != 0) {
m -= (tp->srtt >> 3); /* m is now error in rtt est */
tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */
if (m < 0) {
m = -m; /* m is now abs(error) */
m -= (tp->mdev >> 2); /* similar update on mdev */
/* This is similar to one of Eifel findings.
* Eifel blocks mdev updates when rtt decreases.
* This solution is a bit different: we use finer gain
* for mdev in this case (alpha*beta).
* Like Eifel it also prevents growth of rto,
* but also it limits too fast rto decreases,
* happening in pure Eifel.
*/
if (m > 0)
m >>= 3;
} else {
m -= (tp->mdev >> 2); /* similar update on mdev */
}
tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */
if (tp->mdev > tp->mdev_max) {
tp->mdev_max = tp->mdev;
if (tp->mdev_max > tp->rttvar)
tp->rttvar = tp->mdev_max;
}
if (after(tp->snd_una, tp->rtt_seq)) {
if (tp->mdev_max < tp->rttvar)
tp->rttvar -= (tp->rttvar - tp->mdev_max) >> 2;
tp->mdev_max = tcp_rto_min(sk);
tp->srtt = m << 3; /* take the measured time to be rtt */
tp->mdev = m << 1; /* make sure rto = 3*rtt */
tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
tp->rtt_seq = tp->snd_nxt;
}
}
/* Calculate rto without backoff. This is the second half of Van Jacobson's
* routine referred to above.
*/
static inline void tcp_set_rto(struct sock *sk)
const struct tcp_sock *tp = tcp_sk(sk);
/* Old crap is replaced with new one. 8)
*
* More seriously:
* 1. If rtt variance happened to be less 50msec, it is hallucination.
* It cannot be less due to utterly erratic ACK generation made
* at least by solaris and freebsd. "Erratic ACKs" has _nothing_
* to do with delayed acks, because at cwnd>2 true delack timeout
* is invisible. Actually, Linux-2.4 also generates erratic
inet_csk(sk)->icsk_rto = __tcp_set_rto(tp);
/* 2. Fixups made earlier cannot be right.
* If we do not estimate RTO correctly without them,
* all the algo is pure shit and should be replaced
* with correct one. It is exactly, which we pretend to do.
/* NOTE: clamping at TCP_RTO_MIN is not required, current algo
* guarantees that rto is higher.
*/
tcp_bound_rto(sk);
}
/* Save metrics learned by this TCP session.
This function is called only, when TCP finishes successfully
i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE.
*/
void tcp_update_metrics(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct dst_entry *dst = __sk_dst_get(sk);
if (sysctl_tcp_nometrics_save)
return;
dst_confirm(dst);
if (dst && (dst->flags & DST_HOST)) {
const struct inet_connection_sock *icsk = inet_csk(sk);
if (icsk->icsk_backoff || !tp->srtt) {
/* This session failed to estimate rtt. Why?
* Probably, no packets returned in time.
* Reset our results.
*/
if (!(dst_metric_locked(dst, RTAX_RTT)))
dst_metric_set(dst, RTAX_RTT, 0);
rtt = dst_metric_rtt(dst, RTAX_RTT);
m = rtt - tp->srtt;
/* If newly calculated rtt larger than stored one,
* store new one. Otherwise, use EWMA. Remember,
* rtt overestimation is always better than underestimation.
*/
if (!(dst_metric_locked(dst, RTAX_RTT))) {
if (m <= 0)
set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt);
set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3));
}
if (!(dst_metric_locked(dst, RTAX_RTTVAR))) {
if (m < 0)
m = -m;
/* Scale deviation to rttvar fixed point */
m >>= 1;
if (m < tp->mdev)
m = tp->mdev;
var = dst_metric_rtt(dst, RTAX_RTTVAR);
if (m >= var)
var = m;
var -= (var - m) >> 2;
set_dst_metric_rtt(dst, RTAX_RTTVAR, var);
/* Slow start still did not finish. */
if (dst_metric(dst, RTAX_SSTHRESH) &&
!dst_metric_locked(dst, RTAX_SSTHRESH) &&
(tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1);
if (!dst_metric_locked(dst, RTAX_CWND) &&
tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd);
icsk->icsk_ca_state == TCP_CA_Open) {
/* Cong. avoidance phase, cwnd is reliable. */
if (!dst_metric_locked(dst, RTAX_SSTHRESH))
dst_metric_set(dst, RTAX_SSTHRESH,
max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
dst_metric_set(dst, RTAX_CWND,
(dst_metric(dst, RTAX_CWND) +
tp->snd_cwnd) >> 1);
} else {
/* Else slow start did not finish, cwnd is non-sense,
ssthresh may be also invalid.
*/
if (!dst_metric_locked(dst, RTAX_CWND))
dst_metric_set(dst, RTAX_CWND,
(dst_metric(dst, RTAX_CWND) +
tp->snd_ssthresh) >> 1);

Satoru SATOH
committed
if (dst_metric(dst, RTAX_SSTHRESH) &&

Satoru SATOH
committed
tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH))
dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh);
}
if (!dst_metric_locked(dst, RTAX_REORDERING)) {

Satoru SATOH
committed
if (dst_metric(dst, RTAX_REORDERING) < tp->reordering &&
dst_metric_set(dst, RTAX_REORDERING, tp->reordering);
__u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
{
__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
cwnd = TCP_INIT_CWND;
return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
}
/* Set slow start threshold and cwnd not falling to slow start */
void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
const struct inet_connection_sock *icsk = inet_csk(sk);
tp->prior_ssthresh = 0;
tp->bytes_acked = 0;
if (icsk->icsk_ca_state < TCP_CA_CWR) {
if (set_ssthresh)
tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
tp->snd_cwnd = min(tp->snd_cwnd,
tcp_packets_in_flight(tp) + 1U);
tp->snd_cwnd_cnt = 0;
tp->high_seq = tp->snd_nxt;
tp->snd_cwnd_stamp = tcp_time_stamp;
TCP_ECN_queue_cwr(tp);
tcp_set_ca_state(sk, TCP_CA_CWR);
}
}
/*
* Packet counting of FACK is based on in-order assumptions, therefore TCP
* disables it when reordering is detected
*/
static void tcp_disable_fack(struct tcp_sock *tp)
{
/* RFC3517 uses different metric in lost marker => reset on change */
if (tcp_is_fack(tp))
tp->lost_skb_hint = NULL;
tp->rx_opt.sack_ok &= ~TCP_FACK_ENABLED;
/* Take a notice that peer is sending D-SACKs */
static void tcp_dsack_seen(struct tcp_sock *tp)
{
tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
/* Initialize metrics on socket. */
static void tcp_init_metrics(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct dst_entry *dst = __sk_dst_get(sk);
if (dst == NULL)
goto reset;
dst_confirm(dst);
if (dst_metric_locked(dst, RTAX_CWND))
tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND);
if (dst_metric(dst, RTAX_SSTHRESH)) {
tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH);
if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
tp->snd_ssthresh = tp->snd_cwnd_clamp;
} else {
/* ssthresh may have been reduced unnecessarily during.
* 3WHS. Restore it back to its initial default.
*/
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
}
if (dst_metric(dst, RTAX_REORDERING) &&
tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
tcp_disable_fack(tp);
tp->reordering = dst_metric(dst, RTAX_REORDERING);
}
if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0)
goto reset;
/* Initial rtt is determined from SYN,SYN-ACK.
* The segment is small and rtt may appear much
* less than real one. Use per-dst memory
* to make it more realistic.
*
* A bit of theory. RTT is time passed after "normal" sized packet
* is sent until it is ACKed. In normal circumstances sending small
* packets force peer to delay ACKs and calculation is correct too.
* The algorithm is adaptive and, provided we follow specs, it
* NEVER underestimate RTT. BUT! If peer tries to make some clever
* tricks sort of "quick acks" for time long enough to decrease RTT
* to low value, and then abruptly stops to do it and starts to delay
* ACKs, wait for troubles.
*/
if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) {
tp->srtt = dst_metric_rtt(dst, RTAX_RTT);
if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) {
tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR);
tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
if (tp->srtt == 0) {
/* RFC2988bis: We've failed to get a valid RTT sample from
* 3WHS. This is most likely due to retransmission,
* including spurious one. Reset the RTO back to 3secs
* from the more aggressive 1sec to avoid more spurious
* retransmission.
tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK;
inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
/* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
* retransmitted. In light of RFC2988bis' more aggressive 1sec
* initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
* retransmission has occurred.
*/
if (tp->total_retrans > 1)
tp->snd_cwnd = 1;
else
tp->snd_cwnd = tcp_init_cwnd(tp, dst);
tp->snd_cwnd_stamp = tcp_time_stamp;
static void tcp_update_reordering(struct sock *sk, const int metric,
const int ts)
struct tcp_sock *tp = tcp_sk(sk);
int mib_idx;
tp->reordering = min(TCP_MAX_REORDERING, metric);
/* This exciting event is worth to be remembered. 8) */
if (ts)
mib_idx = LINUX_MIB_TCPTSREORDER;
else if (tcp_is_reno(tp))
mib_idx = LINUX_MIB_TCPRENOREORDER;
else if (tcp_is_fack(tp))
mib_idx = LINUX_MIB_TCPFACKREORDER;
mib_idx = LINUX_MIB_TCPSACKREORDER;
NET_INC_STATS_BH(sock_net(sk), mib_idx);
#if FASTRETRANS_DEBUG > 1
printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n",
tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
tp->reordering,
tp->fackets_out,
tp->sacked_out,
tp->undo_marker ? tp->undo_retrans : 0);
#endif
tcp_disable_fack(tp);
/* This must be called before lost_out is incremented */
static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
{
if ((tp->retransmit_skb_hint == NULL) ||
before(TCP_SKB_CB(skb)->seq,
TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
tp->retransmit_skb_hint = skb;
if (!tp->lost_out ||