Newer
Older
return (sysctl_tcp_frto == 0x2) && !tcp_is_reno(tp);
}
/* F-RTO can only be used if TCP has never retransmitted anything other than
* head (SACK enhanced variant from Appendix B of RFC4138 is more robust here)
*/
int tcp_use_frto(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_buff *skb;
if (!sysctl_tcp_frto)
return 0;
/* MTU probe and F-RTO won't really play nicely along currently */
if (icsk->icsk_mtup.probe_size)
return 0;
if (tcp_is_sackfrto(tp))
/* Avoid expensive walking of rexmit queue if possible */
if (tp->retrans_out > 1)
return 0;
skb = tcp_write_queue_head(sk);
if (tcp_skb_is_last(sk, skb))
return 1;
skb = tcp_write_queue_next(sk, skb); /* Skips head */
tcp_for_write_queue_from(skb, sk) {
if (skb == tcp_send_head(sk))
break;
if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
return 0;
/* Short-circuit when first non-SACKed skb has been checked */
if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
break;
}
return 1;
/* RTO occurred, but do not yet enter Loss state. Instead, defer RTO
* recovery a bit and use heuristics in tcp_process_frto() to detect if
* the RTO was spurious. Only clear SACKED_RETRANS of the head here to
* keep retrans_out counting accurate (with SACK F-RTO, other than head
* may still have that bit set); TCPCB_LOST and remaining SACKED_RETRANS
* bits are handled if the Loss state is really to be entered (in
* tcp_enter_frto_loss).
*
* Do like tcp_enter_loss() would; when RTO expires the second time it
* does:
* "Reduce ssthresh if it has not yet been made inside this window."
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
if ((!tp->frto_counter && icsk->icsk_ca_state <= TCP_CA_Disorder) ||
((icsk->icsk_ca_state == TCP_CA_Loss || tp->frto_counter) &&
!icsk->icsk_retransmits)) {
tp->prior_ssthresh = tcp_current_ssthresh(sk);
/* Our state is too optimistic in ssthresh() call because cwnd
* is not reduced until tcp_enter_frto_loss() when previous F-RTO
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
* recovery has not yet completed. Pattern would be this: RTO,
* Cumulative ACK, RTO (2xRTO for the same segment does not end
* up here twice).
* RFC4138 should be more specific on what to do, even though
* RTO is quite unlikely to occur after the first Cumulative ACK
* due to back-off and complexity of triggering events ...
*/
if (tp->frto_counter) {
u32 stored_cwnd;
stored_cwnd = tp->snd_cwnd;
tp->snd_cwnd = 2;
tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
tp->snd_cwnd = stored_cwnd;
} else {
tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
}
/* ... in theory, cong.control module could do "any tricks" in
* ssthresh(), which means that ca_state, lost bits and lost_out
* counter would have to be faked before the call occurs. We
* consider that too expensive, unlikely and hacky, so modules
* using these in ssthresh() must deal these incompatibility
* issues if they receives CA_EVENT_FRTO and frto_counter != 0
*/
tcp_ca_event(sk, CA_EVENT_FRTO);
}
tp->undo_marker = tp->snd_una;
tp->undo_retrans = 0;
skb = tcp_write_queue_head(sk);
if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
tp->undo_marker = 0;
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb);
tcp_verify_left_out(tp);
/* Too bad if TCP was application limited */
tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1);
/* Earlier loss recovery underway (see RFC4138; Appendix B).
* The last condition is necessary at least in tp->frto_counter case.
*/
if (tcp_is_sackfrto(tp) && (tp->frto_counter ||
((1 << icsk->icsk_ca_state) & (TCPF_CA_Recovery|TCPF_CA_Loss))) &&
after(tp->high_seq, tp->snd_una)) {
tp->frto_highmark = tp->high_seq;
} else {
tp->frto_highmark = tp->snd_nxt;
}
tcp_set_ca_state(sk, TCP_CA_Disorder);
tp->high_seq = tp->snd_nxt;
tp->frto_counter = 1;
}
/* Enter Loss state after F-RTO was applied. Dupack arrived after RTO,
* which indicates that we should follow the traditional RTO recovery,
* i.e. mark everything lost and do go-back-N retransmission.
*/
static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
tp->lost_out = 0;
if (tcp_is_reno(tp))
tcp_reset_reno_sack(tp);
tcp_for_write_queue(skb, sk) {
if (skb == tcp_send_head(sk))
break;
TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
/*
* Count the retransmission made on RTO correctly (only when
* waiting for the first ACK and did not get it)...
*/
if ((tp->frto_counter == 1) && !(flag & FLAG_DATA_ACKED)) {
/* For some reason this R-bit might get cleared? */
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
tp->retrans_out += tcp_skb_pcount(skb);
/* ...enter this if branch just for the first segment */
flag |= FLAG_DATA_ACKED;
} else {
if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
tp->undo_marker = 0;
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
/* Marking forward transmissions that were made after RTO lost
* can cause unnecessary retransmissions in some scenarios,
* SACK blocks will mitigate that in some but not in all cases.
* We used to not mark them but it was causing break-ups with
* receivers that do only in-order receival.
*
* TODO: we could detect presence of such receiver and select
* different behavior per flow.
*/
if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
tcp_verify_left_out(tp);
tp->snd_cwnd = tcp_packets_in_flight(tp) + allowed_segments;
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_time_stamp;
tp->frto_counter = 0;
tp->reordering = min_t(unsigned int, tp->reordering,
tcp_set_ca_state(sk, TCP_CA_Loss);
tcp_clear_all_retrans_hints(tp);
static void tcp_clear_retrans_partial(struct tcp_sock *tp)
{
tp->retrans_out = 0;
tp->lost_out = 0;
tp->undo_marker = 0;
tp->undo_retrans = 0;
}
void tcp_clear_retrans(struct tcp_sock *tp)
{
tcp_clear_retrans_partial(tp);
tp->fackets_out = 0;
tp->sacked_out = 0;
}
/* Enter Loss state. If "how" is not zero, forget all SACK information
* and reset tags completely, otherwise preserve SACKs. If receiver
* dropped its ofo queue, we will know this due to reneging detection.
*/
void tcp_enter_loss(struct sock *sk, int how)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
/* Reduce ssthresh if it has not yet been made inside this window. */
if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
(icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
tp->prior_ssthresh = tcp_current_ssthresh(sk);
tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
tcp_ca_event(sk, CA_EVENT_LOSS);
}
tp->snd_cwnd = 1;
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_time_stamp;
tcp_clear_retrans_partial(tp);
if (tcp_is_reno(tp))
tcp_reset_reno_sack(tp);
if (!how) {
/* Push undo marker, if it was plain RTO and nothing
* was retransmitted. */
tp->sacked_out = 0;
tp->fackets_out = 0;
tcp_clear_all_retrans_hints(tp);
tcp_for_write_queue(skb, sk) {
if (skb == tcp_send_head(sk))
break;
if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
tp->undo_marker = 0;
TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
tcp_verify_left_out(tp);
tp->reordering = min_t(unsigned int, tp->reordering,
tcp_set_ca_state(sk, TCP_CA_Loss);
tp->high_seq = tp->snd_nxt;
TCP_ECN_queue_cwr(tp);
/* Abort F-RTO algorithm if one is in progress */
tp->frto_counter = 0;
/* If ACK arrived pointing to a remembered SACK, it means that our
* remembered SACKs do not reflect real state of receiver i.e.
* receiver _host_ is heavily congested (or buggy).
*
* Do processing similar to RTO timeout.
*/
static int tcp_check_sack_reneging(struct sock *sk, int flag)
if (flag & FLAG_SACK_RENEGING) {
struct inet_connection_sock *icsk = inet_csk(sk);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
icsk->icsk_retransmits++;
tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
icsk->icsk_rto, TCP_RTO_MAX);
return 1;
}
return 0;
}
static inline int tcp_fackets_out(struct tcp_sock *tp)
{
return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out;
/* Heurestics to calculate number of duplicate ACKs. There's no dupACKs
* counter when SACK is enabled (without SACK, sacked_out is used for
* that purpose).
*
* Instead, with FACK TCP uses fackets_out that includes both SACKed
* segments up to the highest received SACK block so far and holes in
* between them.
*
* With reordering, holes may still be in flight, so RFC3517 recovery
* uses pure sacked_out (total number of SACKed segments) even though
* it violates the RFC that uses duplicate ACKs, often these are equal
* but when e.g. out-of-window ACKs or packet duplication occurs,
* they differ. Since neither occurs due to loss, TCP should really
* ignore them.
*/
static inline int tcp_dupack_heuristics(struct tcp_sock *tp)
{
return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
}
static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto;
static inline int tcp_head_timedout(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
tcp_skb_timedout(sk, tcp_write_queue_head(sk));
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
}
/* Linux NewReno/SACK/FACK/ECN state machine.
* --------------------------------------
*
* "Open" Normal state, no dubious events, fast path.
* "Disorder" In all the respects it is "Open",
* but requires a bit more attention. It is entered when
* we see some SACKs or dupacks. It is split of "Open"
* mainly to move some processing from fast path to slow one.
* "CWR" CWND was reduced due to some Congestion Notification event.
* It can be ECN, ICMP source quench, local device congestion.
* "Recovery" CWND was reduced, we are fast-retransmitting.
* "Loss" CWND was reduced due to RTO timeout or SACK reneging.
*
* tcp_fastretrans_alert() is entered:
* - each incoming ACK, if state is not "Open"
* - when arrived ACK is unusual, namely:
* * SACK
* * Duplicate ACK.
* * ECN ECE.
*
* Counting packets in flight is pretty simple.
*
* in_flight = packets_out - left_out + retrans_out
*
* packets_out is SND.NXT-SND.UNA counted in packets.
*
* retrans_out is number of retransmitted segments.
*
* left_out is number of segments left network, but not ACKed yet.
*
* left_out = sacked_out + lost_out
*
* sacked_out: Packets, which arrived to receiver out of order
* and hence not ACKed. With SACKs this number is simply
* amount of SACKed data. Even without SACKs
* it is easy to give pretty reliable estimate of this number,
* counting duplicate ACKs.
*
* lost_out: Packets lost by network. TCP has no explicit
* "loss notification" feedback from network (for now).
* It means that this number can be only _guessed_.
* Actually, it is the heuristics to predict lossage that
* distinguishes different algorithms.
*
* F.e. after RTO, when all the queue is considered as lost,
* lost_out = packets_out and in_flight = retrans_out.
*
* Essentially, we have now two algorithms counting
* lost packets.
*
* FACK: It is the simplest heuristics. As soon as we decided
* that something is lost, we decide that _all_ not SACKed
* packets until the most forward SACK are lost. I.e.
* lost_out = fackets_out - sacked_out and left_out = fackets_out.
* It is absolutely correct estimate, if network does not reorder
* packets. And it loses any connection to reality when reordering
* takes place. We use FACK by default until reordering
* is suspected on the path to this destination.
*
* NewReno: when Recovery is entered, we assume that one segment
* is lost (classic Reno). While we are in Recovery and
* a partial ACK arrives, we assume that one more packet
* is lost (NewReno). This heuristics are the same in NewReno
* and SACK.
*
* Imagine, that's all! Forget about all this shamanism about CWND inflation
* deflation etc. CWND is real congestion window, never inflated, changes
* only according to classic VJ rules.
*
* Really tricky (and requiring careful tuning) part of algorithm
* is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
* The first determines the moment _when_ we should reduce CWND and,
* hence, slow down forward transmission. In fact, it determines the moment
* when we decide that hole is caused by loss, rather than by a reorder.
*
* tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill
* holes, caused by lost packets.
*
* And the most logically complicated part of algorithm is undo
* heuristics. We detect false retransmits due to both too early
* fast retransmit (reordering) and underestimated RTO, analyzing
* timestamps and D-SACKs. When we detect that some segments were
* retransmitted by mistake and CWND reduction was wrong, we undo
* window reduction and abort recovery phase. This logic is hidden
* inside several functions named tcp_try_undo_<something>.
*/
/* This function decides, when we should leave Disordered state
* and enter Recovery phase, reducing congestion window.
*
* Main question: may we further continue forward transmission
* with the same cwnd?
*/
static int tcp_time_to_recover(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
/* Do not perform any recovery during F-RTO algorithm */
if (tp->frto_counter)
return 0;
/* Trick#1: The loss is proven. */
if (tp->lost_out)
return 1;
/* Not-A-Trick#2 : Classic rule... */
if (tcp_dupack_heuristics(tp) > tp->reordering)
return 1;
/* Trick#3 : when we use RFC2988 timer restart, fast
* retransmit can be triggered by timeout of queue head.
*/
if (tcp_is_fack(tp) && tcp_head_timedout(sk))
return 1;
/* Trick#4: It is still not OK... But will it be useful to delay
* recovery more?
*/
packets_out = tp->packets_out;
if (packets_out <= tp->reordering &&
tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) &&
!tcp_may_send_now(sk)) {
/* We have nothing to send. This connection is limited
* either by receiver window or by application.
*/
return 1;
}
/* If a thin stream is detected, retransmit after first
* received dupack. Employ only if SACK is supported in order
* to avoid possible corner-case series of spurious retransmissions
* Use only if there are no unsent data.
*/
if ((tp->thin_dupack || sysctl_tcp_thin_dupack) &&
tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&
tcp_is_sack(tp) && !tcp_send_head(sk))
return 1;
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
/* New heuristics: it is possible only after we switched to restart timer
* each time when something is ACKed. Hence, we can detect timed out packets
* during fast retransmit without falling to slow start.
*
* Usefulness of this as is very questionable, since we should know which of
* the segments is the next to timeout which is relatively expensive to find
* in general case unless we add some data structure just for that. The
* current approach certainly won't find the right one too often and when it
* finally does find _something_ it usually marks large part of the window
* right away (because a retransmission with a larger timestamp blocks the
* loop from advancing). -ij
*/
static void tcp_timeout_skbs(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
if (!tcp_is_fack(tp) || !tcp_head_timedout(sk))
return;
skb = tp->scoreboard_skb_hint;
if (tp->scoreboard_skb_hint == NULL)
skb = tcp_write_queue_head(sk);
tcp_for_write_queue_from(skb, sk) {
if (skb == tcp_send_head(sk))
break;
if (!tcp_skb_timedout(sk, skb))
break;
tcp_skb_mark_lost(tp, skb);
}
tp->scoreboard_skb_hint = skb;
tcp_verify_left_out(tp);
}
/* Mark head of queue up as lost. With RFC3517 SACK, the packets is
* is against sacked "cnt", otherwise it's against facked "cnt"
*/
static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
struct tcp_sock *tp = tcp_sk(sk);
int cnt, oldcnt;
int err;
unsigned int mss;
WARN_ON(packets > tp->packets_out);
if (tp->lost_skb_hint) {
skb = tp->lost_skb_hint;
cnt = tp->lost_cnt_hint;
/* Head already handled? */
if (mark_head && skb != tcp_write_queue_head(sk))
return;
skb = tcp_write_queue_head(sk);
tcp_for_write_queue_from(skb, sk) {
if (skb == tcp_send_head(sk))
break;
/* TODO: do this better */
/* this is not the most efficient way to do this... */
tp->lost_skb_hint = skb;
tp->lost_cnt_hint = cnt;
if (after(TCP_SKB_CB(skb)->end_seq, tp->high_seq))
break;
oldcnt = cnt;
if (tcp_is_fack(tp) || tcp_is_reno(tp) ||
(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
cnt += tcp_skb_pcount(skb);
if (cnt > packets) {
if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) ||
(oldcnt >= packets))
break;
mss = skb_shinfo(skb)->gso_size;
err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, mss);
if (err < 0)
break;
cnt = packets;
}
tcp_verify_left_out(tp);
}
/* Account newly detected lost packet(s) */
static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
struct tcp_sock *tp = tcp_sk(sk);
if (tcp_is_reno(tp)) {
} else if (tcp_is_fack(tp)) {
int lost = tp->fackets_out - tp->reordering;
if (lost <= 0)
lost = 1;
int sacked_upto = tp->sacked_out - tp->reordering;
if (sacked_upto >= 0)
tcp_mark_head_lost(sk, sacked_upto, 0);
else if (fast_rexmit)
tcp_mark_head_lost(sk, 1, 1);
tcp_timeout_skbs(sk);
}
/* CWND moderation, preventing bursts due to too big ACKs
* in dubious situations.
*/
static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
{
tp->snd_cwnd = min(tp->snd_cwnd,
tcp_packets_in_flight(tp) + tcp_max_burst(tp));
/* Lower bound on congestion window is slow start threshold
* unless congestion avoidance choice decides to overide it.
*/
static inline u32 tcp_cwnd_min(const struct sock *sk)
{
const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh;
}
static void tcp_cwnd_down(struct sock *sk, int flag)
struct tcp_sock *tp = tcp_sk(sk);
if ((flag & (FLAG_ANY_PROGRESS | FLAG_DSACKING_ACK)) ||
(tcp_is_reno(tp) && !(flag & FLAG_NOT_DUP))) {
tp->snd_cwnd_cnt = decr & 1;
if (decr && tp->snd_cwnd > tcp_cwnd_min(sk))
tp->snd_cwnd -= decr;
tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1);
tp->snd_cwnd_stamp = tcp_time_stamp;
}
}
/* Nothing was retransmitted or returned timestamp is less
* than timestamp of the first retransmission.
*/
static inline int tcp_packet_delayed(struct tcp_sock *tp)
{
return !tp->retrans_stamp ||
(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
before(tp->rx_opt.rcv_tsecr, tp->retrans_stamp));
}
/* Undo procedures. */
#if FASTRETRANS_DEBUG > 1
static void DBGUNDO(struct sock *sk, const char *msg)
struct tcp_sock *tp = tcp_sk(sk);
if (sk->sk_family == AF_INET) {
printk(KERN_DEBUG "Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
&inet->inet_daddr, ntohs(inet->inet_dport),
tp->snd_cwnd, tcp_left_out(tp),
tp->snd_ssthresh, tp->prior_ssthresh,
tp->packets_out);
}
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
else if (sk->sk_family == AF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
&np->daddr, ntohs(inet->inet_dport),
tp->snd_cwnd, tcp_left_out(tp),
tp->snd_ssthresh, tp->prior_ssthresh,
tp->packets_out);
}
#endif
}
#else
#define DBGUNDO(x...) do { } while (0)
#endif
static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh)
struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
if (icsk->icsk_ca_ops->undo_cwnd)
tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1);
if (undo_ssthresh && tp->prior_ssthresh > tp->snd_ssthresh) {
tp->snd_ssthresh = tp->prior_ssthresh;
TCP_ECN_withdraw_cwr(tp);
}
} else {
tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);
}
tp->snd_cwnd_stamp = tcp_time_stamp;
}
static inline int tcp_may_undo(struct tcp_sock *tp)
{
return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp));
}
/* People celebrate: "We love our President!" */
static int tcp_try_undo_recovery(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
int mib_idx;
/* Happy end! We did not retransmit anything
* or our original transmission succeeded.
*/
DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
tcp_undo_cwr(sk, true);
if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
mib_idx = LINUX_MIB_TCPLOSSUNDO;
mib_idx = LINUX_MIB_TCPFULLUNDO;
NET_INC_STATS_BH(sock_net(sk), mib_idx);
if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
/* Hold old state until something *above* high_seq
* is ACKed. For Reno it is MUST to prevent false
* fast retransmits (RFC2582). SACK TCP is safe. */
tcp_moderate_cwnd(tp);
return 1;
}
tcp_set_ca_state(sk, TCP_CA_Open);
return 0;
}
/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */
static void tcp_try_undo_dsack(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
DBGUNDO(sk, "D-SACK");
tcp_undo_cwr(sk, true);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
/* We can clear retrans_stamp when there are no retransmissions in the
* window. It would seem that it is trivially available for us in
* tp->retrans_out, however, that kind of assumptions doesn't consider
* what will happen if errors occur when sending retransmission for the
* second time. ...It could the that such segment has only
* TCPCB_EVER_RETRANS set at the present time. It seems that checking
* the head skb is enough except for some reneging corner cases that
* are not worth the effort.
*
* Main reason for all this complexity is the fact that connection dying
* time now depends on the validity of the retrans_stamp, in particular,
* that successive retransmissions of a segment must not advance
* retrans_stamp under any conditions.
*/
static int tcp_any_retrans_done(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
if (tp->retrans_out)
return 1;
skb = tcp_write_queue_head(sk);
if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
return 1;
return 0;
}
/* Undo during fast recovery after partial ACK. */
static int tcp_try_undo_partial(struct sock *sk, int acked)
struct tcp_sock *tp = tcp_sk(sk);
int failed = tcp_is_reno(tp) || (tcp_fackets_out(tp) > tp->reordering);
if (tcp_may_undo(tp)) {
/* Plain luck! Hole if filled with delayed
* packet, rather than with a retransmit.
*/
if (!tcp_any_retrans_done(sk))
tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
DBGUNDO(sk, "Hoe");
tcp_undo_cwr(sk, false);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
/* So... Do not make Hoe's retransmit yet.
* If the first packet was delayed, the rest
* ones are most probably delayed as well.
*/
failed = 0;
}
return failed;
}
/* Undo during loss recovery after partial ACK. */
static int tcp_try_undo_loss(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
tcp_for_write_queue(skb, sk) {
if (skb == tcp_send_head(sk))
break;
tcp_clear_all_retrans_hints(tp);
DBGUNDO(sk, "partial loss");
tcp_undo_cwr(sk, true);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
inet_csk(sk)->icsk_retransmits = 0;
if (tcp_is_sack(tp))
tcp_set_ca_state(sk, TCP_CA_Open);
static inline void tcp_complete_cwr(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
/* Do not moderate cwnd if it's already undone in cwr or recovery. */
if (tp->undo_marker) {
if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR)
tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
else /* PRR */
tp->snd_cwnd = tp->snd_ssthresh;
tp->snd_cwnd_stamp = tcp_time_stamp;
}
tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
static void tcp_try_keep_open(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
int state = TCP_CA_Open;
if (tcp_left_out(tp) || tcp_any_retrans_done(sk) || tp->undo_marker)
state = TCP_CA_Disorder;
if (inet_csk(sk)->icsk_ca_state != state) {
tcp_set_ca_state(sk, state);
tp->high_seq = tp->snd_nxt;
}
}
static void tcp_try_to_open(struct sock *sk, int flag)
struct tcp_sock *tp = tcp_sk(sk);
tcp_verify_left_out(tp);
if (!tp->frto_counter && !tcp_any_retrans_done(sk))
tcp_enter_cwr(sk, 1);
if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
tcp_try_keep_open(sk);
tcp_cwnd_down(sk, flag);
static void tcp_mtup_probe_failed(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
icsk->icsk_mtup.probe_size = 0;
}
static void tcp_mtup_probe_success(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
/* FIXME: breaks with very large cwnd */
tp->prior_ssthresh = tcp_current_ssthresh(sk);
tp->snd_cwnd = tp->snd_cwnd *
tcp_mss_to_mtu(sk, tp->mss_cache) /
icsk->icsk_mtup.probe_size;
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_time_stamp;
tp->snd_ssthresh = tcp_current_ssthresh(sk);
icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
icsk->icsk_mtup.probe_size = 0;
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
}
/* Do a simple retransmit without using the backoff mechanisms in
* tcp_timer. This is used for path mtu discovery.
* The socket is already locked here.
*/
void tcp_simple_retransmit(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
u32 prior_lost = tp->lost_out;
tcp_for_write_queue(skb, sk) {
if (skb == tcp_send_head(sk))
break;
if (tcp_skb_seglen(skb) > mss &&
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb);
}
tcp_skb_mark_lost_uncond_verify(tp, skb);
}
}
tcp_clear_retrans_hints_partial(tp);
if (prior_lost == tp->lost_out)
return;
if (tcp_is_reno(tp))
tcp_limit_reno_sacked(tp);
tcp_verify_left_out(tp);
/* Don't muck with the congestion window here.
* Reason is that we do not increase amount of _data_
* in network, but units changed and effective
* cwnd/ssthresh really reduced now.
*/
if (icsk->icsk_ca_state != TCP_CA_Loss) {
tp->high_seq = tp->snd_nxt;
tp->snd_ssthresh = tcp_current_ssthresh(sk);
tp->prior_ssthresh = 0;
tp->undo_marker = 0;
tcp_set_ca_state(sk, TCP_CA_Loss);
}
tcp_xmit_retransmit_queue(sk);
}
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
/* This function implements the PRR algorithm, specifcally the PRR-SSRB
* (proportional rate reduction with slow start reduction bound) as described in
* http://www.ietf.org/id/draft-mathis-tcpm-proportional-rate-reduction-01.txt.
* It computes the number of packets to send (sndcnt) based on packets newly
* delivered:
* 1) If the packets in flight is larger than ssthresh, PRR spreads the
* cwnd reductions across a full RTT.
* 2) If packets in flight is lower than ssthresh (such as due to excess
* losses and/or application stalls), do not perform any further cwnd
* reductions, but instead slow start up to ssthresh.
*/
static void tcp_update_cwnd_in_recovery(struct sock *sk, int newly_acked_sacked,
int fast_rexmit, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
int sndcnt = 0;
int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) {
u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
tp->prior_cwnd - 1;
sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
} else {
sndcnt = min_t(int, delta,
max_t(int, tp->prr_delivered - tp->prr_out,
newly_acked_sacked) + 1);
}
sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));
tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
}
/* Process an event, which can update packets-in-flight not trivially.
* Main goal of this function is to calculate new estimate for left_out,
* taking into account both packets sitting in receiver's buffer and
* packets lost by network.
*
* Besides that it does CWND reduction, when packet loss is detected
* and changes state of machine.
*
* It does _not_ decide what to send, it is made in function