Newer
Older
}
/* D-SACK for already forgotten data...
* Do dumb counting. */
if (found_dup_sack &&
!after(ntohl(sp[0].end_seq), prior_snd_una) &&
after(ntohl(sp[0].end_seq), tp->undo_marker))
tp->undo_retrans--;
/* Eliminate too old ACKs, but take into
* account more or less fresh ones, they can
* contain valid SACK info.
*/
if (before(TCP_SKB_CB(ack_skb)->ack_seq, prior_snd_una - tp->max_window))
return 0;
/* SACK fastpath:
* if the only SACK change is the increase of the end_seq of
* the first block then only apply that SACK block
* and use retrans queue hinting otherwise slowpath */
flag = 1;
for (i = 0; i < num_sacks; i++) {
__be32 start_seq = sp[i].start_seq;
__be32 end_seq = sp[i].end_seq;
if (tp->recv_sack_cache[i].start_seq != start_seq)
flag = 0;
} else {
if ((tp->recv_sack_cache[i].start_seq != start_seq) ||
(tp->recv_sack_cache[i].end_seq != end_seq))
flag = 0;
}
tp->recv_sack_cache[i].start_seq = start_seq;
tp->recv_sack_cache[i].end_seq = end_seq;
}
/* Clear the rest of the cache sack blocks so they won't match mistakenly. */
for (; i < ARRAY_SIZE(tp->recv_sack_cache); i++) {
tp->recv_sack_cache[i].start_seq = 0;
tp->recv_sack_cache[i].end_seq = 0;
}
first_sack_index = 0;
if (flag)
num_sacks = 1;
else {
int j;
tp->fastpath_skb_hint = NULL;
/* order SACK blocks to allow in order walk of the retrans queue */
for (i = num_sacks-1; i > 0; i--) {
for (j = 0; j < i; j++){
if (after(ntohl(sp[j].start_seq),
ntohl(sp[j+1].start_seq))){
struct tcp_sack_block_wire tmp;
tmp = sp[j];
sp[j] = sp[j+1];
sp[j+1] = tmp;
/* Track where the first SACK block goes to */
if (j == first_sack_index)
first_sack_index = j+1;
}
}
}
}
/* clear flag as used for different purpose in following code */
flag = 0;
/* Use SACK fastpath hint if valid */
cached_skb = tp->fastpath_skb_hint;
cached_fack_count = tp->fastpath_cnt_hint;
if (!cached_skb) {
cached_skb = tcp_write_queue_head(sk);
cached_fack_count = 0;
}
for (i=0; i<num_sacks; i++, sp++) {
struct sk_buff *skb;
__u32 start_seq = ntohl(sp->start_seq);
__u32 end_seq = ntohl(sp->end_seq);
int fack_count;
int dup_sack = (found_dup_sack && (i == first_sack_index));
skb = cached_skb;
fack_count = cached_fack_count;
/* Event "B" in the comment above. */
if (after(end_seq, tp->high_seq))
flag |= FLAG_DATA_LOST;
tcp_for_write_queue_from(skb, sk) {
int in_sack, pcount;
u8 sacked;
if (skb == tcp_send_head(sk))
break;
cached_skb = skb;
cached_fack_count = fack_count;
if (i == first_sack_index) {
tp->fastpath_skb_hint = skb;
tp->fastpath_cnt_hint = fack_count;
}
/* The retransmission queue is always in order, so
* we can short-circuit the walk early.
*/
if (!before(TCP_SKB_CB(skb)->seq, end_seq))
in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
!before(end_seq, TCP_SKB_CB(skb)->end_seq);
pcount = tcp_skb_pcount(skb);
if (pcount > 1 && !in_sack &&
after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
unsigned int pkt_len;
in_sack = !after(start_seq,
TCP_SKB_CB(skb)->seq);
if (!in_sack)
pkt_len = (start_seq -
TCP_SKB_CB(skb)->seq);
else
pkt_len = (end_seq -
TCP_SKB_CB(skb)->seq);
if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->gso_size))
break;
pcount = tcp_skb_pcount(skb);
}
fack_count += pcount;
sacked = TCP_SKB_CB(skb)->sacked;
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
/* Account D-SACK for retransmitted packet. */
if ((dup_sack && in_sack) &&
(sacked & TCPCB_RETRANS) &&
after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker))
tp->undo_retrans--;
/* The frame is ACKed. */
if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) {
if (sacked&TCPCB_RETRANS) {
if ((dup_sack && in_sack) &&
(sacked&TCPCB_SACKED_ACKED))
reord = min(fack_count, reord);
} else {
/* If it was in a hole, we detected reordering. */
if (fack_count < prior_fackets &&
!(sacked&TCPCB_SACKED_ACKED))
reord = min(fack_count, reord);
}
/* Nothing to do; acked frame is about to be dropped. */
continue;
}
if ((sacked&TCPCB_SACKED_RETRANS) &&
after(end_seq, TCP_SKB_CB(skb)->ack_seq) &&
(!lost_retrans || after(end_seq, lost_retrans)))
lost_retrans = end_seq;
if (!in_sack)
continue;
if (!(sacked&TCPCB_SACKED_ACKED)) {
if (sacked & TCPCB_SACKED_RETRANS) {
/* If the segment is not tagged as lost,
* we do not clear RETRANS, believing
* that retransmission is still in flight.
*/
if (sacked & TCPCB_LOST) {
TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
tp->lost_out -= tcp_skb_pcount(skb);
tp->retrans_out -= tcp_skb_pcount(skb);
/* clear lost hint */
tp->retransmit_skb_hint = NULL;
}
} else {
/* New sack for not retransmitted frame,
* which was in hole. It is reordering.
*/
if (!(sacked & TCPCB_RETRANS) &&
fack_count < prior_fackets)
reord = min(fack_count, reord);
if (sacked & TCPCB_LOST) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
tp->lost_out -= tcp_skb_pcount(skb);
/* clear lost hint */
tp->retransmit_skb_hint = NULL;
/* SACK enhanced F-RTO detection.
* Set flag if and only if non-rexmitted
* segments below frto_highmark are
* SACKed (RFC4138; Appendix B).
* Clearing correct due to in-order walk
*/
if (after(end_seq, tp->frto_highmark)) {
flag &= ~FLAG_ONLY_ORIG_SACKED;
} else {
if (!(sacked & TCPCB_RETRANS))
flag |= FLAG_ONLY_ORIG_SACKED;
}
}
TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED;
flag |= FLAG_DATA_SACKED;
tp->sacked_out += tcp_skb_pcount(skb);
if (fack_count > tp->fackets_out)
tp->fackets_out = fack_count;
if (after(TCP_SKB_CB(skb)->seq,
tp->highest_sack))
tp->highest_sack = TCP_SKB_CB(skb)->seq;
} else {
if (dup_sack && (sacked&TCPCB_RETRANS))
reord = min(fack_count, reord);
}
/* D-SACK. We can detect redundant retransmission
* in S|R and plain R frames and clear it.
* undo_retrans is decreased above, L|R frames
* are accounted above as well.
*/
if (dup_sack &&
(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb);
}
}
}
/* Check for lost retransmit. This superb idea is
* borrowed from "ratehalving". Event "C".
* Later note: FACK people cheated me again 8),
* we have to account for reordering! Ugly,
* but should help.
*/
if (lost_retrans && icsk->icsk_ca_state == TCP_CA_Recovery) {
tcp_for_write_queue(skb, sk) {
if (skb == tcp_send_head(sk))
break;
if (after(TCP_SKB_CB(skb)->seq, lost_retrans))
break;
if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
continue;
if ((TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) &&
after(lost_retrans, TCP_SKB_CB(skb)->ack_seq) &&
(IsFack(tp) ||
!before(lost_retrans,
TCP_SKB_CB(skb)->ack_seq + tp->reordering *
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb);
/* clear lost hint */
tp->retransmit_skb_hint = NULL;
if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) {
tp->lost_out += tcp_skb_pcount(skb);
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
flag |= FLAG_DATA_SACKED;
NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT);
}
}
}
}
tp->left_out = tp->sacked_out + tp->lost_out;
if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss &&
(!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark)))
tcp_update_reordering(sk, ((tp->fackets_out + 1) - reord), 0);
#if FASTRETRANS_DEBUG > 0
BUG_TRAP((int)tp->sacked_out >= 0);
BUG_TRAP((int)tp->lost_out >= 0);
BUG_TRAP((int)tp->retrans_out >= 0);
BUG_TRAP((int)tcp_packets_in_flight(tp) >= 0);
#endif
return flag;
}
/* F-RTO can only be used if TCP has never retransmitted anything other than
* head (SACK enhanced variant from Appendix B of RFC4138 is more robust here)
int tcp_use_frto(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
if (!sysctl_tcp_frto)
return 0;
/* Avoid expensive walking of rexmit queue if possible */
if (tp->retrans_out > 1)
return 0;
skb = tcp_write_queue_head(sk);
skb = tcp_write_queue_next(sk, skb); /* Skips head */
tcp_for_write_queue_from(skb, sk) {
if (skb == tcp_send_head(sk))
break;
if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS)
return 0;
/* Short-circuit when first non-SACKed skb has been checked */
if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED))
break;
}
return 1;
/* RTO occurred, but do not yet enter Loss state. Instead, defer RTO
* recovery a bit and use heuristics in tcp_process_frto() to detect if
* the RTO was spurious. Only clear SACKED_RETRANS of the head here to
* keep retrans_out counting accurate (with SACK F-RTO, other than head
* may still have that bit set); TCPCB_LOST and remaining SACKED_RETRANS
* bits are handled if the Loss state is really to be entered (in
* tcp_enter_frto_loss).
*
* Do like tcp_enter_loss() would; when RTO expires the second time it
* does:
* "Reduce ssthresh if it has not yet been made inside this window."
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
if ((!tp->frto_counter && icsk->icsk_ca_state <= TCP_CA_Disorder) ||
((icsk->icsk_ca_state == TCP_CA_Loss || tp->frto_counter) &&
!icsk->icsk_retransmits)) {
tp->prior_ssthresh = tcp_current_ssthresh(sk);
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
/* Our state is too optimistic in ssthresh() call because cwnd
* is not reduced until tcp_enter_frto_loss() when previous FRTO
* recovery has not yet completed. Pattern would be this: RTO,
* Cumulative ACK, RTO (2xRTO for the same segment does not end
* up here twice).
* RFC4138 should be more specific on what to do, even though
* RTO is quite unlikely to occur after the first Cumulative ACK
* due to back-off and complexity of triggering events ...
*/
if (tp->frto_counter) {
u32 stored_cwnd;
stored_cwnd = tp->snd_cwnd;
tp->snd_cwnd = 2;
tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
tp->snd_cwnd = stored_cwnd;
} else {
tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
}
/* ... in theory, cong.control module could do "any tricks" in
* ssthresh(), which means that ca_state, lost bits and lost_out
* counter would have to be faked before the call occurs. We
* consider that too expensive, unlikely and hacky, so modules
* using these in ssthresh() must deal these incompatibility
* issues if they receives CA_EVENT_FRTO and frto_counter != 0
*/
tcp_ca_event(sk, CA_EVENT_FRTO);
}
tp->undo_marker = tp->snd_una;
tp->undo_retrans = 0;
skb = tcp_write_queue_head(sk);
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb);
/* Earlier loss recovery underway (see RFC4138; Appendix B).
* The last condition is necessary at least in tp->frto_counter case.
*/
if (IsSackFrto() && (tp->frto_counter ||
((1 << icsk->icsk_ca_state) & (TCPF_CA_Recovery|TCPF_CA_Loss))) &&
after(tp->high_seq, tp->snd_una)) {
tp->frto_highmark = tp->high_seq;
} else {
tp->frto_highmark = tp->snd_nxt;
}
tcp_set_ca_state(sk, TCP_CA_Disorder);
tp->high_seq = tp->snd_nxt;
tp->frto_counter = 1;
}
/* Enter Loss state after F-RTO was applied. Dupack arrived after RTO,
* which indicates that we should follow the traditional RTO recovery,
* i.e. mark everything lost and do go-back-N retransmission.
*/
static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
int cnt = 0;
tp->sacked_out = 0;
tp->lost_out = 0;
tp->fackets_out = 0;
tcp_for_write_queue(skb, sk) {
if (skb == tcp_send_head(sk))
break;
/*
* Count the retransmission made on RTO correctly (only when
* waiting for the first ACK and did not get it)...
*/
if ((tp->frto_counter == 1) && !(flag&FLAG_DATA_ACKED)) {
/* For some reason this R-bit might get cleared? */
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
tp->retrans_out += tcp_skb_pcount(skb);
/* ...enter this if branch just for the first segment */
flag |= FLAG_DATA_ACKED;
} else {
TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
}
if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
/* Do not mark those segments lost that were
* forward transmitted after RTO
*/
if (!after(TCP_SKB_CB(skb)->end_seq,
tp->frto_highmark)) {
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
}
} else {
tp->sacked_out += tcp_skb_pcount(skb);
tp->fackets_out = cnt;
}
}
tcp_sync_left_out(tp);
tp->snd_cwnd = tcp_packets_in_flight(tp) + allowed_segments;
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_time_stamp;
tp->undo_marker = 0;
tp->frto_counter = 0;
tp->reordering = min_t(unsigned int, tp->reordering,
sysctl_tcp_reordering);
tcp_set_ca_state(sk, TCP_CA_Loss);
tp->high_seq = tp->frto_highmark;
TCP_ECN_queue_cwr(tp);
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
}
void tcp_clear_retrans(struct tcp_sock *tp)
{
tp->left_out = 0;
tp->retrans_out = 0;
tp->fackets_out = 0;
tp->sacked_out = 0;
tp->lost_out = 0;
tp->undo_marker = 0;
tp->undo_retrans = 0;
}
/* Enter Loss state. If "how" is not zero, forget all SACK information
* and reset tags completely, otherwise preserve SACKs. If receiver
* dropped its ofo queue, we will know this due to reneging detection.
*/
void tcp_enter_loss(struct sock *sk, int how)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
int cnt = 0;
/* Reduce ssthresh if it has not yet been made inside this window. */
if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
(icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
tp->prior_ssthresh = tcp_current_ssthresh(sk);
tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
tcp_ca_event(sk, CA_EVENT_LOSS);
}
tp->snd_cwnd = 1;
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_time_stamp;
tcp_clear_retrans(tp);
/* Push undo marker, if it was plain RTO and nothing
* was retransmitted. */
if (!how)
tp->undo_marker = tp->snd_una;
tcp_for_write_queue(skb, sk) {
if (skb == tcp_send_head(sk))
break;
cnt += tcp_skb_pcount(skb);
if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS)
tp->undo_marker = 0;
TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
} else {
tp->sacked_out += tcp_skb_pcount(skb);
tp->fackets_out = cnt;
}
}
tcp_sync_left_out(tp);
tp->reordering = min_t(unsigned int, tp->reordering,
sysctl_tcp_reordering);
tcp_set_ca_state(sk, TCP_CA_Loss);
tp->high_seq = tp->snd_nxt;
TCP_ECN_queue_cwr(tp);
/* Abort FRTO algorithm if one is in progress */
tp->frto_counter = 0;
static int tcp_check_sack_reneging(struct sock *sk)
{
struct sk_buff *skb;
/* If ACK arrived pointing to a remembered SACK,
* it means that our remembered SACKs do not reflect
* real state of receiver i.e.
* receiver _host_ is heavily congested (or buggy).
* Do processing similar to RTO timeout.
*/
if ((skb = tcp_write_queue_head(sk)) != NULL &&
struct inet_connection_sock *icsk = inet_csk(sk);
NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING);
tcp_enter_loss(sk, 1);
icsk->icsk_retransmits++;
tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
icsk->icsk_rto, TCP_RTO_MAX);
return 1;
}
return 0;
}
static inline int tcp_fackets_out(struct tcp_sock *tp)
{
return IsReno(tp) ? tp->sacked_out+1 : tp->fackets_out;
}
static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto);
static inline int tcp_head_timedout(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
tcp_skb_timedout(sk, tcp_write_queue_head(sk));
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
}
/* Linux NewReno/SACK/FACK/ECN state machine.
* --------------------------------------
*
* "Open" Normal state, no dubious events, fast path.
* "Disorder" In all the respects it is "Open",
* but requires a bit more attention. It is entered when
* we see some SACKs or dupacks. It is split of "Open"
* mainly to move some processing from fast path to slow one.
* "CWR" CWND was reduced due to some Congestion Notification event.
* It can be ECN, ICMP source quench, local device congestion.
* "Recovery" CWND was reduced, we are fast-retransmitting.
* "Loss" CWND was reduced due to RTO timeout or SACK reneging.
*
* tcp_fastretrans_alert() is entered:
* - each incoming ACK, if state is not "Open"
* - when arrived ACK is unusual, namely:
* * SACK
* * Duplicate ACK.
* * ECN ECE.
*
* Counting packets in flight is pretty simple.
*
* in_flight = packets_out - left_out + retrans_out
*
* packets_out is SND.NXT-SND.UNA counted in packets.
*
* retrans_out is number of retransmitted segments.
*
* left_out is number of segments left network, but not ACKed yet.
*
* left_out = sacked_out + lost_out
*
* sacked_out: Packets, which arrived to receiver out of order
* and hence not ACKed. With SACKs this number is simply
* amount of SACKed data. Even without SACKs
* it is easy to give pretty reliable estimate of this number,
* counting duplicate ACKs.
*
* lost_out: Packets lost by network. TCP has no explicit
* "loss notification" feedback from network (for now).
* It means that this number can be only _guessed_.
* Actually, it is the heuristics to predict lossage that
* distinguishes different algorithms.
*
* F.e. after RTO, when all the queue is considered as lost,
* lost_out = packets_out and in_flight = retrans_out.
*
* Essentially, we have now two algorithms counting
* lost packets.
*
* FACK: It is the simplest heuristics. As soon as we decided
* that something is lost, we decide that _all_ not SACKed
* packets until the most forward SACK are lost. I.e.
* lost_out = fackets_out - sacked_out and left_out = fackets_out.
* It is absolutely correct estimate, if network does not reorder
* packets. And it loses any connection to reality when reordering
* takes place. We use FACK by default until reordering
* is suspected on the path to this destination.
*
* NewReno: when Recovery is entered, we assume that one segment
* is lost (classic Reno). While we are in Recovery and
* a partial ACK arrives, we assume that one more packet
* is lost (NewReno). This heuristics are the same in NewReno
* and SACK.
*
* Imagine, that's all! Forget about all this shamanism about CWND inflation
* deflation etc. CWND is real congestion window, never inflated, changes
* only according to classic VJ rules.
*
* Really tricky (and requiring careful tuning) part of algorithm
* is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
* The first determines the moment _when_ we should reduce CWND and,
* hence, slow down forward transmission. In fact, it determines the moment
* when we decide that hole is caused by loss, rather than by a reorder.
*
* tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill
* holes, caused by lost packets.
*
* And the most logically complicated part of algorithm is undo
* heuristics. We detect false retransmits due to both too early
* fast retransmit (reordering) and underestimated RTO, analyzing
* timestamps and D-SACKs. When we detect that some segments were
* retransmitted by mistake and CWND reduction was wrong, we undo
* window reduction and abort recovery phase. This logic is hidden
* inside several functions named tcp_try_undo_<something>.
*/
/* This function decides, when we should leave Disordered state
* and enter Recovery phase, reducing congestion window.
*
* Main question: may we further continue forward transmission
* with the same cwnd?
*/
static int tcp_time_to_recover(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
/* Do not perform any recovery during FRTO algorithm */
if (tp->frto_counter)
return 0;
/* Trick#1: The loss is proven. */
if (tp->lost_out)
return 1;
/* Not-A-Trick#2 : Classic rule... */
if (tcp_fackets_out(tp) > tp->reordering)
return 1;
/* Trick#3 : when we use RFC2988 timer restart, fast
* retransmit can be triggered by timeout of queue head.
*/
if (tcp_head_timedout(sk))
return 1;
/* Trick#4: It is still not OK... But will it be useful to delay
* recovery more?
*/
packets_out = tp->packets_out;
if (packets_out <= tp->reordering &&
tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) &&
!tcp_may_send_now(sk)) {
/* We have nothing to send. This connection is limited
* either by receiver window or by application.
*/
return 1;
}
return 0;
}
/* If we receive more dupacks than we expected counting segments
* in assumption of absent reordering, interpret this as reordering.
* The only another reason could be bug in receiver TCP.
*/
static void tcp_check_reno_reordering(struct sock *sk, const int addend)
struct tcp_sock *tp = tcp_sk(sk);
u32 holes;
holes = max(tp->lost_out, 1U);
holes = min(holes, tp->packets_out);
if ((tp->sacked_out + holes) > tp->packets_out) {
tp->sacked_out = tp->packets_out - holes;
tcp_update_reordering(sk, tp->packets_out + addend, 0);
}
}
/* Emulate SACKs for SACKless connection: account for a new dupack. */
static void tcp_add_reno_sack(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
tcp_check_reno_reordering(sk, 0);
tcp_sync_left_out(tp);
}
/* Account for ACK, ACKing some data in Reno Recovery phase. */
static void tcp_remove_reno_sacks(struct sock *sk, int acked)
struct tcp_sock *tp = tcp_sk(sk);
if (acked > 0) {
/* One ACK acked hole. The rest eat duplicate ACKs. */
if (acked-1 >= tp->sacked_out)
tp->sacked_out = 0;
else
tp->sacked_out -= acked-1;
}
tcp_check_reno_reordering(sk, acked);
tcp_sync_left_out(tp);
}
static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
{
tp->sacked_out = 0;
tp->left_out = tp->lost_out;
}
/* RFC: This is from the original, I doubt that this is necessary at all:
* clear xmit_retrans hint if seq of this skb is beyond hint. How could we
* retransmitted past LOST markings in the first place? I'm not fully sure
* about undo and end of connection cases, which can cause R without L?
*/
static void tcp_verify_retransmit_hint(struct tcp_sock *tp,
struct sk_buff *skb)
{
if ((tp->retransmit_skb_hint != NULL) &&
before(TCP_SKB_CB(skb)->seq,
TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
tp->retransmit_skb_hint = skb;
}
static void tcp_mark_head_lost(struct sock *sk,
struct tcp_sock *tp = tcp_sk(sk);
BUG_TRAP(packets <= tp->packets_out);
if (tp->lost_skb_hint) {
skb = tp->lost_skb_hint;
cnt = tp->lost_cnt_hint;
} else {
skb = tcp_write_queue_head(sk);
tcp_for_write_queue_from(skb, sk) {
if (skb == tcp_send_head(sk))
break;
/* TODO: do this better */
/* this is not the most efficient way to do this... */
tp->lost_skb_hint = skb;
tp->lost_cnt_hint = cnt;
cnt += tcp_skb_pcount(skb);
if (cnt > packets || after(TCP_SKB_CB(skb)->end_seq, high_seq))
break;
if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
tcp_verify_retransmit_hint(tp, skb);
}
}
tcp_sync_left_out(tp);
}
/* Account newly detected lost packet(s) */
static void tcp_update_scoreboard(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
if (IsFack(tp)) {
int lost = tp->fackets_out - tp->reordering;
if (lost <= 0)
lost = 1;
tcp_mark_head_lost(sk, lost, tp->high_seq);
tcp_mark_head_lost(sk, 1, tp->high_seq);
}
/* New heuristics: it is possible only after we switched
* to restart timer each time when something is ACKed.
* Hence, we can detect timed out packets during fast
* retransmit without falling to slow start.
*/
if (!IsReno(tp) && tcp_head_timedout(sk)) {
skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint
: tcp_write_queue_head(sk);
tcp_for_write_queue_from(skb, sk) {
if (skb == tcp_send_head(sk))
break;
if (!tcp_skb_timedout(sk, skb))
break;
if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
tcp_verify_retransmit_hint(tp, skb);
tp->scoreboard_skb_hint = skb;
tcp_sync_left_out(tp);
}
}
/* CWND moderation, preventing bursts due to too big ACKs
* in dubious situations.
*/
static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
{
tp->snd_cwnd = min(tp->snd_cwnd,
tcp_packets_in_flight(tp)+tcp_max_burst(tp));
tp->snd_cwnd_stamp = tcp_time_stamp;
}
/* Lower bound on congestion window is slow start threshold
* unless congestion avoidance choice decides to overide it.
*/
static inline u32 tcp_cwnd_min(const struct sock *sk)
{
const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh;
}
static void tcp_cwnd_down(struct sock *sk, int flag)
struct tcp_sock *tp = tcp_sk(sk);
if ((flag&(FLAG_ANY_PROGRESS|FLAG_DSACKING_ACK)) ||
(IsReno(tp) && !(flag&FLAG_NOT_DUP))) {
tp->snd_cwnd_cnt = decr&1;
decr >>= 1;
if (decr && tp->snd_cwnd > tcp_cwnd_min(sk))
tp->snd_cwnd -= decr;
tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
tp->snd_cwnd_stamp = tcp_time_stamp;
}
}
/* Nothing was retransmitted or returned timestamp is less
* than timestamp of the first retransmission.
*/
static inline int tcp_packet_delayed(struct tcp_sock *tp)
{
return !tp->retrans_stamp ||
(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
(__s32)(tp->rx_opt.rcv_tsecr - tp->retrans_stamp) < 0);
}
/* Undo procedures. */
#if FASTRETRANS_DEBUG > 1
static void DBGUNDO(struct sock *sk, const char *msg)
struct tcp_sock *tp = tcp_sk(sk);
printk(KERN_DEBUG "Undo %s %u.%u.%u.%u/%u c%u l%u ss%u/%u p%u\n",
msg,
NIPQUAD(inet->daddr), ntohs(inet->dport),
tp->snd_cwnd, tp->left_out,
tp->snd_ssthresh, tp->prior_ssthresh,
tp->packets_out);
}
#else
#define DBGUNDO(x...) do { } while (0)
#endif
static void tcp_undo_cwr(struct sock *sk, const int undo)
struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
if (icsk->icsk_ca_ops->undo_cwnd)
tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
else
tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1);
if (undo && tp->prior_ssthresh > tp->snd_ssthresh) {
tp->snd_ssthresh = tp->prior_ssthresh;
TCP_ECN_withdraw_cwr(tp);
}
} else {
tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);
}
tcp_moderate_cwnd(tp);
tp->snd_cwnd_stamp = tcp_time_stamp;
/* There is something screwy going on with the retrans hints after
an undo */
clear_all_retrans_hints(tp);
}
static inline int tcp_may_undo(struct tcp_sock *tp)
{
return tp->undo_marker &&
(!tp->undo_retrans || tcp_packet_delayed(tp));
}
/* People celebrate: "We love our President!" */
static int tcp_try_undo_recovery(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
if (tcp_may_undo(tp)) {
/* Happy end! We did not retransmit anything
* or our original transmission succeeded.
*/
DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
tcp_undo_cwr(sk, 1);
if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO);
else
NET_INC_STATS_BH(LINUX_MIB_TCPFULLUNDO);
tp->undo_marker = 0;
}
if (tp->snd_una == tp->high_seq && IsReno(tp)) {
/* Hold old state until something *above* high_seq
* is ACKed. For Reno it is MUST to prevent false
* fast retransmits (RFC2582). SACK TCP is safe. */
tcp_moderate_cwnd(tp);
return 1;
}
tcp_set_ca_state(sk, TCP_CA_Open);
return 0;
}
/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */
static void tcp_try_undo_dsack(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
DBGUNDO(sk, "D-SACK");
tcp_undo_cwr(sk, 1);