Newer
Older
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
}
}
static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp,
struct sk_buff *skb)
{
tcp_verify_retransmit_hint(tp, skb);
if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
tp->lost_out += tcp_skb_pcount(skb);
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
}
}
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
/* This procedure tags the retransmission queue when SACKs arrive.
*
* We have three tag bits: SACKED(S), RETRANS(R) and LOST(L).
* Packets in queue with these bits set are counted in variables
* sacked_out, retrans_out and lost_out, correspondingly.
*
* Valid combinations are:
* Tag InFlight Description
* 0 1 - orig segment is in flight.
* S 0 - nothing flies, orig reached receiver.
* L 0 - nothing flies, orig lost by net.
* R 2 - both orig and retransmit are in flight.
* L|R 1 - orig is lost, retransmit is in flight.
* S|R 1 - orig reached receiver, retrans is still in flight.
* (L|S|R is logically valid, it could occur when L|R is sacked,
* but it is equivalent to plain S and code short-curcuits it to S.
* L|S is logically invalid, it would mean -1 packet in flight 8))
*
* These 6 states form finite state machine, controlled by the following events:
* 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue())
* 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue())
* 3. Loss detection event of one of three flavors:
* A. Scoreboard estimator decided the packet is lost.
* A'. Reno "three dupacks" marks head of queue lost.
* A''. Its FACK modfication, head until snd.fack is lost.
* B. SACK arrives sacking data transmitted after never retransmitted
* hole was sent out.
* C. SACK arrives sacking SND.NXT at the moment, when the
* segment was retransmitted.
* 4. D-SACK added new rule: D-SACK changes any tag to S.
*
* It is pleasant to note, that state diagram turns out to be commutative,
* so that we are allowed not to be bothered by order of our actions,
* when multiple events arrive simultaneously. (see the function below).
*
* Reordering detection.
* --------------------
* Reordering metric is maximal distance, which a packet can be displaced
* in packet stream. With SACKs we can estimate it:
*
* 1. SACK fills old hole and the corresponding segment was not
* ever retransmitted -> reordering. Alas, we cannot use it
* when segment was retransmitted.
* 2. The last flaw is solved with D-SACK. D-SACK arrives
* for retransmitted and already SACKed segment -> reordering..
* Both of these heuristics are not used in Loss state, when we cannot
* account for retransmits accurately.
*
* SACK block validation.
* ----------------------
*
* SACK block range validation checks that the received SACK block fits to
* the expected sequence limits, i.e., it is between SND.UNA and SND.NXT.
* Note that SND.UNA is not included to the range though being valid because
* it means that the receiver is rather inconsistent with itself reporting
* SACK reneging when it should advance SND.UNA. Such SACK block this is
* perfectly valid, however, in light of RFC2018 which explicitly states
* that "SACK block MUST reflect the newest segment. Even if the newest
* segment is going to be discarded ...", not that it looks very clever
* in case of head skb. Due to potentional receiver driven attacks, we
* choose to avoid immediate execution of a walk in write queue due to
* reneging and defer head skb's loss recovery to standard loss recovery
* procedure that will eventually trigger (nothing forbids us doing this).
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
*
* Implements also blockage to start_seq wrap-around. Problem lies in the
* fact that though start_seq (s) is before end_seq (i.e., not reversed),
* there's no guarantee that it will be before snd_nxt (n). The problem
* happens when start_seq resides between end_seq wrap (e_w) and snd_nxt
* wrap (s_w):
*
* <- outs wnd -> <- wrapzone ->
* u e n u_w e_w s n_w
* | | | | | | |
* |<------------+------+----- TCP seqno space --------------+---------->|
* ...-- <2^31 ->| |<--------...
* ...---- >2^31 ------>| |<--------...
*
* Current code wouldn't be vulnerable but it's better still to discard such
* crazy SACK blocks. Doing this check for start_seq alone closes somewhat
* similar case (end_seq after snd_nxt wrap) as earlier reversed check in
* snd_nxt wrap -> snd_una region will then become "well defined", i.e.,
* equal to the ideal case (infinite seqno space without wrap caused issues).
*
* With D-SACK the lower bound is extended to cover sequence space below
* SND.UNA down to undo_marker, which is the last point of interest. Yet
* again, D-SACK block must not to go across snd_una (for the same reason as
* for the normal SACK blocks, explained above). But there all simplicity
* ends, TCP might receive valid D-SACKs below that. As long as they reside
* fully below undo_marker they do not affect behavior in anyway and can
* therefore be safely ignored. In rare cases (which are more or less
* theoretical ones), the D-SACK will nicely cross that boundary due to skb
* fragmentation and packet reordering past skb's retransmission. To consider
* them correctly, the acceptable range must be extended even more though
* the exact amount is rather hard to quantify. However, tp->max_window can
* be used as an exaggerated estimate.
static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack,
u32 start_seq, u32 end_seq)
{
/* Too far in future, or reversed (interpretation is ambiguous) */
if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq))
return 0;
/* Nasty start_seq wrap-around check (see comments above) */
if (!before(start_seq, tp->snd_nxt))
return 0;
/* In outstanding window? ...This is valid exit for D-SACKs too.
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
* start_seq == snd_una is non-sensical (see comments above)
*/
if (after(start_seq, tp->snd_una))
return 1;
if (!is_dsack || !tp->undo_marker)
return 0;
/* ...Then it's D-SACK, and must reside below snd_una completely */
if (!after(end_seq, tp->snd_una))
return 0;
if (!before(start_seq, tp->undo_marker))
return 1;
/* Too old */
if (!after(end_seq, tp->undo_marker))
return 0;
/* Undo_marker boundary crossing (overestimates a lot). Known already:
* start_seq < undo_marker and end_seq >= undo_marker.
*/
return !before(start_seq, end_seq - tp->max_window);
}
/* Check for lost retransmit. This superb idea is borrowed from "ratehalving".
* Event "C". Later note: FACK people cheated me again 8), we have to account
* for reordering! Ugly, but should help.
*
* Search retransmitted skbs from write_queue that were sent when snd_nxt was
* less than what is now known to be received by the other end (derived from
* highest SACK block). Also calculate the lowest snd_nxt among the remaining
* retransmitted skbs to avoid some costly processing per ACKs.
static void tcp_mark_lost_retrans(struct sock *sk)
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
u32 new_low_seq = tp->snd_nxt;
u32 received_upto = tcp_highest_sack_seq(tp);
if (!tcp_is_fack(tp) || !tp->retrans_out ||
!after(received_upto, tp->lost_retrans_low) ||
icsk->icsk_ca_state != TCP_CA_Recovery)
tcp_for_write_queue(skb, sk) {
u32 ack_seq = TCP_SKB_CB(skb)->ack_seq;
if (skb == tcp_send_head(sk))
break;
if (cnt == tp->retrans_out)
break;
if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
continue;
if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS))
continue;
if (after(received_upto, ack_seq) &&
(tcp_is_fack(tp) ||
!before(received_upto,
ack_seq + tp->reordering * tp->mss_cache))) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb);
tcp_skb_mark_lost_uncond_verify(tp, skb);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT);
if (before(ack_seq, new_low_seq))
new_low_seq = ack_seq;
cnt += tcp_skb_pcount(skb);
if (tp->retrans_out)
tp->lost_retrans_low = new_low_seq;
static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb,
struct tcp_sack_block_wire *sp, int num_sacks,
u32 prior_snd_una)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
int dup_sack = 0;
if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
dup_sack = 1;
tcp_dsack_seen(tp);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
} else if (num_sacks > 1) {
u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
if (!after(end_seq_0, end_seq_1) &&
!before(start_seq_0, start_seq_1)) {
dup_sack = 1;
tcp_dsack_seen(tp);
NET_INC_STATS_BH(sock_net(sk),
LINUX_MIB_TCPDSACKOFORECV);
}
}
/* D-SACK for already forgotten data... Do dumb counting. */
if (dup_sack &&
!after(end_seq_0, prior_snd_una) &&
after(end_seq_0, tp->undo_marker))
tp->undo_retrans--;
return dup_sack;
}
/* Check if skb is fully within the SACK block. In presence of GSO skbs,
* the incoming SACK may not exactly match but we can find smaller MSS
* aligned portion of it that matches. Therefore we might need to fragment
* which may fail and creates some hassle (caller must handle error case
* returns).
*/
static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
u32 start_seq, u32 end_seq)
{
int in_sack, err;
unsigned int pkt_len;
in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
!before(end_seq, TCP_SKB_CB(skb)->end_seq);
if (tcp_skb_pcount(skb) > 1 && !in_sack &&
after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
mss = tcp_skb_mss(skb);
in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
pkt_len = start_seq - TCP_SKB_CB(skb)->seq;
if (pkt_len < mss)
pkt_len = mss;
} else {
pkt_len = end_seq - TCP_SKB_CB(skb)->seq;
if (pkt_len < mss)
return -EINVAL;
}
/* Round if necessary so that SACKs cover only full MSSes
* and/or the remaining small portion (if present)
*/
if (pkt_len > mss) {
unsigned int new_len = (pkt_len / mss) * mss;
if (!in_sack && new_len < pkt_len) {
new_len += mss;
if (new_len > skb->len)
return 0;
}
pkt_len = new_len;
}
err = tcp_fragment(sk, skb, pkt_len, mss);
if (err < 0)
return err;
}
return in_sack;
}
static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
int *reord, int dup_sack, int fack_count)
{
struct tcp_sock *tp = tcp_sk(sk);
u8 sacked = TCP_SKB_CB(skb)->sacked;
int flag = 0;
/* Account D-SACK for retransmitted packet. */
if (dup_sack && (sacked & TCPCB_RETRANS)) {
if (after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker))
tp->undo_retrans--;
if (sacked & TCPCB_SACKED_ACKED)
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
*reord = min(fack_count, *reord);
}
/* Nothing to do; acked frame is about to be dropped (was ACKed). */
if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
return flag;
if (!(sacked & TCPCB_SACKED_ACKED)) {
if (sacked & TCPCB_SACKED_RETRANS) {
/* If the segment is not tagged as lost,
* we do not clear RETRANS, believing
* that retransmission is still in flight.
*/
if (sacked & TCPCB_LOST) {
TCP_SKB_CB(skb)->sacked &=
~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
tp->lost_out -= tcp_skb_pcount(skb);
tp->retrans_out -= tcp_skb_pcount(skb);
}
} else {
if (!(sacked & TCPCB_RETRANS)) {
/* New sack for not retransmitted frame,
* which was in hole. It is reordering.
*/
if (before(TCP_SKB_CB(skb)->seq,
tcp_highest_sack_seq(tp)))
*reord = min(fack_count, *reord);
/* SACK enhanced F-RTO (RFC4138; Appendix B) */
if (!after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark))
flag |= FLAG_ONLY_ORIG_SACKED;
}
if (sacked & TCPCB_LOST) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
tp->lost_out -= tcp_skb_pcount(skb);
}
}
TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED;
flag |= FLAG_DATA_SACKED;
tp->sacked_out += tcp_skb_pcount(skb);
fack_count += tcp_skb_pcount(skb);
/* Lost marker hint past SACKed? Tweak RFC3517 cnt */
if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) &&
before(TCP_SKB_CB(skb)->seq,
TCP_SKB_CB(tp->lost_skb_hint)->seq))
tp->lost_cnt_hint += tcp_skb_pcount(skb);
if (fack_count > tp->fackets_out)
tp->fackets_out = fack_count;
if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
tcp_advance_highest_sack(sk, skb);
}
/* D-SACK. We can detect redundant retransmission in S|R and plain R
* frames and clear it. undo_retrans is decreased above, L|R frames
* are accounted above as well.
*/
if (dup_sack && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb);
}
return flag;
}
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
struct tcp_sack_block *next_dup,
u32 start_seq, u32 end_seq,
int dup_sack_in, int *fack_count,
int *reord, int *flag)
{
tcp_for_write_queue_from(skb, sk) {
int in_sack = 0;
int dup_sack = dup_sack_in;
if (skb == tcp_send_head(sk))
break;
/* queue is in-order => we can short-circuit the walk early */
if (!before(TCP_SKB_CB(skb)->seq, end_seq))
break;
if ((next_dup != NULL) &&
before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) {
in_sack = tcp_match_skb_to_sack(sk, skb,
next_dup->start_seq,
next_dup->end_seq);
if (in_sack > 0)
dup_sack = 1;
}
if (in_sack <= 0)
in_sack = tcp_match_skb_to_sack(sk, skb, start_seq,
end_seq);
if (unlikely(in_sack < 0))
break;
if (in_sack)
*flag |= tcp_sacktag_one(skb, sk, reord, dup_sack,
*fack_count);
*fack_count += tcp_skb_pcount(skb);
}
return skb;
}
/* Avoid all extra work that is being done by sacktag while walking in
* a normal way
*/
static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
u32 skip_to_seq, int *fack_count)
{
tcp_for_write_queue_from(skb, sk) {
if (skb == tcp_send_head(sk))
break;
if (after(TCP_SKB_CB(skb)->end_seq, skip_to_seq))
*fack_count += tcp_skb_pcount(skb);
}
return skb;
}
static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
struct sock *sk,
struct tcp_sack_block *next_dup,
u32 skip_to_seq,
int *fack_count, int *reord,
int *flag)
{
if (next_dup == NULL)
return skb;
if (before(next_dup->start_seq, skip_to_seq)) {
skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq, fack_count);
skb = tcp_sacktag_walk(skb, sk, NULL,
next_dup->start_seq, next_dup->end_seq,
1, fack_count, reord, flag);
}
return skb;
}
static int tcp_sack_cache_ok(struct tcp_sock *tp, struct tcp_sack_block *cache)
{
return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
}
tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,
u32 prior_snd_una)
const struct inet_connection_sock *icsk = inet_csk(sk);
unsigned char *ptr = (skb_transport_header(ack_skb) +
TCP_SKB_CB(ack_skb)->sacked);
struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2);
struct tcp_sack_block sp[TCP_NUM_SACKS];
struct tcp_sack_block *cache;
struct sk_buff *skb;
int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);
int used_sacks;
int found_dup_sack = 0;
int fack_count;
int i, j;
int first_sack_index;
if (!tp->sacked_out) {
if (WARN_ON(tp->fackets_out))
tp->fackets_out = 0;
tcp_highest_sack_reset(sk);
found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
num_sacks, prior_snd_una);
if (found_dup_sack)
flag |= FLAG_DSACKING_ACK;
/* Eliminate too old ACKs, but take into
* account more or less fresh ones, they can
* contain valid SACK info.
*/
if (before(TCP_SKB_CB(ack_skb)->ack_seq, prior_snd_una - tp->max_window))
return 0;
if (!tp->packets_out)
goto out;
used_sacks = 0;
first_sack_index = 0;
for (i = 0; i < num_sacks; i++) {
int dup_sack = !i && found_dup_sack;
sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq);
sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq);
if (!tcp_is_sackblock_valid(tp, dup_sack,
sp[used_sacks].start_seq,
sp[used_sacks].end_seq)) {
int mib_idx;
if (dup_sack) {
if (!tp->undo_marker)
mib_idx = LINUX_MIB_TCPDSACKIGNOREDNOUNDO;
mib_idx = LINUX_MIB_TCPDSACKIGNOREDOLD;
} else {
/* Don't count olds caused by ACK reordering */
if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) &&
!after(sp[used_sacks].end_seq, tp->snd_una))
continue;
mib_idx = LINUX_MIB_TCPSACKDISCARD;
NET_INC_STATS_BH(sock_net(sk), mib_idx);
if (i == 0)
first_sack_index = -1;
continue;
}
/* Ignore very old stuff early */
if (!after(sp[used_sacks].end_seq, prior_snd_una))
continue;
used_sacks++;
}
/* order SACK blocks to allow in order walk of the retrans queue */
for (i = used_sacks - 1; i > 0; i--) {
for (j = 0; j < i; j++) {
if (after(sp[j].start_seq, sp[j + 1].start_seq)) {
struct tcp_sack_block tmp;
tmp = sp[j];
sp[j] = sp[j + 1];
sp[j + 1] = tmp;
/* Track where the first SACK block goes to */
if (j == first_sack_index)
skb = tcp_write_queue_head(sk);
fack_count = 0;
i = 0;
if (!tp->sacked_out) {
/* It's already past, so skip checking against it */
cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
} else {
cache = tp->recv_sack_cache;
/* Skip empty blocks in at head of the cache */
while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq &&
!cache->end_seq)
cache++;
while (i < used_sacks) {
u32 start_seq = sp[i].start_seq;
u32 end_seq = sp[i].end_seq;
int dup_sack = (found_dup_sack && (i == first_sack_index));
struct tcp_sack_block *next_dup = NULL;
if (found_dup_sack && ((i + 1) == first_sack_index))
next_dup = &sp[i + 1];
/* Event "B" in the comment above. */
if (after(end_seq, tp->high_seq))
flag |= FLAG_DATA_LOST;
/* Skip too early cached blocks */
while (tcp_sack_cache_ok(tp, cache) &&
!before(start_seq, cache->end_seq))
cache++;
/* Can skip some work by looking recv_sack_cache? */
if (tcp_sack_cache_ok(tp, cache) && !dup_sack &&
after(end_seq, cache->start_seq)) {
/* Head todo? */
if (before(start_seq, cache->start_seq)) {
skb = tcp_sacktag_skip(skb, sk, start_seq,
&fack_count);
skb = tcp_sacktag_walk(skb, sk, next_dup,
start_seq,
cache->start_seq,
dup_sack, &fack_count,
&reord, &flag);
/* Rest of the block already fully processed? */
if (!after(end_seq, cache->end_seq))
goto advance_sp;
skb = tcp_maybe_skipping_dsack(skb, sk, next_dup,
cache->end_seq,
&fack_count, &reord,
&flag);
/* ...tail remains todo... */
if (tcp_highest_sack_seq(tp) == cache->end_seq) {
/* ...but better entrypoint exists! */
skb = tcp_highest_sack(sk);
if (skb == NULL)
break;
fack_count = tp->fackets_out;
cache++;
goto walk;
skb = tcp_sacktag_skip(skb, sk, cache->end_seq,
&fack_count);
/* Check overlap against next cached too (past this one already) */
cache++;
continue;
}
if (!before(start_seq, tcp_highest_sack_seq(tp))) {
skb = tcp_highest_sack(sk);
if (skb == NULL)
break;
fack_count = tp->fackets_out;
skb = tcp_sacktag_skip(skb, sk, start_seq, &fack_count);
walk:
skb = tcp_sacktag_walk(skb, sk, next_dup, start_seq, end_seq,
dup_sack, &fack_count, &reord, &flag);
/* SACK enhanced FRTO (RFC4138, Appendix B): Clearing correct
* due to in-order walk
*/
if (after(end_seq, tp->frto_highmark))
flag &= ~FLAG_ONLY_ORIG_SACKED;
/* Clear the head of the cache sack blocks so we can skip it next time */
for (i = 0; i < ARRAY_SIZE(tp->recv_sack_cache) - used_sacks; i++) {
tp->recv_sack_cache[i].start_seq = 0;
tp->recv_sack_cache[i].end_seq = 0;
}
for (j = 0; j < used_sacks; j++)
tp->recv_sack_cache[i++] = sp[j];
tcp_mark_lost_retrans(sk);
tcp_verify_left_out(tp);
if ((reord < tp->fackets_out) &&
((icsk->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker) &&
(!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark)))
tcp_update_reordering(sk, tp->fackets_out - reord, 0);
out:
WARN_ON((int)tp->sacked_out < 0);
WARN_ON((int)tp->lost_out < 0);
WARN_ON((int)tp->retrans_out < 0);
WARN_ON((int)tcp_packets_in_flight(tp) < 0);
/* Limits sacked_out so that sum with lost_out isn't ever larger than
* packets_out. Returns zero if sacked_out adjustement wasn't necessary.
int tcp_limit_reno_sacked(struct tcp_sock *tp)
{
u32 holes;
holes = max(tp->lost_out, 1U);
holes = min(holes, tp->packets_out);
if ((tp->sacked_out + holes) > tp->packets_out) {
tp->sacked_out = tp->packets_out - holes;
return 0;
}
/* If we receive more dupacks than we expected counting segments
* in assumption of absent reordering, interpret this as reordering.
* The only another reason could be bug in receiver TCP.
*/
static void tcp_check_reno_reordering(struct sock *sk, const int addend)
{
struct tcp_sock *tp = tcp_sk(sk);
if (tcp_limit_reno_sacked(tp))
tcp_update_reordering(sk, tp->packets_out + addend, 0);
}
/* Emulate SACKs for SACKless connection: account for a new dupack. */
static void tcp_add_reno_sack(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
tp->sacked_out++;
tcp_check_reno_reordering(sk, 0);
tcp_verify_left_out(tp);
}
/* Account for ACK, ACKing some data in Reno Recovery phase. */
static void tcp_remove_reno_sacks(struct sock *sk, int acked)
{
struct tcp_sock *tp = tcp_sk(sk);
if (acked > 0) {
/* One ACK acked hole. The rest eat duplicate ACKs. */
if (acked - 1 >= tp->sacked_out)
tp->sacked_out = 0;
else
}
tcp_check_reno_reordering(sk, acked);
tcp_verify_left_out(tp);
}
static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
{
tp->sacked_out = 0;
}
static int tcp_is_sackfrto(const struct tcp_sock *tp)
{
return (sysctl_tcp_frto == 0x2) && !tcp_is_reno(tp);
}
/* F-RTO can only be used if TCP has never retransmitted anything other than
* head (SACK enhanced variant from Appendix B of RFC4138 is more robust here)
*/
int tcp_use_frto(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_buff *skb;
if (!sysctl_tcp_frto)
return 0;
/* MTU probe and F-RTO won't really play nicely along currently */
if (icsk->icsk_mtup.probe_size)
return 0;
if (tcp_is_sackfrto(tp))
/* Avoid expensive walking of rexmit queue if possible */
if (tp->retrans_out > 1)
return 0;
skb = tcp_write_queue_head(sk);
if (tcp_skb_is_last(sk, skb))
return 1;
skb = tcp_write_queue_next(sk, skb); /* Skips head */
tcp_for_write_queue_from(skb, sk) {
if (skb == tcp_send_head(sk))
break;
if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
return 0;
/* Short-circuit when first non-SACKed skb has been checked */
if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
break;
}
return 1;
/* RTO occurred, but do not yet enter Loss state. Instead, defer RTO
* recovery a bit and use heuristics in tcp_process_frto() to detect if
* the RTO was spurious. Only clear SACKED_RETRANS of the head here to
* keep retrans_out counting accurate (with SACK F-RTO, other than head
* may still have that bit set); TCPCB_LOST and remaining SACKED_RETRANS
* bits are handled if the Loss state is really to be entered (in
* tcp_enter_frto_loss).
*
* Do like tcp_enter_loss() would; when RTO expires the second time it
* does:
* "Reduce ssthresh if it has not yet been made inside this window."
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
if ((!tp->frto_counter && icsk->icsk_ca_state <= TCP_CA_Disorder) ||
((icsk->icsk_ca_state == TCP_CA_Loss || tp->frto_counter) &&
!icsk->icsk_retransmits)) {
tp->prior_ssthresh = tcp_current_ssthresh(sk);
/* Our state is too optimistic in ssthresh() call because cwnd
* is not reduced until tcp_enter_frto_loss() when previous F-RTO
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
* recovery has not yet completed. Pattern would be this: RTO,
* Cumulative ACK, RTO (2xRTO for the same segment does not end
* up here twice).
* RFC4138 should be more specific on what to do, even though
* RTO is quite unlikely to occur after the first Cumulative ACK
* due to back-off and complexity of triggering events ...
*/
if (tp->frto_counter) {
u32 stored_cwnd;
stored_cwnd = tp->snd_cwnd;
tp->snd_cwnd = 2;
tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
tp->snd_cwnd = stored_cwnd;
} else {
tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
}
/* ... in theory, cong.control module could do "any tricks" in
* ssthresh(), which means that ca_state, lost bits and lost_out
* counter would have to be faked before the call occurs. We
* consider that too expensive, unlikely and hacky, so modules
* using these in ssthresh() must deal these incompatibility
* issues if they receives CA_EVENT_FRTO and frto_counter != 0
*/
tcp_ca_event(sk, CA_EVENT_FRTO);
}
tp->undo_marker = tp->snd_una;
tp->undo_retrans = 0;
skb = tcp_write_queue_head(sk);
if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
tp->undo_marker = 0;
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb);
tcp_verify_left_out(tp);
/* Too bad if TCP was application limited */
tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1);
/* Earlier loss recovery underway (see RFC4138; Appendix B).
* The last condition is necessary at least in tp->frto_counter case.
*/
if (tcp_is_sackfrto(tp) && (tp->frto_counter ||
((1 << icsk->icsk_ca_state) & (TCPF_CA_Recovery|TCPF_CA_Loss))) &&
after(tp->high_seq, tp->snd_una)) {
tp->frto_highmark = tp->high_seq;
} else {
tp->frto_highmark = tp->snd_nxt;
}
tcp_set_ca_state(sk, TCP_CA_Disorder);
tp->high_seq = tp->snd_nxt;
tp->frto_counter = 1;
}
/* Enter Loss state after F-RTO was applied. Dupack arrived after RTO,
* which indicates that we should follow the traditional RTO recovery,
* i.e. mark everything lost and do go-back-N retransmission.
*/
static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
tp->lost_out = 0;
if (tcp_is_reno(tp))
tcp_reset_reno_sack(tp);
tcp_for_write_queue(skb, sk) {
if (skb == tcp_send_head(sk))
break;
TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
/*
* Count the retransmission made on RTO correctly (only when
* waiting for the first ACK and did not get it)...
*/
if ((tp->frto_counter == 1) && !(flag & FLAG_DATA_ACKED)) {
/* For some reason this R-bit might get cleared? */
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
tp->retrans_out += tcp_skb_pcount(skb);
/* ...enter this if branch just for the first segment */
flag |= FLAG_DATA_ACKED;
} else {
if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
tp->undo_marker = 0;
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
/* Marking forward transmissions that were made after RTO lost
* can cause unnecessary retransmissions in some scenarios,
* SACK blocks will mitigate that in some but not in all cases.
* We used to not mark them but it was causing break-ups with
* receivers that do only in-order receival.
*
* TODO: we could detect presence of such receiver and select
* different behavior per flow.
*/
if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
tcp_verify_left_out(tp);
tp->snd_cwnd = tcp_packets_in_flight(tp) + allowed_segments;
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_time_stamp;
tp->frto_counter = 0;
tp->reordering = min_t(unsigned int, tp->reordering,
tcp_set_ca_state(sk, TCP_CA_Loss);
tcp_clear_all_retrans_hints(tp);
static void tcp_clear_retrans_partial(struct tcp_sock *tp)
{
tp->retrans_out = 0;
tp->lost_out = 0;
tp->undo_marker = 0;
tp->undo_retrans = 0;
}
void tcp_clear_retrans(struct tcp_sock *tp)
{
tcp_clear_retrans_partial(tp);
tp->fackets_out = 0;
tp->sacked_out = 0;
}
/* Enter Loss state. If "how" is not zero, forget all SACK information
* and reset tags completely, otherwise preserve SACKs. If receiver
* dropped its ofo queue, we will know this due to reneging detection.
*/
void tcp_enter_loss(struct sock *sk, int how)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
/* Reduce ssthresh if it has not yet been made inside this window. */
if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
(icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
tp->prior_ssthresh = tcp_current_ssthresh(sk);
tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
tcp_ca_event(sk, CA_EVENT_LOSS);
}
tp->snd_cwnd = 1;
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_time_stamp;
tcp_clear_retrans_partial(tp);
if (tcp_is_reno(tp))
tcp_reset_reno_sack(tp);
if (!how) {
/* Push undo marker, if it was plain RTO and nothing
* was retransmitted. */
tp->sacked_out = 0;
tp->fackets_out = 0;
tcp_clear_all_retrans_hints(tp);
tcp_for_write_queue(skb, sk) {
if (skb == tcp_send_head(sk))
break;
if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
tp->undo_marker = 0;
TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
tcp_verify_left_out(tp);