Newer
Older
/* Clear the rest of the cache sack blocks so they won't match mistakenly. */
for (; i < ARRAY_SIZE(tp->recv_sack_cache); i++) {
tp->recv_sack_cache[i].start_seq = 0;
tp->recv_sack_cache[i].end_seq = 0;
}
first_sack_index = 0;
if (flag)
num_sacks = 1;
else {
int j;
tp->fastpath_skb_hint = NULL;
/* order SACK blocks to allow in order walk of the retrans queue */
for (i = num_sacks-1; i > 0; i--) {
for (j = 0; j < i; j++){
if (after(ntohl(sp[j].start_seq),
ntohl(sp[j+1].start_seq))){
struct tcp_sack_block_wire tmp;
tmp = sp[j];
sp[j] = sp[j+1];
sp[j+1] = tmp;
/* Track where the first SACK block goes to */
if (j == first_sack_index)
first_sack_index = j+1;
}
}
}
}
/* clear flag as used for different purpose in following code */
flag = 0;
/* Use SACK fastpath hint if valid */
cached_skb = tp->fastpath_skb_hint;
cached_fack_count = tp->fastpath_cnt_hint;
if (!cached_skb) {
cached_skb = sk->sk_write_queue.next;
cached_fack_count = 0;
}
for (i=0; i<num_sacks; i++, sp++) {
struct sk_buff *skb;
__u32 start_seq = ntohl(sp->start_seq);
__u32 end_seq = ntohl(sp->end_seq);
int fack_count;
skb = cached_skb;
fack_count = cached_fack_count;
/* Event "B" in the comment above. */
if (after(end_seq, tp->high_seq))
flag |= FLAG_DATA_LOST;
sk_stream_for_retrans_queue_from(skb, sk) {
int in_sack, pcount;
u8 sacked;
cached_skb = skb;
cached_fack_count = fack_count;
if (i == first_sack_index) {
tp->fastpath_skb_hint = skb;
tp->fastpath_cnt_hint = fack_count;
}
/* The retransmission queue is always in order, so
* we can short-circuit the walk early.
*/
if (!before(TCP_SKB_CB(skb)->seq, end_seq))
in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
!before(end_seq, TCP_SKB_CB(skb)->end_seq);
pcount = tcp_skb_pcount(skb);
if (pcount > 1 && !in_sack &&
after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
unsigned int pkt_len;
in_sack = !after(start_seq,
TCP_SKB_CB(skb)->seq);
if (!in_sack)
pkt_len = (start_seq -
TCP_SKB_CB(skb)->seq);
else
pkt_len = (end_seq -
TCP_SKB_CB(skb)->seq);
if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->gso_size))
break;
pcount = tcp_skb_pcount(skb);
}
fack_count += pcount;
sacked = TCP_SKB_CB(skb)->sacked;
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
/* Account D-SACK for retransmitted packet. */
if ((dup_sack && in_sack) &&
(sacked & TCPCB_RETRANS) &&
after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker))
tp->undo_retrans--;
/* The frame is ACKed. */
if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) {
if (sacked&TCPCB_RETRANS) {
if ((dup_sack && in_sack) &&
(sacked&TCPCB_SACKED_ACKED))
reord = min(fack_count, reord);
} else {
/* If it was in a hole, we detected reordering. */
if (fack_count < prior_fackets &&
!(sacked&TCPCB_SACKED_ACKED))
reord = min(fack_count, reord);
}
/* Nothing to do; acked frame is about to be dropped. */
continue;
}
if ((sacked&TCPCB_SACKED_RETRANS) &&
after(end_seq, TCP_SKB_CB(skb)->ack_seq) &&
(!lost_retrans || after(end_seq, lost_retrans)))
lost_retrans = end_seq;
if (!in_sack)
continue;
if (!(sacked&TCPCB_SACKED_ACKED)) {
if (sacked & TCPCB_SACKED_RETRANS) {
/* If the segment is not tagged as lost,
* we do not clear RETRANS, believing
* that retransmission is still in flight.
*/
if (sacked & TCPCB_LOST) {
TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
tp->lost_out -= tcp_skb_pcount(skb);
tp->retrans_out -= tcp_skb_pcount(skb);
/* clear lost hint */
tp->retransmit_skb_hint = NULL;
}
} else {
/* New sack for not retransmitted frame,
* which was in hole. It is reordering.
*/
if (!(sacked & TCPCB_RETRANS) &&
fack_count < prior_fackets)
reord = min(fack_count, reord);
if (sacked & TCPCB_LOST) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
tp->lost_out -= tcp_skb_pcount(skb);
/* clear lost hint */
tp->retransmit_skb_hint = NULL;
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
}
}
TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED;
flag |= FLAG_DATA_SACKED;
tp->sacked_out += tcp_skb_pcount(skb);
if (fack_count > tp->fackets_out)
tp->fackets_out = fack_count;
} else {
if (dup_sack && (sacked&TCPCB_RETRANS))
reord = min(fack_count, reord);
}
/* D-SACK. We can detect redundant retransmission
* in S|R and plain R frames and clear it.
* undo_retrans is decreased above, L|R frames
* are accounted above as well.
*/
if (dup_sack &&
(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb);
}
}
}
/* Check for lost retransmit. This superb idea is
* borrowed from "ratehalving". Event "C".
* Later note: FACK people cheated me again 8),
* we have to account for reordering! Ugly,
* but should help.
*/
if (lost_retrans && icsk->icsk_ca_state == TCP_CA_Recovery) {
struct sk_buff *skb;
sk_stream_for_retrans_queue(skb, sk) {
if (after(TCP_SKB_CB(skb)->seq, lost_retrans))
break;
if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
continue;
if ((TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) &&
after(lost_retrans, TCP_SKB_CB(skb)->ack_seq) &&
(IsFack(tp) ||
!before(lost_retrans,
TCP_SKB_CB(skb)->ack_seq + tp->reordering *
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb);
/* clear lost hint */
tp->retransmit_skb_hint = NULL;
if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) {
tp->lost_out += tcp_skb_pcount(skb);
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
flag |= FLAG_DATA_SACKED;
NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT);
}
}
}
}
tp->left_out = tp->sacked_out + tp->lost_out;
if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss)
tcp_update_reordering(sk, ((tp->fackets_out + 1) - reord), 0);
#if FASTRETRANS_DEBUG > 0
BUG_TRAP((int)tp->sacked_out >= 0);
BUG_TRAP((int)tp->lost_out >= 0);
BUG_TRAP((int)tp->retrans_out >= 0);
BUG_TRAP((int)tcp_packets_in_flight(tp) >= 0);
#endif
return flag;
}
int tcp_use_frto(const struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
/* F-RTO must be activated in sysctl and there must be some
* unsent new data, and the advertised window should allow
* sending it.
*/
return (sysctl_tcp_frto && sk->sk_send_head &&
!after(TCP_SKB_CB(sk->sk_send_head)->end_seq,
tp->snd_una + tp->snd_wnd));
}
/* RTO occurred, but do not yet enter loss state. Instead, transmit two new
* segments to see from the next ACKs whether any data was really missing.
* If the RTO was spurious, new ACKs should arrive.
*/
void tcp_enter_frto(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
tp->frto_counter = 1;
if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
tp->snd_una == tp->high_seq ||
(icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
tp->prior_ssthresh = tcp_current_ssthresh(sk);
tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
tcp_ca_event(sk, CA_EVENT_FRTO);
}
/* Have to clear retransmission markers here to keep the bookkeeping
* in shape, even though we are not yet in Loss state.
* If something was really lost, it is eventually caught up
* in tcp_enter_frto_loss.
*/
tp->retrans_out = 0;
tp->undo_marker = tp->snd_una;
tp->undo_retrans = 0;
sk_stream_for_retrans_queue(skb, sk) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
tcp_set_ca_state(sk, TCP_CA_Open);
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
tp->frto_highmark = tp->snd_nxt;
}
/* Enter Loss state after F-RTO was applied. Dupack arrived after RTO,
* which indicates that we should follow the traditional RTO recovery,
* i.e. mark everything lost and do go-back-N retransmission.
*/
static void tcp_enter_frto_loss(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
int cnt = 0;
tp->sacked_out = 0;
tp->lost_out = 0;
tp->fackets_out = 0;
sk_stream_for_retrans_queue(skb, sk) {
cnt += tcp_skb_pcount(skb);
TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
/* Do not mark those segments lost that were
* forward transmitted after RTO
*/
if (!after(TCP_SKB_CB(skb)->end_seq,
tp->frto_highmark)) {
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
}
} else {
tp->sacked_out += tcp_skb_pcount(skb);
tp->fackets_out = cnt;
}
}
tcp_sync_left_out(tp);
tp->snd_cwnd = tp->frto_counter + tcp_packets_in_flight(tp)+1;
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_time_stamp;
tp->undo_marker = 0;
tp->frto_counter = 0;
tp->reordering = min_t(unsigned int, tp->reordering,
sysctl_tcp_reordering);
tcp_set_ca_state(sk, TCP_CA_Loss);
tp->high_seq = tp->frto_highmark;
TCP_ECN_queue_cwr(tp);
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
}
void tcp_clear_retrans(struct tcp_sock *tp)
{
tp->left_out = 0;
tp->retrans_out = 0;
tp->fackets_out = 0;
tp->sacked_out = 0;
tp->lost_out = 0;
tp->undo_marker = 0;
tp->undo_retrans = 0;
}
/* Enter Loss state. If "how" is not zero, forget all SACK information
* and reset tags completely, otherwise preserve SACKs. If receiver
* dropped its ofo queue, we will know this due to reneging detection.
*/
void tcp_enter_loss(struct sock *sk, int how)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
int cnt = 0;
/* Reduce ssthresh if it has not yet been made inside this window. */
if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
(icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
tp->prior_ssthresh = tcp_current_ssthresh(sk);
tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
tcp_ca_event(sk, CA_EVENT_LOSS);
}
tp->snd_cwnd = 1;
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_time_stamp;
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
tcp_clear_retrans(tp);
/* Push undo marker, if it was plain RTO and nothing
* was retransmitted. */
if (!how)
tp->undo_marker = tp->snd_una;
sk_stream_for_retrans_queue(skb, sk) {
cnt += tcp_skb_pcount(skb);
if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS)
tp->undo_marker = 0;
TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
} else {
tp->sacked_out += tcp_skb_pcount(skb);
tp->fackets_out = cnt;
}
}
tcp_sync_left_out(tp);
tp->reordering = min_t(unsigned int, tp->reordering,
sysctl_tcp_reordering);
tcp_set_ca_state(sk, TCP_CA_Loss);
tp->high_seq = tp->snd_nxt;
TCP_ECN_queue_cwr(tp);
static int tcp_check_sack_reneging(struct sock *sk)
{
struct sk_buff *skb;
/* If ACK arrived pointing to a remembered SACK,
* it means that our remembered SACKs do not reflect
* real state of receiver i.e.
* receiver _host_ is heavily congested (or buggy).
* Do processing similar to RTO timeout.
*/
if ((skb = skb_peek(&sk->sk_write_queue)) != NULL &&
(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
struct inet_connection_sock *icsk = inet_csk(sk);
NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING);
tcp_enter_loss(sk, 1);
icsk->icsk_retransmits++;
tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue));
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
icsk->icsk_rto, TCP_RTO_MAX);
return 1;
}
return 0;
}
static inline int tcp_fackets_out(struct tcp_sock *tp)
{
return IsReno(tp) ? tp->sacked_out+1 : tp->fackets_out;
}
static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto);
}
static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp)
{
return tp->packets_out &&
tcp_skb_timedout(sk, skb_peek(&sk->sk_write_queue));
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
}
/* Linux NewReno/SACK/FACK/ECN state machine.
* --------------------------------------
*
* "Open" Normal state, no dubious events, fast path.
* "Disorder" In all the respects it is "Open",
* but requires a bit more attention. It is entered when
* we see some SACKs or dupacks. It is split of "Open"
* mainly to move some processing from fast path to slow one.
* "CWR" CWND was reduced due to some Congestion Notification event.
* It can be ECN, ICMP source quench, local device congestion.
* "Recovery" CWND was reduced, we are fast-retransmitting.
* "Loss" CWND was reduced due to RTO timeout or SACK reneging.
*
* tcp_fastretrans_alert() is entered:
* - each incoming ACK, if state is not "Open"
* - when arrived ACK is unusual, namely:
* * SACK
* * Duplicate ACK.
* * ECN ECE.
*
* Counting packets in flight is pretty simple.
*
* in_flight = packets_out - left_out + retrans_out
*
* packets_out is SND.NXT-SND.UNA counted in packets.
*
* retrans_out is number of retransmitted segments.
*
* left_out is number of segments left network, but not ACKed yet.
*
* left_out = sacked_out + lost_out
*
* sacked_out: Packets, which arrived to receiver out of order
* and hence not ACKed. With SACKs this number is simply
* amount of SACKed data. Even without SACKs
* it is easy to give pretty reliable estimate of this number,
* counting duplicate ACKs.
*
* lost_out: Packets lost by network. TCP has no explicit
* "loss notification" feedback from network (for now).
* It means that this number can be only _guessed_.
* Actually, it is the heuristics to predict lossage that
* distinguishes different algorithms.
*
* F.e. after RTO, when all the queue is considered as lost,
* lost_out = packets_out and in_flight = retrans_out.
*
* Essentially, we have now two algorithms counting
* lost packets.
*
* FACK: It is the simplest heuristics. As soon as we decided
* that something is lost, we decide that _all_ not SACKed
* packets until the most forward SACK are lost. I.e.
* lost_out = fackets_out - sacked_out and left_out = fackets_out.
* It is absolutely correct estimate, if network does not reorder
* packets. And it loses any connection to reality when reordering
* takes place. We use FACK by default until reordering
* is suspected on the path to this destination.
*
* NewReno: when Recovery is entered, we assume that one segment
* is lost (classic Reno). While we are in Recovery and
* a partial ACK arrives, we assume that one more packet
* is lost (NewReno). This heuristics are the same in NewReno
* and SACK.
*
* Imagine, that's all! Forget about all this shamanism about CWND inflation
* deflation etc. CWND is real congestion window, never inflated, changes
* only according to classic VJ rules.
*
* Really tricky (and requiring careful tuning) part of algorithm
* is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
* The first determines the moment _when_ we should reduce CWND and,
* hence, slow down forward transmission. In fact, it determines the moment
* when we decide that hole is caused by loss, rather than by a reorder.
*
* tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill
* holes, caused by lost packets.
*
* And the most logically complicated part of algorithm is undo
* heuristics. We detect false retransmits due to both too early
* fast retransmit (reordering) and underestimated RTO, analyzing
* timestamps and D-SACKs. When we detect that some segments were
* retransmitted by mistake and CWND reduction was wrong, we undo
* window reduction and abort recovery phase. This logic is hidden
* inside several functions named tcp_try_undo_<something>.
*/
/* This function decides, when we should leave Disordered state
* and enter Recovery phase, reducing congestion window.
*
* Main question: may we further continue forward transmission
* with the same cwnd?
*/
static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp)
{
__u32 packets_out;
/* Trick#1: The loss is proven. */
if (tp->lost_out)
return 1;
/* Not-A-Trick#2 : Classic rule... */
if (tcp_fackets_out(tp) > tp->reordering)
return 1;
/* Trick#3 : when we use RFC2988 timer restart, fast
* retransmit can be triggered by timeout of queue head.
*/
if (tcp_head_timedout(sk, tp))
return 1;
/* Trick#4: It is still not OK... But will it be useful to delay
* recovery more?
*/
packets_out = tp->packets_out;
if (packets_out <= tp->reordering &&
tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) &&
!tcp_may_send_now(sk, tp)) {
/* We have nothing to send. This connection is limited
* either by receiver window or by application.
*/
return 1;
}
return 0;
}
/* If we receive more dupacks than we expected counting segments
* in assumption of absent reordering, interpret this as reordering.
* The only another reason could be bug in receiver TCP.
*/
static void tcp_check_reno_reordering(struct sock *sk, const int addend)
struct tcp_sock *tp = tcp_sk(sk);
u32 holes;
holes = max(tp->lost_out, 1U);
holes = min(holes, tp->packets_out);
if ((tp->sacked_out + holes) > tp->packets_out) {
tp->sacked_out = tp->packets_out - holes;
tcp_update_reordering(sk, tp->packets_out + addend, 0);
}
}
/* Emulate SACKs for SACKless connection: account for a new dupack. */
static void tcp_add_reno_sack(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
tcp_check_reno_reordering(sk, 0);
tcp_sync_left_out(tp);
}
/* Account for ACK, ACKing some data in Reno Recovery phase. */
static void tcp_remove_reno_sacks(struct sock *sk, struct tcp_sock *tp, int acked)
{
if (acked > 0) {
/* One ACK acked hole. The rest eat duplicate ACKs. */
if (acked-1 >= tp->sacked_out)
tp->sacked_out = 0;
else
tp->sacked_out -= acked-1;
}
tcp_check_reno_reordering(sk, acked);
tcp_sync_left_out(tp);
}
static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
{
tp->sacked_out = 0;
tp->left_out = tp->lost_out;
}
/* Mark head of queue up as lost. */
static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp,
int packets, u32 high_seq)
{
struct sk_buff *skb;
BUG_TRAP(packets <= tp->packets_out);
if (tp->lost_skb_hint) {
skb = tp->lost_skb_hint;
cnt = tp->lost_cnt_hint;
} else {
skb = sk->sk_write_queue.next;
cnt = 0;
}
sk_stream_for_retrans_queue_from(skb, sk) {
/* TODO: do this better */
/* this is not the most efficient way to do this... */
tp->lost_skb_hint = skb;
tp->lost_cnt_hint = cnt;
cnt += tcp_skb_pcount(skb);
if (cnt > packets || after(TCP_SKB_CB(skb)->end_seq, high_seq))
break;
if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
/* clear xmit_retransmit_queue hints
* if this is beyond hint */
if(tp->retransmit_skb_hint != NULL &&
before(TCP_SKB_CB(skb)->seq,
TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) {
tp->retransmit_skb_hint = NULL;
}
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
}
}
tcp_sync_left_out(tp);
}
/* Account newly detected lost packet(s) */
static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp)
{
if (IsFack(tp)) {
int lost = tp->fackets_out - tp->reordering;
if (lost <= 0)
lost = 1;
tcp_mark_head_lost(sk, tp, lost, tp->high_seq);
} else {
tcp_mark_head_lost(sk, tp, 1, tp->high_seq);
}
/* New heuristics: it is possible only after we switched
* to restart timer each time when something is ACKed.
* Hence, we can detect timed out packets during fast
* retransmit without falling to slow start.
*/
if (!IsReno(tp) && tcp_head_timedout(sk, tp)) {
skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint
: sk->sk_write_queue.next;
sk_stream_for_retrans_queue_from(skb, sk) {
if (!tcp_skb_timedout(sk, skb))
break;
if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
/* clear xmit_retrans hint */
if (tp->retransmit_skb_hint &&
before(TCP_SKB_CB(skb)->seq,
TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
tp->retransmit_skb_hint = NULL;
tp->scoreboard_skb_hint = skb;
tcp_sync_left_out(tp);
}
}
/* CWND moderation, preventing bursts due to too big ACKs
* in dubious situations.
*/
static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
{
tp->snd_cwnd = min(tp->snd_cwnd,
tcp_packets_in_flight(tp)+tcp_max_burst(tp));
tp->snd_cwnd_stamp = tcp_time_stamp;
}
/* Lower bound on congestion window is slow start threshold
* unless congestion avoidance choice decides to overide it.
*/
static inline u32 tcp_cwnd_min(const struct sock *sk)
{
const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh;
}
static void tcp_cwnd_down(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
int decr = tp->snd_cwnd_cnt + 1;
tp->snd_cwnd_cnt = decr&1;
decr >>= 1;
if (decr && tp->snd_cwnd > tcp_cwnd_min(sk))
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
tp->snd_cwnd -= decr;
tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
tp->snd_cwnd_stamp = tcp_time_stamp;
}
/* Nothing was retransmitted or returned timestamp is less
* than timestamp of the first retransmission.
*/
static inline int tcp_packet_delayed(struct tcp_sock *tp)
{
return !tp->retrans_stamp ||
(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
(__s32)(tp->rx_opt.rcv_tsecr - tp->retrans_stamp) < 0);
}
/* Undo procedures. */
#if FASTRETRANS_DEBUG > 1
static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg)
{
struct inet_sock *inet = inet_sk(sk);
printk(KERN_DEBUG "Undo %s %u.%u.%u.%u/%u c%u l%u ss%u/%u p%u\n",
msg,
NIPQUAD(inet->daddr), ntohs(inet->dport),
tp->snd_cwnd, tp->left_out,
tp->snd_ssthresh, tp->prior_ssthresh,
tp->packets_out);
}
#else
#define DBGUNDO(x...) do { } while (0)
#endif
static void tcp_undo_cwr(struct sock *sk, const int undo)
struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
if (icsk->icsk_ca_ops->undo_cwnd)
tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
else
tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1);
if (undo && tp->prior_ssthresh > tp->snd_ssthresh) {
tp->snd_ssthresh = tp->prior_ssthresh;
TCP_ECN_withdraw_cwr(tp);
}
} else {
tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);
}
tcp_moderate_cwnd(tp);
tp->snd_cwnd_stamp = tcp_time_stamp;
/* There is something screwy going on with the retrans hints after
an undo */
clear_all_retrans_hints(tp);
}
static inline int tcp_may_undo(struct tcp_sock *tp)
{
return tp->undo_marker &&
(!tp->undo_retrans || tcp_packet_delayed(tp));
}
/* People celebrate: "We love our President!" */
static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp)
{
if (tcp_may_undo(tp)) {
/* Happy end! We did not retransmit anything
* or our original transmission succeeded.
*/
DBGUNDO(sk, tp, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
tcp_undo_cwr(sk, 1);
if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO);
else
NET_INC_STATS_BH(LINUX_MIB_TCPFULLUNDO);
tp->undo_marker = 0;
}
if (tp->snd_una == tp->high_seq && IsReno(tp)) {
/* Hold old state until something *above* high_seq
* is ACKed. For Reno it is MUST to prevent false
* fast retransmits (RFC2582). SACK TCP is safe. */
tcp_moderate_cwnd(tp);
return 1;
}
tcp_set_ca_state(sk, TCP_CA_Open);
return 0;
}
/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */
static void tcp_try_undo_dsack(struct sock *sk, struct tcp_sock *tp)
{
if (tp->undo_marker && !tp->undo_retrans) {
DBGUNDO(sk, tp, "D-SACK");
tcp_undo_cwr(sk, 1);
tp->undo_marker = 0;
NET_INC_STATS_BH(LINUX_MIB_TCPDSACKUNDO);
}
}
/* Undo during fast recovery after partial ACK. */
static int tcp_try_undo_partial(struct sock *sk, struct tcp_sock *tp,
int acked)
{
/* Partial ACK arrived. Force Hoe's retransmit. */
int failed = IsReno(tp) || tp->fackets_out>tp->reordering;
if (tcp_may_undo(tp)) {
/* Plain luck! Hole if filled with delayed
* packet, rather than with a retransmit.
*/
if (tp->retrans_out == 0)
tp->retrans_stamp = 0;
tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
tcp_undo_cwr(sk, 0);
NET_INC_STATS_BH(LINUX_MIB_TCPPARTIALUNDO);
/* So... Do not make Hoe's retransmit yet.
* If the first packet was delayed, the rest
* ones are most probably delayed as well.
*/
failed = 0;
}
return failed;
}
/* Undo during loss recovery after partial ACK. */
static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp)
{
if (tcp_may_undo(tp)) {
struct sk_buff *skb;
sk_stream_for_retrans_queue(skb, sk) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
}
clear_all_retrans_hints(tp);
DBGUNDO(sk, tp, "partial loss");
tp->lost_out = 0;
tp->left_out = tp->sacked_out;
tcp_undo_cwr(sk, 1);
inet_csk(sk)->icsk_retransmits = 0;
tcp_set_ca_state(sk, TCP_CA_Open);
static inline void tcp_complete_cwr(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
}
static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
{
tp->left_out = tp->sacked_out;
if (tp->retrans_out == 0)
tp->retrans_stamp = 0;
if (flag&FLAG_ECE)
tcp_enter_cwr(sk);
if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
int state = TCP_CA_Open;
if (tp->left_out || tp->retrans_out || tp->undo_marker)
state = TCP_CA_Disorder;
if (inet_csk(sk)->icsk_ca_state != state) {
tcp_set_ca_state(sk, state);
tp->high_seq = tp->snd_nxt;
}
tcp_moderate_cwnd(tp);
} else {
tcp_cwnd_down(sk);
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
static void tcp_mtup_probe_failed(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
icsk->icsk_mtup.probe_size = 0;
}
static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
/* FIXME: breaks with very large cwnd */
tp->prior_ssthresh = tcp_current_ssthresh(sk);
tp->snd_cwnd = tp->snd_cwnd *
tcp_mss_to_mtu(sk, tp->mss_cache) /
icsk->icsk_mtup.probe_size;
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_time_stamp;
tp->rcv_ssthresh = tcp_current_ssthresh(sk);
icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
icsk->icsk_mtup.probe_size = 0;
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
}
/* Process an event, which can update packets-in-flight not trivially.
* Main goal of this function is to calculate new estimate for left_out,
* taking into account both packets sitting in receiver's buffer and
* packets lost by network.
*
* Besides that it does CWND reduction, when packet loss is detected
* and changes state of machine.
*
* It does _not_ decide what to send, it is made in function
* tcp_xmit_retransmit_queue().
*/
static void
tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
int prior_packets, int flag)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
int is_dupack = (tp->snd_una == prior_snd_una && !(flag&FLAG_NOT_DUP));
/* Some technical things:
* 1. Reno does not count dupacks (sacked_out) automatically. */
if (!tp->packets_out)
tp->sacked_out = 0;
/* 2. SACK counts snd_fack in packets inaccurately. */
if (tp->sacked_out == 0)
tp->fackets_out = 0;
* A. ECE, hence prohibit cwnd undoing, the reduction is required. */
if (flag&FLAG_ECE)
tp->prior_ssthresh = 0;
/* B. In all the states check for reneging SACKs. */
if (tp->sacked_out && tcp_check_sack_reneging(sk))
return;
/* C. Process data loss notification, provided it is valid. */
if ((flag&FLAG_DATA_LOST) &&
before(tp->snd_una, tp->high_seq) &&