Newer
Older
icsk->icsk_retransmits++;
tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
icsk->icsk_rto, TCP_RTO_MAX);
return 1;
}
return 0;
}
static inline int tcp_fackets_out(struct tcp_sock *tp)
{
return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out;
/* Heurestics to calculate number of duplicate ACKs. There's no dupACKs
* counter when SACK is enabled (without SACK, sacked_out is used for
* that purpose).
*
* Instead, with FACK TCP uses fackets_out that includes both SACKed
* segments up to the highest received SACK block so far and holes in
* between them.
*
* With reordering, holes may still be in flight, so RFC3517 recovery
* uses pure sacked_out (total number of SACKed segments) even though
* it violates the RFC that uses duplicate ACKs, often these are equal
* but when e.g. out-of-window ACKs or packet duplication occurs,
* they differ. Since neither occurs due to loss, TCP should really
* ignore them.
*/
static inline int tcp_dupack_heurestics(struct tcp_sock *tp)
{
return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
}
static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto);
static inline int tcp_head_timedout(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
tcp_skb_timedout(sk, tcp_write_queue_head(sk));
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
}
/* Linux NewReno/SACK/FACK/ECN state machine.
* --------------------------------------
*
* "Open" Normal state, no dubious events, fast path.
* "Disorder" In all the respects it is "Open",
* but requires a bit more attention. It is entered when
* we see some SACKs or dupacks. It is split of "Open"
* mainly to move some processing from fast path to slow one.
* "CWR" CWND was reduced due to some Congestion Notification event.
* It can be ECN, ICMP source quench, local device congestion.
* "Recovery" CWND was reduced, we are fast-retransmitting.
* "Loss" CWND was reduced due to RTO timeout or SACK reneging.
*
* tcp_fastretrans_alert() is entered:
* - each incoming ACK, if state is not "Open"
* - when arrived ACK is unusual, namely:
* * SACK
* * Duplicate ACK.
* * ECN ECE.
*
* Counting packets in flight is pretty simple.
*
* in_flight = packets_out - left_out + retrans_out
*
* packets_out is SND.NXT-SND.UNA counted in packets.
*
* retrans_out is number of retransmitted segments.
*
* left_out is number of segments left network, but not ACKed yet.
*
* left_out = sacked_out + lost_out
*
* sacked_out: Packets, which arrived to receiver out of order
* and hence not ACKed. With SACKs this number is simply
* amount of SACKed data. Even without SACKs
* it is easy to give pretty reliable estimate of this number,
* counting duplicate ACKs.
*
* lost_out: Packets lost by network. TCP has no explicit
* "loss notification" feedback from network (for now).
* It means that this number can be only _guessed_.
* Actually, it is the heuristics to predict lossage that
* distinguishes different algorithms.
*
* F.e. after RTO, when all the queue is considered as lost,
* lost_out = packets_out and in_flight = retrans_out.
*
* Essentially, we have now two algorithms counting
* lost packets.
*
* FACK: It is the simplest heuristics. As soon as we decided
* that something is lost, we decide that _all_ not SACKed
* packets until the most forward SACK are lost. I.e.
* lost_out = fackets_out - sacked_out and left_out = fackets_out.
* It is absolutely correct estimate, if network does not reorder
* packets. And it loses any connection to reality when reordering
* takes place. We use FACK by default until reordering
* is suspected on the path to this destination.
*
* NewReno: when Recovery is entered, we assume that one segment
* is lost (classic Reno). While we are in Recovery and
* a partial ACK arrives, we assume that one more packet
* is lost (NewReno). This heuristics are the same in NewReno
* and SACK.
*
* Imagine, that's all! Forget about all this shamanism about CWND inflation
* deflation etc. CWND is real congestion window, never inflated, changes
* only according to classic VJ rules.
*
* Really tricky (and requiring careful tuning) part of algorithm
* is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
* The first determines the moment _when_ we should reduce CWND and,
* hence, slow down forward transmission. In fact, it determines the moment
* when we decide that hole is caused by loss, rather than by a reorder.
*
* tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill
* holes, caused by lost packets.
*
* And the most logically complicated part of algorithm is undo
* heuristics. We detect false retransmits due to both too early
* fast retransmit (reordering) and underestimated RTO, analyzing
* timestamps and D-SACKs. When we detect that some segments were
* retransmitted by mistake and CWND reduction was wrong, we undo
* window reduction and abort recovery phase. This logic is hidden
* inside several functions named tcp_try_undo_<something>.
*/
/* This function decides, when we should leave Disordered state
* and enter Recovery phase, reducing congestion window.
*
* Main question: may we further continue forward transmission
* with the same cwnd?
*/
static int tcp_time_to_recover(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
/* Do not perform any recovery during F-RTO algorithm */
if (tp->frto_counter)
return 0;
/* Trick#1: The loss is proven. */
if (tp->lost_out)
return 1;
/* Not-A-Trick#2 : Classic rule... */
if (tcp_dupack_heurestics(tp) > tp->reordering)
return 1;
/* Trick#3 : when we use RFC2988 timer restart, fast
* retransmit can be triggered by timeout of queue head.
*/
if (tcp_is_fack(tp) && tcp_head_timedout(sk))
return 1;
/* Trick#4: It is still not OK... But will it be useful to delay
* recovery more?
*/
packets_out = tp->packets_out;
if (packets_out <= tp->reordering &&
tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) &&
!tcp_may_send_now(sk)) {
/* We have nothing to send. This connection is limited
* either by receiver window or by application.
*/
return 1;
}
return 0;
}
/* Mark head of queue up as lost. With RFC3517 SACK, the packets is
* is against sacked "cnt", otherwise it's against facked "cnt"
*/
static void tcp_mark_head_lost(struct sock *sk, int packets)
struct tcp_sock *tp = tcp_sk(sk);
int cnt, oldcnt;
int err;
unsigned int mss;
WARN_ON(packets > tp->packets_out);
if (tp->lost_skb_hint) {
skb = tp->lost_skb_hint;
cnt = tp->lost_cnt_hint;
} else {
skb = tcp_write_queue_head(sk);
tcp_for_write_queue_from(skb, sk) {
if (skb == tcp_send_head(sk))
break;
/* TODO: do this better */
/* this is not the most efficient way to do this... */
tp->lost_skb_hint = skb;
tp->lost_cnt_hint = cnt;
if (after(TCP_SKB_CB(skb)->end_seq, tp->high_seq))
break;
oldcnt = cnt;
if (tcp_is_fack(tp) || tcp_is_reno(tp) ||
(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
cnt += tcp_skb_pcount(skb);
if (cnt > packets) {
if (tcp_is_sack(tp) || (oldcnt >= packets))
break;
mss = skb_shinfo(skb)->gso_size;
err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, mss);
if (err < 0)
break;
cnt = packets;
}
tcp_verify_left_out(tp);
}
/* Account newly detected lost packet(s) */
static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
struct tcp_sock *tp = tcp_sk(sk);
if (tcp_is_reno(tp)) {
tcp_mark_head_lost(sk, 1);
} else if (tcp_is_fack(tp)) {
int lost = tp->fackets_out - tp->reordering;
if (lost <= 0)
lost = 1;
tcp_mark_head_lost(sk, lost);
int sacked_upto = tp->sacked_out - tp->reordering;
if (sacked_upto < fast_rexmit)
sacked_upto = fast_rexmit;
tcp_mark_head_lost(sk, sacked_upto);
}
/* New heuristics: it is possible only after we switched
* to restart timer each time when something is ACKed.
* Hence, we can detect timed out packets during fast
* retransmit without falling to slow start.
*/
if (tcp_is_fack(tp) && tcp_head_timedout(sk)) {
skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint
: tcp_write_queue_head(sk);
tcp_for_write_queue_from(skb, sk) {
if (skb == tcp_send_head(sk))
break;
if (!tcp_skb_timedout(sk, skb))
break;
tp->scoreboard_skb_hint = skb;
tcp_verify_left_out(tp);
}
}
/* CWND moderation, preventing bursts due to too big ACKs
* in dubious situations.
*/
static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
{
tp->snd_cwnd = min(tp->snd_cwnd,
tcp_packets_in_flight(tp) + tcp_max_burst(tp));
/* Lower bound on congestion window is slow start threshold
* unless congestion avoidance choice decides to overide it.
*/
static inline u32 tcp_cwnd_min(const struct sock *sk)
{
const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh;
}
static void tcp_cwnd_down(struct sock *sk, int flag)
struct tcp_sock *tp = tcp_sk(sk);
if ((flag & (FLAG_ANY_PROGRESS | FLAG_DSACKING_ACK)) ||
(tcp_is_reno(tp) && !(flag & FLAG_NOT_DUP))) {
tp->snd_cwnd_cnt = decr & 1;
if (decr && tp->snd_cwnd > tcp_cwnd_min(sk))
tp->snd_cwnd -= decr;
tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1);
tp->snd_cwnd_stamp = tcp_time_stamp;
}
}
/* Nothing was retransmitted or returned timestamp is less
* than timestamp of the first retransmission.
*/
static inline int tcp_packet_delayed(struct tcp_sock *tp)
{
return !tp->retrans_stamp ||
(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
before(tp->rx_opt.rcv_tsecr, tp->retrans_stamp));
}
/* Undo procedures. */
#if FASTRETRANS_DEBUG > 1
static void DBGUNDO(struct sock *sk, const char *msg)
struct tcp_sock *tp = tcp_sk(sk);
if (sk->sk_family == AF_INET) {
printk(KERN_DEBUG "Undo %s " NIPQUAD_FMT "/%u c%u l%u ss%u/%u p%u\n",
msg,
NIPQUAD(inet->daddr), ntohs(inet->dport),
tp->snd_cwnd, tcp_left_out(tp),
tp->snd_ssthresh, tp->prior_ssthresh,
tp->packets_out);
}
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
else if (sk->sk_family == AF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
printk(KERN_DEBUG "Undo %s " NIP6_FMT "/%u c%u l%u ss%u/%u p%u\n",
msg,
NIP6(np->daddr), ntohs(inet->dport),
tp->snd_cwnd, tcp_left_out(tp),
tp->snd_ssthresh, tp->prior_ssthresh,
tp->packets_out);
}
#endif
}
#else
#define DBGUNDO(x...) do { } while (0)
#endif
static void tcp_undo_cwr(struct sock *sk, const int undo)
struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
if (icsk->icsk_ca_ops->undo_cwnd)
tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1);
if (undo && tp->prior_ssthresh > tp->snd_ssthresh) {
tp->snd_ssthresh = tp->prior_ssthresh;
TCP_ECN_withdraw_cwr(tp);
}
} else {
tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);
}
tcp_moderate_cwnd(tp);
tp->snd_cwnd_stamp = tcp_time_stamp;
}
static inline int tcp_may_undo(struct tcp_sock *tp)
{
return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp));
}
/* People celebrate: "We love our President!" */
static int tcp_try_undo_recovery(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
int mib_idx;
/* Happy end! We did not retransmit anything
* or our original transmission succeeded.
*/
DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
tcp_undo_cwr(sk, 1);
if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
mib_idx = LINUX_MIB_TCPLOSSUNDO;
mib_idx = LINUX_MIB_TCPFULLUNDO;
NET_INC_STATS_BH(sock_net(sk), mib_idx);
if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
/* Hold old state until something *above* high_seq
* is ACKed. For Reno it is MUST to prevent false
* fast retransmits (RFC2582). SACK TCP is safe. */
tcp_moderate_cwnd(tp);
return 1;
}
tcp_set_ca_state(sk, TCP_CA_Open);
return 0;
}
/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */
static void tcp_try_undo_dsack(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
DBGUNDO(sk, "D-SACK");
tcp_undo_cwr(sk, 1);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
}
}
/* Undo during fast recovery after partial ACK. */
static int tcp_try_undo_partial(struct sock *sk, int acked)
struct tcp_sock *tp = tcp_sk(sk);
int failed = tcp_is_reno(tp) || (tcp_fackets_out(tp) > tp->reordering);
if (tcp_may_undo(tp)) {
/* Plain luck! Hole if filled with delayed
* packet, rather than with a retransmit.
*/
if (tp->retrans_out == 0)
tp->retrans_stamp = 0;
tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
DBGUNDO(sk, "Hoe");
tcp_undo_cwr(sk, 0);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
/* So... Do not make Hoe's retransmit yet.
* If the first packet was delayed, the rest
* ones are most probably delayed as well.
*/
failed = 0;
}
return failed;
}
/* Undo during loss recovery after partial ACK. */
static int tcp_try_undo_loss(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
tcp_for_write_queue(skb, sk) {
if (skb == tcp_send_head(sk))
break;
tcp_clear_all_retrans_hints(tp);
DBGUNDO(sk, "partial loss");
tcp_undo_cwr(sk, 1);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
inet_csk(sk)->icsk_retransmits = 0;
if (tcp_is_sack(tp))
tcp_set_ca_state(sk, TCP_CA_Open);
static inline void tcp_complete_cwr(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
static void tcp_try_keep_open(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
int state = TCP_CA_Open;
if (tcp_left_out(tp) || tp->retrans_out || tp->undo_marker)
state = TCP_CA_Disorder;
if (inet_csk(sk)->icsk_ca_state != state) {
tcp_set_ca_state(sk, state);
tp->high_seq = tp->snd_nxt;
}
}
static void tcp_try_to_open(struct sock *sk, int flag)
struct tcp_sock *tp = tcp_sk(sk);
tcp_verify_left_out(tp);
if (!tp->frto_counter && tp->retrans_out == 0)
tcp_enter_cwr(sk, 1);
if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
tcp_try_keep_open(sk);
tcp_cwnd_down(sk, flag);
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
static void tcp_mtup_probe_failed(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
icsk->icsk_mtup.probe_size = 0;
}
static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
/* FIXME: breaks with very large cwnd */
tp->prior_ssthresh = tcp_current_ssthresh(sk);
tp->snd_cwnd = tp->snd_cwnd *
tcp_mss_to_mtu(sk, tp->mss_cache) /
icsk->icsk_mtup.probe_size;
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_time_stamp;
tp->rcv_ssthresh = tcp_current_ssthresh(sk);
icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
icsk->icsk_mtup.probe_size = 0;
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
}
/* Process an event, which can update packets-in-flight not trivially.
* Main goal of this function is to calculate new estimate for left_out,
* taking into account both packets sitting in receiver's buffer and
* packets lost by network.
*
* Besides that it does CWND reduction, when packet loss is detected
* and changes state of machine.
*
* It does _not_ decide what to send, it is made in function
* tcp_xmit_retransmit_queue().
*/
static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
struct inet_connection_sock *icsk = inet_csk(sk);
int is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
(tcp_fackets_out(tp) > tp->reordering));
int fast_rexmit = 0, mib_idx;
if (WARN_ON(!tp->packets_out && tp->sacked_out))
if (WARN_ON(!tp->sacked_out && tp->fackets_out))
* A. ECE, hence prohibit cwnd undoing, the reduction is required. */
tp->prior_ssthresh = 0;
/* B. In all the states check for reneging SACKs. */
if (tcp_check_sack_reneging(sk, flag))
return;
/* C. Process data loss notification, provided it is valid. */
if (tcp_is_fack(tp) && (flag & FLAG_DATA_LOST) &&
icsk->icsk_ca_state != TCP_CA_Open &&
tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS);
/* D. Check consistency of the current state. */
tcp_verify_left_out(tp);
/* E. Check state exit conditions. State can be terminated
* when high_seq is ACKed. */
if (icsk->icsk_ca_state == TCP_CA_Open) {
WARN_ON(tp->retrans_out != 0);
tp->retrans_stamp = 0;
} else if (!before(tp->snd_una, tp->high_seq)) {
switch (icsk->icsk_ca_state) {
icsk->icsk_retransmits = 0;
if (tcp_try_undo_recovery(sk))
return;
break;
case TCP_CA_CWR:
/* CWR is to be held something *above* high_seq
* is ACKed for CWR bit to reach receiver. */
if (tp->snd_una != tp->high_seq) {
tcp_complete_cwr(sk);
tcp_set_ca_state(sk, TCP_CA_Open);
tcp_try_undo_dsack(sk);
if (!tp->undo_marker ||
/* For SACK case do not Open to allow to undo
* catching for all duplicate ACKs. */
tcp_is_reno(tp) || tp->snd_una != tp->high_seq) {
tcp_set_ca_state(sk, TCP_CA_Open);
if (tcp_is_reno(tp))
if (tcp_try_undo_recovery(sk))
tcp_complete_cwr(sk);
break;
}
}
/* F. Process state. */
switch (icsk->icsk_ca_state) {
if (!(flag & FLAG_SND_UNA_ADVANCED)) {
if (tcp_is_reno(tp) && is_dupack)
tcp_add_reno_sack(sk);
} else
do_lost = tcp_try_undo_partial(sk, pkts_acked);
icsk->icsk_retransmits = 0;
if (tcp_is_reno(tp) && flag & FLAG_SND_UNA_ADVANCED)
tcp_reset_reno_sack(tp);
if (!tcp_try_undo_loss(sk)) {
tcp_moderate_cwnd(tp);
tcp_xmit_retransmit_queue(sk);
return;
}
if (icsk->icsk_ca_state != TCP_CA_Open)
return;
/* Loss is undone; fall through to processing in Open state. */
default:
if (tcp_is_reno(tp)) {
if (flag & FLAG_SND_UNA_ADVANCED)
tcp_add_reno_sack(sk);
if (icsk->icsk_ca_state == TCP_CA_Disorder)
tcp_try_undo_dsack(sk);
if (!tcp_time_to_recover(sk)) {
tcp_try_to_open(sk, flag);
/* MTU probe failure: don't reduce cwnd */
if (icsk->icsk_ca_state < TCP_CA_CWR &&
icsk->icsk_mtup.probe_size &&
tp->snd_una == tp->mtu_probe.probe_seq_start) {
tcp_mtup_probe_failed(sk);
/* Restores the reduction we did in tcp_mtup_probe() */
tp->snd_cwnd++;
tcp_simple_retransmit(sk);
return;
}
if (tcp_is_reno(tp))
mib_idx = LINUX_MIB_TCPRENORECOVERY;
mib_idx = LINUX_MIB_TCPSACKRECOVERY;
NET_INC_STATS_BH(sock_net(sk), mib_idx);
tp->high_seq = tp->snd_nxt;
tp->prior_ssthresh = 0;
tp->undo_marker = tp->snd_una;
tp->undo_retrans = tp->retrans_out;
if (icsk->icsk_ca_state < TCP_CA_CWR) {
tp->prior_ssthresh = tcp_current_ssthresh(sk);
tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
tcp_set_ca_state(sk, TCP_CA_Recovery);
fast_rexmit = 1;
if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk)))
tcp_update_scoreboard(sk, fast_rexmit);
tcp_cwnd_down(sk, flag);
tcp_xmit_retransmit_queue(sk);
}
/* Read draft-ietf-tcplw-high-performance before mucking
static void tcp_ack_saw_tstamp(struct sock *sk, int flag)
{
/* RTTM Rule: A TSecr value received in a segment is used to
* update the averaged RTT measurement only if the segment
* acknowledges some new data, i.e., only if it advances the
* left edge of the send window.
*
* See draft-ietf-tcplw-high-performance-00, section 3.3.
* 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
*
* Changed: reset backoff as soon as we see the first valid sample.
* If we do not, we get strongly overestimated rto. With timestamps
* samples are accepted even from very old segments: f.e., when rtt=1
* increases to 8, we retransmit 5 times and after 8 seconds delayed
* answer arrives rto becomes 120 seconds! If at least one of segments
* in window is lost... Voila. --ANK (010210)
*/
struct tcp_sock *tp = tcp_sk(sk);
const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
tcp_rtt_estimator(sk, seq_rtt);
tcp_set_rto(sk);
inet_csk(sk)->icsk_backoff = 0;
tcp_bound_rto(sk);
static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag)
{
/* We don't have a timestamp. Can only use
* packets that are not retransmitted to determine
* rtt estimates. Also, we must not reset the
* backoff for rto until we get a non-retransmitted
* packet. This allows us to deal with a situation
* where the network delay has increased suddenly.
* I.e. Karn's algorithm. (SIGCOMM '87, p5.)
*/
if (flag & FLAG_RETRANS_DATA_ACKED)
return;
tcp_rtt_estimator(sk, seq_rtt);
tcp_set_rto(sk);
inet_csk(sk)->icsk_backoff = 0;
tcp_bound_rto(sk);
static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
const struct tcp_sock *tp = tcp_sk(sk);
/* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
tcp_ack_saw_tstamp(sk, flag);
tcp_ack_no_tstamp(sk, seq_rtt, flag);
static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
const struct inet_connection_sock *icsk = inet_csk(sk);
icsk->icsk_ca_ops->cong_avoid(sk, ack, in_flight);
tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;
}
/* Restart timer after forward progress on connection.
* RFC2988 recommends to restart timer to now+rto.
*/
static void tcp_rearm_rto(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
/* If we get here, the whole TSO packet has not been acked. */
static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
u32 packets_acked;
BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una));
if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
return 0;
packets_acked -= tcp_skb_pcount(skb);
if (packets_acked) {
BUG_ON(tcp_skb_pcount(skb) == 0);
BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));
return packets_acked;
/* Remove acknowledged frames from the retransmission queue. If our packet
* is before the ack sequence we can discard it as it's confirmed to have
* arrived at the other end.
*/
static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets)
const struct inet_connection_sock *icsk = inet_csk(sk);
u32 now = tcp_time_stamp;
int fully_acked = 1;
u32 reord = tp->packets_out;
u32 prior_sacked = tp->sacked_out;
s32 ca_seq_rtt = -1;
ktime_t last_ackt = net_invalid_timestamp();
while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
u8 sacked = scb->sacked;
/* Determine how many packets and what bytes were acked, tso and else */
if (tcp_skb_pcount(skb) == 1 ||
!after(tp->snd_una, scb->seq))
break;
acked_pcount = tcp_tso_acked(sk, skb);
if (!acked_pcount)
break;
fully_acked = 0;
end_seq = tp->snd_una;
} else {
acked_pcount = tcp_skb_pcount(skb);
end_seq = scb->end_seq;
if (fully_acked && icsk->icsk_mtup.probe_size &&
!after(tp->mtu_probe.probe_seq_end, scb->end_seq)) {
tcp_mtup_probe_success(sk, skb);
if (sacked & TCPCB_RETRANS) {
if (sacked & TCPCB_SACKED_RETRANS)
tp->retrans_out -= acked_pcount;
flag |= FLAG_RETRANS_DATA_ACKED;
ca_seq_rtt = -1;
seq_rtt = -1;
if ((flag & FLAG_DATA_ACKED) || (acked_pcount > 1))
flag |= FLAG_NONHEAD_RETRANS_ACKED;
ca_seq_rtt = now - scb->when;
last_ackt = skb->tstamp;
if (seq_rtt < 0) {
seq_rtt = ca_seq_rtt;
if (!(sacked & TCPCB_SACKED_ACKED))
reord = min(pkts_acked, reord);
if (sacked & TCPCB_SACKED_ACKED)
tp->sacked_out -= acked_pcount;
if (sacked & TCPCB_LOST)
tp->lost_out -= acked_pcount;
if (unlikely(tp->urg_mode && !before(end_seq, tp->snd_up)))
tp->urg_mode = 0;
tp->packets_out -= acked_pcount;
pkts_acked += acked_pcount;
/* Initial outgoing SYN's get put onto the write_queue
* just like anything else we transmit. It is not
* true data, and if we misinform our callers that
* this ACK acks real data, we will erroneously exit
* connection startup slow start one packet too
* quickly. This is severely frowned upon behavior.
*/
if (!(scb->flags & TCPCB_FLAG_SYN)) {
flag |= FLAG_DATA_ACKED;
} else {
flag |= FLAG_SYN_ACKED;
tp->retrans_stamp = 0;
}
if (!fully_acked)
break;
tcp_unlink_write_queue(skb, sk);
sk_wmem_free_skb(sk, skb);
tp->scoreboard_skb_hint = NULL;
if (skb == tp->retransmit_skb_hint)
tp->retransmit_skb_hint = NULL;
if (skb == tp->lost_skb_hint)
tp->lost_skb_hint = NULL;
if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
flag |= FLAG_SACK_RENEGING;
if (flag & FLAG_ACKED) {
const struct tcp_congestion_ops *ca_ops
= inet_csk(sk)->icsk_ca_ops;
tcp_ack_update_rtt(sk, flag, seq_rtt);
if (tcp_is_reno(tp)) {
tcp_remove_reno_sacks(sk, pkts_acked);
} else {
/* Non-retransmitted hole got filled? That's reordering */
if (reord < prior_fackets)
tcp_update_reordering(sk, tp->fackets_out - reord, 0);
/* No need to care for underflows here because
* the lost_skb_hint gets NULLed if we're past it
* (or something non-trivial happened)
*/
if (tcp_is_fack(tp))
tp->lost_cnt_hint -= pkts_acked;
else
tp->lost_cnt_hint -= prior_sacked - tp->sacked_out;
tp->fackets_out -= min(pkts_acked, tp->fackets_out);
if (ca_ops->pkts_acked) {
s32 rtt_us = -1;
/* Is the ACK triggering packet unambiguous? */
if (!(flag & FLAG_RETRANS_DATA_ACKED)) {
/* High resolution needed and available? */
if (ca_ops->flags & TCP_CONG_RTT_STAMP &&
!ktime_equal(last_ackt,
net_invalid_timestamp()))
rtt_us = ktime_us_delta(ktime_get_real(),
last_ackt);
else if (ca_seq_rtt > 0)
rtt_us = jiffies_to_usecs(ca_seq_rtt);
ca_ops->pkts_acked(sk, pkts_acked, rtt_us);
}
WARN_ON((int)tp->sacked_out < 0);
WARN_ON((int)tp->lost_out < 0);
WARN_ON((int)tp->retrans_out < 0);
if (!tp->packets_out && tcp_is_sack(tp)) {
if (tp->lost_out) {
printk(KERN_DEBUG "Leak l=%u %d\n",
tp->lost_out, icsk->icsk_ca_state);
tp->lost_out = 0;
}
if (tp->sacked_out) {
printk(KERN_DEBUG "Leak s=%u %d\n",
tp->sacked_out, icsk->icsk_ca_state);
tp->sacked_out = 0;
}
if (tp->retrans_out) {