Newer
Older
static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
{
tp->left_out = tp->sacked_out;
if (tp->retrans_out == 0)
tp->retrans_stamp = 0;
if (flag&FLAG_ECE)
tcp_enter_cwr(sk, 1);
if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
int state = TCP_CA_Open;
if (tp->left_out || tp->retrans_out || tp->undo_marker)
state = TCP_CA_Disorder;
if (inet_csk(sk)->icsk_ca_state != state) {
tcp_set_ca_state(sk, state);
tp->high_seq = tp->snd_nxt;
}
tcp_moderate_cwnd(tp);
} else {
tcp_cwnd_down(sk);
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
static void tcp_mtup_probe_failed(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
icsk->icsk_mtup.probe_size = 0;
}
static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
/* FIXME: breaks with very large cwnd */
tp->prior_ssthresh = tcp_current_ssthresh(sk);
tp->snd_cwnd = tp->snd_cwnd *
tcp_mss_to_mtu(sk, tp->mss_cache) /
icsk->icsk_mtup.probe_size;
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_time_stamp;
tp->rcv_ssthresh = tcp_current_ssthresh(sk);
icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
icsk->icsk_mtup.probe_size = 0;
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
}
/* Process an event, which can update packets-in-flight not trivially.
* Main goal of this function is to calculate new estimate for left_out,
* taking into account both packets sitting in receiver's buffer and
* packets lost by network.
*
* Besides that it does CWND reduction, when packet loss is detected
* and changes state of machine.
*
* It does _not_ decide what to send, it is made in function
* tcp_xmit_retransmit_queue().
*/
static void
tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
int prior_packets, int flag)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
int is_dupack = (tp->snd_una == prior_snd_una && !(flag&FLAG_NOT_DUP));
/* Some technical things:
* 1. Reno does not count dupacks (sacked_out) automatically. */
if (!tp->packets_out)
tp->sacked_out = 0;
/* 2. SACK counts snd_fack in packets inaccurately. */
if (tp->sacked_out == 0)
tp->fackets_out = 0;
* A. ECE, hence prohibit cwnd undoing, the reduction is required. */
if (flag&FLAG_ECE)
tp->prior_ssthresh = 0;
/* B. In all the states check for reneging SACKs. */
if (tp->sacked_out && tcp_check_sack_reneging(sk))
return;
/* C. Process data loss notification, provided it is valid. */
if ((flag&FLAG_DATA_LOST) &&
before(tp->snd_una, tp->high_seq) &&
icsk->icsk_ca_state != TCP_CA_Open &&
tp->fackets_out > tp->reordering) {
tcp_mark_head_lost(sk, tp, tp->fackets_out-tp->reordering, tp->high_seq);
NET_INC_STATS_BH(LINUX_MIB_TCPLOSS);
}
/* D. Synchronize left_out to current state. */
tcp_sync_left_out(tp);
/* E. Check state exit conditions. State can be terminated
* when high_seq is ACKed. */
if (icsk->icsk_ca_state == TCP_CA_Open) {
BUG_TRAP(tp->retrans_out == 0);
tp->retrans_stamp = 0;
} else if (!before(tp->snd_una, tp->high_seq)) {
switch (icsk->icsk_ca_state) {
icsk->icsk_retransmits = 0;
if (tcp_try_undo_recovery(sk, tp))
return;
break;
case TCP_CA_CWR:
/* CWR is to be held something *above* high_seq
* is ACKed for CWR bit to reach receiver. */
if (tp->snd_una != tp->high_seq) {
tcp_complete_cwr(sk);
tcp_set_ca_state(sk, TCP_CA_Open);
}
break;
case TCP_CA_Disorder:
tcp_try_undo_dsack(sk, tp);
if (!tp->undo_marker ||
/* For SACK case do not Open to allow to undo
* catching for all duplicate ACKs. */
IsReno(tp) || tp->snd_una != tp->high_seq) {
tp->undo_marker = 0;
tcp_set_ca_state(sk, TCP_CA_Open);
}
break;
case TCP_CA_Recovery:
if (IsReno(tp))
tcp_reset_reno_sack(tp);
if (tcp_try_undo_recovery(sk, tp))
return;
tcp_complete_cwr(sk);
break;
}
}
/* F. Process state. */
switch (icsk->icsk_ca_state) {
case TCP_CA_Recovery:
if (prior_snd_una == tp->snd_una) {
if (IsReno(tp) && is_dupack)
tcp_add_reno_sack(sk);
} else {
int acked = prior_packets - tp->packets_out;
if (IsReno(tp))
tcp_remove_reno_sacks(sk, tp, acked);
is_dupack = tcp_try_undo_partial(sk, tp, acked);
}
break;
case TCP_CA_Loss:
if (flag&FLAG_DATA_ACKED)
icsk->icsk_retransmits = 0;
if (!tcp_try_undo_loss(sk, tp)) {
tcp_moderate_cwnd(tp);
tcp_xmit_retransmit_queue(sk);
return;
}
if (icsk->icsk_ca_state != TCP_CA_Open)
return;
/* Loss is undone; fall through to processing in Open state. */
default:
if (IsReno(tp)) {
if (tp->snd_una != prior_snd_una)
tcp_reset_reno_sack(tp);
if (is_dupack)
tcp_add_reno_sack(sk);
if (icsk->icsk_ca_state == TCP_CA_Disorder)
tcp_try_undo_dsack(sk, tp);
if (!tcp_time_to_recover(sk, tp)) {
tcp_try_to_open(sk, tp, flag);
return;
}
/* MTU probe failure: don't reduce cwnd */
if (icsk->icsk_ca_state < TCP_CA_CWR &&
icsk->icsk_mtup.probe_size &&
tp->snd_una == tp->mtu_probe.probe_seq_start) {
tcp_mtup_probe_failed(sk);
/* Restores the reduction we did in tcp_mtup_probe() */
tp->snd_cwnd++;
tcp_simple_retransmit(sk);
return;
}
/* Otherwise enter Recovery state */
if (IsReno(tp))
NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERY);
else
NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERY);
tp->high_seq = tp->snd_nxt;
tp->prior_ssthresh = 0;
tp->undo_marker = tp->snd_una;
tp->undo_retrans = tp->retrans_out;
if (icsk->icsk_ca_state < TCP_CA_CWR) {
tp->prior_ssthresh = tcp_current_ssthresh(sk);
tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
tcp_set_ca_state(sk, TCP_CA_Recovery);
}
if (is_dupack || tcp_head_timedout(sk, tp))
tcp_update_scoreboard(sk, tp);
tcp_cwnd_down(sk);
tcp_xmit_retransmit_queue(sk);
}
/* Read draft-ietf-tcplw-high-performance before mucking
static void tcp_ack_saw_tstamp(struct sock *sk, int flag)
{
/* RTTM Rule: A TSecr value received in a segment is used to
* update the averaged RTT measurement only if the segment
* acknowledges some new data, i.e., only if it advances the
* left edge of the send window.
*
* See draft-ietf-tcplw-high-performance-00, section 3.3.
* 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
*
* Changed: reset backoff as soon as we see the first valid sample.
* If we do not, we get strongly overestimated rto. With timestamps
* samples are accepted even from very old segments: f.e., when rtt=1
* increases to 8, we retransmit 5 times and after 8 seconds delayed
* answer arrives rto becomes 120 seconds! If at least one of segments
* in window is lost... Voila. --ANK (010210)
*/
struct tcp_sock *tp = tcp_sk(sk);
const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
tcp_rtt_estimator(sk, seq_rtt);
tcp_set_rto(sk);
inet_csk(sk)->icsk_backoff = 0;
tcp_bound_rto(sk);
static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag)
{
/* We don't have a timestamp. Can only use
* packets that are not retransmitted to determine
* rtt estimates. Also, we must not reset the
* backoff for rto until we get a non-retransmitted
* packet. This allows us to deal with a situation
* where the network delay has increased suddenly.
* I.e. Karn's algorithm. (SIGCOMM '87, p5.)
*/
if (flag & FLAG_RETRANS_DATA_ACKED)
return;
tcp_rtt_estimator(sk, seq_rtt);
tcp_set_rto(sk);
inet_csk(sk)->icsk_backoff = 0;
tcp_bound_rto(sk);
static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
const struct tcp_sock *tp = tcp_sk(sk);
/* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
tcp_ack_saw_tstamp(sk, flag);
tcp_ack_no_tstamp(sk, seq_rtt, flag);
static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
u32 in_flight, int good)
const struct inet_connection_sock *icsk = inet_csk(sk);
icsk->icsk_ca_ops->cong_avoid(sk, ack, rtt, in_flight, good);
tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;
}
/* Restart timer after forward progress on connection.
* RFC2988 recommends to restart timer to now+rto.
*/
static void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);

Arnaldo Carvalho de Melo
committed
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
}
}
static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
__u32 now, __s32 *seq_rtt)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
__u32 seq = tp->snd_una;
__u32 packets_acked;
int acked = 0;
/* If we get here, the whole TSO packet has not been
* acked.
*/
BUG_ON(!after(scb->end_seq, seq));
packets_acked = tcp_skb_pcount(skb);
if (tcp_trim_head(sk, skb, seq - scb->seq))
return 0;
packets_acked -= tcp_skb_pcount(skb);
if (packets_acked) {
__u8 sacked = scb->sacked;
acked |= FLAG_DATA_ACKED;
if (sacked) {
if (sacked & TCPCB_RETRANS) {
if (sacked & TCPCB_SACKED_RETRANS)
tp->retrans_out -= packets_acked;
acked |= FLAG_RETRANS_DATA_ACKED;
*seq_rtt = -1;
} else if (*seq_rtt < 0)
*seq_rtt = now - scb->when;
if (sacked & TCPCB_SACKED_ACKED)
tp->sacked_out -= packets_acked;
if (sacked & TCPCB_LOST)
tp->lost_out -= packets_acked;
if (sacked & TCPCB_URG) {
if (tp->urg_mode &&
!before(seq, tp->snd_up))
tp->urg_mode = 0;
}
} else if (*seq_rtt < 0)
*seq_rtt = now - scb->when;
if (tp->fackets_out) {
__u32 dval = min(tp->fackets_out, packets_acked);
tp->fackets_out -= dval;
}
tp->packets_out -= packets_acked;
BUG_ON(tcp_skb_pcount(skb) == 0);
BUG_ON(!before(scb->seq, scb->end_seq));
}
return acked;
}
static u32 tcp_usrtt(struct timeval *tv)
do_gettimeofday(&now);
return (now.tv_sec - tv->tv_sec) * 1000000 + (now.tv_usec - tv->tv_usec);
/* Remove acknowledged frames from the retransmission queue. */
static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
const struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_buff *skb;
__u32 now = tcp_time_stamp;
int acked = 0;
__s32 seq_rtt = -1;
u32 pkts_acked = 0;
void (*rtt_sample)(struct sock *sk, u32 usrtt)
= icsk->icsk_ca_ops->rtt_sample;
struct timeval tv = { .tv_sec = 0, .tv_usec = 0 };
while ((skb = skb_peek(&sk->sk_write_queue)) &&
skb != sk->sk_send_head) {
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
__u8 sacked = scb->sacked;
/* If our packet is before the ack sequence we can
* discard it as it's confirmed to have arrived at
* the other end.
*/
if (after(scb->end_seq, tp->snd_una)) {
if (tcp_skb_pcount(skb) > 1 &&
after(tp->snd_una, scb->seq))
acked |= tcp_tso_acked(sk, skb,
now, &seq_rtt);
break;
}
/* Initial outgoing SYN's get put onto the write_queue
* just like anything else we transmit. It is not
* true data, and if we misinform our callers that
* this ACK acks real data, we will erroneously exit
* connection startup slow start one packet too
* quickly. This is severely frowned upon behavior.
*/
if (!(scb->flags & TCPCB_FLAG_SYN)) {
acked |= FLAG_DATA_ACKED;
++pkts_acked;
} else {
acked |= FLAG_SYN_ACKED;
tp->retrans_stamp = 0;
}
/* MTU probing checks */
if (icsk->icsk_mtup.probe_size) {
if (!after(tp->mtu_probe.probe_seq_end, TCP_SKB_CB(skb)->end_seq)) {
if (sacked) {
if (sacked & TCPCB_RETRANS) {
if(sacked & TCPCB_SACKED_RETRANS)
tp->retrans_out -= tcp_skb_pcount(skb);
acked |= FLAG_RETRANS_DATA_ACKED;
seq_rtt = -1;
skb_get_timestamp(skb, &tv);
if (sacked & TCPCB_SACKED_ACKED)
tp->sacked_out -= tcp_skb_pcount(skb);
if (sacked & TCPCB_LOST)
tp->lost_out -= tcp_skb_pcount(skb);
if (sacked & TCPCB_URG) {
if (tp->urg_mode &&
!before(scb->end_seq, tp->snd_up))
tp->urg_mode = 0;
}
skb_get_timestamp(skb, &tv);
tcp_dec_pcount_approx(&tp->fackets_out, skb);
tcp_packets_out_dec(tp, skb);
tcp_ack_update_rtt(sk, acked, seq_rtt);
if (rtt_sample && !(acked & FLAG_RETRANS_DATA_ACKED))
(*rtt_sample)(sk, tcp_usrtt(&tv));
if (icsk->icsk_ca_ops->pkts_acked)
icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked);
}
#if FASTRETRANS_DEBUG > 0
BUG_TRAP((int)tp->sacked_out >= 0);
BUG_TRAP((int)tp->lost_out >= 0);
BUG_TRAP((int)tp->retrans_out >= 0);
if (!tp->packets_out && tp->rx_opt.sack_ok) {
const struct inet_connection_sock *icsk = inet_csk(sk);
if (tp->lost_out) {
printk(KERN_DEBUG "Leak l=%u %d\n",
tp->lost_out, icsk->icsk_ca_state);
tp->lost_out = 0;
}
if (tp->sacked_out) {
printk(KERN_DEBUG "Leak s=%u %d\n",
tp->sacked_out, icsk->icsk_ca_state);
tp->sacked_out = 0;
}
if (tp->retrans_out) {
printk(KERN_DEBUG "Leak r=%u %d\n",
tp->retrans_out, icsk->icsk_ca_state);
tp->retrans_out = 0;
}
}
#endif
*seq_rtt_p = seq_rtt;
return acked;
}
static void tcp_ack_probe(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
/* Was it a usable window open? */
if (!after(TCP_SKB_CB(sk->sk_send_head)->end_seq,
tp->snd_una + tp->snd_wnd)) {
icsk->icsk_backoff = 0;
inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
/* Socket must be waked up by subsequent tcp_data_snd_check().
* This function is not for random using!
*/
} else {
inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,

Arnaldo Carvalho de Melo
committed
min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
TCP_RTO_MAX);
static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag)
{
return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
inet_csk(sk)->icsk_ca_state != TCP_CA_Open);
static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
const struct tcp_sock *tp = tcp_sk(sk);
return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) &&
!((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR));
}
/* Check that window update is acceptable.
* The function assumes that snd_una<=ack<=snd_next.
*/
static inline int tcp_may_update_window(const struct tcp_sock *tp, const u32 ack,
const u32 ack_seq, const u32 nwin)
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
{
return (after(ack, tp->snd_una) ||
after(ack_seq, tp->snd_wl1) ||
(ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd));
}
/* Update our send window.
*
* Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
* and in FreeBSD. NetBSD's one is even worse.) is wrong.
*/
static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp,
struct sk_buff *skb, u32 ack, u32 ack_seq)
{
int flag = 0;
u32 nwin = ntohs(skb->h.th->window);
if (likely(!skb->h.th->syn))
nwin <<= tp->rx_opt.snd_wscale;
if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
flag |= FLAG_WIN_UPDATE;
tcp_update_wl(tp, ack, ack_seq);
if (tp->snd_wnd != nwin) {
tp->snd_wnd = nwin;
/* Note, it is the only place, where
* fast path is recovered for sending TCP.
*/
tcp_fast_path_check(sk, tp);
if (nwin > tp->max_window) {
tp->max_window = nwin;
tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
}
}
}
tp->snd_una = ack;
return flag;
}
/* A very conservative spurious RTO response algorithm: reduce cwnd and
* continue in congestion avoidance.
*/
static void tcp_conservative_spur_to_response(struct tcp_sock *tp)
{
tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
tp->snd_cwnd_cnt = 0;
tcp_moderate_cwnd(tp);
}
/* A conservative spurious RTO response algorithm: reduce cwnd using
* rate halving and continue in congestion avoidance.
*/
static void tcp_ratehalving_spur_to_response(struct sock *sk)
{
tcp_enter_cwr(sk, 0);
}
static void tcp_undo_spur_to_response(struct sock *sk, int flag)
if (flag&FLAG_ECE)
tcp_ratehalving_spur_to_response(sk);
else
tcp_undo_cwr(sk, 1);
/* F-RTO spurious RTO detection algorithm (RFC4138)
*
* F-RTO affects during two new ACKs following RTO (well, almost, see inline
* comments). State (ACK number) is kept in frto_counter. When ACK advances
* window (but not to or beyond highest sequence sent before RTO):
* On First ACK, send two new segments out.
* On Second ACK, RTO was likely spurious. Do spurious response (response
* algorithm is not part of the F-RTO detection algorithm
* given in RFC4138 but can be selected separately).
* Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss
* and TCP falls back to conventional RTO recovery.
*
* Rationale: if the RTO was spurious, new ACKs should arrive from the
* original window even after we transmit two new data segments.
*
* SACK version:
* on first step, wait until first cumulative ACK arrives, then move to
* the second step. In second step, the next ACK decides.
*
* F-RTO is implemented (mainly) in four functions:
* - tcp_use_frto() is used to determine if TCP is can use F-RTO
* - tcp_enter_frto() prepares TCP state on RTO if F-RTO is used, it is
* called when tcp_use_frto() showed green light
* - tcp_process_frto() handles incoming ACKs during F-RTO algorithm
* - tcp_enter_frto_loss() is called if there is not enough evidence
* to prove that the RTO is indeed spurious. It transfers the control
* from F-RTO to the conventional RTO recovery
*/
static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
/* Duplicate the behavior from Loss state (fastretrans_alert) */
if (flag&FLAG_DATA_ACKED)
inet_csk(sk)->icsk_retransmits = 0;
if (!before(tp->snd_una, tp->frto_highmark)) {
tcp_enter_frto_loss(sk, tp->frto_counter + 1, flag);
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
if (!IsSackFrto() || IsReno(tp)) {
/* RFC4138 shortcoming in step 2; should also have case c):
* ACK isn't duplicate nor advances window, e.g., opposite dir
* data, winupdate
*/
if ((tp->snd_una == prior_snd_una) && (flag&FLAG_NOT_DUP) &&
!(flag&FLAG_FORWARD_PROGRESS))
return 1;
if (!(flag&FLAG_DATA_ACKED)) {
tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3),
flag);
return 1;
}
} else {
if (!(flag&FLAG_DATA_ACKED) && (tp->frto_counter == 1)) {
/* Prevent sending of new data. */
tp->snd_cwnd = min(tp->snd_cwnd,
tcp_packets_in_flight(tp));
return 1;
}
if ((tp->frto_counter == 2) &&
(!(flag&FLAG_FORWARD_PROGRESS) ||
((flag&FLAG_DATA_SACKED) && !(flag&FLAG_ONLY_ORIG_SACKED)))) {
/* RFC4138 shortcoming (see comment above) */
if (!(flag&FLAG_FORWARD_PROGRESS) && (flag&FLAG_NOT_DUP))
return 1;
tcp_enter_frto_loss(sk, 3, flag);
return 1;
}
}
if (tp->frto_counter == 1) {
tp->snd_cwnd = tcp_packets_in_flight(tp) + 2;
tp->frto_counter = 2;
} else /* frto_counter == 2 */ {
switch (sysctl_tcp_frto_response) {
case 2:
tcp_undo_spur_to_response(sk, flag);
break;
case 1:
tcp_conservative_spur_to_response(tp);
break;
default:
tcp_ratehalving_spur_to_response(sk);
break;
};
tp->frto_counter = 0;
}
/* This routine deals with incoming acks, but not outgoing ones. */
static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
u32 prior_snd_una = tp->snd_una;
u32 ack_seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
u32 prior_in_flight;
s32 seq_rtt;
int prior_packets;
int frto_cwnd = 0;
/* If the ack is newer than sent or older than previous acks
* then we can probably ignore it.
*/
if (after(ack, tp->snd_nxt))
goto uninteresting_ack;
if (before(ack, prior_snd_una))
goto old_ack;
if (sysctl_tcp_abc) {
if (icsk->icsk_ca_state < TCP_CA_CWR)
tp->bytes_acked += ack - prior_snd_una;
else if (icsk->icsk_ca_state == TCP_CA_Loss)
/* we assume just one segment left network */
tp->bytes_acked += min(ack - prior_snd_una, tp->mss_cache);
}
if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
/* Window is constant, pure forward advance.
* No more checks are required.
* Note, we use the fact that SND.UNA>=SND.WL2.
*/
tcp_update_wl(tp, ack, ack_seq);
tp->snd_una = ack;
flag |= FLAG_WIN_UPDATE;
tcp_ca_event(sk, CA_EVENT_FAST_ACK);
NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS);
} else {
if (ack_seq != TCP_SKB_CB(skb)->end_seq)
flag |= FLAG_DATA;
else
NET_INC_STATS_BH(LINUX_MIB_TCPPUREACKS);
flag |= tcp_ack_update_window(sk, tp, skb, ack, ack_seq);
if (TCP_SKB_CB(skb)->sacked)
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th))
flag |= FLAG_ECE;
tcp_ca_event(sk, CA_EVENT_SLOW_ACK);
}
/* We passed data and got it acked, remove any soft error
* log. Something worked...
*/
sk->sk_err_soft = 0;
tp->rcv_tstamp = tcp_time_stamp;
prior_packets = tp->packets_out;
if (!prior_packets)
goto no_queue;
prior_in_flight = tcp_packets_in_flight(tp);
/* See if we can take anything off of the retransmit queue. */
flag |= tcp_clean_rtx_queue(sk, &seq_rtt);
frto_cwnd = tcp_process_frto(sk, prior_snd_una, flag);
if (tcp_ack_is_dubious(sk, flag)) {
if ((flag & FLAG_DATA_ACKED) && !frto_cwnd &&
tcp_may_raise_cwnd(sk, flag))
tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0);
tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
} else {
if ((flag & FLAG_DATA_ACKED) && !frto_cwnd)
tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 1);
}
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP))
dst_confirm(sk->sk_dst_cache);
return 1;
no_queue:
icsk->icsk_probes_out = 0;
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
/* If this ack opens up a zero window, clear backoff. It was
* being used to time the probes, and is probably far higher than
* it needs to be for normal retransmission.
*/
if (sk->sk_send_head)
tcp_ack_probe(sk);
return 1;
old_ack:
if (TCP_SKB_CB(skb)->sacked)
tcp_sacktag_write_queue(sk, skb, prior_snd_una);
uninteresting_ack:
SOCK_DEBUG(sk, "Ack %u out of %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
return 0;
}
/* Look for tcp options. Normally only called on SYN and SYNACK packets.
* But, this can also be called on packets in the established flow when
* the fast version below fails.
*/
void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab)
{
unsigned char *ptr;
struct tcphdr *th = skb->h.th;
int length=(th->doff*4)-sizeof(struct tcphdr);
ptr = (unsigned char *)(th + 1);
opt_rx->saw_tstamp = 0;
while(length>0) {
int opsize;
switch (opcode) {
case TCPOPT_EOL:
return;
case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
length--;
continue;
default:
opsize=*ptr++;
if (opsize < 2) /* "silly options" */
return;
if (opsize > length)
return; /* don't parse partial options */
case TCPOPT_MSS:
if(opsize==TCPOLEN_MSS && th->syn && !estab) {
u16 in_mss = ntohs(get_unaligned((__be16 *)ptr));
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
if (in_mss) {
if (opt_rx->user_mss && opt_rx->user_mss < in_mss)
in_mss = opt_rx->user_mss;
opt_rx->mss_clamp = in_mss;
}
}
break;
case TCPOPT_WINDOW:
if(opsize==TCPOLEN_WINDOW && th->syn && !estab)
if (sysctl_tcp_window_scaling) {
__u8 snd_wscale = *(__u8 *) ptr;
opt_rx->wscale_ok = 1;
if (snd_wscale > 14) {
if(net_ratelimit())
printk(KERN_INFO "tcp_parse_options: Illegal window "
"scaling value %d >14 received.\n",
snd_wscale);
snd_wscale = 14;
}
opt_rx->snd_wscale = snd_wscale;
}
break;
case TCPOPT_TIMESTAMP:
if(opsize==TCPOLEN_TIMESTAMP) {
if ((estab && opt_rx->tstamp_ok) ||
(!estab && sysctl_tcp_timestamps)) {
opt_rx->saw_tstamp = 1;
opt_rx->rcv_tsval = ntohl(get_unaligned((__be32 *)ptr));
opt_rx->rcv_tsecr = ntohl(get_unaligned((__be32 *)(ptr+4)));
}
}
break;
case TCPOPT_SACK_PERM:
if(opsize==TCPOLEN_SACK_PERM && th->syn && !estab) {
if (sysctl_tcp_sack) {
opt_rx->sack_ok = 1;
tcp_sack_reset(opt_rx);
}
}
break;
case TCPOPT_SACK:
if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
!((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
opt_rx->sack_ok) {
TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
}
#ifdef CONFIG_TCP_MD5SIG
case TCPOPT_MD5SIG:
/*
* The MD5 Hash has already been
* checked (see tcp_v{4,6}_do_rcv()).
*/
break;
#endif
};
ptr+=opsize-2;
length-=opsize;
};
}
}
/* Fast parse options. This hopes to only see timestamps.
* If it is wrong it falls back on tcp_parse_options().
*/
static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
struct tcp_sock *tp)
{
if (th->doff == sizeof(struct tcphdr)>>2) {
tp->rx_opt.saw_tstamp = 0;
return 0;
} else if (tp->rx_opt.tstamp_ok &&
th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) {
__be32 *ptr = (__be32 *)(th + 1);
if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
| (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
tp->rx_opt.saw_tstamp = 1;
++ptr;
tp->rx_opt.rcv_tsval = ntohl(*ptr);
++ptr;
tp->rx_opt.rcv_tsecr = ntohl(*ptr);
return 1;
}
}
tcp_parse_options(skb, &tp->rx_opt, 1);
return 1;
}
static inline void tcp_store_ts_recent(struct tcp_sock *tp)
{
tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
tp->rx_opt.ts_recent_stamp = get_seconds();
}
static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
{
if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
/* PAWS bug workaround wrt. ACK frames, the PAWS discard
* extra check below makes sure this can only happen
* for pure ACK frames. -DaveM
*
* Not only, also it occurs for expired timestamps.
*/
if((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) >= 0 ||
get_seconds() >= tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS)
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
tcp_store_ts_recent(tp);
}
}
/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
*
* It is not fatal. If this ACK does _not_ change critical state (seqs, window)
* it can pass through stack. So, the following predicate verifies that
* this segment is not used for anything but congestion avoidance or
* fast retransmit. Moreover, we even are able to eliminate most of such
* second order effects, if we apply some small "replay" window (~RTO)
* to timestamp space.
*
* All these measures still do not guarantee that we reject wrapped ACKs
* on networks with high bandwidth, when sequence space is recycled fastly,
* but it guarantees that such events will be very rare and do not affect
* connection seriously. This doesn't look nice, but alas, PAWS is really
* buggy extension.
*
* [ Later note. Even worse! It is buggy for segments _with_ data. RFC
* states that events when retransmit arrives after original data are rare.
* It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is
* the biggest problem on large power networks even with minor reordering.
* OK, let's give it small replay window. If peer clock is even 1hz, it is safe
* up to bandwidth of 18Gigabit/sec. 8) ]
*/
static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
struct tcp_sock *tp = tcp_sk(sk);
struct tcphdr *th = skb->h.th;
u32 seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
return (/* 1. Pure ACK with correct sequence number. */
(th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&
/* 2. ... and duplicate ACK. */
ack == tp->snd_una &&
/* 3. ... and does not update window. */
!tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
/* 4. ... and sits in replay window. */
(s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
static inline int tcp_paws_discard(const struct sock *sk, const struct sk_buff *skb)
const struct tcp_sock *tp = tcp_sk(sk);