Skip to content
Snippets Groups Projects
tcp_input.c 139 KiB
Newer Older
  • Learn to ignore specific revisions
  • static int tcp_try_undo_loss(struct sock *sk)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct tcp_sock *tp = tcp_sk(sk);
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (tcp_may_undo(tp)) {
    		struct sk_buff *skb;
    
    		tcp_for_write_queue(skb, sk) {
    			if (skb == tcp_send_head(sk))
    				break;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
    		}
    
    
    		clear_all_retrans_hints(tp);
    
    
    		DBGUNDO(sk, "partial loss");
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		tp->lost_out = 0;
    		tp->left_out = tp->sacked_out;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO);
    
    		inet_csk(sk)->icsk_retransmits = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		tp->undo_marker = 0;
    		if (!IsReno(tp))
    
    			tcp_set_ca_state(sk, TCP_CA_Open);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return 1;
    	}
    	return 0;
    }
    
    
    static inline void tcp_complete_cwr(struct sock *sk)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	tp->snd_cwnd_stamp = tcp_time_stamp;
    
    	tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
    
    static void tcp_try_to_open(struct sock *sk, int flag)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct tcp_sock *tp = tcp_sk(sk);
    
    
    	tcp_sync_left_out(tp);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	if (tp->retrans_out == 0)
    		tp->retrans_stamp = 0;
    
    	if (flag&FLAG_ECE)
    
    		tcp_enter_cwr(sk, 1);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		int state = TCP_CA_Open;
    
    		if (tp->left_out || tp->retrans_out || tp->undo_marker)
    			state = TCP_CA_Disorder;
    
    
    		if (inet_csk(sk)->icsk_ca_state != state) {
    			tcp_set_ca_state(sk, state);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			tp->high_seq = tp->snd_nxt;
    		}
    		tcp_moderate_cwnd(tp);
    	} else {
    
    John Heffner's avatar
    John Heffner committed
    static void tcp_mtup_probe_failed(struct sock *sk)
    {
    	struct inet_connection_sock *icsk = inet_csk(sk);
    
    	icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
    	icsk->icsk_mtup.probe_size = 0;
    }
    
    static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	struct inet_connection_sock *icsk = inet_csk(sk);
    
    	/* FIXME: breaks with very large cwnd */
    	tp->prior_ssthresh = tcp_current_ssthresh(sk);
    	tp->snd_cwnd = tp->snd_cwnd *
    		       tcp_mss_to_mtu(sk, tp->mss_cache) /
    		       icsk->icsk_mtup.probe_size;
    	tp->snd_cwnd_cnt = 0;
    	tp->snd_cwnd_stamp = tcp_time_stamp;
    	tp->rcv_ssthresh = tcp_current_ssthresh(sk);
    
    	icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
    	icsk->icsk_mtup.probe_size = 0;
    	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
    }
    
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /* Process an event, which can update packets-in-flight not trivially.
     * Main goal of this function is to calculate new estimate for left_out,
     * taking into account both packets sitting in receiver's buffer and
     * packets lost by network.
     *
     * Besides that it does CWND reduction, when packet loss is detected
     * and changes state of machine.
     *
     * It does _not_ decide what to send, it is made in function
     * tcp_xmit_retransmit_queue().
     */
    static void
    tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
    		      int prior_packets, int flag)
    {
    
    	struct inet_connection_sock *icsk = inet_csk(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct tcp_sock *tp = tcp_sk(sk);
    	int is_dupack = (tp->snd_una == prior_snd_una && !(flag&FLAG_NOT_DUP));
    
    	/* Some technical things:
    	 * 1. Reno does not count dupacks (sacked_out) automatically. */
    	if (!tp->packets_out)
    		tp->sacked_out = 0;
    
    	/* 2. SACK counts snd_fack in packets inaccurately. */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (tp->sacked_out == 0)
    		tp->fackets_out = 0;
    
    
    	/* Now state machine starts.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	 * A. ECE, hence prohibit cwnd undoing, the reduction is required. */
    	if (flag&FLAG_ECE)
    		tp->prior_ssthresh = 0;
    
    	/* B. In all the states check for reneging SACKs. */
    
    	if (tp->sacked_out && tcp_check_sack_reneging(sk))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return;
    
    	/* C. Process data loss notification, provided it is valid. */
    	if ((flag&FLAG_DATA_LOST) &&
    	    before(tp->snd_una, tp->high_seq) &&
    
    	    icsk->icsk_ca_state != TCP_CA_Open &&
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	    tp->fackets_out > tp->reordering) {
    
    		tcp_mark_head_lost(sk, tp->fackets_out-tp->reordering, tp->high_seq);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		NET_INC_STATS_BH(LINUX_MIB_TCPLOSS);
    	}
    
    	/* D. Synchronize left_out to current state. */
    	tcp_sync_left_out(tp);
    
    	/* E. Check state exit conditions. State can be terminated
    	 *    when high_seq is ACKed. */
    
    	if (icsk->icsk_ca_state == TCP_CA_Open) {
    
    		BUG_TRAP(tp->retrans_out == 0);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		tp->retrans_stamp = 0;
    	} else if (!before(tp->snd_una, tp->high_seq)) {
    
    		switch (icsk->icsk_ca_state) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		case TCP_CA_Loss:
    
    			if (tcp_try_undo_recovery(sk))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				return;
    			break;
    
    		case TCP_CA_CWR:
    			/* CWR is to be held something *above* high_seq
    			 * is ACKed for CWR bit to reach receiver. */
    			if (tp->snd_una != tp->high_seq) {
    
    				tcp_complete_cwr(sk);
    				tcp_set_ca_state(sk, TCP_CA_Open);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			}
    			break;
    
    		case TCP_CA_Disorder:
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			if (!tp->undo_marker ||
    			    /* For SACK case do not Open to allow to undo
    			     * catching for all duplicate ACKs. */
    			    IsReno(tp) || tp->snd_una != tp->high_seq) {
    				tp->undo_marker = 0;
    
    				tcp_set_ca_state(sk, TCP_CA_Open);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			}
    			break;
    
    		case TCP_CA_Recovery:
    			if (IsReno(tp))
    				tcp_reset_reno_sack(tp);
    
    			if (tcp_try_undo_recovery(sk))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				return;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			break;
    		}
    	}
    
    	/* F. Process state. */
    
    	switch (icsk->icsk_ca_state) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	case TCP_CA_Recovery:
    		if (prior_snd_una == tp->snd_una) {
    			if (IsReno(tp) && is_dupack)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		} else {
    			int acked = prior_packets - tp->packets_out;
    			if (IsReno(tp))
    
    				tcp_remove_reno_sacks(sk, acked);
    			is_dupack = tcp_try_undo_partial(sk, acked);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    		break;
    	case TCP_CA_Loss:
    		if (flag&FLAG_DATA_ACKED)
    
    		if (!tcp_try_undo_loss(sk)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			tcp_moderate_cwnd(tp);
    			tcp_xmit_retransmit_queue(sk);
    			return;
    		}
    
    		if (icsk->icsk_ca_state != TCP_CA_Open)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			return;
    		/* Loss is undone; fall through to processing in Open state. */
    	default:
    		if (IsReno(tp)) {
    			if (tp->snd_una != prior_snd_una)
    				tcp_reset_reno_sack(tp);
    			if (is_dupack)
    
    		if (icsk->icsk_ca_state == TCP_CA_Disorder)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		if (!tcp_time_to_recover(sk)) {
    			tcp_try_to_open(sk, flag);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			return;
    		}
    
    
    John Heffner's avatar
    John Heffner committed
    		/* MTU probe failure: don't reduce cwnd */
    		if (icsk->icsk_ca_state < TCP_CA_CWR &&
    		    icsk->icsk_mtup.probe_size &&
    
    		    tp->snd_una == tp->mtu_probe.probe_seq_start) {
    
    John Heffner's avatar
    John Heffner committed
    			tcp_mtup_probe_failed(sk);
    			/* Restores the reduction we did in tcp_mtup_probe() */
    			tp->snd_cwnd++;
    			tcp_simple_retransmit(sk);
    			return;
    		}
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/* Otherwise enter Recovery state */
    
    		if (IsReno(tp))
    			NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERY);
    		else
    			NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERY);
    
    		tp->high_seq = tp->snd_nxt;
    		tp->prior_ssthresh = 0;
    		tp->undo_marker = tp->snd_una;
    		tp->undo_retrans = tp->retrans_out;
    
    
    		if (icsk->icsk_ca_state < TCP_CA_CWR) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			if (!(flag&FLAG_ECE))
    
    				tp->prior_ssthresh = tcp_current_ssthresh(sk);
    			tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			TCP_ECN_queue_cwr(tp);
    		}
    
    
    		tp->bytes_acked = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		tp->snd_cwnd_cnt = 0;
    
    		tcp_set_ca_state(sk, TCP_CA_Recovery);
    
    	if (is_dupack || tcp_head_timedout(sk))
    		tcp_update_scoreboard(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	tcp_xmit_retransmit_queue(sk);
    }
    
    /* Read draft-ietf-tcplw-high-performance before mucking
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
     * with this code. (Supersedes RFC1323)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    static void tcp_ack_saw_tstamp(struct sock *sk, int flag)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	/* RTTM Rule: A TSecr value received in a segment is used to
    	 * update the averaged RTT measurement only if the segment
    	 * acknowledges some new data, i.e., only if it advances the
    	 * left edge of the send window.
    	 *
    	 * See draft-ietf-tcplw-high-performance-00, section 3.3.
    	 * 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
    	 *
    	 * Changed: reset backoff as soon as we see the first valid sample.
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    	 * If we do not, we get strongly overestimated rto. With timestamps
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	 * samples are accepted even from very old segments: f.e., when rtt=1
    	 * increases to 8, we retransmit 5 times and after 8 seconds delayed
    	 * answer arrives rto becomes 120 seconds! If at least one of segments
    	 * in window is lost... Voila.	 			--ANK (010210)
    	 */
    
    	struct tcp_sock *tp = tcp_sk(sk);
    	const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
    
    	tcp_rtt_estimator(sk, seq_rtt);
    
    	tcp_set_rto(sk);
    	inet_csk(sk)->icsk_backoff = 0;
    	tcp_bound_rto(sk);
    
    static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	/* We don't have a timestamp. Can only use
    	 * packets that are not retransmitted to determine
    	 * rtt estimates. Also, we must not reset the
    	 * backoff for rto until we get a non-retransmitted
    	 * packet. This allows us to deal with a situation
    	 * where the network delay has increased suddenly.
    	 * I.e. Karn's algorithm. (SIGCOMM '87, p5.)
    	 */
    
    	if (flag & FLAG_RETRANS_DATA_ACKED)
    		return;
    
    
    	tcp_rtt_estimator(sk, seq_rtt);
    
    	tcp_set_rto(sk);
    	inet_csk(sk)->icsk_backoff = 0;
    	tcp_bound_rto(sk);
    
    static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
    
    				      const s32 seq_rtt)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	const struct tcp_sock *tp = tcp_sk(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	/* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
    	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
    
    		tcp_ack_saw_tstamp(sk, flag);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	else if (seq_rtt >= 0)
    
    		tcp_ack_no_tstamp(sk, seq_rtt, flag);
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
    			   u32 in_flight, int good)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	const struct inet_connection_sock *icsk = inet_csk(sk);
    	icsk->icsk_ca_ops->cong_avoid(sk, ack, rtt, in_flight, good);
    	tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /* Restart timer after forward progress on connection.
     * RFC2988 recommends to restart timer to now+rto.
     */
    
    
    static void tcp_ack_packets_out(struct sock *sk)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct tcp_sock *tp = tcp_sk(sk);
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (!tp->packets_out) {
    
    		inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	} else {
    
    		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    }
    
    static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
    			 __u32 now, __s32 *seq_rtt)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	__u32 seq = tp->snd_una;
    	__u32 packets_acked;
    	int acked = 0;
    
    	/* If we get here, the whole TSO packet has not been
    	 * acked.
    	 */
    	BUG_ON(!after(scb->end_seq, seq));
    
    	packets_acked = tcp_skb_pcount(skb);
    	if (tcp_trim_head(sk, skb, seq - scb->seq))
    		return 0;
    	packets_acked -= tcp_skb_pcount(skb);
    
    	if (packets_acked) {
    		__u8 sacked = scb->sacked;
    
    		acked |= FLAG_DATA_ACKED;
    		if (sacked) {
    			if (sacked & TCPCB_RETRANS) {
    				if (sacked & TCPCB_SACKED_RETRANS)
    					tp->retrans_out -= packets_acked;
    				acked |= FLAG_RETRANS_DATA_ACKED;
    				*seq_rtt = -1;
    			} else if (*seq_rtt < 0)
    				*seq_rtt = now - scb->when;
    			if (sacked & TCPCB_SACKED_ACKED)
    				tp->sacked_out -= packets_acked;
    			if (sacked & TCPCB_LOST)
    				tp->lost_out -= packets_acked;
    			if (sacked & TCPCB_URG) {
    				if (tp->urg_mode &&
    				    !before(seq, tp->snd_up))
    					tp->urg_mode = 0;
    			}
    		} else if (*seq_rtt < 0)
    			*seq_rtt = now - scb->when;
    
    		if (tp->fackets_out) {
    			__u32 dval = min(tp->fackets_out, packets_acked);
    			tp->fackets_out -= dval;
    		}
    		tp->packets_out -= packets_acked;
    
    		BUG_ON(tcp_skb_pcount(skb) == 0);
    		BUG_ON(!before(scb->seq, scb->end_seq));
    	}
    
    	return acked;
    }
    
    /* Remove acknowledged frames from the retransmission queue. */
    
    static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	const struct inet_connection_sock *icsk = inet_csk(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct sk_buff *skb;
    	__u32 now = tcp_time_stamp;
    	int acked = 0;
    
    	int prior_packets = tp->packets_out;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	__s32 seq_rtt = -1;
    
    	ktime_t last_ackt = net_invalid_timestamp();
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	while ((skb = tcp_write_queue_head(sk)) &&
    	       skb != tcp_send_head(sk)) {
    
    		struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		__u8 sacked = scb->sacked;
    
    		/* If our packet is before the ack sequence we can
    		 * discard it as it's confirmed to have arrived at
    		 * the other end.
    		 */
    		if (after(scb->end_seq, tp->snd_una)) {
    
    			if (tcp_skb_pcount(skb) > 1 &&
    			    after(tp->snd_una, scb->seq))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				acked |= tcp_tso_acked(sk, skb,
    						       now, &seq_rtt);
    			break;
    		}
    
    		/* Initial outgoing SYN's get put onto the write_queue
    		 * just like anything else we transmit.  It is not
    		 * true data, and if we misinform our callers that
    		 * this ACK acks real data, we will erroneously exit
    		 * connection startup slow start one packet too
    		 * quickly.  This is severely frowned upon behavior.
    		 */
    		if (!(scb->flags & TCPCB_FLAG_SYN)) {
    			acked |= FLAG_DATA_ACKED;
    		} else {
    			acked |= FLAG_SYN_ACKED;
    			tp->retrans_stamp = 0;
    		}
    
    
    John Heffner's avatar
    John Heffner committed
    		/* MTU probing checks */
    		if (icsk->icsk_mtup.probe_size) {
    
    			if (!after(tp->mtu_probe.probe_seq_end, TCP_SKB_CB(skb)->end_seq)) {
    
    John Heffner's avatar
    John Heffner committed
    				tcp_mtup_probe_success(sk, skb);
    			}
    		}
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (sacked) {
    			if (sacked & TCPCB_RETRANS) {
    
    				if (sacked & TCPCB_SACKED_RETRANS)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    					tp->retrans_out -= tcp_skb_pcount(skb);
    				acked |= FLAG_RETRANS_DATA_ACKED;
    				seq_rtt = -1;
    
    			} else if (seq_rtt < 0) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				seq_rtt = now - scb->when;
    
    				last_ackt = skb->tstamp;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			if (sacked & TCPCB_SACKED_ACKED)
    				tp->sacked_out -= tcp_skb_pcount(skb);
    			if (sacked & TCPCB_LOST)
    				tp->lost_out -= tcp_skb_pcount(skb);
    			if (sacked & TCPCB_URG) {
    				if (tp->urg_mode &&
    				    !before(scb->end_seq, tp->snd_up))
    					tp->urg_mode = 0;
    			}
    
    		} else if (seq_rtt < 0) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			seq_rtt = now - scb->when;
    
    			last_ackt = skb->tstamp;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		tcp_dec_pcount_approx(&tp->fackets_out, skb);
    		tcp_packets_out_dec(tp, skb);
    
    		tcp_unlink_write_queue(skb, sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		sk_stream_free_skb(sk, skb);
    
    		clear_all_retrans_hints(tp);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    	if (acked&FLAG_ACKED) {
    
    		u32 pkts_acked = prior_packets - tp->packets_out;
    
    		const struct tcp_congestion_ops *ca_ops
    			= inet_csk(sk)->icsk_ca_ops;
    
    
    		tcp_ack_update_rtt(sk, acked, seq_rtt);
    
    		/* Is the ACK triggering packet unambiguous? */
    		if (acked & FLAG_RETRANS_DATA_ACKED)
    			last_ackt = net_invalid_timestamp();
    
    
    		if (ca_ops->pkts_acked)
    			ca_ops->pkts_acked(sk, pkts_acked, last_ackt);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    #if FASTRETRANS_DEBUG > 0
    	BUG_TRAP((int)tp->sacked_out >= 0);
    	BUG_TRAP((int)tp->lost_out >= 0);
    	BUG_TRAP((int)tp->retrans_out >= 0);
    	if (!tp->packets_out && tp->rx_opt.sack_ok) {
    
    		const struct inet_connection_sock *icsk = inet_csk(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (tp->lost_out) {
    			printk(KERN_DEBUG "Leak l=%u %d\n",
    
    			       tp->lost_out, icsk->icsk_ca_state);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			tp->lost_out = 0;
    		}
    		if (tp->sacked_out) {
    			printk(KERN_DEBUG "Leak s=%u %d\n",
    
    			       tp->sacked_out, icsk->icsk_ca_state);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			tp->sacked_out = 0;
    		}
    		if (tp->retrans_out) {
    			printk(KERN_DEBUG "Leak r=%u %d\n",
    
    			       tp->retrans_out, icsk->icsk_ca_state);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			tp->retrans_out = 0;
    		}
    	}
    #endif
    	*seq_rtt_p = seq_rtt;
    	return acked;
    }
    
    static void tcp_ack_probe(struct sock *sk)
    {
    
    	const struct tcp_sock *tp = tcp_sk(sk);
    	struct inet_connection_sock *icsk = inet_csk(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	/* Was it a usable window open? */
    
    
    	if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		   tp->snd_una + tp->snd_wnd)) {
    
    		icsk->icsk_backoff = 0;
    		inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/* Socket must be waked up by subsequent tcp_data_snd_check().
    		 * This function is not for random using!
    		 */
    	} else {
    
    		inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
    
    					  min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
    					  TCP_RTO_MAX);
    
    static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
    
    		inet_csk(sk)->icsk_ca_state != TCP_CA_Open);
    
    static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	const struct tcp_sock *tp = tcp_sk(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) &&
    
    		!((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /* Check that window update is acceptable.
     * The function assumes that snd_una<=ack<=snd_next.
     */
    
    static inline int tcp_may_update_window(const struct tcp_sock *tp, const u32 ack,
    					const u32 ack_seq, const u32 nwin)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	return (after(ack, tp->snd_una) ||
    		after(ack_seq, tp->snd_wl1) ||
    		(ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd));
    }
    
    /* Update our send window.
     *
     * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
     * and in FreeBSD. NetBSD's one is even worse.) is wrong.
     */
    
    static int tcp_ack_update_window(struct sock *sk, struct sk_buff *skb, u32 ack,
    				 u32 ack_seq)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct tcp_sock *tp = tcp_sk(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	int flag = 0;
    
    	u32 nwin = ntohs(tcp_hdr(skb)->window);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if (likely(!tcp_hdr(skb)->syn))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		nwin <<= tp->rx_opt.snd_wscale;
    
    	if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
    		flag |= FLAG_WIN_UPDATE;
    		tcp_update_wl(tp, ack, ack_seq);
    
    		if (tp->snd_wnd != nwin) {
    			tp->snd_wnd = nwin;
    
    			/* Note, it is the only place, where
    			 * fast path is recovered for sending TCP.
    			 */
    
    			tp->pred_flags = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    			if (nwin > tp->max_window) {
    				tp->max_window = nwin;
    
    				tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			}
    		}
    	}
    
    	tp->snd_una = ack;
    
    	return flag;
    }
    
    
    /* A very conservative spurious RTO response algorithm: reduce cwnd and
     * continue in congestion avoidance.
     */
    static void tcp_conservative_spur_to_response(struct tcp_sock *tp)
    {
    	tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
    
    /* A conservative spurious RTO response algorithm: reduce cwnd using
     * rate halving and continue in congestion avoidance.
     */
    static void tcp_ratehalving_spur_to_response(struct sock *sk)
    {
    	tcp_enter_cwr(sk, 0);
    }
    
    
    static void tcp_undo_spur_to_response(struct sock *sk, int flag)
    
    	if (flag&FLAG_ECE)
    		tcp_ratehalving_spur_to_response(sk);
    	else
    		tcp_undo_cwr(sk, 1);
    
    /* F-RTO spurious RTO detection algorithm (RFC4138)
     *
    
     * F-RTO affects during two new ACKs following RTO (well, almost, see inline
     * comments). State (ACK number) is kept in frto_counter. When ACK advances
     * window (but not to or beyond highest sequence sent before RTO):
    
     *   On First ACK,  send two new segments out.
     *   On Second ACK, RTO was likely spurious. Do spurious response (response
     *                  algorithm is not part of the F-RTO detection algorithm
     *                  given in RFC4138 but can be selected separately).
     * Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss
    
     * and TCP falls back to conventional RTO recovery. F-RTO allows overriding
     * of Nagle, this is done using frto_counter states 2 and 3, when a new data
     * segment of any size sent during F-RTO, state 2 is upgraded to 3.
    
     *
     * Rationale: if the RTO was spurious, new ACKs should arrive from the
     * original window even after we transmit two new data segments.
     *
    
     * SACK version:
     *   on first step, wait until first cumulative ACK arrives, then move to
     *   the second step. In second step, the next ACK decides.
     *
    
     * F-RTO is implemented (mainly) in four functions:
     *   - tcp_use_frto() is used to determine if TCP is can use F-RTO
     *   - tcp_enter_frto() prepares TCP state on RTO if F-RTO is used, it is
     *     called when tcp_use_frto() showed green light
     *   - tcp_process_frto() handles incoming ACKs during F-RTO algorithm
     *   - tcp_enter_frto_loss() is called if there is not enough evidence
     *     to prove that the RTO is indeed spurious. It transfers the control
     *     from F-RTO to the conventional RTO recovery
     */
    
    static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	tcp_sync_left_out(tp);
    
    	/* Duplicate the behavior from Loss state (fastretrans_alert) */
    	if (flag&FLAG_DATA_ACKED)
    		inet_csk(sk)->icsk_retransmits = 0;
    
    
    	if (!before(tp->snd_una, tp->frto_highmark)) {
    
    		tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag);
    
    	if (!IsSackFrto() || IsReno(tp)) {
    		/* RFC4138 shortcoming in step 2; should also have case c):
    		 * ACK isn't duplicate nor advances window, e.g., opposite dir
    		 * data, winupdate
    		 */
    		if ((tp->snd_una == prior_snd_una) && (flag&FLAG_NOT_DUP) &&
    		    !(flag&FLAG_FORWARD_PROGRESS))
    			return 1;
    
    		if (!(flag&FLAG_DATA_ACKED)) {
    			tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3),
    					    flag);
    			return 1;
    		}
    	} else {
    		if (!(flag&FLAG_DATA_ACKED) && (tp->frto_counter == 1)) {
    			/* Prevent sending of new data. */
    			tp->snd_cwnd = min(tp->snd_cwnd,
    					   tcp_packets_in_flight(tp));
    			return 1;
    		}
    
    		    (!(flag&FLAG_FORWARD_PROGRESS) ||
    		     ((flag&FLAG_DATA_SACKED) && !(flag&FLAG_ONLY_ORIG_SACKED)))) {
    			/* RFC4138 shortcoming (see comment above) */
    			if (!(flag&FLAG_FORWARD_PROGRESS) && (flag&FLAG_NOT_DUP))
    				return 1;
    
    			tcp_enter_frto_loss(sk, 3, flag);
    			return 1;
    		}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    	if (tp->frto_counter == 1) {
    
    		/* Sending of the next skb must be allowed or no FRTO */
    		if (!tcp_send_head(sk) ||
    		    after(TCP_SKB_CB(tcp_send_head(sk))->end_seq,
    				     tp->snd_una + tp->snd_wnd)) {
    
    			tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3),
    					    flag);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		tp->snd_cwnd = tcp_packets_in_flight(tp) + 2;
    
    		switch (sysctl_tcp_frto_response) {
    		case 2:
    
    			tcp_undo_spur_to_response(sk, flag);
    
    			break;
    		case 1:
    			tcp_conservative_spur_to_response(tp);
    			break;
    		default:
    			tcp_ratehalving_spur_to_response(sk);
    			break;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /* This routine deals with incoming acks, but not outgoing ones. */
    static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
    {
    
    	struct inet_connection_sock *icsk = inet_csk(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct tcp_sock *tp = tcp_sk(sk);
    	u32 prior_snd_una = tp->snd_una;
    	u32 ack_seq = TCP_SKB_CB(skb)->seq;
    	u32 ack = TCP_SKB_CB(skb)->ack_seq;
    	u32 prior_in_flight;
    	s32 seq_rtt;
    	int prior_packets;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	/* If the ack is newer than sent or older than previous acks
    	 * then we can probably ignore it.
    	 */
    	if (after(ack, tp->snd_nxt))
    		goto uninteresting_ack;
    
    	if (before(ack, prior_snd_una))
    		goto old_ack;
    
    
    	if (sysctl_tcp_abc) {
    		if (icsk->icsk_ca_state < TCP_CA_CWR)
    			tp->bytes_acked += ack - prior_snd_una;
    		else if (icsk->icsk_ca_state == TCP_CA_Loss)
    			/* we assume just one segment left network */
    			tp->bytes_acked += min(ack - prior_snd_una, tp->mss_cache);
    	}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
    		/* Window is constant, pure forward advance.
    		 * No more checks are required.
    		 * Note, we use the fact that SND.UNA>=SND.WL2.
    		 */
    		tcp_update_wl(tp, ack, ack_seq);
    		tp->snd_una = ack;
    		flag |= FLAG_WIN_UPDATE;
    
    
    		tcp_ca_event(sk, CA_EVENT_FAST_ACK);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS);
    	} else {
    		if (ack_seq != TCP_SKB_CB(skb)->end_seq)
    			flag |= FLAG_DATA;
    		else
    			NET_INC_STATS_BH(LINUX_MIB_TCPPUREACKS);
    
    
    		flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		if (TCP_SKB_CB(skb)->sacked)
    			flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
    
    
    		if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			flag |= FLAG_ECE;
    
    
    		tcp_ca_event(sk, CA_EVENT_SLOW_ACK);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    	/* We passed data and got it acked, remove any soft error
    	 * log. Something worked...
    	 */
    	sk->sk_err_soft = 0;
    	tp->rcv_tstamp = tcp_time_stamp;
    	prior_packets = tp->packets_out;
    	if (!prior_packets)
    		goto no_queue;
    
    	prior_in_flight = tcp_packets_in_flight(tp);
    
    	/* See if we can take anything off of the retransmit queue. */
    
    	flag |= tcp_clean_rtx_queue(sk, &seq_rtt);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	if (tp->frto_counter)
    
    		frto_cwnd = tcp_process_frto(sk, prior_snd_una, flag);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if (tcp_ack_is_dubious(sk, flag)) {
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    		/* Advance CWND, if state allows this. */
    
    		if ((flag & FLAG_DATA_ACKED) && !frto_cwnd &&
    		    tcp_may_raise_cwnd(sk, flag))
    
    			tcp_cong_avoid(sk, ack,  seq_rtt, prior_in_flight, 0);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
    	} else {
    
    		if ((flag & FLAG_DATA_ACKED) && !frto_cwnd)
    
    			tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 1);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP))
    		dst_confirm(sk->sk_dst_cache);
    
    	return 1;
    
    no_queue:
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	/* If this ack opens up a zero window, clear backoff.  It was
    	 * being used to time the probes, and is probably far higher than
    	 * it needs to be for normal retransmission.
    	 */
    
    	if (tcp_send_head(sk))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		tcp_ack_probe(sk);
    	return 1;
    
    old_ack:
    	if (TCP_SKB_CB(skb)->sacked)
    		tcp_sacktag_write_queue(sk, skb, prior_snd_una);
    
    uninteresting_ack:
    	SOCK_DEBUG(sk, "Ack %u out of %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
    	return 0;
    }
    
    
    /* Look for tcp options. Normally only called on SYN and SYNACK packets.
     * But, this can also be called on packets in the established flow when
     * the fast version below fails.
     */
    void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab)
    {
    	unsigned char *ptr;
    
    	struct tcphdr *th = tcp_hdr(skb);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	int length=(th->doff*4)-sizeof(struct tcphdr);
    
    	ptr = (unsigned char *)(th + 1);
    	opt_rx->saw_tstamp = 0;
    
    
    	while (length > 0) {
    
    		int opcode=*ptr++;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		int opsize;
    
    		switch (opcode) {
    			case TCPOPT_EOL:
    				return;
    			case TCPOPT_NOP:	/* Ref: RFC 793 section 3.1 */
    				length--;
    				continue;
    			default:
    				opsize=*ptr++;
    				if (opsize < 2) /* "silly options" */
    					return;
    				if (opsize > length)
    					return;	/* don't parse partial options */
    
    				switch (opcode) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				case TCPOPT_MSS:
    
    					if (opsize==TCPOLEN_MSS && th->syn && !estab) {
    
    						u16 in_mss = ntohs(get_unaligned((__be16 *)ptr));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    						if (in_mss) {
    							if (opt_rx->user_mss && opt_rx->user_mss < in_mss)
    								in_mss = opt_rx->user_mss;
    							opt_rx->mss_clamp = in_mss;
    						}
    					}
    					break;
    				case TCPOPT_WINDOW:
    
    					if (opsize==TCPOLEN_WINDOW && th->syn && !estab)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    						if (sysctl_tcp_window_scaling) {
    							__u8 snd_wscale = *(__u8 *) ptr;
    							opt_rx->wscale_ok = 1;
    							if (snd_wscale > 14) {
    
    								if (net_ratelimit())
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    									printk(KERN_INFO "tcp_parse_options: Illegal window "
    									       "scaling value %d >14 received.\n",
    									       snd_wscale);
    								snd_wscale = 14;
    							}
    							opt_rx->snd_wscale = snd_wscale;
    						}
    					break;
    				case TCPOPT_TIMESTAMP:
    
    					if (opsize==TCPOLEN_TIMESTAMP) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    						if ((estab && opt_rx->tstamp_ok) ||
    						    (!estab && sysctl_tcp_timestamps)) {
    							opt_rx->saw_tstamp = 1;
    
    							opt_rx->rcv_tsval = ntohl(get_unaligned((__be32 *)ptr));
    							opt_rx->rcv_tsecr = ntohl(get_unaligned((__be32 *)(ptr+4)));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    						}
    					}
    					break;
    				case TCPOPT_SACK_PERM:
    
    					if (opsize==TCPOLEN_SACK_PERM && th->syn && !estab) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    						if (sysctl_tcp_sack) {
    							opt_rx->sack_ok = 1;
    							tcp_sack_reset(opt_rx);
    						}
    					}
    					break;
    
    				case TCPOPT_SACK:
    
    					if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    					   !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
    					   opt_rx->sack_ok) {
    						TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
    					}
    
    #ifdef CONFIG_TCP_MD5SIG
    				case TCPOPT_MD5SIG:
    					/*
    					 * The MD5 Hash has already been
    					 * checked (see tcp_v{4,6}_do_rcv()).
    					 */
    					break;
    #endif
    
    				ptr+=opsize-2;
    				length-=opsize;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    }
    
    /* Fast parse options. This hopes to only see timestamps.
     * If it is wrong it falls back on tcp_parse_options().
     */
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
    				  struct tcp_sock *tp)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	if (th->doff == sizeof(struct tcphdr)>>2) {
    		tp->rx_opt.saw_tstamp = 0;
    		return 0;
    	} else if (tp->rx_opt.tstamp_ok &&
    		   th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) {
    
    		__be32 *ptr = (__be32 *)(th + 1);
    		if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				  | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
    			tp->rx_opt.saw_tstamp = 1;
    			++ptr;
    			tp->rx_opt.rcv_tsval = ntohl(*ptr);
    			++ptr;
    			tp->rx_opt.rcv_tsecr = ntohl(*ptr);
    			return 1;
    		}
    	}
    	tcp_parse_options(skb, &tp->rx_opt, 1);
    	return 1;
    }
    
    static inline void tcp_store_ts_recent(struct tcp_sock *tp)
    {
    	tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
    
    	tp->rx_opt.ts_recent_stamp = get_seconds();
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
    {
    	if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
    		/* PAWS bug workaround wrt. ACK frames, the PAWS discard
    		 * extra check below makes sure this can only happen
    		 * for pure ACK frames.  -DaveM
    		 *
    		 * Not only, also it occurs for expired timestamps.
    		 */
    
    
    		if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) >= 0 ||
    
    		   get_seconds() >= tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS)