Skip to content
Snippets Groups Projects
tcp_input.c 132 KiB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
			BUG_TRAP(tp->retrans_out == 0);
		tp->retrans_stamp = 0;
	} else if (!before(tp->snd_una, tp->high_seq)) {
		switch (icsk->icsk_ca_state) {
Linus Torvalds's avatar
Linus Torvalds committed
		case TCP_CA_Loss:
Linus Torvalds's avatar
Linus Torvalds committed
			if (tcp_try_undo_recovery(sk, tp))
				return;
			break;

		case TCP_CA_CWR:
			/* CWR is to be held something *above* high_seq
			 * is ACKed for CWR bit to reach receiver. */
			if (tp->snd_una != tp->high_seq) {
				tcp_complete_cwr(sk);
				tcp_set_ca_state(sk, TCP_CA_Open);
Linus Torvalds's avatar
Linus Torvalds committed
			}
			break;

		case TCP_CA_Disorder:
			tcp_try_undo_dsack(sk, tp);
			if (!tp->undo_marker ||
			    /* For SACK case do not Open to allow to undo
			     * catching for all duplicate ACKs. */
			    IsReno(tp) || tp->snd_una != tp->high_seq) {
				tp->undo_marker = 0;
				tcp_set_ca_state(sk, TCP_CA_Open);
Linus Torvalds's avatar
Linus Torvalds committed
			}
			break;

		case TCP_CA_Recovery:
			if (IsReno(tp))
				tcp_reset_reno_sack(tp);
			if (tcp_try_undo_recovery(sk, tp))
				return;
Linus Torvalds's avatar
Linus Torvalds committed
			break;
		}
	}

	/* F. Process state. */
	switch (icsk->icsk_ca_state) {
Linus Torvalds's avatar
Linus Torvalds committed
	case TCP_CA_Recovery:
		if (prior_snd_una == tp->snd_una) {
			if (IsReno(tp) && is_dupack)
Linus Torvalds's avatar
Linus Torvalds committed
		} else {
			int acked = prior_packets - tp->packets_out;
			if (IsReno(tp))
				tcp_remove_reno_sacks(sk, tp, acked);
			is_dupack = tcp_try_undo_partial(sk, tp, acked);
		}
		break;
	case TCP_CA_Loss:
		if (flag&FLAG_DATA_ACKED)
Linus Torvalds's avatar
Linus Torvalds committed
		if (!tcp_try_undo_loss(sk, tp)) {
			tcp_moderate_cwnd(tp);
			tcp_xmit_retransmit_queue(sk);
			return;
		}
		if (icsk->icsk_ca_state != TCP_CA_Open)
Linus Torvalds's avatar
Linus Torvalds committed
			return;
		/* Loss is undone; fall through to processing in Open state. */
	default:
		if (IsReno(tp)) {
			if (tp->snd_una != prior_snd_una)
				tcp_reset_reno_sack(tp);
			if (is_dupack)
		if (icsk->icsk_ca_state == TCP_CA_Disorder)
Linus Torvalds's avatar
Linus Torvalds committed
			tcp_try_undo_dsack(sk, tp);

		if (!tcp_time_to_recover(sk, tp)) {
			tcp_try_to_open(sk, tp, flag);
			return;
		}

John Heffner's avatar
John Heffner committed
		/* MTU probe failure: don't reduce cwnd */
		if (icsk->icsk_ca_state < TCP_CA_CWR &&
		    icsk->icsk_mtup.probe_size &&
		    tp->snd_una == tp->mtu_probe.probe_seq_start) {
John Heffner's avatar
John Heffner committed
			tcp_mtup_probe_failed(sk);
			/* Restores the reduction we did in tcp_mtup_probe() */
			tp->snd_cwnd++;
			tcp_simple_retransmit(sk);
			return;
		}

Linus Torvalds's avatar
Linus Torvalds committed
		/* Otherwise enter Recovery state */

		if (IsReno(tp))
			NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERY);
		else
			NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERY);

		tp->high_seq = tp->snd_nxt;
		tp->prior_ssthresh = 0;
		tp->undo_marker = tp->snd_una;
		tp->undo_retrans = tp->retrans_out;

		if (icsk->icsk_ca_state < TCP_CA_CWR) {
Linus Torvalds's avatar
Linus Torvalds committed
			if (!(flag&FLAG_ECE))
				tp->prior_ssthresh = tcp_current_ssthresh(sk);
			tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
Linus Torvalds's avatar
Linus Torvalds committed
			TCP_ECN_queue_cwr(tp);
		}

		tp->bytes_acked = 0;
Linus Torvalds's avatar
Linus Torvalds committed
		tp->snd_cwnd_cnt = 0;
		tcp_set_ca_state(sk, TCP_CA_Recovery);
Linus Torvalds's avatar
Linus Torvalds committed
	}

	if (is_dupack || tcp_head_timedout(sk, tp))
		tcp_update_scoreboard(sk, tp);
Linus Torvalds's avatar
Linus Torvalds committed
	tcp_xmit_retransmit_queue(sk);
}

/* Read draft-ietf-tcplw-high-performance before mucking
Stephen Hemminger's avatar
Stephen Hemminger committed
 * with this code. (Supersedes RFC1323)
Linus Torvalds's avatar
Linus Torvalds committed
 */
static void tcp_ack_saw_tstamp(struct sock *sk, int flag)
Linus Torvalds's avatar
Linus Torvalds committed
{
	/* RTTM Rule: A TSecr value received in a segment is used to
	 * update the averaged RTT measurement only if the segment
	 * acknowledges some new data, i.e., only if it advances the
	 * left edge of the send window.
	 *
	 * See draft-ietf-tcplw-high-performance-00, section 3.3.
	 * 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
	 *
	 * Changed: reset backoff as soon as we see the first valid sample.
Stephen Hemminger's avatar
Stephen Hemminger committed
	 * If we do not, we get strongly overestimated rto. With timestamps
Linus Torvalds's avatar
Linus Torvalds committed
	 * samples are accepted even from very old segments: f.e., when rtt=1
	 * increases to 8, we retransmit 5 times and after 8 seconds delayed
	 * answer arrives rto becomes 120 seconds! If at least one of segments
	 * in window is lost... Voila.	 			--ANK (010210)
	 */
	struct tcp_sock *tp = tcp_sk(sk);
	const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
	tcp_rtt_estimator(sk, seq_rtt);
	tcp_set_rto(sk);
	inet_csk(sk)->icsk_backoff = 0;
	tcp_bound_rto(sk);
static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag)
Linus Torvalds's avatar
Linus Torvalds committed
{
	/* We don't have a timestamp. Can only use
	 * packets that are not retransmitted to determine
	 * rtt estimates. Also, we must not reset the
	 * backoff for rto until we get a non-retransmitted
	 * packet. This allows us to deal with a situation
	 * where the network delay has increased suddenly.
	 * I.e. Karn's algorithm. (SIGCOMM '87, p5.)
	 */

	if (flag & FLAG_RETRANS_DATA_ACKED)
		return;

	tcp_rtt_estimator(sk, seq_rtt);
	tcp_set_rto(sk);
	inet_csk(sk)->icsk_backoff = 0;
	tcp_bound_rto(sk);
static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
				      const s32 seq_rtt)
Linus Torvalds's avatar
Linus Torvalds committed
{
	const struct tcp_sock *tp = tcp_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
	/* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
		tcp_ack_saw_tstamp(sk, flag);
Linus Torvalds's avatar
Linus Torvalds committed
	else if (seq_rtt >= 0)
		tcp_ack_no_tstamp(sk, seq_rtt, flag);
Stephen Hemminger's avatar
Stephen Hemminger committed
static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
			   u32 in_flight, int good)
Linus Torvalds's avatar
Linus Torvalds committed
{
	const struct inet_connection_sock *icsk = inet_csk(sk);
	icsk->icsk_ca_ops->cong_avoid(sk, ack, rtt, in_flight, good);
	tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;
Linus Torvalds's avatar
Linus Torvalds committed
}

/* Restart timer after forward progress on connection.
 * RFC2988 recommends to restart timer to now+rto.
 */

Stephen Hemminger's avatar
Stephen Hemminger committed
static void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
Linus Torvalds's avatar
Linus Torvalds committed
{
	if (!tp->packets_out) {
		inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
Linus Torvalds's avatar
Linus Torvalds committed
	} else {
		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
Linus Torvalds's avatar
Linus Torvalds committed
	}
}

static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
			 __u32 now, __s32 *seq_rtt)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct tcp_skb_cb *scb = TCP_SKB_CB(skb); 
	__u32 seq = tp->snd_una;
	__u32 packets_acked;
	int acked = 0;

	/* If we get here, the whole TSO packet has not been
	 * acked.
	 */
	BUG_ON(!after(scb->end_seq, seq));

	packets_acked = tcp_skb_pcount(skb);
	if (tcp_trim_head(sk, skb, seq - scb->seq))
		return 0;
	packets_acked -= tcp_skb_pcount(skb);

	if (packets_acked) {
		__u8 sacked = scb->sacked;

		acked |= FLAG_DATA_ACKED;
		if (sacked) {
			if (sacked & TCPCB_RETRANS) {
				if (sacked & TCPCB_SACKED_RETRANS)
					tp->retrans_out -= packets_acked;
				acked |= FLAG_RETRANS_DATA_ACKED;
				*seq_rtt = -1;
			} else if (*seq_rtt < 0)
				*seq_rtt = now - scb->when;
			if (sacked & TCPCB_SACKED_ACKED)
				tp->sacked_out -= packets_acked;
			if (sacked & TCPCB_LOST)
				tp->lost_out -= packets_acked;
			if (sacked & TCPCB_URG) {
				if (tp->urg_mode &&
				    !before(seq, tp->snd_up))
					tp->urg_mode = 0;
			}
		} else if (*seq_rtt < 0)
			*seq_rtt = now - scb->when;

		if (tp->fackets_out) {
			__u32 dval = min(tp->fackets_out, packets_acked);
			tp->fackets_out -= dval;
		}
		tp->packets_out -= packets_acked;

		BUG_ON(tcp_skb_pcount(skb) == 0);
		BUG_ON(!before(scb->seq, scb->end_seq));
	}

	return acked;
}

static u32 tcp_usrtt(struct timeval *tv)
	struct timeval now;

	do_gettimeofday(&now);
	return (now.tv_sec - tv->tv_sec) * 1000000 + (now.tv_usec - tv->tv_usec);
Linus Torvalds's avatar
Linus Torvalds committed

/* Remove acknowledged frames from the retransmission queue. */
static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
Linus Torvalds's avatar
Linus Torvalds committed
{
	struct tcp_sock *tp = tcp_sk(sk);
	const struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
	struct sk_buff *skb;
	__u32 now = tcp_time_stamp;
	int acked = 0;
	__s32 seq_rtt = -1;
	void (*rtt_sample)(struct sock *sk, u32 usrtt)
		= icsk->icsk_ca_ops->rtt_sample;
	struct timeval tv = { .tv_sec = 0, .tv_usec = 0 };
Linus Torvalds's avatar
Linus Torvalds committed

	while ((skb = skb_peek(&sk->sk_write_queue)) &&
	       skb != sk->sk_send_head) {
		struct tcp_skb_cb *scb = TCP_SKB_CB(skb); 
		__u8 sacked = scb->sacked;

		/* If our packet is before the ack sequence we can
		 * discard it as it's confirmed to have arrived at
		 * the other end.
		 */
		if (after(scb->end_seq, tp->snd_una)) {
			if (tcp_skb_pcount(skb) > 1 &&
			    after(tp->snd_una, scb->seq))
Linus Torvalds's avatar
Linus Torvalds committed
				acked |= tcp_tso_acked(sk, skb,
						       now, &seq_rtt);
			break;
		}

		/* Initial outgoing SYN's get put onto the write_queue
		 * just like anything else we transmit.  It is not
		 * true data, and if we misinform our callers that
		 * this ACK acks real data, we will erroneously exit
		 * connection startup slow start one packet too
		 * quickly.  This is severely frowned upon behavior.
		 */
		if (!(scb->flags & TCPCB_FLAG_SYN)) {
			acked |= FLAG_DATA_ACKED;
Linus Torvalds's avatar
Linus Torvalds committed
		} else {
			acked |= FLAG_SYN_ACKED;
			tp->retrans_stamp = 0;
		}

John Heffner's avatar
John Heffner committed
		/* MTU probing checks */
		if (icsk->icsk_mtup.probe_size) {
			if (!after(tp->mtu_probe.probe_seq_end, TCP_SKB_CB(skb)->end_seq)) {
John Heffner's avatar
John Heffner committed
				tcp_mtup_probe_success(sk, skb);
			}
		}

Linus Torvalds's avatar
Linus Torvalds committed
		if (sacked) {
			if (sacked & TCPCB_RETRANS) {
				if(sacked & TCPCB_SACKED_RETRANS)
					tp->retrans_out -= tcp_skb_pcount(skb);
				acked |= FLAG_RETRANS_DATA_ACKED;
				seq_rtt = -1;
			} else if (seq_rtt < 0) {
Linus Torvalds's avatar
Linus Torvalds committed
				seq_rtt = now - scb->when;
				skb_get_timestamp(skb, &tv);
Linus Torvalds's avatar
Linus Torvalds committed
			if (sacked & TCPCB_SACKED_ACKED)
				tp->sacked_out -= tcp_skb_pcount(skb);
			if (sacked & TCPCB_LOST)
				tp->lost_out -= tcp_skb_pcount(skb);
			if (sacked & TCPCB_URG) {
				if (tp->urg_mode &&
				    !before(scb->end_seq, tp->snd_up))
					tp->urg_mode = 0;
			}
		} else if (seq_rtt < 0) {
Linus Torvalds's avatar
Linus Torvalds committed
			seq_rtt = now - scb->when;
			skb_get_timestamp(skb, &tv);
Linus Torvalds's avatar
Linus Torvalds committed
		tcp_dec_pcount_approx(&tp->fackets_out, skb);
		tcp_packets_out_dec(tp, skb);
David S. Miller's avatar
David S. Miller committed
		__skb_unlink(skb, &sk->sk_write_queue);
Linus Torvalds's avatar
Linus Torvalds committed
		sk_stream_free_skb(sk, skb);
		clear_all_retrans_hints(tp);
Linus Torvalds's avatar
Linus Torvalds committed
	}

	if (acked&FLAG_ACKED) {
		tcp_ack_update_rtt(sk, acked, seq_rtt);
Linus Torvalds's avatar
Linus Torvalds committed
		tcp_ack_packets_out(sk, tp);
		if (rtt_sample && !(acked & FLAG_RETRANS_DATA_ACKED))
			(*rtt_sample)(sk, tcp_usrtt(&tv));
		if (icsk->icsk_ca_ops->pkts_acked)
			icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked);
Linus Torvalds's avatar
Linus Torvalds committed
	}

#if FASTRETRANS_DEBUG > 0
	BUG_TRAP((int)tp->sacked_out >= 0);
	BUG_TRAP((int)tp->lost_out >= 0);
	BUG_TRAP((int)tp->retrans_out >= 0);
	if (!tp->packets_out && tp->rx_opt.sack_ok) {
		const struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
		if (tp->lost_out) {
			printk(KERN_DEBUG "Leak l=%u %d\n",
			       tp->lost_out, icsk->icsk_ca_state);
Linus Torvalds's avatar
Linus Torvalds committed
			tp->lost_out = 0;
		}
		if (tp->sacked_out) {
			printk(KERN_DEBUG "Leak s=%u %d\n",
			       tp->sacked_out, icsk->icsk_ca_state);
Linus Torvalds's avatar
Linus Torvalds committed
			tp->sacked_out = 0;
		}
		if (tp->retrans_out) {
			printk(KERN_DEBUG "Leak r=%u %d\n",
			       tp->retrans_out, icsk->icsk_ca_state);
Linus Torvalds's avatar
Linus Torvalds committed
			tp->retrans_out = 0;
		}
	}
#endif
	*seq_rtt_p = seq_rtt;
	return acked;
}

static void tcp_ack_probe(struct sock *sk)
{
	const struct tcp_sock *tp = tcp_sk(sk);
	struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds's avatar
Linus Torvalds committed

	/* Was it a usable window open? */

	if (!after(TCP_SKB_CB(sk->sk_send_head)->end_seq,
		   tp->snd_una + tp->snd_wnd)) {
		icsk->icsk_backoff = 0;
		inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
Linus Torvalds's avatar
Linus Torvalds committed
		/* Socket must be waked up by subsequent tcp_data_snd_check().
		 * This function is not for random using!
		 */
	} else {
		inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
					  min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
					  TCP_RTO_MAX);
static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag)
Linus Torvalds's avatar
Linus Torvalds committed
{
	return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
		inet_csk(sk)->icsk_ca_state != TCP_CA_Open);
static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
Linus Torvalds's avatar
Linus Torvalds committed
{
	const struct tcp_sock *tp = tcp_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
	return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) &&
		!((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR));
Linus Torvalds's avatar
Linus Torvalds committed
}

/* Check that window update is acceptable.
 * The function assumes that snd_una<=ack<=snd_next.
 */
static inline int tcp_may_update_window(const struct tcp_sock *tp, const u32 ack,
					const u32 ack_seq, const u32 nwin)
Linus Torvalds's avatar
Linus Torvalds committed
{
	return (after(ack, tp->snd_una) ||
		after(ack_seq, tp->snd_wl1) ||
		(ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd));
}

/* Update our send window.
 *
 * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
 * and in FreeBSD. NetBSD's one is even worse.) is wrong.
 */
static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp,
				 struct sk_buff *skb, u32 ack, u32 ack_seq)
{
	int flag = 0;
	u32 nwin = ntohs(skb->h.th->window);

	if (likely(!skb->h.th->syn))
		nwin <<= tp->rx_opt.snd_wscale;

	if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
		flag |= FLAG_WIN_UPDATE;
		tcp_update_wl(tp, ack, ack_seq);

		if (tp->snd_wnd != nwin) {
			tp->snd_wnd = nwin;

			/* Note, it is the only place, where
			 * fast path is recovered for sending TCP.
			 */
			tp->pred_flags = 0;
Linus Torvalds's avatar
Linus Torvalds committed
			tcp_fast_path_check(sk, tp);

			if (nwin > tp->max_window) {
				tp->max_window = nwin;
				tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
Linus Torvalds's avatar
Linus Torvalds committed
			}
		}
	}

	tp->snd_una = ack;

	return flag;
}

static void tcp_process_frto(struct sock *sk, u32 prior_snd_una)
{
	struct tcp_sock *tp = tcp_sk(sk);
	
	tcp_sync_left_out(tp);
	
	if (tp->snd_una == prior_snd_una ||
	    !before(tp->snd_una, tp->frto_highmark)) {
		/* RTO was caused by loss, start retransmitting in
		 * go-back-N slow start
		 */
		tcp_enter_frto_loss(sk);
		return;
	}

	if (tp->frto_counter == 1) {
		/* First ACK after RTO advances the window: allow two new
		 * segments out.
		 */
		tp->snd_cwnd = tcp_packets_in_flight(tp) + 2;
	} else {
		/* Also the second ACK after RTO advances the window.
		 * The RTO was likely spurious. Reduce cwnd and continue
		 * in congestion avoidance
		 */
		tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
		tcp_moderate_cwnd(tp);
	}

	/* F-RTO affects on two new ACKs following RTO.
Stephen Hemminger's avatar
Stephen Hemminger committed
	 * At latest on third ACK the TCP behavior is back to normal.
Linus Torvalds's avatar
Linus Torvalds committed
	 */
	tp->frto_counter = (tp->frto_counter + 1) % 3;
}

/* This routine deals with incoming acks, but not outgoing ones. */
static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
{
	struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
	struct tcp_sock *tp = tcp_sk(sk);
	u32 prior_snd_una = tp->snd_una;
	u32 ack_seq = TCP_SKB_CB(skb)->seq;
	u32 ack = TCP_SKB_CB(skb)->ack_seq;
	u32 prior_in_flight;
	s32 seq_rtt;
	int prior_packets;

	/* If the ack is newer than sent or older than previous acks
	 * then we can probably ignore it.
	 */
	if (after(ack, tp->snd_nxt))
		goto uninteresting_ack;

	if (before(ack, prior_snd_una))
		goto old_ack;

	if (sysctl_tcp_abc) {
		if (icsk->icsk_ca_state < TCP_CA_CWR)
			tp->bytes_acked += ack - prior_snd_una;
		else if (icsk->icsk_ca_state == TCP_CA_Loss)
			/* we assume just one segment left network */
			tp->bytes_acked += min(ack - prior_snd_una, tp->mss_cache);
	}
Linus Torvalds's avatar
Linus Torvalds committed
	if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
		/* Window is constant, pure forward advance.
		 * No more checks are required.
		 * Note, we use the fact that SND.UNA>=SND.WL2.
		 */
		tcp_update_wl(tp, ack, ack_seq);
		tp->snd_una = ack;
		flag |= FLAG_WIN_UPDATE;

		tcp_ca_event(sk, CA_EVENT_FAST_ACK);
Linus Torvalds's avatar
Linus Torvalds committed
		NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS);
	} else {
		if (ack_seq != TCP_SKB_CB(skb)->end_seq)
			flag |= FLAG_DATA;
		else
			NET_INC_STATS_BH(LINUX_MIB_TCPPUREACKS);

		flag |= tcp_ack_update_window(sk, tp, skb, ack, ack_seq);

		if (TCP_SKB_CB(skb)->sacked)
			flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);

		if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th))
			flag |= FLAG_ECE;

		tcp_ca_event(sk, CA_EVENT_SLOW_ACK);
Linus Torvalds's avatar
Linus Torvalds committed
	}

	/* We passed data and got it acked, remove any soft error
	 * log. Something worked...
	 */
	sk->sk_err_soft = 0;
	tp->rcv_tstamp = tcp_time_stamp;
	prior_packets = tp->packets_out;
	if (!prior_packets)
		goto no_queue;

	prior_in_flight = tcp_packets_in_flight(tp);

	/* See if we can take anything off of the retransmit queue. */
	flag |= tcp_clean_rtx_queue(sk, &seq_rtt);
Linus Torvalds's avatar
Linus Torvalds committed

	if (tp->frto_counter)
		tcp_process_frto(sk, prior_snd_una);

	if (tcp_ack_is_dubious(sk, flag)) {
Stephen Hemminger's avatar
Stephen Hemminger committed
		/* Advance CWND, if state allows this. */
		if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag))
			tcp_cong_avoid(sk, ack,  seq_rtt, prior_in_flight, 0);
Linus Torvalds's avatar
Linus Torvalds committed
		tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
	} else {
			tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 1);
Linus Torvalds's avatar
Linus Torvalds committed
	}

	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP))
		dst_confirm(sk->sk_dst_cache);

	return 1;

no_queue:
Linus Torvalds's avatar
Linus Torvalds committed

	/* If this ack opens up a zero window, clear backoff.  It was
	 * being used to time the probes, and is probably far higher than
	 * it needs to be for normal retransmission.
	 */
	if (sk->sk_send_head)
		tcp_ack_probe(sk);
	return 1;

old_ack:
	if (TCP_SKB_CB(skb)->sacked)
		tcp_sacktag_write_queue(sk, skb, prior_snd_una);

uninteresting_ack:
	SOCK_DEBUG(sk, "Ack %u out of %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
	return 0;
}


/* Look for tcp options. Normally only called on SYN and SYNACK packets.
 * But, this can also be called on packets in the established flow when
 * the fast version below fails.
 */
void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab)
{
	unsigned char *ptr;
	struct tcphdr *th = skb->h.th;
	int length=(th->doff*4)-sizeof(struct tcphdr);

	ptr = (unsigned char *)(th + 1);
	opt_rx->saw_tstamp = 0;

	while(length>0) {
	  	int opcode=*ptr++;
		int opsize;

		switch (opcode) {
			case TCPOPT_EOL:
				return;
			case TCPOPT_NOP:	/* Ref: RFC 793 section 3.1 */
				length--;
				continue;
			default:
				opsize=*ptr++;
				if (opsize < 2) /* "silly options" */
					return;
				if (opsize > length)
					return;	/* don't parse partial options */
	  			switch(opcode) {
				case TCPOPT_MSS:
					if(opsize==TCPOLEN_MSS && th->syn && !estab) {
						u16 in_mss = ntohs(get_unaligned((__be16 *)ptr));
Linus Torvalds's avatar
Linus Torvalds committed
						if (in_mss) {
							if (opt_rx->user_mss && opt_rx->user_mss < in_mss)
								in_mss = opt_rx->user_mss;
							opt_rx->mss_clamp = in_mss;
						}
					}
					break;
				case TCPOPT_WINDOW:
					if(opsize==TCPOLEN_WINDOW && th->syn && !estab)
						if (sysctl_tcp_window_scaling) {
							__u8 snd_wscale = *(__u8 *) ptr;
							opt_rx->wscale_ok = 1;
							if (snd_wscale > 14) {
								if(net_ratelimit())
									printk(KERN_INFO "tcp_parse_options: Illegal window "
									       "scaling value %d >14 received.\n",
									       snd_wscale);
								snd_wscale = 14;
							}
							opt_rx->snd_wscale = snd_wscale;
						}
					break;
				case TCPOPT_TIMESTAMP:
					if(opsize==TCPOLEN_TIMESTAMP) {
						if ((estab && opt_rx->tstamp_ok) ||
						    (!estab && sysctl_tcp_timestamps)) {
							opt_rx->saw_tstamp = 1;
							opt_rx->rcv_tsval = ntohl(get_unaligned((__be32 *)ptr));
							opt_rx->rcv_tsecr = ntohl(get_unaligned((__be32 *)(ptr+4)));
Linus Torvalds's avatar
Linus Torvalds committed
						}
					}
					break;
				case TCPOPT_SACK_PERM:
					if(opsize==TCPOLEN_SACK_PERM && th->syn && !estab) {
						if (sysctl_tcp_sack) {
							opt_rx->sack_ok = 1;
							tcp_sack_reset(opt_rx);
						}
					}
					break;

				case TCPOPT_SACK:
					if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
					   !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
					   opt_rx->sack_ok) {
						TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
					}
#ifdef CONFIG_TCP_MD5SIG
				case TCPOPT_MD5SIG:
					/*
					 * The MD5 Hash has already been
					 * checked (see tcp_v{4,6}_do_rcv()).
					 */
					break;
#endif
Linus Torvalds's avatar
Linus Torvalds committed
	  			};
	  			ptr+=opsize-2;
	  			length-=opsize;
	  	};
	}
}

/* Fast parse options. This hopes to only see timestamps.
 * If it is wrong it falls back on tcp_parse_options().
 */
Stephen Hemminger's avatar
Stephen Hemminger committed
static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
				  struct tcp_sock *tp)
Linus Torvalds's avatar
Linus Torvalds committed
{
	if (th->doff == sizeof(struct tcphdr)>>2) {
		tp->rx_opt.saw_tstamp = 0;
		return 0;
	} else if (tp->rx_opt.tstamp_ok &&
		   th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) {
		__be32 *ptr = (__be32 *)(th + 1);
		if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
Linus Torvalds's avatar
Linus Torvalds committed
				  | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
			tp->rx_opt.saw_tstamp = 1;
			++ptr;
			tp->rx_opt.rcv_tsval = ntohl(*ptr);
			++ptr;
			tp->rx_opt.rcv_tsecr = ntohl(*ptr);
			return 1;
		}
	}
	tcp_parse_options(skb, &tp->rx_opt, 1);
	return 1;
}

static inline void tcp_store_ts_recent(struct tcp_sock *tp)
{
	tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
	tp->rx_opt.ts_recent_stamp = xtime.tv_sec;
}

static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
{
	if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
		/* PAWS bug workaround wrt. ACK frames, the PAWS discard
		 * extra check below makes sure this can only happen
		 * for pure ACK frames.  -DaveM
		 *
		 * Not only, also it occurs for expired timestamps.
		 */

		if((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) >= 0 ||
		   xtime.tv_sec >= tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS)
			tcp_store_ts_recent(tp);
	}
}

/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
 *
 * It is not fatal. If this ACK does _not_ change critical state (seqs, window)
 * it can pass through stack. So, the following predicate verifies that
 * this segment is not used for anything but congestion avoidance or
 * fast retransmit. Moreover, we even are able to eliminate most of such
 * second order effects, if we apply some small "replay" window (~RTO)
 * to timestamp space.
 *
 * All these measures still do not guarantee that we reject wrapped ACKs
 * on networks with high bandwidth, when sequence space is recycled fastly,
 * but it guarantees that such events will be very rare and do not affect
 * connection seriously. This doesn't look nice, but alas, PAWS is really
 * buggy extension.
 *
 * [ Later note. Even worse! It is buggy for segments _with_ data. RFC
 * states that events when retransmit arrives after original data are rare.
 * It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is
 * the biggest problem on large power networks even with minor reordering.
 * OK, let's give it small replay window. If peer clock is even 1hz, it is safe
 * up to bandwidth of 18Gigabit/sec. 8) ]
 */

static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
{
	struct tcp_sock *tp = tcp_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
	struct tcphdr *th = skb->h.th;
	u32 seq = TCP_SKB_CB(skb)->seq;
	u32 ack = TCP_SKB_CB(skb)->ack_seq;

	return (/* 1. Pure ACK with correct sequence number. */
		(th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&

		/* 2. ... and duplicate ACK. */
		ack == tp->snd_una &&

		/* 3. ... and does not update window. */
		!tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&

		/* 4. ... and sits in replay window. */
		(s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
static inline int tcp_paws_discard(const struct sock *sk, const struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
{
	const struct tcp_sock *tp = tcp_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
	return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW &&
		xtime.tv_sec < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS &&
		!tcp_disordered_ack(sk, skb));
Linus Torvalds's avatar
Linus Torvalds committed
}

/* Check segment sequence number for validity.
 *
 * Segment controls are considered valid, if the segment
 * fits to the window after truncation to the window. Acceptability
 * of data (and SYN, FIN, of course) is checked separately.
 * See tcp_data_queue(), for example.
 *
 * Also, controls (RST is main one) are accepted using RCV.WUP instead
 * of RCV.NXT. Peer still did not advance his SND.UNA when we
 * delayed ACK, so that hisSND.UNA<=ourRCV.WUP.
 * (borrowed from freebsd)
 */

static inline int tcp_sequence(struct tcp_sock *tp, u32 seq, u32 end_seq)
{
	return	!before(end_seq, tp->rcv_wup) &&
		!after(seq, tp->rcv_nxt + tcp_receive_window(tp));
}

/* When we get a reset we do this. */
static void tcp_reset(struct sock *sk)
{
	/* We want the right error as BSD sees it (and indeed as we do). */
	switch (sk->sk_state) {
		case TCP_SYN_SENT:
			sk->sk_err = ECONNREFUSED;
			break;
		case TCP_CLOSE_WAIT:
			sk->sk_err = EPIPE;
			break;
		case TCP_CLOSE:
			return;
		default:
			sk->sk_err = ECONNRESET;
	}

	if (!sock_flag(sk, SOCK_DEAD))
		sk->sk_error_report(sk);

	tcp_done(sk);
}

/*
 * 	Process the FIN bit. This now behaves as it is supposed to work
 *	and the FIN takes effect when it is validly part of sequence
 *	space. Not before when we get holes.
 *
 *	If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
 *	(and thence onto LAST-ACK and finally, CLOSE, we never enter
 *	TIME-WAIT)
 *
 *	If we are in FINWAIT-1, a received FIN indicates simultaneous
 *	close and we go into CLOSING (and later onto TIME-WAIT)
 *
 *	If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
 */
static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
{
	struct tcp_sock *tp = tcp_sk(sk);

	inet_csk_schedule_ack(sk);
Linus Torvalds's avatar
Linus Torvalds committed

	sk->sk_shutdown |= RCV_SHUTDOWN;
	sock_set_flag(sk, SOCK_DONE);

	switch (sk->sk_state) {
		case TCP_SYN_RECV:
		case TCP_ESTABLISHED:
			/* Move to CLOSE_WAIT */
			tcp_set_state(sk, TCP_CLOSE_WAIT);
			inet_csk(sk)->icsk_ack.pingpong = 1;
Linus Torvalds's avatar
Linus Torvalds committed
			break;

		case TCP_CLOSE_WAIT:
		case TCP_CLOSING:
			/* Received a retransmission of the FIN, do
			 * nothing.
			 */
			break;
		case TCP_LAST_ACK:
			/* RFC793: Remain in the LAST-ACK state. */
			break;

		case TCP_FIN_WAIT1:
			/* This case occurs when a simultaneous close
			 * happens, we must ack the received FIN and
			 * enter the CLOSING state.
			 */
			tcp_send_ack(sk);
			tcp_set_state(sk, TCP_CLOSING);
			break;
		case TCP_FIN_WAIT2:
			/* Received a FIN -- send ACK and enter TIME_WAIT. */
			tcp_send_ack(sk);
			tcp_time_wait(sk, TCP_TIME_WAIT, 0);
			break;
		default:
			/* Only TCP_LISTEN and TCP_CLOSE are left, in these
			 * cases we should never reach this piece of code.
			 */
			printk(KERN_ERR "%s: Impossible, sk->sk_state=%d\n",
			       __FUNCTION__, sk->sk_state);
			break;
	};

	/* It _is_ possible, that we have something out-of-order _after_ FIN.
	 * Probably, we should reset in this case. For now drop them.
	 */
	__skb_queue_purge(&tp->out_of_order_queue);
	if (tp->rx_opt.sack_ok)
		tcp_sack_reset(&tp->rx_opt);
	sk_stream_mem_reclaim(sk);

	if (!sock_flag(sk, SOCK_DEAD)) {
		sk->sk_state_change(sk);

		/* Do not send POLL_HUP for half duplex close. */
		if (sk->sk_shutdown == SHUTDOWN_MASK ||
		    sk->sk_state == TCP_CLOSE)
			sk_wake_async(sk, 1, POLL_HUP);
		else
			sk_wake_async(sk, 1, POLL_IN);
	}
}

Stephen Hemminger's avatar
Stephen Hemminger committed
static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq)
Linus Torvalds's avatar
Linus Torvalds committed
{
	if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
		if (before(seq, sp->start_seq))
			sp->start_seq = seq;
		if (after(end_seq, sp->end_seq))
			sp->end_seq = end_seq;
		return 1;
	}
	return 0;
}

Stephen Hemminger's avatar
Stephen Hemminger committed
static void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq)
Linus Torvalds's avatar
Linus Torvalds committed
{
	if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) {
		if (before(seq, tp->rcv_nxt))
			NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOLDSENT);
		else
			NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFOSENT);

		tp->rx_opt.dsack = 1;
		tp->duplicate_sack[0].start_seq = seq;
		tp->duplicate_sack[0].end_seq = end_seq;
		tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + 1, 4 - tp->rx_opt.tstamp_ok);
	}
}

Stephen Hemminger's avatar
Stephen Hemminger committed
static void tcp_dsack_extend(struct tcp_sock *tp, u32 seq, u32 end_seq)
Linus Torvalds's avatar
Linus Torvalds committed
{
	if (!tp->rx_opt.dsack)
		tcp_dsack_set(tp, seq, end_seq);
	else
		tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
}

static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb)
{
	struct tcp_sock *tp = tcp_sk(sk);

	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
	    before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
		NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST);
		tcp_enter_quickack_mode(sk);
Linus Torvalds's avatar
Linus Torvalds committed

		if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) {
			u32 end_seq = TCP_SKB_CB(skb)->end_seq;

			if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
				end_seq = tp->rcv_nxt;
			tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, end_seq);
		}
	}

	tcp_send_ack(sk);
}

/* These routines update the SACK block as out-of-order packets arrive or
 * in-order packets close up the sequence space.
 */
static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
{
	int this_sack;
	struct tcp_sack_block *sp = &tp->selective_acks[0];
	struct tcp_sack_block *swalk = sp+1;

	/* See if the recent change to the first SACK eats into