Skip to content
Snippets Groups Projects
tcp_input.c 156 KiB
Newer Older
  • Learn to ignore specific revisions
  • Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    		if (tp->retrans_out) {
    			printk(KERN_DEBUG "Leak r=%u %d\n",
    
    			       tp->retrans_out, icsk->icsk_ca_state);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			tp->retrans_out = 0;
    		}
    	}
    #endif
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    static void tcp_ack_probe(struct sock *sk)
    {
    
    	const struct tcp_sock *tp = tcp_sk(sk);
    	struct inet_connection_sock *icsk = inet_csk(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	/* Was it a usable window open? */
    
    
    	if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) {
    
    		icsk->icsk_backoff = 0;
    		inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/* Socket must be waked up by subsequent tcp_data_snd_check().
    		 * This function is not for random using!
    		 */
    	} else {
    
    		inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
    
    					  min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
    					  TCP_RTO_MAX);
    
    static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
    
    		inet_csk(sk)->icsk_ca_state != TCP_CA_Open);
    
    static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	const struct tcp_sock *tp = tcp_sk(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) &&
    
    		!((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /* Check that window update is acceptable.
     * The function assumes that snd_una<=ack<=snd_next.
     */
    
    static inline int tcp_may_update_window(const struct tcp_sock *tp,
    					const u32 ack, const u32 ack_seq,
    					const u32 nwin)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	return (after(ack, tp->snd_una) ||
    		after(ack_seq, tp->snd_wl1) ||
    		(ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd));
    }
    
    /* Update our send window.
     *
     * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
     * and in FreeBSD. NetBSD's one is even worse.) is wrong.
     */
    
    static int tcp_ack_update_window(struct sock *sk, struct sk_buff *skb, u32 ack,
    				 u32 ack_seq)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct tcp_sock *tp = tcp_sk(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	int flag = 0;
    
    	u32 nwin = ntohs(tcp_hdr(skb)->window);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if (likely(!tcp_hdr(skb)->syn))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		nwin <<= tp->rx_opt.snd_wscale;
    
    	if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
    		flag |= FLAG_WIN_UPDATE;
    		tcp_update_wl(tp, ack, ack_seq);
    
    		if (tp->snd_wnd != nwin) {
    			tp->snd_wnd = nwin;
    
    			/* Note, it is the only place, where
    			 * fast path is recovered for sending TCP.
    			 */
    
    			tp->pred_flags = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    			if (nwin > tp->max_window) {
    				tp->max_window = nwin;
    
    				tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			}
    		}
    	}
    
    	tp->snd_una = ack;
    
    	return flag;
    }
    
    
    /* A very conservative spurious RTO response algorithm: reduce cwnd and
     * continue in congestion avoidance.
     */
    static void tcp_conservative_spur_to_response(struct tcp_sock *tp)
    {
    	tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
    
    /* A conservative spurious RTO response algorithm: reduce cwnd using
     * rate halving and continue in congestion avoidance.
     */
    static void tcp_ratehalving_spur_to_response(struct sock *sk)
    {
    	tcp_enter_cwr(sk, 0);
    }
    
    
    static void tcp_undo_spur_to_response(struct sock *sk, int flag)
    
    	if (flag & FLAG_ECE)
    
    		tcp_ratehalving_spur_to_response(sk);
    	else
    		tcp_undo_cwr(sk, 1);
    
    /* F-RTO spurious RTO detection algorithm (RFC4138)
     *
    
     * F-RTO affects during two new ACKs following RTO (well, almost, see inline
     * comments). State (ACK number) is kept in frto_counter. When ACK advances
     * window (but not to or beyond highest sequence sent before RTO):
    
     *   On First ACK,  send two new segments out.
     *   On Second ACK, RTO was likely spurious. Do spurious response (response
     *                  algorithm is not part of the F-RTO detection algorithm
     *                  given in RFC4138 but can be selected separately).
     * Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss
    
     * and TCP falls back to conventional RTO recovery. F-RTO allows overriding
     * of Nagle, this is done using frto_counter states 2 and 3, when a new data
     * segment of any size sent during F-RTO, state 2 is upgraded to 3.
    
     *
     * Rationale: if the RTO was spurious, new ACKs should arrive from the
     * original window even after we transmit two new data segments.
     *
    
     * SACK version:
     *   on first step, wait until first cumulative ACK arrives, then move to
     *   the second step. In second step, the next ACK decides.
     *
    
     * F-RTO is implemented (mainly) in four functions:
     *   - tcp_use_frto() is used to determine if TCP is can use F-RTO
     *   - tcp_enter_frto() prepares TCP state on RTO if F-RTO is used, it is
     *     called when tcp_use_frto() showed green light
     *   - tcp_process_frto() handles incoming ACKs during F-RTO algorithm
     *   - tcp_enter_frto_loss() is called if there is not enough evidence
     *     to prove that the RTO is indeed spurious. It transfers the control
     *     from F-RTO to the conventional RTO recovery
     */
    
    static int tcp_process_frto(struct sock *sk, int flag)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	/* Duplicate the behavior from Loss state (fastretrans_alert) */
    
    	if (flag & FLAG_DATA_ACKED)
    
    		inet_csk(sk)->icsk_retransmits = 0;
    
    
    	if ((flag & FLAG_NONHEAD_RETRANS_ACKED) ||
    	    ((tp->frto_counter >= 2) && (flag & FLAG_RETRANS_DATA_ACKED)))
    		tp->undo_marker = 0;
    
    
    	if (!before(tp->snd_una, tp->frto_highmark)) {
    
    		tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag);
    
    	if (!tcp_is_sackfrto(tp)) {
    
    		/* RFC4138 shortcoming in step 2; should also have case c):
    		 * ACK isn't duplicate nor advances window, e.g., opposite dir
    		 * data, winupdate
    		 */
    
    		if (!(flag & FLAG_ANY_PROGRESS) && (flag & FLAG_NOT_DUP))
    
    		if (!(flag & FLAG_DATA_ACKED)) {
    
    			tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3),
    					    flag);
    			return 1;
    		}
    	} else {
    
    		if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) {
    
    			/* Prevent sending of new data. */
    			tp->snd_cwnd = min(tp->snd_cwnd,
    					   tcp_packets_in_flight(tp));
    			return 1;
    		}
    
    		    (!(flag & FLAG_FORWARD_PROGRESS) ||
    		     ((flag & FLAG_DATA_SACKED) &&
    		      !(flag & FLAG_ONLY_ORIG_SACKED)))) {
    
    			/* RFC4138 shortcoming (see comment above) */
    
    			if (!(flag & FLAG_FORWARD_PROGRESS) &&
    			    (flag & FLAG_NOT_DUP))
    
    				return 1;
    
    			tcp_enter_frto_loss(sk, 3, flag);
    			return 1;
    		}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    	if (tp->frto_counter == 1) {
    
    		/* tcp_may_send_now needs to see updated state */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		tp->snd_cwnd = tcp_packets_in_flight(tp) + 2;
    
    
    		if (!tcp_may_send_now(sk))
    			tcp_enter_frto_loss(sk, 2, flag);
    
    
    		switch (sysctl_tcp_frto_response) {
    		case 2:
    
    			tcp_undo_spur_to_response(sk, flag);
    
    			break;
    		case 1:
    			tcp_conservative_spur_to_response(tp);
    			break;
    		default:
    			tcp_ratehalving_spur_to_response(sk);
    			break;
    
    		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /* This routine deals with incoming acks, but not outgoing ones. */
    static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
    {
    
    	struct inet_connection_sock *icsk = inet_csk(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct tcp_sock *tp = tcp_sk(sk);
    	u32 prior_snd_una = tp->snd_una;
    	u32 ack_seq = TCP_SKB_CB(skb)->seq;
    	u32 ack = TCP_SKB_CB(skb)->ack_seq;
    	u32 prior_in_flight;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	int prior_packets;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	/* If the ack is newer than sent or older than previous acks
    	 * then we can probably ignore it.
    	 */
    	if (after(ack, tp->snd_nxt))
    		goto uninteresting_ack;
    
    	if (before(ack, prior_snd_una))
    		goto old_ack;
    
    
    	if (after(ack, prior_snd_una))
    		flag |= FLAG_SND_UNA_ADVANCED;
    
    
    	if (sysctl_tcp_abc) {
    		if (icsk->icsk_ca_state < TCP_CA_CWR)
    			tp->bytes_acked += ack - prior_snd_una;
    		else if (icsk->icsk_ca_state == TCP_CA_Loss)
    			/* we assume just one segment left network */
    
    			tp->bytes_acked += min(ack - prior_snd_una,
    					       tp->mss_cache);
    
    	prior_fackets = tp->fackets_out;
    
    	prior_in_flight = tcp_packets_in_flight(tp);
    
    	if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/* Window is constant, pure forward advance.
    		 * No more checks are required.
    		 * Note, we use the fact that SND.UNA>=SND.WL2.
    		 */
    		tcp_update_wl(tp, ack, ack_seq);
    		tp->snd_una = ack;
    		flag |= FLAG_WIN_UPDATE;
    
    
    		tcp_ca_event(sk, CA_EVENT_FAST_ACK);
    
    		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	} else {
    		if (ack_seq != TCP_SKB_CB(skb)->end_seq)
    			flag |= FLAG_DATA;
    		else
    
    			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPUREACKS);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		if (TCP_SKB_CB(skb)->sacked)
    			flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
    
    
    		if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			flag |= FLAG_ECE;
    
    
    		tcp_ca_event(sk, CA_EVENT_SLOW_ACK);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    	/* We passed data and got it acked, remove any soft error
    	 * log. Something worked...
    	 */
    	sk->sk_err_soft = 0;
    
    	icsk->icsk_probes_out = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	tp->rcv_tstamp = tcp_time_stamp;
    	prior_packets = tp->packets_out;
    	if (!prior_packets)
    		goto no_queue;
    
    	/* See if we can take anything off of the retransmit queue. */
    
    	flag |= tcp_clean_rtx_queue(sk, prior_fackets);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if (tp->frto_counter)
    		frto_cwnd = tcp_process_frto(sk, flag);
    
    	/* Guarantee sacktag reordering detection against wrap-arounds */
    	if (before(tp->frto_highmark, tp->snd_una))
    		tp->frto_highmark = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if (tcp_ack_is_dubious(sk, flag)) {
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    		/* Advance CWND, if state allows this. */
    
    		if ((flag & FLAG_DATA_ACKED) && !frto_cwnd &&
    		    tcp_may_raise_cwnd(sk, flag))
    
    			tcp_cong_avoid(sk, ack, prior_in_flight);
    
    		tcp_fastretrans_alert(sk, prior_packets - tp->packets_out,
    				      flag);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	} else {
    
    		if ((flag & FLAG_DATA_ACKED) && !frto_cwnd)
    
    			tcp_cong_avoid(sk, ack, prior_in_flight);
    
    	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		dst_confirm(sk->sk_dst_cache);
    
    	return 1;
    
    no_queue:
    	/* If this ack opens up a zero window, clear backoff.  It was
    	 * being used to time the probes, and is probably far higher than
    	 * it needs to be for normal retransmission.
    	 */
    
    	if (tcp_send_head(sk))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		tcp_ack_probe(sk);
    	return 1;
    
    old_ack:
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		tcp_sacktag_write_queue(sk, skb, prior_snd_una);
    
    		if (icsk->icsk_ca_state == TCP_CA_Open)
    			tcp_try_keep_open(sk);
    	}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    uninteresting_ack:
    	SOCK_DEBUG(sk, "Ack %u out of %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
    	return 0;
    }
    
    /* Look for tcp options. Normally only called on SYN and SYNACK packets.
     * But, this can also be called on packets in the established flow when
     * the fast version below fails.
     */
    
    void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
    		       int estab)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	unsigned char *ptr;
    
    	struct tcphdr *th = tcp_hdr(skb);
    
    	int length = (th->doff * 4) - sizeof(struct tcphdr);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	ptr = (unsigned char *)(th + 1);
    	opt_rx->saw_tstamp = 0;
    
    
    	while (length > 0) {
    
    		int opcode = *ptr++;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		int opsize;
    
    		switch (opcode) {
    
    		case TCPOPT_EOL:
    			return;
    		case TCPOPT_NOP:	/* Ref: RFC 793 section 3.1 */
    			length--;
    			continue;
    		default:
    			opsize = *ptr++;
    			if (opsize < 2) /* "silly options" */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				return;
    
    			if (opsize > length)
    				return;	/* don't parse partial options */
    			switch (opcode) {
    			case TCPOPT_MSS:
    				if (opsize == TCPOLEN_MSS && th->syn && !estab) {
    
    					u16 in_mss = get_unaligned_be16(ptr);
    
    					if (in_mss) {
    						if (opt_rx->user_mss &&
    						    opt_rx->user_mss < in_mss)
    							in_mss = opt_rx->user_mss;
    						opt_rx->mss_clamp = in_mss;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    					}
    
    				}
    				break;
    			case TCPOPT_WINDOW:
    				if (opsize == TCPOLEN_WINDOW && th->syn &&
    				    !estab && sysctl_tcp_window_scaling) {
    					__u8 snd_wscale = *(__u8 *)ptr;
    					opt_rx->wscale_ok = 1;
    					if (snd_wscale > 14) {
    						if (net_ratelimit())
    							printk(KERN_INFO "tcp_parse_options: Illegal window "
    							       "scaling value %d >14 received.\n",
    							       snd_wscale);
    						snd_wscale = 14;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    					}
    
    					opt_rx->snd_wscale = snd_wscale;
    				}
    				break;
    			case TCPOPT_TIMESTAMP:
    				if ((opsize == TCPOLEN_TIMESTAMP) &&
    				    ((estab && opt_rx->tstamp_ok) ||
    				     (!estab && sysctl_tcp_timestamps))) {
    					opt_rx->saw_tstamp = 1;
    
    					opt_rx->rcv_tsval = get_unaligned_be32(ptr);
    					opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
    
    				}
    				break;
    			case TCPOPT_SACK_PERM:
    				if (opsize == TCPOLEN_SACK_PERM && th->syn &&
    				    !estab && sysctl_tcp_sack) {
    					opt_rx->sack_ok = 1;
    					tcp_sack_reset(opt_rx);
    				}
    				break;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    			case TCPOPT_SACK:
    				if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
    				   !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
    				   opt_rx->sack_ok) {
    					TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
    				}
    				break;
    
    #ifdef CONFIG_TCP_MD5SIG
    
    			case TCPOPT_MD5SIG:
    				/*
    				 * The MD5 Hash has already been
    				 * checked (see tcp_v{4,6}_do_rcv()).
    				 */
    				break;
    
    			ptr += opsize-2;
    			length -= opsize;
    
    static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th)
    {
    	__be32 *ptr = (__be32 *)(th + 1);
    
    	if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
    			  | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
    		tp->rx_opt.saw_tstamp = 1;
    		++ptr;
    		tp->rx_opt.rcv_tsval = ntohl(*ptr);
    		++ptr;
    		tp->rx_opt.rcv_tsecr = ntohl(*ptr);
    		return 1;
    	}
    	return 0;
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /* Fast parse options. This hopes to only see timestamps.
     * If it is wrong it falls back on tcp_parse_options().
     */
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
    				  struct tcp_sock *tp)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	if (th->doff == sizeof(struct tcphdr) >> 2) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		tp->rx_opt.saw_tstamp = 0;
    		return 0;
    	} else if (tp->rx_opt.tstamp_ok &&
    		   th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) {
    
    		if (tcp_parse_aligned_timestamp(tp, th))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			return 1;
    	}
    	tcp_parse_options(skb, &tp->rx_opt, 1);
    	return 1;
    }
    
    
    #ifdef CONFIG_TCP_MD5SIG
    /*
     * Parse MD5 Signature option
     */
    u8 *tcp_parse_md5sig_option(struct tcphdr *th)
    {
    	int length = (th->doff << 2) - sizeof (*th);
    	u8 *ptr = (u8*)(th + 1);
    
    	/* If the TCP option is too short, we can short cut */
    	if (length < TCPOLEN_MD5SIG)
    		return NULL;
    
    	while (length > 0) {
    		int opcode = *ptr++;
    		int opsize;
    
    		switch(opcode) {
    		case TCPOPT_EOL:
    			return NULL;
    		case TCPOPT_NOP:
    			length--;
    			continue;
    		default:
    			opsize = *ptr++;
    			if (opsize < 2 || opsize > length)
    				return NULL;
    			if (opcode == TCPOPT_MD5SIG)
    				return ptr;
    		}
    		ptr += opsize - 2;
    		length -= opsize;
    	}
    	return NULL;
    }
    #endif
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    static inline void tcp_store_ts_recent(struct tcp_sock *tp)
    {
    	tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
    
    	tp->rx_opt.ts_recent_stamp = get_seconds();
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
    {
    	if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
    		/* PAWS bug workaround wrt. ACK frames, the PAWS discard
    		 * extra check below makes sure this can only happen
    		 * for pure ACK frames.  -DaveM
    		 *
    		 * Not only, also it occurs for expired timestamps.
    		 */
    
    
    		if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) >= 0 ||
    
    		   get_seconds() >= tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			tcp_store_ts_recent(tp);
    	}
    }
    
    /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
     *
     * It is not fatal. If this ACK does _not_ change critical state (seqs, window)
     * it can pass through stack. So, the following predicate verifies that
     * this segment is not used for anything but congestion avoidance or
     * fast retransmit. Moreover, we even are able to eliminate most of such
     * second order effects, if we apply some small "replay" window (~RTO)
     * to timestamp space.
     *
     * All these measures still do not guarantee that we reject wrapped ACKs
     * on networks with high bandwidth, when sequence space is recycled fastly,
     * but it guarantees that such events will be very rare and do not affect
     * connection seriously. This doesn't look nice, but alas, PAWS is really
     * buggy extension.
     *
     * [ Later note. Even worse! It is buggy for segments _with_ data. RFC
     * states that events when retransmit arrives after original data are rare.
     * It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is
     * the biggest problem on large power networks even with minor reordering.
     * OK, let's give it small replay window. If peer clock is even 1hz, it is safe
     * up to bandwidth of 18Gigabit/sec. 8) ]
     */
    
    
    static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	struct tcphdr *th = tcp_hdr(skb);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	u32 seq = TCP_SKB_CB(skb)->seq;
    	u32 ack = TCP_SKB_CB(skb)->ack_seq;
    
    	return (/* 1. Pure ACK with correct sequence number. */
    		(th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&
    
    		/* 2. ... and duplicate ACK. */
    		ack == tp->snd_una &&
    
    		/* 3. ... and does not update window. */
    		!tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
    
    		/* 4. ... and sits in replay window. */
    
    		(s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
    
    static inline int tcp_paws_discard(const struct sock *sk,
    				   const struct sk_buff *skb)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	const struct tcp_sock *tp = tcp_sk(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW &&
    
    		get_seconds() < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS &&
    
    		!tcp_disordered_ack(sk, skb));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /* Check segment sequence number for validity.
     *
     * Segment controls are considered valid, if the segment
     * fits to the window after truncation to the window. Acceptability
     * of data (and SYN, FIN, of course) is checked separately.
     * See tcp_data_queue(), for example.
     *
     * Also, controls (RST is main one) are accepted using RCV.WUP instead
     * of RCV.NXT. Peer still did not advance his SND.UNA when we
     * delayed ACK, so that hisSND.UNA<=ourRCV.WUP.
     * (borrowed from freebsd)
     */
    
    static inline int tcp_sequence(struct tcp_sock *tp, u32 seq, u32 end_seq)
    {
    	return	!before(end_seq, tp->rcv_wup) &&
    		!after(seq, tp->rcv_nxt + tcp_receive_window(tp));
    }
    
    /* When we get a reset we do this. */
    static void tcp_reset(struct sock *sk)
    {
    	/* We want the right error as BSD sees it (and indeed as we do). */
    	switch (sk->sk_state) {
    
    	case TCP_SYN_SENT:
    		sk->sk_err = ECONNREFUSED;
    		break;
    	case TCP_CLOSE_WAIT:
    		sk->sk_err = EPIPE;
    		break;
    	case TCP_CLOSE:
    		return;
    	default:
    		sk->sk_err = ECONNRESET;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    	if (!sock_flag(sk, SOCK_DEAD))
    		sk->sk_error_report(sk);
    
    	tcp_done(sk);
    }
    
    /*
     * 	Process the FIN bit. This now behaves as it is supposed to work
     *	and the FIN takes effect when it is validly part of sequence
     *	space. Not before when we get holes.
     *
     *	If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
     *	(and thence onto LAST-ACK and finally, CLOSE, we never enter
     *	TIME-WAIT)
     *
     *	If we are in FINWAIT-1, a received FIN indicates simultaneous
     *	close and we go into CLOSING (and later onto TIME-WAIT)
     *
     *	If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
     */
    static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    
    	inet_csk_schedule_ack(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	sk->sk_shutdown |= RCV_SHUTDOWN;
    	sock_set_flag(sk, SOCK_DONE);
    
    	switch (sk->sk_state) {
    
    	case TCP_SYN_RECV:
    	case TCP_ESTABLISHED:
    		/* Move to CLOSE_WAIT */
    		tcp_set_state(sk, TCP_CLOSE_WAIT);
    		inet_csk(sk)->icsk_ack.pingpong = 1;
    		break;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	case TCP_CLOSE_WAIT:
    	case TCP_CLOSING:
    		/* Received a retransmission of the FIN, do
    		 * nothing.
    		 */
    		break;
    	case TCP_LAST_ACK:
    		/* RFC793: Remain in the LAST-ACK state. */
    		break;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	case TCP_FIN_WAIT1:
    		/* This case occurs when a simultaneous close
    		 * happens, we must ack the received FIN and
    		 * enter the CLOSING state.
    		 */
    		tcp_send_ack(sk);
    		tcp_set_state(sk, TCP_CLOSING);
    		break;
    	case TCP_FIN_WAIT2:
    		/* Received a FIN -- send ACK and enter TIME_WAIT. */
    		tcp_send_ack(sk);
    		tcp_time_wait(sk, TCP_TIME_WAIT, 0);
    		break;
    	default:
    		/* Only TCP_LISTEN and TCP_CLOSE are left, in these
    		 * cases we should never reach this piece of code.
    		 */
    		printk(KERN_ERR "%s: Impossible, sk->sk_state=%d\n",
    
    		       __func__, sk->sk_state);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	/* It _is_ possible, that we have something out-of-order _after_ FIN.
    	 * Probably, we should reset in this case. For now drop them.
    	 */
    	__skb_queue_purge(&tp->out_of_order_queue);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		tcp_sack_reset(&tp->rx_opt);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	if (!sock_flag(sk, SOCK_DEAD)) {
    		sk->sk_state_change(sk);
    
    		/* Do not send POLL_HUP for half duplex close. */
    		if (sk->sk_shutdown == SHUTDOWN_MASK ||
    		    sk->sk_state == TCP_CLOSE)
    
    			sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		else
    
    			sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
    
    static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
    				  u32 end_seq)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
    		if (before(seq, sp->start_seq))
    			sp->start_seq = seq;
    		if (after(end_seq, sp->end_seq))
    			sp->end_seq = end_seq;
    		return 1;
    	}
    	return 0;
    }
    
    
    static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct tcp_sock *tp = tcp_sk(sk);
    
    
    	if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (before(seq, tp->rcv_nxt))
    
    			mib_idx = LINUX_MIB_TCPDSACKOLDSENT;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		else
    
    			mib_idx = LINUX_MIB_TCPDSACKOFOSENT;
    
    
    		NET_INC_STATS_BH(sock_net(sk), mib_idx);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		tp->rx_opt.dsack = 1;
    		tp->duplicate_sack[0].start_seq = seq;
    		tp->duplicate_sack[0].end_seq = end_seq;
    
    		tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks + 1;
    
    static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct tcp_sock *tp = tcp_sk(sk);
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (!tp->rx_opt.dsack)
    
    		tcp_dsack_set(sk, seq, end_seq);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	else
    		tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
    }
    
    static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
    	    before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
    
    		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
    
    		tcp_enter_quickack_mode(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			u32 end_seq = TCP_SKB_CB(skb)->end_seq;
    
    			if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
    				end_seq = tp->rcv_nxt;
    
    			tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    	}
    
    	tcp_send_ack(sk);
    }
    
    /* These routines update the SACK block as out-of-order packets arrive or
     * in-order packets close up the sequence space.
     */
    static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
    {
    	int this_sack;
    	struct tcp_sack_block *sp = &tp->selective_acks[0];
    
    	struct tcp_sack_block *swalk = sp + 1;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	/* See if the recent change to the first SACK eats into
    	 * or hits the sequence space of other SACK blocks, if so coalesce.
    	 */
    
    	for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) {
    			int i;
    
    			/* Zap SWALK, by moving every further SACK up by one slot.
    			 * Decrease num_sacks.
    			 */
    			tp->rx_opt.num_sacks--;
    
    			tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks +
    					       tp->rx_opt.dsack;
    
    			for (i = this_sack; i < tp->rx_opt.num_sacks; i++)
    				sp[i] = sp[i + 1];
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			continue;
    		}
    		this_sack++, swalk++;
    	}
    }
    
    
    static inline void tcp_sack_swap(struct tcp_sack_block *sack1,
    				 struct tcp_sack_block *sack2)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	__u32 tmp;
    
    	tmp = sack1->start_seq;
    	sack1->start_seq = sack2->start_seq;
    	sack2->start_seq = tmp;
    
    	tmp = sack1->end_seq;
    	sack1->end_seq = sack2->end_seq;
    	sack2->end_seq = tmp;
    }
    
    static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	struct tcp_sack_block *sp = &tp->selective_acks[0];
    	int cur_sacks = tp->rx_opt.num_sacks;
    	int this_sack;
    
    	if (!cur_sacks)
    		goto new_sack;
    
    
    	for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (tcp_sack_extend(sp, seq, end_seq)) {
    			/* Rotate this_sack to the first one. */
    
    			for (; this_sack > 0; this_sack--, sp--)
    				tcp_sack_swap(sp, sp - 1);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			if (cur_sacks > 1)
    				tcp_sack_maybe_coalesce(tp);
    			return;
    		}
    	}
    
    	/* Could not find an adjacent existing SACK, build a new one,
    	 * put it at the front, and shift everyone else down.  We
    	 * always know there is at least one SACK present already here.
    	 *
    	 * If the sack array is full, forget about the last one.
    	 */
    
    	if (this_sack >= TCP_NUM_SACKS) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		this_sack--;
    		tp->rx_opt.num_sacks--;
    		sp--;
    	}
    
    	for (; this_sack > 0; this_sack--, sp--)
    
    		*sp = *(sp - 1);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    new_sack:
    	/* Build the new head SACK, and we're done. */
    	sp->start_seq = seq;
    	sp->end_seq = end_seq;
    	tp->rx_opt.num_sacks++;
    
    	tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /* RCV.NXT advances, some SACKs should be eaten. */
    
    static void tcp_sack_remove(struct tcp_sock *tp)
    {
    	struct tcp_sack_block *sp = &tp->selective_acks[0];
    	int num_sacks = tp->rx_opt.num_sacks;
    	int this_sack;
    
    	/* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
    
    	if (skb_queue_empty(&tp->out_of_order_queue)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		tp->rx_opt.num_sacks = 0;
    		tp->rx_opt.eff_sacks = tp->rx_opt.dsack;
    		return;
    	}
    
    
    	for (this_sack = 0; this_sack < num_sacks;) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/* Check if the start of the sack is covered by RCV.NXT. */
    		if (!before(tp->rcv_nxt, sp->start_seq)) {
    			int i;
    
    			/* RCV.NXT must cover all the block! */
    
    			WARN_ON(before(tp->rcv_nxt, sp->end_seq));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    			/* Zap this SACK, by moving forward any other SACKS. */
    			for (i=this_sack+1; i < num_sacks; i++)
    				tp->selective_acks[i-1] = tp->selective_acks[i];
    			num_sacks--;
    			continue;
    		}
    		this_sack++;
    		sp++;
    	}
    	if (num_sacks != tp->rx_opt.num_sacks) {
    		tp->rx_opt.num_sacks = num_sacks;
    
    		tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks +
    				       tp->rx_opt.dsack;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    }
    
    /* This one checks to see if we can put data from the
     * out_of_order queue into the receive_queue.
     */
    static void tcp_ofo_queue(struct sock *sk)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	__u32 dsack_high = tp->rcv_nxt;
    	struct sk_buff *skb;
    
    	while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) {
    		if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
    			break;
    
    		if (before(TCP_SKB_CB(skb)->seq, dsack_high)) {
    			__u32 dsack = dsack_high;
    			if (before(TCP_SKB_CB(skb)->end_seq, dsack_high))
    				dsack_high = TCP_SKB_CB(skb)->end_seq;
    
    			tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    
    		if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
    			SOCK_DEBUG(sk, "ofo packet was already received \n");
    
    David S. Miller's avatar
    David S. Miller committed
    			__skb_unlink(skb, &tp->out_of_order_queue);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			__kfree_skb(skb);
    			continue;
    		}
    		SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
    			   tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
    			   TCP_SKB_CB(skb)->end_seq);
    
    
    David S. Miller's avatar
    David S. Miller committed
    		__skb_unlink(skb, &tp->out_of_order_queue);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		__skb_queue_tail(&sk->sk_receive_queue, skb);
    		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
    
    		if (tcp_hdr(skb)->fin)
    			tcp_fin(skb, sk, tcp_hdr(skb));
    
    static int tcp_prune_ofo_queue(struct sock *sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    static int tcp_prune_queue(struct sock *sk);
    
    
    static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size)
    {
    	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
    	    !sk_rmem_schedule(sk, size)) {
    
    		if (tcp_prune_queue(sk) < 0)
    			return -1;
    
    		if (!sk_rmem_schedule(sk, size)) {
    
    			if (!tcp_prune_ofo_queue(sk))
    				return -1;
    
    
    			if (!sk_rmem_schedule(sk, size))
    				return -1;
    		}
    	}
    	return 0;
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
    {
    
    	struct tcphdr *th = tcp_hdr(skb);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct tcp_sock *tp = tcp_sk(sk);
    	int eaten = -1;
    
    	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
    		goto drop;
    
    
    	__skb_pull(skb, th->doff * 4);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	TCP_ECN_accept_cwr(tp, skb);
    
    	if (tp->rx_opt.dsack) {
    		tp->rx_opt.dsack = 0;
    
    		tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    	/*  Queue data for delivery to the user.
    	 *  Packets in sequence go to the receive queue.
    	 *  Out of sequence packets to the out_of_order_queue.
    	 */
    	if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {