Skip to content
Snippets Groups Projects
tcp_input.c 147 KiB
Newer Older
  • Learn to ignore specific revisions
  • Linus Torvalds's avatar
    Linus Torvalds committed
    	/* Collapsing did not help, destructive actions follow.
    	 * This must not ever occur. */
    
    	/* First, purge the out_of_order queue. */
    
    	if (!skb_queue_empty(&tp->out_of_order_queue)) {
    		NET_INC_STATS_BH(LINUX_MIB_OFOPRUNED);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		__skb_queue_purge(&tp->out_of_order_queue);
    
    		/* Reset SACK state.  A conforming SACK implementation will
    		 * do the same at a timeout based retransmit.  When a connection
    		 * is in a sad state like this, we care only about integrity
    		 * of the connection not performance.
    		 */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			tcp_sack_reset(&tp->rx_opt);
    		sk_stream_mem_reclaim(sk);
    	}
    
    	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
    		return 0;
    
    	/* If we are really being abused, tell the caller to silently
    	 * drop receive data on the floor.  It will get retransmitted
    	 * and hopefully then we'll have sufficient space.
    	 */
    	NET_INC_STATS_BH(LINUX_MIB_RCVPRUNED);
    
    	/* Massive buffer overcommit. */
    	tp->pred_flags = 0;
    	return -1;
    }
    
    
    /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
     * As additional protections, we do not touch cwnd in retransmission phases,
     * and if application hit its sndbuf limit recently.
     */
    void tcp_cwnd_application_limited(struct sock *sk)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    
    	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	    sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
    		/* Limited by application or receiver window. */
    
    		u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
    		u32 win_used = max(tp->snd_cwnd_used, init_win);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (win_used < tp->snd_cwnd) {
    
    			tp->snd_ssthresh = tcp_current_ssthresh(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
    		}
    		tp->snd_cwnd_used = 0;
    	}
    	tp->snd_cwnd_stamp = tcp_time_stamp;
    }
    
    
    static int tcp_should_expand_sndbuf(struct sock *sk)
    
    	struct tcp_sock *tp = tcp_sk(sk);
    
    
    	/* If the user specified a specific send buffer setting, do
    	 * not modify it.
    	 */
    	if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
    		return 0;
    
    	/* If we are under global TCP memory pressure, do not expand.  */
    	if (tcp_memory_pressure)
    		return 0;
    
    	/* If we are under soft global TCP memory pressure, do not expand.  */
    	if (atomic_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])
    		return 0;
    
    	/* If we filled the congestion window, do not expand.  */
    	if (tp->packets_out >= tp->snd_cwnd)
    		return 0;
    
    	return 1;
    }
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    /* When incoming ACK allowed to free some skb from write_queue,
     * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
     * on the exit from tcp input handler.
     *
     * PROBLEM: sndbuf expansion does not work well with largesend.
     */
    static void tcp_new_space(struct sock *sk)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    
    	if (tcp_should_expand_sndbuf(sk)) {
    
    		int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			MAX_TCP_HEADER + 16 + sizeof(struct sk_buff),
    		    demanded = max_t(unsigned int, tp->snd_cwnd,
    						   tp->reordering + 1);
    		sndmem *= 2*demanded;
    		if (sndmem > sk->sk_sndbuf)
    			sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
    		tp->snd_cwnd_stamp = tcp_time_stamp;
    	}
    
    	sk->sk_write_space(sk);
    }
    
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    static void tcp_check_space(struct sock *sk)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
    		sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
    		if (sk->sk_socket &&
    		    test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
    			tcp_new_space(sk);
    	}
    }
    
    
    static inline void tcp_data_snd_check(struct sock *sk)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	tcp_push_pending_frames(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	tcp_check_space(sk);
    }
    
    /*
     * Check if sending an ack is needed.
     */
    static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	    /* More than one full frame received... */
    
    	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	     /* ... and right edge of window advances far enough.
    	      * (tcp_recvmsg() will send ACK otherwise). Or...
    	      */
    	     && __tcp_select_window(sk) >= tp->rcv_wnd) ||
    	    /* We ACK each frame or... */
    
    	    tcp_in_quickack_mode(sk) ||
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	    /* We have out of order data. */
    	    (ofo_possible &&
    	     skb_peek(&tp->out_of_order_queue))) {
    		/* Then ack it now */
    		tcp_send_ack(sk);
    	} else {
    		/* Else, send delayed ack. */
    		tcp_send_delayed_ack(sk);
    	}
    }
    
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    static inline void tcp_ack_snd_check(struct sock *sk)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	if (!inet_csk_ack_scheduled(sk)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/* We sent a data segment already. */
    		return;
    	}
    	__tcp_ack_snd_check(sk, 1);
    }
    
    /*
     *	This routine is only called when we have urgent data
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
     *	signaled. Its the 'slow' part of tcp_urg. It could be
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *	moved inline now as tcp_urg is only called from one
     *	place. We handle URGent data wrong. We have to - as
     *	BSD still doesn't use the correction from RFC961.
     *	For 1003.1g we should support a new option TCP_STDURG to permit
     *	either form (or just set the sysctl tcp_stdurg).
     */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	u32 ptr = ntohs(th->urg_ptr);
    
    	if (ptr && !sysctl_tcp_stdurg)
    		ptr--;
    	ptr += ntohl(th->seq);
    
    	/* Ignore urgent data that we've already seen and read. */
    	if (after(tp->copied_seq, ptr))
    		return;
    
    	/* Do not replay urg ptr.
    	 *
    	 * NOTE: interesting situation not covered by specs.
    	 * Misbehaving sender may send urg ptr, pointing to segment,
    	 * which we already have in ofo queue. We are not able to fetch
    	 * such data and will stay in TCP_URG_NOTYET until will be eaten
    	 * by recvmsg(). Seems, we are not obliged to handle such wicked
    	 * situations. But it is worth to think about possibility of some
    	 * DoSes using some hypothetical application level deadlock.
    	 */
    	if (before(ptr, tp->rcv_nxt))
    		return;
    
    	/* Do we already have a newer (or duplicate) urgent pointer? */
    	if (tp->urg_data && !after(ptr, tp->urg_seq))
    		return;
    
    	/* Tell the world about our new urgent pointer. */
    	sk_send_sigurg(sk);
    
    	/* We may be adding urgent data when the last byte read was
    	 * urgent. To do this requires some care. We cannot just ignore
    	 * tp->copied_seq since we would read the last urgent byte again
    	 * as data, nor can we alter copied_seq until this data arrives
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    	 * or we break the semantics of SIOCATMARK (and thus sockatmark())
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	 *
    	 * NOTE. Double Dutch. Rendering to plain English: author of comment
    	 * above did something sort of 	send("A", MSG_OOB); send("B", MSG_OOB);
    	 * and expect that both A and B disappear from stream. This is _wrong_.
    	 * Though this happens in BSD with high probability, this is occasional.
    	 * Any application relying on this is buggy. Note also, that fix "works"
    	 * only in this artificial test. Insert some normal data between A and B and we will
    	 * decline of BSD again. Verdict: it is better to remove to trap
    	 * buggy users.
    	 */
    	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
    	    !sock_flag(sk, SOCK_URGINLINE) &&
    	    tp->copied_seq != tp->rcv_nxt) {
    		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
    		tp->copied_seq++;
    		if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
    
    David S. Miller's avatar
    David S. Miller committed
    			__skb_unlink(skb, &sk->sk_receive_queue);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			__kfree_skb(skb);
    		}
    	}
    
    	tp->urg_data   = TCP_URG_NOTYET;
    	tp->urg_seq    = ptr;
    
    	/* Disable header prediction. */
    	tp->pred_flags = 0;
    }
    
    /* This is the 'fast' part of urgent handling. */
    static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	/* Check if we get a new urgent pointer - normally not. */
    	if (th->urg)
    		tcp_check_urg(sk,th);
    
    	/* Do we wait for any urgent data? - normally not... */
    	if (tp->urg_data == TCP_URG_NOTYET) {
    		u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) -
    			  th->syn;
    
    
    		/* Is the urgent pointer pointing into this packet? */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (ptr < skb->len) {
    			u8 tmp;
    			if (skb_copy_bits(skb, ptr, &tmp, 1))
    				BUG();
    			tp->urg_data = TCP_URG_VALID | tmp;
    			if (!sock_flag(sk, SOCK_DEAD))
    				sk->sk_data_ready(sk, 0);
    		}
    	}
    }
    
    static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	int chunk = skb->len - hlen;
    	int err;
    
    	local_bh_enable();
    
    	if (skb_csum_unnecessary(skb))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		err = skb_copy_datagram_iovec(skb, hlen, tp->ucopy.iov, chunk);
    	else
    		err = skb_copy_and_csum_datagram_iovec(skb, hlen,
    						       tp->ucopy.iov);
    
    	if (!err) {
    		tp->ucopy.len -= chunk;
    		tp->copied_seq += chunk;
    		tcp_rcv_space_adjust(sk);
    	}
    
    	local_bh_disable();
    	return err;
    }
    
    
    static __sum16 __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	if (sock_owned_by_user(sk)) {
    		local_bh_enable();
    		result = __tcp_checksum_complete(skb);
    		local_bh_disable();
    	} else {
    		result = __tcp_checksum_complete(skb);
    	}
    	return result;
    }
    
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    static inline int tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	return !skb_csum_unnecessary(skb) &&
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		__tcp_checksum_complete_user(sk, skb);
    }
    
    
    #ifdef CONFIG_NET_DMA
    static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	int chunk = skb->len - hlen;
    	int dma_cookie;
    	int copied_early = 0;
    
    	if (tp->ucopy.wakeup)
    
    
    	if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
    		tp->ucopy.dma_chan = get_softnet_dma();
    
    
    	if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) {
    
    
    		dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan,
    			skb, hlen, tp->ucopy.iov, chunk, tp->ucopy.pinned_list);
    
    		if (dma_cookie < 0)
    			goto out;
    
    		tp->ucopy.dma_cookie = dma_cookie;
    		copied_early = 1;
    
    		tp->ucopy.len -= chunk;
    		tp->copied_seq += chunk;
    		tcp_rcv_space_adjust(sk);
    
    		if ((tp->ucopy.len == 0) ||
    
    		    (tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) ||
    
    		    (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) {
    			tp->ucopy.wakeup = 1;
    			sk->sk_data_ready(sk, 0);
    		}
    	} else if (chunk > 0) {
    		tp->ucopy.wakeup = 1;
    		sk->sk_data_ready(sk, 0);
    	}
    out:
    	return copied_early;
    }
    #endif /* CONFIG_NET_DMA */
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
    
     *	TCP receive function for the ESTABLISHED state.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *
    
     *	It is split into a fast path and a slow path. The fast path is
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * 	disabled when:
     *	- A zero window was announced from us - zero window probing
    
     *        is only handled properly in the slow path.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *	- Out of order segments arrived.
     *	- Urgent data is expected.
     *	- There is no buffer space left
     *	- Unexpected TCP flags/window values/header lengths are received
    
     *	  (detected by checking the TCP header against pred_flags)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *	- Data is sent in both directions. Fast path only supports pure senders
     *	  or pure receivers (this means either the sequence number or the ack
     *	  value must stay constant)
     *	- Unexpected TCP option.
     *
    
     *	When these conditions are not satisfied it drops into a standard
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *	receive procedure patterned after RFC793 to handle all cases.
     *	The first three cases are guaranteed by proper pred_flags setting,
    
     *	the rest is checked inline. Fast processing is turned on in
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *	tcp_data_queue when everything is OK.
     */
    int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
    			struct tcphdr *th, unsigned len)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	/*
    	 *	Header prediction.
    
    	 *	The code loosely follows the one in the famous
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	 *	"30 instruction TCP receive" Van Jacobson mail.
    
    	 *
    	 *	Van's trick is to deposit buffers into socket queue
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	 *	on a device interrupt, to call tcp_recv function
    	 *	on the receive process context and checksum and copy
    	 *	the buffer to user space. smart...
    	 *
    
    	 *	Our current scheme is not silly either but we take the
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	 *	extra cost of the net_bh soft interrupt processing...
    	 *	We do checksum and copy also but from device to kernel.
    	 */
    
    	tp->rx_opt.saw_tstamp = 0;
    
    	/*	pred_flags is 0xS?10 << 16 + snd_wnd
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    	 *	if header_prediction is to be made
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	 *	'S' will always be tp->tcp_header_len >> 2
    	 *	'?' will be 0 for the fast path, otherwise pred_flags is 0 to
    
    	 *  turn it off	(when there are holes in the receive
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	 *	 space for instance)
    	 *	PSH flag is ignored.
    	 */
    
    	if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
    		TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
    		int tcp_header_len = tp->tcp_header_len;
    
    		/* Timestamp header prediction: tcp_header_len
    		 * is automatically equal to th->doff*4 due to pred_flags
    		 * match.
    		 */
    
    		/* Check timestamp */
    		if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
    
    			__be32 *ptr = (__be32 *)(th + 1);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    			/* No? Slow path! */
    
    			if (*ptr != htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    					  | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP))
    				goto slow_path;
    
    			tp->rx_opt.saw_tstamp = 1;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			tp->rx_opt.rcv_tsval = ntohl(*ptr);
    			++ptr;
    			tp->rx_opt.rcv_tsecr = ntohl(*ptr);
    
    			/* If PAWS failed, check it more carefully in slow path */
    			if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
    				goto slow_path;
    
    			/* DO NOT update ts_recent here, if checksum fails
    			 * and timestamp was corrupted part, it will result
    			 * in a hung connection since we will drop all
    			 * future packets due to the PAWS test.
    			 */
    		}
    
    		if (len <= tcp_header_len) {
    			/* Bulk data transfer: sender */
    			if (len == tcp_header_len) {
    				/* Predicted packet is in window by definition.
    				 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
    				 * Hence, check seq<=rcv_wup reduces to:
    				 */
    				if (tcp_header_len ==
    				    (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
    				    tp->rcv_nxt == tp->rcv_wup)
    					tcp_store_ts_recent(tp);
    
    				/* We know that such packets are checksummed
    				 * on entry.
    				 */
    				tcp_ack(sk, skb, 0);
    
    				__kfree_skb(skb);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				return 0;
    			} else { /* Header too small */
    				TCP_INC_STATS_BH(TCP_MIB_INERRS);
    				goto discard;
    			}
    		} else {
    			int eaten = 0;
    
    			int copied_early = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    			if (tp->copied_seq == tp->rcv_nxt &&
    			    len - tcp_header_len <= tp->ucopy.len) {
    #ifdef CONFIG_NET_DMA
    				if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
    					copied_early = 1;
    					eaten = 1;
    				}
    #endif
    				if (tp->ucopy.task == current && sock_owned_by_user(sk) && !copied_early) {
    					__set_current_state(TASK_RUNNING);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    					if (!tcp_copy_to_iovec(sk, skb, tcp_header_len))
    						eaten = 1;
    				}
    				if (eaten) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    					/* Predicted packet is in window by definition.
    					 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
    					 * Hence, check seq<=rcv_wup reduces to:
    					 */
    					if (tcp_header_len ==
    					    (sizeof(struct tcphdr) +
    					     TCPOLEN_TSTAMP_ALIGNED) &&
    					    tp->rcv_nxt == tp->rcv_wup)
    						tcp_store_ts_recent(tp);
    
    
    					tcp_rcv_rtt_measure_ts(sk, skb);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    					__skb_pull(skb, tcp_header_len);
    					tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
    					NET_INC_STATS_BH(LINUX_MIB_TCPHPHITSTOUSER);
    				}
    
    				if (copied_early)
    					tcp_cleanup_rbuf(sk, skb->len);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			}
    			if (!eaten) {
    				if (tcp_checksum_complete_user(sk, skb))
    					goto csum_error;
    
    				/* Predicted packet is in window by definition.
    				 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
    				 * Hence, check seq<=rcv_wup reduces to:
    				 */
    				if (tcp_header_len ==
    				    (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
    				    tp->rcv_nxt == tp->rcv_wup)
    					tcp_store_ts_recent(tp);
    
    
    				tcp_rcv_rtt_measure_ts(sk, skb);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    				if ((int)skb->truesize > sk->sk_forward_alloc)
    					goto step5;
    
    				NET_INC_STATS_BH(LINUX_MIB_TCPHPHITS);
    
    				/* Bulk data transfer: receiver */
    				__skb_pull(skb,tcp_header_len);
    				__skb_queue_tail(&sk->sk_receive_queue, skb);
    				sk_stream_set_owner_r(skb, sk);
    				tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
    			}
    
    
    			tcp_event_data_recv(sk, skb);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    			if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
    				/* Well, only one small jumplet in fast path... */
    				tcp_ack(sk, skb, FLAG_DATA);
    
    				if (!inet_csk_ack_scheduled(sk))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    					goto no_ack;
    			}
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    no_ack:
    
    #ifdef CONFIG_NET_DMA
    			if (copied_early)
    				__skb_queue_tail(&sk->sk_async_wait_queue, skb);
    			else
    #endif
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			if (eaten)
    				__kfree_skb(skb);
    			else
    				sk->sk_data_ready(sk, 0);
    			return 0;
    		}
    	}
    
    slow_path:
    	if (len < (th->doff<<2) || tcp_checksum_complete_user(sk, skb))
    		goto csum_error;
    
    	/*
    	 * RFC1323: H1. Apply PAWS check first.
    	 */
    	if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
    
    	    tcp_paws_discard(sk, skb)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (!th->rst) {
    			NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
    			tcp_send_dupack(sk, skb);
    			goto discard;
    		}
    		/* Resets are accepted even if PAWS failed.
    
    		   ts_recent update must be made after we are sure
    		   that the packet is in window.
    		 */
    	}
    
    	/*
    	 *	Standard slow path.
    	 */
    
    	if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
    		/* RFC793, page 37: "In all states except SYN-SENT, all reset
    		 * (RST) segments are validated by checking their SEQ-fields."
    		 * And page 69: "If an incoming segment is not acceptable,
    		 * an acknowledgment should be sent in reply (unless the RST bit
    		 * is set, if so drop the segment and return)".
    		 */
    		if (!th->rst)
    			tcp_send_dupack(sk, skb);
    		goto discard;
    	}
    
    
    	if (th->rst) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		tcp_reset(sk);
    		goto discard;
    	}
    
    	tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
    
    	if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
    		TCP_INC_STATS_BH(TCP_MIB_INERRS);
    		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);
    		tcp_reset(sk);
    		return 1;
    	}
    
    step5:
    
    	if (th->ack)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		tcp_ack(sk, skb, FLAG_SLOWPATH);
    
    
    	tcp_rcv_rtt_measure_ts(sk, skb);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	/* Process urgent data. */
    	tcp_urg(sk, skb, th);
    
    	/* step 7: process the segment text */
    	tcp_data_queue(sk, skb);
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	tcp_ack_snd_check(sk);
    	return 0;
    
    csum_error:
    	TCP_INC_STATS_BH(TCP_MIB_INERRS);
    
    discard:
    	__kfree_skb(skb);
    	return 0;
    }
    
    static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
    					 struct tcphdr *th, unsigned len)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	struct inet_connection_sock *icsk = inet_csk(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	int saved_clamp = tp->rx_opt.mss_clamp;
    
    	tcp_parse_options(skb, &tp->rx_opt, 0);
    
    	if (th->ack) {
    		/* rfc793:
    		 * "If the state is SYN-SENT then
    		 *    first check the ACK bit
    		 *      If the ACK bit is set
    		 *	  If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
    		 *        a reset (unless the RST bit is set, if so drop
    		 *        the segment and return)"
    		 *
    		 *  We do not send data with SYN, so that RFC-correct
    		 *  test reduces to:
    		 */
    		if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
    			goto reset_and_undo;
    
    		if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
    		    !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
    			     tcp_time_stamp)) {
    			NET_INC_STATS_BH(LINUX_MIB_PAWSACTIVEREJECTED);
    			goto reset_and_undo;
    		}
    
    		/* Now ACK is acceptable.
    		 *
    		 * "If the RST bit is set
    		 *    If the ACK was acceptable then signal the user "error:
    		 *    connection reset", drop the segment, enter CLOSED state,
    		 *    delete TCB, and return."
    		 */
    
    		if (th->rst) {
    			tcp_reset(sk);
    			goto discard;
    		}
    
    		/* rfc793:
    		 *   "fifth, if neither of the SYN or RST bits is set then
    		 *    drop the segment and return."
    		 *
    		 *    See note below!
    		 *                                        --ANK(990513)
    		 */
    		if (!th->syn)
    			goto discard_and_undo;
    
    		/* rfc793:
    		 *   "If the SYN bit is on ...
    		 *    are acceptable then ...
    		 *    (our SYN has been ACKed), change the connection
    		 *    state to ESTABLISHED..."
    		 */
    
    		TCP_ECN_rcv_synack(tp, th);
    
    		tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
    		tcp_ack(sk, skb, FLAG_SLOWPATH);
    
    		/* Ok.. it's good. Set up sequence numbers and
    		 * move to established.
    		 */
    		tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
    		tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
    
    		/* RFC1323: The window in SYN & SYN/ACK segments is
    		 * never scaled.
    		 */
    		tp->snd_wnd = ntohs(th->window);
    		tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(skb)->seq);
    
    		if (!tp->rx_opt.wscale_ok) {
    			tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
    			tp->window_clamp = min(tp->window_clamp, 65535U);
    		}
    
    		if (tp->rx_opt.saw_tstamp) {
    			tp->rx_opt.tstamp_ok	   = 1;
    			tp->tcp_header_len =
    				sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
    			tp->advmss	    -= TCPOLEN_TSTAMP_ALIGNED;
    			tcp_store_ts_recent(tp);
    		} else {
    			tp->tcp_header_len = sizeof(struct tcphdr);
    		}
    
    
    		if (tcp_is_sack(tp) && sysctl_tcp_fack)
    			tcp_enable_fack(tp);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    John Heffner's avatar
    John Heffner committed
    		tcp_mtup_init(sk);
    
    		tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		tcp_initialize_rcv_mss(sk);
    
    		/* Remember, tcp_poll() does not lock socket!
    		 * Change state from SYN-SENT only after copied_seq
    		 * is initialized. */
    		tp->copied_seq = tp->rcv_nxt;
    
    		smp_mb();
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		tcp_set_state(sk, TCP_ESTABLISHED);
    
    
    		security_inet_conn_established(sk, skb);
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/* Make sure socket is routed, for correct metrics.  */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		tcp_init_metrics(sk);
    
    
    		tcp_init_congestion_control(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/* Prevent spurious tcp_cwnd_restart() on first data
    		 * packet.
    		 */
    		tp->lsndtime = tcp_time_stamp;
    
    		tcp_init_buffer_space(sk);
    
    		if (sock_flag(sk, SOCK_KEEPOPEN))
    
    			inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		if (!tp->rx_opt.snd_wscale)
    			__tcp_fast_path_on(tp, tp->snd_wnd);
    		else
    			tp->pred_flags = 0;
    
    		if (!sock_flag(sk, SOCK_DEAD)) {
    			sk->sk_state_change(sk);
    			sk_wake_async(sk, 0, POLL_OUT);
    		}
    
    
    		if (sk->sk_write_pending ||
    		    icsk->icsk_accept_queue.rskq_defer_accept ||
    		    icsk->icsk_ack.pingpong) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			/* Save one ACK. Data will be ready after
    			 * several ticks, if write_pending is set.
    			 *
    			 * It may be deleted, but with this feature tcpdumps
    			 * look so _wonderfully_ clever, that I was not able
    			 * to stand against the temptation 8)     --ANK
    			 */
    
    			inet_csk_schedule_ack(sk);
    
    			icsk->icsk_ack.lrcvtime = tcp_time_stamp;
    			icsk->icsk_ack.ato	 = TCP_ATO_MIN;
    
    			tcp_incr_quickack(sk);
    			tcp_enter_quickack_mode(sk);
    
    			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
    						  TCP_DELACK_MAX, TCP_RTO_MAX);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    discard:
    			__kfree_skb(skb);
    			return 0;
    		} else {
    			tcp_send_ack(sk);
    		}
    		return -1;
    	}
    
    	/* No ACK in the segment */
    
    	if (th->rst) {
    		/* rfc793:
    		 * "If the RST bit is set
    		 *
    		 *      Otherwise (no ACK) drop the segment and return."
    		 */
    
    		goto discard_and_undo;
    	}
    
    	/* PAWS check. */
    	if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp && tcp_paws_check(&tp->rx_opt, 0))
    		goto discard_and_undo;
    
    	if (th->syn) {
    		/* We see SYN without ACK. It is attempt of
    		 * simultaneous connect with crossed SYNs.
    		 * Particularly, it can be connect to self.
    		 */
    		tcp_set_state(sk, TCP_SYN_RECV);
    
    		if (tp->rx_opt.saw_tstamp) {
    			tp->rx_opt.tstamp_ok = 1;
    			tcp_store_ts_recent(tp);
    			tp->tcp_header_len =
    				sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
    		} else {
    			tp->tcp_header_len = sizeof(struct tcphdr);
    		}
    
    		tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
    		tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
    
    		/* RFC1323: The window in SYN & SYN/ACK segments is
    		 * never scaled.
    		 */
    		tp->snd_wnd    = ntohs(th->window);
    		tp->snd_wl1    = TCP_SKB_CB(skb)->seq;
    		tp->max_window = tp->snd_wnd;
    
    		TCP_ECN_rcv_syn(tp, th);
    
    
    John Heffner's avatar
    John Heffner committed
    		tcp_mtup_init(sk);
    
    		tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		tcp_initialize_rcv_mss(sk);
    
    
    		tcp_send_synack(sk);
    #if 0
    		/* Note, we could accept data and URG from this segment.
    		 * There are no obstacles to make this.
    		 *
    		 * However, if we ignore data in ACKless segments sometimes,
    		 * we have no reasons to accept it sometimes.
    		 * Also, seems the code doing it in step6 of tcp_rcv_state_process
    		 * is not flawless. So, discard packet for sanity.
    		 * Uncomment this return to process the data.
    		 */
    		return -1;
    #else
    		goto discard;
    #endif
    	}
    	/* "fifth, if neither of the SYN or RST bits is set then
    	 * drop the segment and return."
    	 */
    
    discard_and_undo:
    	tcp_clear_options(&tp->rx_opt);
    	tp->rx_opt.mss_clamp = saved_clamp;
    	goto discard;
    
    reset_and_undo:
    	tcp_clear_options(&tp->rx_opt);
    	tp->rx_opt.mss_clamp = saved_clamp;
    	return 1;
    }
    
    
    /*
     *	This function implements the receiving procedure of RFC 793 for
    
     *	all states except ESTABLISHED and TIME_WAIT.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *	It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
     *	address independent.
     */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
    			  struct tcphdr *th, unsigned len)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	struct inet_connection_sock *icsk = inet_csk(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	int queued = 0;
    
    	tp->rx_opt.saw_tstamp = 0;
    
    	switch (sk->sk_state) {
    	case TCP_CLOSE:
    		goto discard;
    
    	case TCP_LISTEN:
    
    		if (th->ack)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			return 1;
    
    
    		if (th->rst)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			goto discard;
    
    
    		if (th->syn) {
    
    			if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				return 1;
    
    
    			/* Now we have several options: In theory there is
    			 * nothing else in the frame. KA9Q has an option to
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			 * send data with the syn, BSD accepts data with the
    
    			 * syn up to the [to be] advertised window and
    			 * Solaris 2.1 gives you a protocol error. For now
    			 * we just ignore it, that fits the spec precisely
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			 * and avoids incompatibilities. It would be nice in
    			 * future to drop through and process the data.
    			 *
    
    			 * Now that TTCP is starting to be used we ought to
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			 * queue this data.
    			 * But, this leaves one open to an easy denial of
    
    			 * service attack, and SYN cookies can't defend
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			 * against this problem. So, we drop the data
    
    			 * in the interest of security over speed unless
    			 * it's still in use.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			 */
    
    			kfree_skb(skb);
    			return 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    		goto discard;
    
    	case TCP_SYN_SENT:
    		queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
    		if (queued >= 0)
    			return queued;
    
    		/* Do step6 onward by hand. */
    		tcp_urg(sk, skb, th);
    		__kfree_skb(skb);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return 0;
    	}
    
    	if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
    
    	    tcp_paws_discard(sk, skb)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (!th->rst) {
    			NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
    			tcp_send_dupack(sk, skb);
    			goto discard;
    		}
    		/* Reset is accepted even if it did not pass PAWS. */
    	}
    
    	/* step 1: check sequence number */
    	if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
    		if (!th->rst)
    			tcp_send_dupack(sk, skb);
    		goto discard;
    	}
    
    	/* step 2: check RST bit */
    
    	if (th->rst) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		tcp_reset(sk);
    		goto discard;
    	}
    
    	tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
    
    	/* step 3: check security and precedence [ignored] */
    
    	/*	step 4:
    	 *
    	 *	Check for a SYN in window.
    	 */
    	if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
    		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);
    		tcp_reset(sk);
    		return 1;
    	}
    
    	/* step 5: check the ACK field */
    	if (th->ack) {
    		int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH);
    
    
    		switch (sk->sk_state) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		case TCP_SYN_RECV:
    			if (acceptable) {
    				tp->copied_seq = tp->rcv_nxt;
    
    				smp_mb();
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				tcp_set_state(sk, TCP_ESTABLISHED);
    				sk->sk_state_change(sk);
    
    				/* Note, that this wakeup is only for marginal
    				 * crossed SYN case. Passively open sockets
    				 * are not waked up, because sk->sk_sleep ==
    				 * NULL and sk->sk_socket == NULL.
    				 */
    				if (sk->sk_socket) {
    					sk_wake_async(sk,0,POLL_OUT);
    				}
    
    				tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
    				tp->snd_wnd = ntohs(th->window) <<
    					      tp->rx_opt.snd_wscale;
    				tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq,
    					    TCP_SKB_CB(skb)->seq);
    
    				/* tcp_ack considers this ACK as duplicate
    				 * and does not calculate rtt.
    				 * Fix it at least with timestamps.
    				 */
    				if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
    				    !tp->srtt)