Skip to content
Snippets Groups Projects
tcp_input.c 178 KiB
Newer Older
  • Learn to ignore specific revisions
  • 					break;
    				case TCPOLEN_COOKIE_PAIR:
    					/* not yet implemented */
    					break;
    				case TCPOLEN_COOKIE_MIN+0:
    				case TCPOLEN_COOKIE_MIN+2:
    				case TCPOLEN_COOKIE_MIN+4:
    				case TCPOLEN_COOKIE_MIN+6:
    				case TCPOLEN_COOKIE_MAX:
    					/* 16-bit multiple */
    					opt_rx->cookie_plus = opsize;
    					*hvpp = ptr;
    
    				default:
    					/* ignore option */
    					break;
    
    			ptr += opsize-2;
    			length -= opsize;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    }
    
    EXPORT_SYMBOL(tcp_parse_options);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th)
    
    	const __be32 *ptr = (const __be32 *)(th + 1);
    
    
    	if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
    			  | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
    		tp->rx_opt.saw_tstamp = 1;
    		++ptr;
    		tp->rx_opt.rcv_tsval = ntohl(*ptr);
    		++ptr;
    		tp->rx_opt.rcv_tsecr = ntohl(*ptr);
    		return 1;
    	}
    	return 0;
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /* Fast parse options. This hopes to only see timestamps.
     * If it is wrong it falls back on tcp_parse_options().
     */
    
    static int tcp_fast_parse_options(const struct sk_buff *skb,
    				  const struct tcphdr *th,
    				  struct tcp_sock *tp, const u8 **hvpp)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	/* In the spirit of fast parsing, compare doff directly to constant
    	 * values.  Because equality is used, short doff can be ignored here.
    	 */
    	if (th->doff == (sizeof(*th) / 4)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		tp->rx_opt.saw_tstamp = 0;
    		return 0;
    	} else if (tp->rx_opt.tstamp_ok &&
    
    		   th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
    
    		if (tcp_parse_aligned_timestamp(tp, th))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			return 1;
    	}
    
    	tcp_parse_options(skb, &tp->rx_opt, hvpp, 1);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	return 1;
    }
    
    
    #ifdef CONFIG_TCP_MD5SIG
    /*
     * Parse MD5 Signature option
     */
    
    const u8 *tcp_parse_md5sig_option(const struct tcphdr *th)
    
    	int length = (th->doff << 2) - sizeof(*th);
    	const u8 *ptr = (const u8 *)(th + 1);
    
    
    	/* If the TCP option is too short, we can short cut */
    	if (length < TCPOLEN_MD5SIG)
    		return NULL;
    
    	while (length > 0) {
    		int opcode = *ptr++;
    		int opsize;
    
    		switch(opcode) {
    		case TCPOPT_EOL:
    			return NULL;
    		case TCPOPT_NOP:
    			length--;
    			continue;
    		default:
    			opsize = *ptr++;
    			if (opsize < 2 || opsize > length)
    				return NULL;
    			if (opcode == TCPOPT_MD5SIG)
    
    				return opsize == TCPOLEN_MD5SIG ? ptr : NULL;
    
    EXPORT_SYMBOL(tcp_parse_md5sig_option);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    static inline void tcp_store_ts_recent(struct tcp_sock *tp)
    {
    	tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
    
    	tp->rx_opt.ts_recent_stamp = get_seconds();
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
    {
    	if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
    		/* PAWS bug workaround wrt. ACK frames, the PAWS discard
    		 * extra check below makes sure this can only happen
    		 * for pure ACK frames.  -DaveM
    		 *
    		 * Not only, also it occurs for expired timestamps.
    		 */
    
    
    		if (tcp_paws_check(&tp->rx_opt, 0))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			tcp_store_ts_recent(tp);
    	}
    }
    
    /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
     *
     * It is not fatal. If this ACK does _not_ change critical state (seqs, window)
     * it can pass through stack. So, the following predicate verifies that
     * this segment is not used for anything but congestion avoidance or
     * fast retransmit. Moreover, we even are able to eliminate most of such
     * second order effects, if we apply some small "replay" window (~RTO)
     * to timestamp space.
     *
     * All these measures still do not guarantee that we reject wrapped ACKs
     * on networks with high bandwidth, when sequence space is recycled fastly,
     * but it guarantees that such events will be very rare and do not affect
     * connection seriously. This doesn't look nice, but alas, PAWS is really
     * buggy extension.
     *
     * [ Later note. Even worse! It is buggy for segments _with_ data. RFC
     * states that events when retransmit arrives after original data are rare.
     * It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is
     * the biggest problem on large power networks even with minor reordering.
     * OK, let's give it small replay window. If peer clock is even 1hz, it is safe
     * up to bandwidth of 18Gigabit/sec. 8) ]
     */
    
    
    static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	const struct tcp_sock *tp = tcp_sk(sk);
    	const struct tcphdr *th = tcp_hdr(skb);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	u32 seq = TCP_SKB_CB(skb)->seq;
    	u32 ack = TCP_SKB_CB(skb)->ack_seq;
    
    	return (/* 1. Pure ACK with correct sequence number. */
    		(th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&
    
    		/* 2. ... and duplicate ACK. */
    		ack == tp->snd_una &&
    
    		/* 3. ... and does not update window. */
    		!tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
    
    		/* 4. ... and sits in replay window. */
    
    		(s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
    
    static inline int tcp_paws_discard(const struct sock *sk,
    				   const struct sk_buff *skb)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	const struct tcp_sock *tp = tcp_sk(sk);
    
    
    	return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) &&
    	       !tcp_disordered_ack(sk, skb);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /* Check segment sequence number for validity.
     *
     * Segment controls are considered valid, if the segment
     * fits to the window after truncation to the window. Acceptability
     * of data (and SYN, FIN, of course) is checked separately.
     * See tcp_data_queue(), for example.
     *
     * Also, controls (RST is main one) are accepted using RCV.WUP instead
     * of RCV.NXT. Peer still did not advance his SND.UNA when we
     * delayed ACK, so that hisSND.UNA<=ourRCV.WUP.
     * (borrowed from freebsd)
     */
    
    
    static inline int tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	return	!before(end_seq, tp->rcv_wup) &&
    		!after(seq, tp->rcv_nxt + tcp_receive_window(tp));
    }
    
    /* When we get a reset we do this. */
    static void tcp_reset(struct sock *sk)
    {
    	/* We want the right error as BSD sees it (and indeed as we do). */
    	switch (sk->sk_state) {
    
    	case TCP_SYN_SENT:
    		sk->sk_err = ECONNREFUSED;
    		break;
    	case TCP_CLOSE_WAIT:
    		sk->sk_err = EPIPE;
    		break;
    	case TCP_CLOSE:
    		return;
    	default:
    		sk->sk_err = ECONNRESET;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    Tom Marshall's avatar
    Tom Marshall committed
    	/* This barrier is coupled with smp_rmb() in tcp_poll() */
    	smp_wmb();
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	if (!sock_flag(sk, SOCK_DEAD))
    		sk->sk_error_report(sk);
    
    	tcp_done(sk);
    }
    
    /*
     * 	Process the FIN bit. This now behaves as it is supposed to work
     *	and the FIN takes effect when it is validly part of sequence
     *	space. Not before when we get holes.
     *
     *	If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
     *	(and thence onto LAST-ACK and finally, CLOSE, we never enter
     *	TIME-WAIT)
     *
     *	If we are in FINWAIT-1, a received FIN indicates simultaneous
     *	close and we go into CLOSING (and later onto TIME-WAIT)
     *
     *	If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
     */
    
    static void tcp_fin(struct sock *sk)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    
    	inet_csk_schedule_ack(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	sk->sk_shutdown |= RCV_SHUTDOWN;
    	sock_set_flag(sk, SOCK_DONE);
    
    	switch (sk->sk_state) {
    
    	case TCP_SYN_RECV:
    	case TCP_ESTABLISHED:
    		/* Move to CLOSE_WAIT */
    		tcp_set_state(sk, TCP_CLOSE_WAIT);
    		inet_csk(sk)->icsk_ack.pingpong = 1;
    		break;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	case TCP_CLOSE_WAIT:
    	case TCP_CLOSING:
    		/* Received a retransmission of the FIN, do
    		 * nothing.
    		 */
    		break;
    	case TCP_LAST_ACK:
    		/* RFC793: Remain in the LAST-ACK state. */
    		break;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	case TCP_FIN_WAIT1:
    		/* This case occurs when a simultaneous close
    		 * happens, we must ack the received FIN and
    		 * enter the CLOSING state.
    		 */
    		tcp_send_ack(sk);
    		tcp_set_state(sk, TCP_CLOSING);
    		break;
    	case TCP_FIN_WAIT2:
    		/* Received a FIN -- send ACK and enter TIME_WAIT. */
    		tcp_send_ack(sk);
    		tcp_time_wait(sk, TCP_TIME_WAIT, 0);
    		break;
    	default:
    		/* Only TCP_LISTEN and TCP_CLOSE are left, in these
    		 * cases we should never reach this piece of code.
    		 */
    
    		pr_err("%s: Impossible, sk->sk_state=%d\n",
    
    		       __func__, sk->sk_state);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	/* It _is_ possible, that we have something out-of-order _after_ FIN.
    	 * Probably, we should reset in this case. For now drop them.
    	 */
    	__skb_queue_purge(&tp->out_of_order_queue);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		tcp_sack_reset(&tp->rx_opt);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	if (!sock_flag(sk, SOCK_DEAD)) {
    		sk->sk_state_change(sk);
    
    		/* Do not send POLL_HUP for half duplex close. */
    		if (sk->sk_shutdown == SHUTDOWN_MASK ||
    		    sk->sk_state == TCP_CLOSE)
    
    			sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		else
    
    			sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
    
    static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
    				  u32 end_seq)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
    		if (before(seq, sp->start_seq))
    			sp->start_seq = seq;
    		if (after(end_seq, sp->end_seq))
    			sp->end_seq = end_seq;
    		return 1;
    	}
    	return 0;
    }
    
    
    static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct tcp_sock *tp = tcp_sk(sk);
    
    
    	if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (before(seq, tp->rcv_nxt))
    
    			mib_idx = LINUX_MIB_TCPDSACKOLDSENT;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		else
    
    			mib_idx = LINUX_MIB_TCPDSACKOFOSENT;
    
    
    		NET_INC_STATS_BH(sock_net(sk), mib_idx);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		tp->rx_opt.dsack = 1;
    		tp->duplicate_sack[0].start_seq = seq;
    		tp->duplicate_sack[0].end_seq = end_seq;
    	}
    }
    
    
    static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct tcp_sock *tp = tcp_sk(sk);
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (!tp->rx_opt.dsack)
    
    		tcp_dsack_set(sk, seq, end_seq);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	else
    		tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
    }
    
    
    static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
    	    before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
    
    		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
    
    		tcp_enter_quickack_mode(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			u32 end_seq = TCP_SKB_CB(skb)->end_seq;
    
    			if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
    				end_seq = tp->rcv_nxt;
    
    			tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    	}
    
    	tcp_send_ack(sk);
    }
    
    /* These routines update the SACK block as out-of-order packets arrive or
     * in-order packets close up the sequence space.
     */
    static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
    {
    	int this_sack;
    	struct tcp_sack_block *sp = &tp->selective_acks[0];
    
    	struct tcp_sack_block *swalk = sp + 1;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	/* See if the recent change to the first SACK eats into
    	 * or hits the sequence space of other SACK blocks, if so coalesce.
    	 */
    
    	for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) {
    			int i;
    
    			/* Zap SWALK, by moving every further SACK up by one slot.
    			 * Decrease num_sacks.
    			 */
    			tp->rx_opt.num_sacks--;
    
    			for (i = this_sack; i < tp->rx_opt.num_sacks; i++)
    				sp[i] = sp[i + 1];
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			continue;
    		}
    		this_sack++, swalk++;
    	}
    }
    
    static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	struct tcp_sack_block *sp = &tp->selective_acks[0];
    	int cur_sacks = tp->rx_opt.num_sacks;
    	int this_sack;
    
    	if (!cur_sacks)
    		goto new_sack;
    
    
    	for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (tcp_sack_extend(sp, seq, end_seq)) {
    			/* Rotate this_sack to the first one. */
    
    			for (; this_sack > 0; this_sack--, sp--)
    
    				swap(*sp, *(sp - 1));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			if (cur_sacks > 1)
    				tcp_sack_maybe_coalesce(tp);
    			return;
    		}
    	}
    
    	/* Could not find an adjacent existing SACK, build a new one,
    	 * put it at the front, and shift everyone else down.  We
    	 * always know there is at least one SACK present already here.
    	 *
    	 * If the sack array is full, forget about the last one.
    	 */
    
    	if (this_sack >= TCP_NUM_SACKS) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		this_sack--;
    		tp->rx_opt.num_sacks--;
    		sp--;
    	}
    
    	for (; this_sack > 0; this_sack--, sp--)
    
    		*sp = *(sp - 1);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    new_sack:
    	/* Build the new head SACK, and we're done. */
    	sp->start_seq = seq;
    	sp->end_seq = end_seq;
    	tp->rx_opt.num_sacks++;
    }
    
    /* RCV.NXT advances, some SACKs should be eaten. */
    
    static void tcp_sack_remove(struct tcp_sock *tp)
    {
    	struct tcp_sack_block *sp = &tp->selective_acks[0];
    	int num_sacks = tp->rx_opt.num_sacks;
    	int this_sack;
    
    	/* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
    
    	if (skb_queue_empty(&tp->out_of_order_queue)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		tp->rx_opt.num_sacks = 0;
    		return;
    	}
    
    
    	for (this_sack = 0; this_sack < num_sacks;) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/* Check if the start of the sack is covered by RCV.NXT. */
    		if (!before(tp->rcv_nxt, sp->start_seq)) {
    			int i;
    
    			/* RCV.NXT must cover all the block! */
    
    			WARN_ON(before(tp->rcv_nxt, sp->end_seq));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    			/* Zap this SACK, by moving forward any other SACKS. */
    			for (i=this_sack+1; i < num_sacks; i++)
    				tp->selective_acks[i-1] = tp->selective_acks[i];
    			num_sacks--;
    			continue;
    		}
    		this_sack++;
    		sp++;
    	}
    
    	tp->rx_opt.num_sacks = num_sacks;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /* This one checks to see if we can put data from the
     * out_of_order queue into the receive_queue.
     */
    static void tcp_ofo_queue(struct sock *sk)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	__u32 dsack_high = tp->rcv_nxt;
    	struct sk_buff *skb;
    
    	while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) {
    		if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
    			break;
    
    		if (before(TCP_SKB_CB(skb)->seq, dsack_high)) {
    			__u32 dsack = dsack_high;
    			if (before(TCP_SKB_CB(skb)->end_seq, dsack_high))
    				dsack_high = TCP_SKB_CB(skb)->end_seq;
    
    			tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    
    		if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
    
    			SOCK_DEBUG(sk, "ofo packet was already received\n");
    
    David S. Miller's avatar
    David S. Miller committed
    			__skb_unlink(skb, &tp->out_of_order_queue);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			__kfree_skb(skb);
    			continue;
    		}
    		SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
    			   tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
    			   TCP_SKB_CB(skb)->end_seq);
    
    
    David S. Miller's avatar
    David S. Miller committed
    		__skb_unlink(skb, &tp->out_of_order_queue);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		__skb_queue_tail(&sk->sk_receive_queue, skb);
    		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
    
    static int tcp_prune_ofo_queue(struct sock *sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    static int tcp_prune_queue(struct sock *sk);
    
    
    static int tcp_try_rmem_schedule(struct sock *sk, unsigned int size)
    
    {
    	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
    	    !sk_rmem_schedule(sk, size)) {
    
    		if (tcp_prune_queue(sk) < 0)
    			return -1;
    
    		if (!sk_rmem_schedule(sk, size)) {
    
    			if (!tcp_prune_ofo_queue(sk))
    				return -1;
    
    
    			if (!sk_rmem_schedule(sk, size))
    				return -1;
    		}
    	}
    	return 0;
    }
    
    
    /**
     * tcp_try_coalesce - try to merge skb to prior one
     * @sk: socket
     * @to: prior buffer
     * @from: buffer to add in queue
    
     * @fragstolen: pointer to boolean
    
     *
     * Before queueing skb @from after @to, try to merge them
     * to reduce overall memory use and queue lengths, if cost is small.
     * Packets in ofo or receive queues can stay a long time.
     * Better try to coalesce them right now to avoid future collapses.
    
     * Returns true if caller should free @from instead of queueing it
    
    static bool tcp_try_coalesce(struct sock *sk,
    			     struct sk_buff *to,
    
    			     struct sk_buff *from,
    			     bool *fragstolen)
    
    	int i, delta, len = from->len;
    
    	if (tcp_hdr(from)->fin || skb_cloned(to))
    
    		return false;
    
    	if (len <= skb_tailroom(to)) {
    		BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
    
    
    	if (skb_has_frag_list(to) || skb_has_frag_list(from))
    		return false;
    
    
    	if (skb_headlen(from) != 0) {
    		struct page *page;
    		unsigned int offset;
    
    		if (skb_shinfo(to)->nr_frags +
    		    skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS)
    			return false;
    
    
    		if (skb_head_is_locked(from))
    
    			return false;
    
    		delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
    
    		page = virt_to_head_page(from->head);
    		offset = from->data - (unsigned char *)page_address(page);
    
    		skb_fill_page_desc(to, skb_shinfo(to)->nr_frags,
    				   page, offset, skb_headlen(from));
    		*fragstolen = true;
    	} else {
    		if (skb_shinfo(to)->nr_frags +
    		    skb_shinfo(from)->nr_frags > MAX_SKB_FRAGS)
    			return false;
    
    
    		delta = from->truesize -
    			SKB_TRUESIZE(skb_end_pointer(from) - from->head);
    
    
    	WARN_ON_ONCE(delta < len);
    
    	memcpy(skb_shinfo(to)->frags + skb_shinfo(to)->nr_frags,
    	       skb_shinfo(from)->frags,
    	       skb_shinfo(from)->nr_frags * sizeof(skb_frag_t));
    	skb_shinfo(to)->nr_frags += skb_shinfo(from)->nr_frags;
    
    	if (!skb_cloned(from))
    		skb_shinfo(from)->nr_frags = 0;
    
    	/* if the skb is cloned this does nothing since we set nr_frags to 0 */
    	for (i = 0; i < skb_shinfo(from)->nr_frags; i++)
    		skb_frag_ref(from, i);
    
    	to->truesize += delta;
    	atomic_add(delta, &sk->sk_rmem_alloc);
    	sk_mem_charge(sk, delta);
    	to->len += len;
    	to->data_len += len;
    
    merge:
    	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
    	TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
    	TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
    	return true;
    
    static void kfree_skb_partial(struct sk_buff *skb, bool head_stolen)
    {
    	if (head_stolen)
    		kmem_cache_free(skbuff_head_cache, skb);
    	else
    		__kfree_skb(skb);
    }
    
    
    static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	struct sk_buff *skb1;
    	u32 seq, end_seq;
    
    	TCP_ECN_check_ce(tp, skb);
    
    	if (tcp_try_rmem_schedule(sk, skb->truesize)) {
    		/* TODO: should increment a counter */
    		__kfree_skb(skb);
    		return;
    	}
    
    	/* Disable header prediction. */
    	tp->pred_flags = 0;
    	inet_csk_schedule_ack(sk);
    
    	SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
    		   tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
    
    	skb1 = skb_peek_tail(&tp->out_of_order_queue);
    	if (!skb1) {
    		/* Initial out of order segment, build 1 SACK. */
    		if (tcp_is_sack(tp)) {
    			tp->rx_opt.num_sacks = 1;
    			tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
    			tp->selective_acks[0].end_seq =
    						TCP_SKB_CB(skb)->end_seq;
    		}
    		__skb_queue_head(&tp->out_of_order_queue, skb);
    		goto end;
    	}
    
    	seq = TCP_SKB_CB(skb)->seq;
    	end_seq = TCP_SKB_CB(skb)->end_seq;
    
    	if (seq == TCP_SKB_CB(skb1)->end_seq) {
    
    		bool fragstolen;
    
    		if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
    
    			__skb_queue_after(&tp->out_of_order_queue, skb1, skb);
    		} else {
    
    			kfree_skb_partial(skb, fragstolen);
    
    
    		if (!tp->rx_opt.num_sacks ||
    		    tp->selective_acks[0].end_seq != seq)
    			goto add_sack;
    
    		/* Common case: data arrive in order after hole. */
    		tp->selective_acks[0].end_seq = end_seq;
    		goto end;
    	}
    
    	/* Find place to insert this segment. */
    	while (1) {
    		if (!after(TCP_SKB_CB(skb1)->seq, seq))
    			break;
    		if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) {
    			skb1 = NULL;
    			break;
    		}
    		skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1);
    	}
    
    	/* Do skb overlap to previous one? */
    	if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
    		if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
    			/* All the bits are present. Drop. */
    			__kfree_skb(skb);
    			skb = NULL;
    			tcp_dsack_set(sk, seq, end_seq);
    			goto add_sack;
    		}
    		if (after(seq, TCP_SKB_CB(skb1)->seq)) {
    			/* Partial overlap. */
    			tcp_dsack_set(sk, seq,
    				      TCP_SKB_CB(skb1)->end_seq);
    		} else {
    			if (skb_queue_is_first(&tp->out_of_order_queue,
    					       skb1))
    				skb1 = NULL;
    			else
    				skb1 = skb_queue_prev(
    					&tp->out_of_order_queue,
    					skb1);
    		}
    	}
    	if (!skb1)
    		__skb_queue_head(&tp->out_of_order_queue, skb);
    	else
    		__skb_queue_after(&tp->out_of_order_queue, skb1, skb);
    
    	/* And clean segments covered by new one as whole. */
    	while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) {
    		skb1 = skb_queue_next(&tp->out_of_order_queue, skb);
    
    		if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
    			break;
    		if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
    			tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
    					 end_seq);
    			break;
    		}
    		__skb_unlink(skb1, &tp->out_of_order_queue);
    		tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
    				 TCP_SKB_CB(skb1)->end_seq);
    		__kfree_skb(skb1);
    	}
    
    add_sack:
    	if (tcp_is_sack(tp))
    		tcp_sack_new_ofo_skb(sk, seq, end_seq);
    end:
    	if (skb)
    		skb_set_owner_r(skb, sk);
    }
    
    
    static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
    
    		  bool *fragstolen)
    {
    	int eaten;
    	struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
    
    	__skb_pull(skb, hdrlen);
    	eaten = (tail &&
    		 tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0;
    	tcp_sk(sk)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
    	if (!eaten) {
    		__skb_queue_tail(&sk->sk_receive_queue, skb);
    		skb_set_owner_r(skb, sk);
    	}
    	return eaten;
    }
    
    int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
    {
    	struct sk_buff *skb;
    	struct tcphdr *th;
    	bool fragstolen;
    
    
    	if (tcp_try_rmem_schedule(sk, size + sizeof(*th)))
    		goto err;
    
    
    	skb = alloc_skb(size + sizeof(*th), sk->sk_allocation);
    	if (!skb)
    		goto err;
    
    	th = (struct tcphdr *)skb_put(skb, sizeof(*th));
    	skb_reset_transport_header(skb);
    	memset(th, 0, sizeof(*th));
    
    	if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size))
    		goto err_free;
    
    	TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
    	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
    	TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
    
    	if (tcp_queue_rcv(sk, skb, sizeof(*th), &fragstolen)) {
    		WARN_ON_ONCE(fragstolen); /* should not happen */
    		__kfree_skb(skb);
    	}
    	return size;
    
    err_free:
    	kfree_skb(skb);
    err:
    	return -ENOMEM;
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
    {
    
    	const struct tcphdr *th = tcp_hdr(skb);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct tcp_sock *tp = tcp_sk(sk);
    	int eaten = -1;
    
    	bool fragstolen = false;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
    		goto drop;
    
    
    	skb_dst_drop(skb);
    
    	__skb_pull(skb, th->doff * 4);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	TCP_ECN_accept_cwr(tp, skb);
    
    
    	tp->rx_opt.dsack = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	/*  Queue data for delivery to the user.
    	 *  Packets in sequence go to the receive queue.
    	 *  Out of sequence packets to the out_of_order_queue.
    	 */
    	if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
    		if (tcp_receive_window(tp) == 0)
    			goto out_of_window;
    
    		/* Ok. In sequence. In window. */
    		if (tp->ucopy.task == current &&
    		    tp->copied_seq == tp->rcv_nxt && tp->ucopy.len &&
    		    sock_owned_by_user(sk) && !tp->urg_data) {
    			int chunk = min_t(unsigned int, skb->len,
    
    					  tp->ucopy.len);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    			__set_current_state(TASK_RUNNING);
    
    			local_bh_enable();
    			if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) {
    				tp->ucopy.len -= chunk;
    				tp->copied_seq += chunk;
    
    				eaten = (chunk == skb->len);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				tcp_rcv_space_adjust(sk);
    			}
    			local_bh_disable();
    		}
    
    		if (eaten <= 0) {
    queue_and_out:
    			if (eaten < 0 &&
    
    			    tcp_try_rmem_schedule(sk, skb->truesize))
    				goto drop;
    
    
    			eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
    
    		if (skb->len)
    
    			tcp_event_data_recv(sk, skb);
    
    		if (th->fin)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		if (!skb_queue_empty(&tp->out_of_order_queue)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			tcp_ofo_queue(sk);
    
    			/* RFC2581. 4.2. SHOULD send immediate ACK, when
    			 * gap in queue is filled.
    			 */
    
    			if (skb_queue_empty(&tp->out_of_order_queue))
    
    				inet_csk(sk)->icsk_ack.pingpong = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    
    		if (tp->rx_opt.num_sacks)
    			tcp_sack_remove(tp);
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		if (eaten > 0)
    			kfree_skb_partial(skb, fragstolen);
    		else if (!sock_flag(sk, SOCK_DEAD))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			sk->sk_data_ready(sk, 0);
    		return;
    	}
    
    	if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
    		/* A retransmit, 2nd most common case.  Force an immediate ack. */
    
    		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
    
    		tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    out_of_window:
    
    		tcp_enter_quickack_mode(sk);
    		inet_csk_schedule_ack(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    drop:
    		__kfree_skb(skb);
    		return;
    	}
    
    	/* Out of window. F.e. zero window probe. */
    	if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
    		goto out_of_window;
    
    
    	tcp_enter_quickack_mode(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
    		/* Partial packet, seq < rcv_next < end_seq */
    		SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
    			   tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
    			   TCP_SKB_CB(skb)->end_seq);
    
    
    		tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/* If window is closed, drop tail of packet. But after
    		 * remembering D-SACK for its head made in previous line.
    		 */
    		if (!tcp_receive_window(tp))
    			goto out_of_window;
    		goto queue_and_out;
    	}
    
    
    	tcp_data_queue_ofo(sk, skb);
    
    static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
    					struct sk_buff_head *list)
    {
    
    	struct sk_buff *next = NULL;
    
    	if (!skb_queue_is_last(list, skb))
    		next = skb_queue_next(list, skb);
    
    
    	__skb_unlink(skb, list);
    	__kfree_skb(skb);
    	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
    
    	return next;
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /* Collapse contiguous sequence of skbs head..tail with
     * sequence numbers start..end.
    
     *
     * If tail is NULL, this means until the end of the list.
     *
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * Segments with FIN/SYN are not collapsed (only because this
     * simplifies code)
     */
    static void
    
    David S. Miller's avatar
    David S. Miller committed
    tcp_collapse(struct sock *sk, struct sk_buff_head *list,
    	     struct sk_buff *head, struct sk_buff *tail,
    	     u32 start, u32 end)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct sk_buff *skb, *n;
    	bool end_of_skbs;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    	/* First, check that queue is collapsible and find
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	 * the point where collapsing can be useful. */
    
    	skb = head;
    restart:
    	end_of_skbs = true;
    	skb_queue_walk_from_safe(list, skb, n) {
    		if (skb == tail)
    			break;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/* No new bits? It is possible on ofo queue. */
    		if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
    
    			skb = tcp_collapse_one(sk, skb, list);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    
    		/* The first skb to collapse is:
    		 * - not SYN/FIN and
    		 * - bloated or contains data before "start" or
    		 *   overlaps to the next one.
    		 */
    
    		if (!tcp_hdr(skb)->syn && !tcp_hdr(skb)->fin &&
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		    (tcp_win_from_space(skb->truesize) > skb->len ||
    
    		     before(TCP_SKB_CB(skb)->seq, start))) {
    			end_of_skbs = false;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			break;
    
    		}
    
    		if (!skb_queue_is_last(list, skb)) {
    			struct sk_buff *next = skb_queue_next(list, skb);
    			if (next != tail &&
    			    TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) {
    				end_of_skbs = false;
    				break;
    			}
    		}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		/* Decided to skip this, advance start seq. */
    		start = TCP_SKB_CB(skb)->end_seq;
    	}
    
    	if (end_of_skbs || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return;
    
    	while (before(start, end)) {
    		struct sk_buff *nskb;
    
    		unsigned int header = skb_headroom(skb);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		int copy = SKB_MAX_ORDER(header, 0);
    
    		/* Too big header? This can happen with IPv6. */
    		if (copy < 0)
    			return;