Skip to content
Snippets Groups Projects
tcp_input.c 163 KiB
Newer Older
  • Learn to ignore specific revisions
  • Eric Dumazet's avatar
    Eric Dumazet committed
    static bool tcp_prune_ofo_queue(struct sock *sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    static int tcp_prune_queue(struct sock *sk);
    
    
    static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
    				 unsigned int size)
    
    {
    	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
    
    	    !sk_rmem_schedule(sk, skb, size)) {
    
    
    		if (tcp_prune_queue(sk) < 0)
    			return -1;
    
    
    		if (!sk_rmem_schedule(sk, skb, size)) {
    
    			if (!tcp_prune_ofo_queue(sk))
    				return -1;
    
    
    			if (!sk_rmem_schedule(sk, skb, size))
    
    /**
     * tcp_try_coalesce - try to merge skb to prior one
     * @sk: socket
     * @to: prior buffer
     * @from: buffer to add in queue
    
     * @fragstolen: pointer to boolean
    
     *
     * Before queueing skb @from after @to, try to merge them
     * to reduce overall memory use and queue lengths, if cost is small.
     * Packets in ofo or receive queues can stay a long time.
     * Better try to coalesce them right now to avoid future collapses.
    
     * Returns true if caller should free @from instead of queueing it
    
    static bool tcp_try_coalesce(struct sock *sk,
    			     struct sk_buff *to,
    
    			     struct sk_buff *from,
    			     bool *fragstolen)
    
    	int delta;
    
    	if (tcp_hdr(from)->fin)
    
    		return false;
    
    
    	/* Its possible this segment overlaps with prior segment in queue */
    	if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
    		return false;
    
    
    	if (!skb_try_coalesce(to, from, fragstolen, &delta))
    
    	atomic_add(delta, &sk->sk_rmem_alloc);
    	sk_mem_charge(sk, delta);
    	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
    	TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
    	TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
    	return true;
    
    static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	struct sk_buff *skb1;
    	u32 seq, end_seq;
    
    	TCP_ECN_check_ce(tp, skb);
    
    
    	if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
    
    Eric Dumazet's avatar
    Eric Dumazet committed
    		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP);
    
    		__kfree_skb(skb);
    		return;
    	}
    
    	/* Disable header prediction. */
    	tp->pred_flags = 0;
    	inet_csk_schedule_ack(sk);
    
    
    Eric Dumazet's avatar
    Eric Dumazet committed
    	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
    
    	SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
    		   tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
    
    	skb1 = skb_peek_tail(&tp->out_of_order_queue);
    	if (!skb1) {
    		/* Initial out of order segment, build 1 SACK. */
    		if (tcp_is_sack(tp)) {
    			tp->rx_opt.num_sacks = 1;
    			tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
    			tp->selective_acks[0].end_seq =
    						TCP_SKB_CB(skb)->end_seq;
    		}
    		__skb_queue_head(&tp->out_of_order_queue, skb);
    		goto end;
    	}
    
    	seq = TCP_SKB_CB(skb)->seq;
    	end_seq = TCP_SKB_CB(skb)->end_seq;
    
    	if (seq == TCP_SKB_CB(skb1)->end_seq) {
    
    		bool fragstolen;
    
    		if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
    
    			__skb_queue_after(&tp->out_of_order_queue, skb1, skb);
    		} else {
    
    			kfree_skb_partial(skb, fragstolen);
    
    
    		if (!tp->rx_opt.num_sacks ||
    		    tp->selective_acks[0].end_seq != seq)
    			goto add_sack;
    
    		/* Common case: data arrive in order after hole. */
    		tp->selective_acks[0].end_seq = end_seq;
    		goto end;
    	}
    
    	/* Find place to insert this segment. */
    	while (1) {
    		if (!after(TCP_SKB_CB(skb1)->seq, seq))
    			break;
    		if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) {
    			skb1 = NULL;
    			break;
    		}
    		skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1);
    	}
    
    	/* Do skb overlap to previous one? */
    	if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
    		if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
    			/* All the bits are present. Drop. */
    
    Eric Dumazet's avatar
    Eric Dumazet committed
    			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
    
    			__kfree_skb(skb);
    			skb = NULL;
    			tcp_dsack_set(sk, seq, end_seq);
    			goto add_sack;
    		}
    		if (after(seq, TCP_SKB_CB(skb1)->seq)) {
    			/* Partial overlap. */
    			tcp_dsack_set(sk, seq,
    				      TCP_SKB_CB(skb1)->end_seq);
    		} else {
    			if (skb_queue_is_first(&tp->out_of_order_queue,
    					       skb1))
    				skb1 = NULL;
    			else
    				skb1 = skb_queue_prev(
    					&tp->out_of_order_queue,
    					skb1);
    		}
    	}
    	if (!skb1)
    		__skb_queue_head(&tp->out_of_order_queue, skb);
    	else
    		__skb_queue_after(&tp->out_of_order_queue, skb1, skb);
    
    	/* And clean segments covered by new one as whole. */
    	while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) {
    		skb1 = skb_queue_next(&tp->out_of_order_queue, skb);
    
    		if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
    			break;
    		if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
    			tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
    					 end_seq);
    			break;
    		}
    		__skb_unlink(skb1, &tp->out_of_order_queue);
    		tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
    				 TCP_SKB_CB(skb1)->end_seq);
    
    Eric Dumazet's avatar
    Eric Dumazet committed
    		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
    
    		__kfree_skb(skb1);
    	}
    
    add_sack:
    	if (tcp_is_sack(tp))
    		tcp_sack_new_ofo_skb(sk, seq, end_seq);
    end:
    	if (skb)
    		skb_set_owner_r(skb, sk);
    }
    
    
    static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
    
    		  bool *fragstolen)
    {
    	int eaten;
    	struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
    
    	__skb_pull(skb, hdrlen);
    	eaten = (tail &&
    		 tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0;
    	tcp_sk(sk)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
    	if (!eaten) {
    		__skb_queue_tail(&sk->sk_receive_queue, skb);
    		skb_set_owner_r(skb, sk);
    	}
    	return eaten;
    }
    
    int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
    {
    
    	struct sk_buff *skb = NULL;
    
    	struct tcphdr *th;
    	bool fragstolen;
    
    
    	skb = alloc_skb(size + sizeof(*th), sk->sk_allocation);
    	if (!skb)
    		goto err;
    
    
    	if (tcp_try_rmem_schedule(sk, skb, size + sizeof(*th)))
    		goto err_free;
    
    
    	th = (struct tcphdr *)skb_put(skb, sizeof(*th));
    	skb_reset_transport_header(skb);
    	memset(th, 0, sizeof(*th));
    
    	if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size))
    		goto err_free;
    
    	TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
    	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
    	TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
    
    	if (tcp_queue_rcv(sk, skb, sizeof(*th), &fragstolen)) {
    		WARN_ON_ONCE(fragstolen); /* should not happen */
    		__kfree_skb(skb);
    	}
    	return size;
    
    err_free:
    	kfree_skb(skb);
    err:
    	return -ENOMEM;
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
    {
    
    	const struct tcphdr *th = tcp_hdr(skb);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct tcp_sock *tp = tcp_sk(sk);
    	int eaten = -1;
    
    	bool fragstolen = false;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
    		goto drop;
    
    
    	skb_dst_drop(skb);
    
    	__skb_pull(skb, th->doff * 4);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	TCP_ECN_accept_cwr(tp, skb);
    
    
    	tp->rx_opt.dsack = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	/*  Queue data for delivery to the user.
    	 *  Packets in sequence go to the receive queue.
    	 *  Out of sequence packets to the out_of_order_queue.
    	 */
    	if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
    		if (tcp_receive_window(tp) == 0)
    			goto out_of_window;
    
    		/* Ok. In sequence. In window. */
    		if (tp->ucopy.task == current &&
    		    tp->copied_seq == tp->rcv_nxt && tp->ucopy.len &&
    		    sock_owned_by_user(sk) && !tp->urg_data) {
    			int chunk = min_t(unsigned int, skb->len,
    
    					  tp->ucopy.len);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    			__set_current_state(TASK_RUNNING);
    
    			local_bh_enable();
    			if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) {
    				tp->ucopy.len -= chunk;
    				tp->copied_seq += chunk;
    
    				eaten = (chunk == skb->len);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				tcp_rcv_space_adjust(sk);
    			}
    			local_bh_disable();
    		}
    
    		if (eaten <= 0) {
    queue_and_out:
    			if (eaten < 0 &&
    
    			    tcp_try_rmem_schedule(sk, skb, skb->truesize))
    
    			eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
    
    		if (skb->len)
    
    			tcp_event_data_recv(sk, skb);
    
    		if (th->fin)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		if (!skb_queue_empty(&tp->out_of_order_queue)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			tcp_ofo_queue(sk);
    
    			/* RFC2581. 4.2. SHOULD send immediate ACK, when
    			 * gap in queue is filled.
    			 */
    
    			if (skb_queue_empty(&tp->out_of_order_queue))
    
    				inet_csk(sk)->icsk_ack.pingpong = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    
    		if (tp->rx_opt.num_sacks)
    			tcp_sack_remove(tp);
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		if (eaten > 0)
    			kfree_skb_partial(skb, fragstolen);
    
    		if (!sock_flag(sk, SOCK_DEAD))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			sk->sk_data_ready(sk, 0);
    		return;
    	}
    
    	if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
    		/* A retransmit, 2nd most common case.  Force an immediate ack. */
    
    		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
    
    		tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    out_of_window:
    
    		tcp_enter_quickack_mode(sk);
    		inet_csk_schedule_ack(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    drop:
    		__kfree_skb(skb);
    		return;
    	}
    
    	/* Out of window. F.e. zero window probe. */
    	if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
    		goto out_of_window;
    
    
    	tcp_enter_quickack_mode(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
    		/* Partial packet, seq < rcv_next < end_seq */
    		SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
    			   tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
    			   TCP_SKB_CB(skb)->end_seq);
    
    
    		tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/* If window is closed, drop tail of packet. But after
    		 * remembering D-SACK for its head made in previous line.
    		 */
    		if (!tcp_receive_window(tp))
    			goto out_of_window;
    		goto queue_and_out;
    	}
    
    
    	tcp_data_queue_ofo(sk, skb);
    
    static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
    					struct sk_buff_head *list)
    {
    
    	struct sk_buff *next = NULL;
    
    	if (!skb_queue_is_last(list, skb))
    		next = skb_queue_next(list, skb);
    
    
    	__skb_unlink(skb, list);
    	__kfree_skb(skb);
    	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
    
    	return next;
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /* Collapse contiguous sequence of skbs head..tail with
     * sequence numbers start..end.
    
     *
     * If tail is NULL, this means until the end of the list.
     *
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * Segments with FIN/SYN are not collapsed (only because this
     * simplifies code)
     */
    static void
    
    David S. Miller's avatar
    David S. Miller committed
    tcp_collapse(struct sock *sk, struct sk_buff_head *list,
    	     struct sk_buff *head, struct sk_buff *tail,
    	     u32 start, u32 end)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct sk_buff *skb, *n;
    	bool end_of_skbs;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    	/* First, check that queue is collapsible and find
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	 * the point where collapsing can be useful. */
    
    	skb = head;
    restart:
    	end_of_skbs = true;
    	skb_queue_walk_from_safe(list, skb, n) {
    		if (skb == tail)
    			break;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/* No new bits? It is possible on ofo queue. */
    		if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
    
    			skb = tcp_collapse_one(sk, skb, list);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    
    		/* The first skb to collapse is:
    		 * - not SYN/FIN and
    		 * - bloated or contains data before "start" or
    		 *   overlaps to the next one.
    		 */
    
    		if (!tcp_hdr(skb)->syn && !tcp_hdr(skb)->fin &&
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		    (tcp_win_from_space(skb->truesize) > skb->len ||
    
    		     before(TCP_SKB_CB(skb)->seq, start))) {
    			end_of_skbs = false;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			break;
    
    		}
    
    		if (!skb_queue_is_last(list, skb)) {
    			struct sk_buff *next = skb_queue_next(list, skb);
    			if (next != tail &&
    			    TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) {
    				end_of_skbs = false;
    				break;
    			}
    		}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		/* Decided to skip this, advance start seq. */
    		start = TCP_SKB_CB(skb)->end_seq;
    	}
    
    	if (end_of_skbs || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return;
    
    	while (before(start, end)) {
    		struct sk_buff *nskb;
    
    		unsigned int header = skb_headroom(skb);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		int copy = SKB_MAX_ORDER(header, 0);
    
    		/* Too big header? This can happen with IPv6. */
    		if (copy < 0)
    			return;
    
    		if (end - start < copy)
    			copy = end - start;
    		nskb = alloc_skb(copy + header, GFP_ATOMIC);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (!nskb)
    			return;
    
    		skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head);
    
    		skb_set_network_header(nskb, (skb_network_header(skb) -
    					      skb->head));
    		skb_set_transport_header(nskb, (skb_transport_header(skb) -
    						skb->head));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		skb_reserve(nskb, header);
    		memcpy(nskb->head, skb->head, header);
    		memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
    		TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
    
    		__skb_queue_before(list, skb, nskb);
    
    		skb_set_owner_r(nskb, sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		/* Copy data, releasing collapsed skbs. */
    		while (copy > 0) {
    			int offset = start - TCP_SKB_CB(skb)->seq;
    			int size = TCP_SKB_CB(skb)->end_seq - start;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			if (size > 0) {
    				size = min(copy, size);
    				if (skb_copy_bits(skb, offset, skb_put(nskb, size), size))
    					BUG();
    				TCP_SKB_CB(nskb)->end_seq += size;
    				copy -= size;
    				start += size;
    			}
    			if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
    
    				skb = tcp_collapse_one(sk, skb, list);
    
    				    tcp_hdr(skb)->syn ||
    				    tcp_hdr(skb)->fin)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    					return;
    			}
    		}
    	}
    }
    
    /* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
     * and tcp_collapse() them until all the queue is collapsed.
     */
    static void tcp_collapse_ofo_queue(struct sock *sk)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	struct sk_buff *skb = skb_peek(&tp->out_of_order_queue);
    	struct sk_buff *head;
    	u32 start, end;
    
    	if (skb == NULL)
    		return;
    
    	start = TCP_SKB_CB(skb)->seq;
    	end = TCP_SKB_CB(skb)->end_seq;
    	head = skb;
    
    	for (;;) {
    
    		struct sk_buff *next = NULL;
    
    		if (!skb_queue_is_last(&tp->out_of_order_queue, skb))
    			next = skb_queue_next(&tp->out_of_order_queue, skb);
    		skb = next;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		/* Segment is terminated when we see gap or when
    		 * we are at the end of all the queue. */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		    after(TCP_SKB_CB(skb)->seq, end) ||
    		    before(TCP_SKB_CB(skb)->end_seq, start)) {
    
    David S. Miller's avatar
    David S. Miller committed
    			tcp_collapse(sk, &tp->out_of_order_queue,
    				     head, skb, start, end);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			head = skb;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				break;
    			/* Start new segment */
    			start = TCP_SKB_CB(skb)->seq;
    			end = TCP_SKB_CB(skb)->end_seq;
    		} else {
    			if (before(TCP_SKB_CB(skb)->seq, start))
    				start = TCP_SKB_CB(skb)->seq;
    			if (after(TCP_SKB_CB(skb)->end_seq, end))
    				end = TCP_SKB_CB(skb)->end_seq;
    		}
    	}
    }
    
    
    /*
     * Purge the out-of-order queue.
    
     * Return true if queue was pruned.
    
    Eric Dumazet's avatar
    Eric Dumazet committed
    static bool tcp_prune_ofo_queue(struct sock *sk)
    
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    Eric Dumazet's avatar
    Eric Dumazet committed
    	bool res = false;
    
    
    	if (!skb_queue_empty(&tp->out_of_order_queue)) {
    
    		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
    
    		__skb_queue_purge(&tp->out_of_order_queue);
    
    		/* Reset SACK state.  A conforming SACK implementation will
    		 * do the same at a timeout based retransmit.  When a connection
    		 * is in a sad state like this, we care only about integrity
    		 * of the connection not performance.
    		 */
    		if (tp->rx_opt.sack_ok)
    			tcp_sack_reset(&tp->rx_opt);
    		sk_mem_reclaim(sk);
    
    Eric Dumazet's avatar
    Eric Dumazet committed
    		res = true;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /* Reduce allocated memory if we can, trying to get
     * the socket within its memory limits again.
     *
     * Return less than zero if we should start dropping frames
     * until the socket owning process reads some of the data
     * to stabilize the situation.
     */
    static int tcp_prune_queue(struct sock *sk)
    {
    
    	struct tcp_sock *tp = tcp_sk(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
    
    
    	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PRUNECALLED);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
    
    	else if (sk_under_memory_pressure(sk))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
    
    	tcp_collapse_ofo_queue(sk);
    
    	if (!skb_queue_empty(&sk->sk_receive_queue))
    		tcp_collapse(sk, &sk->sk_receive_queue,
    			     skb_peek(&sk->sk_receive_queue),
    			     NULL,
    			     tp->copied_seq, tp->rcv_nxt);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
    		return 0;
    
    	/* Collapsing did not help, destructive actions follow.
    	 * This must not ever occur. */
    
    
    	tcp_prune_ofo_queue(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
    		return 0;
    
    	/* If we are really being abused, tell the caller to silently
    	 * drop receive data on the floor.  It will get retransmitted
    	 * and hopefully then we'll have sufficient space.
    	 */
    
    	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_RCVPRUNED);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	/* Massive buffer overcommit. */
    	tp->pred_flags = 0;
    	return -1;
    }
    
    /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
     * As additional protections, we do not touch cwnd in retransmission phases,
     * and if application hit its sndbuf limit recently.
     */
    void tcp_cwnd_application_limited(struct sock *sk)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    
    	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	    sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
    		/* Limited by application or receiver window. */
    
    		u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
    		u32 win_used = max(tp->snd_cwnd_used, init_win);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (win_used < tp->snd_cwnd) {
    
    			tp->snd_ssthresh = tcp_current_ssthresh(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
    		}
    		tp->snd_cwnd_used = 0;
    	}
    	tp->snd_cwnd_stamp = tcp_time_stamp;
    }
    
    
    Eric Dumazet's avatar
    Eric Dumazet committed
    static bool tcp_should_expand_sndbuf(const struct sock *sk)
    
    	const struct tcp_sock *tp = tcp_sk(sk);
    
    	/* If the user specified a specific send buffer setting, do
    	 * not modify it.
    	 */
    	if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
    
    Eric Dumazet's avatar
    Eric Dumazet committed
    		return false;
    
    
    	/* If we are under global TCP memory pressure, do not expand.  */
    
    	if (sk_under_memory_pressure(sk))
    
    Eric Dumazet's avatar
    Eric Dumazet committed
    		return false;
    
    
    	/* If we are under soft global TCP memory pressure, do not expand.  */
    
    	if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
    
    Eric Dumazet's avatar
    Eric Dumazet committed
    		return false;
    
    
    	/* If we filled the congestion window, do not expand.  */
    	if (tp->packets_out >= tp->snd_cwnd)
    
    Eric Dumazet's avatar
    Eric Dumazet committed
    		return false;
    
    Eric Dumazet's avatar
    Eric Dumazet committed
    	return true;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    /* When incoming ACK allowed to free some skb from write_queue,
     * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
     * on the exit from tcp input handler.
     *
     * PROBLEM: sndbuf expansion does not work well with largesend.
     */
    static void tcp_new_space(struct sock *sk)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    
    	if (tcp_should_expand_sndbuf(sk)) {
    
    		int sndmem = SKB_TRUESIZE(max_t(u32,
    						tp->rx_opt.mss_clamp,
    						tp->mss_cache) +
    					  MAX_TCP_HEADER);
    
    		int demanded = max_t(unsigned int, tp->snd_cwnd,
    
    				     tp->reordering + 1);
    		sndmem *= 2 * demanded;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (sndmem > sk->sk_sndbuf)
    			sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
    		tp->snd_cwnd_stamp = tcp_time_stamp;
    	}
    
    	sk->sk_write_space(sk);
    }
    
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    static void tcp_check_space(struct sock *sk)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
    		sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
    		if (sk->sk_socket &&
    		    test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
    			tcp_new_space(sk);
    	}
    }
    
    
    static inline void tcp_data_snd_check(struct sock *sk)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	tcp_push_pending_frames(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	tcp_check_space(sk);
    }
    
    /*
     * Check if sending an ack is needed.
     */
    static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	    /* More than one full frame received... */
    
    	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	     /* ... and right edge of window advances far enough.
    	      * (tcp_recvmsg() will send ACK otherwise). Or...
    	      */
    
    	     __tcp_select_window(sk) >= tp->rcv_wnd) ||
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	    /* We ACK each frame or... */
    
    	    tcp_in_quickack_mode(sk) ||
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	    /* We have out of order data. */
    
    	    (ofo_possible && skb_peek(&tp->out_of_order_queue))) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/* Then ack it now */
    		tcp_send_ack(sk);
    	} else {
    		/* Else, send delayed ack. */
    		tcp_send_delayed_ack(sk);
    	}
    }
    
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    static inline void tcp_ack_snd_check(struct sock *sk)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	if (!inet_csk_ack_scheduled(sk)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/* We sent a data segment already. */
    		return;
    	}
    	__tcp_ack_snd_check(sk, 1);
    }
    
    /*
     *	This routine is only called when we have urgent data
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
     *	signaled. Its the 'slow' part of tcp_urg. It could be
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *	moved inline now as tcp_urg is only called from one
     *	place. We handle URGent data wrong. We have to - as
     *	BSD still doesn't use the correction from RFC961.
     *	For 1003.1g we should support a new option TCP_STDURG to permit
     *	either form (or just set the sysctl tcp_stdurg).
     */
    
    static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	u32 ptr = ntohs(th->urg_ptr);
    
    	if (ptr && !sysctl_tcp_stdurg)
    		ptr--;
    	ptr += ntohl(th->seq);
    
    	/* Ignore urgent data that we've already seen and read. */
    	if (after(tp->copied_seq, ptr))
    		return;
    
    	/* Do not replay urg ptr.
    	 *
    	 * NOTE: interesting situation not covered by specs.
    	 * Misbehaving sender may send urg ptr, pointing to segment,
    	 * which we already have in ofo queue. We are not able to fetch
    	 * such data and will stay in TCP_URG_NOTYET until will be eaten
    	 * by recvmsg(). Seems, we are not obliged to handle such wicked
    	 * situations. But it is worth to think about possibility of some
    	 * DoSes using some hypothetical application level deadlock.
    	 */
    	if (before(ptr, tp->rcv_nxt))
    		return;
    
    	/* Do we already have a newer (or duplicate) urgent pointer? */
    	if (tp->urg_data && !after(ptr, tp->urg_seq))
    		return;
    
    	/* Tell the world about our new urgent pointer. */
    	sk_send_sigurg(sk);
    
    	/* We may be adding urgent data when the last byte read was
    	 * urgent. To do this requires some care. We cannot just ignore
    	 * tp->copied_seq since we would read the last urgent byte again
    	 * as data, nor can we alter copied_seq until this data arrives
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    	 * or we break the semantics of SIOCATMARK (and thus sockatmark())
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	 *
    	 * NOTE. Double Dutch. Rendering to plain English: author of comment
    	 * above did something sort of 	send("A", MSG_OOB); send("B", MSG_OOB);
    	 * and expect that both A and B disappear from stream. This is _wrong_.
    	 * Though this happens in BSD with high probability, this is occasional.
    	 * Any application relying on this is buggy. Note also, that fix "works"
    	 * only in this artificial test. Insert some normal data between A and B and we will
    	 * decline of BSD again. Verdict: it is better to remove to trap
    	 * buggy users.
    	 */
    	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
    
    	    !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
    		tp->copied_seq++;
    		if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
    
    David S. Miller's avatar
    David S. Miller committed
    			__skb_unlink(skb, &sk->sk_receive_queue);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			__kfree_skb(skb);
    		}
    	}
    
    
    	tp->urg_data = TCP_URG_NOTYET;
    	tp->urg_seq = ptr;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	/* Disable header prediction. */
    	tp->pred_flags = 0;
    }
    
    /* This is the 'fast' part of urgent handling. */
    
    static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	/* Check if we get a new urgent pointer - normally not. */
    	if (th->urg)
    
    		tcp_check_urg(sk, th);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	/* Do we wait for any urgent data? - normally not... */
    	if (tp->urg_data == TCP_URG_NOTYET) {
    		u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) -
    			  th->syn;
    
    
    		/* Is the urgent pointer pointing into this packet? */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (ptr < skb->len) {
    			u8 tmp;
    			if (skb_copy_bits(skb, ptr, &tmp, 1))
    				BUG();
    			tp->urg_data = TCP_URG_VALID | tmp;
    			if (!sock_flag(sk, SOCK_DEAD))
    				sk->sk_data_ready(sk, 0);
    		}
    	}
    }
    
    static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	int chunk = skb->len - hlen;
    	int err;
    
    	local_bh_enable();
    
    	if (skb_csum_unnecessary(skb))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		err = skb_copy_datagram_iovec(skb, hlen, tp->ucopy.iov, chunk);
    	else
    		err = skb_copy_and_csum_datagram_iovec(skb, hlen,
    						       tp->ucopy.iov);
    
    	if (!err) {
    		tp->ucopy.len -= chunk;
    		tp->copied_seq += chunk;
    		tcp_rcv_space_adjust(sk);
    	}
    
    	local_bh_disable();
    	return err;
    }
    
    
    static __sum16 __tcp_checksum_complete_user(struct sock *sk,
    					    struct sk_buff *skb)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	if (sock_owned_by_user(sk)) {
    		local_bh_enable();
    		result = __tcp_checksum_complete(skb);
    		local_bh_disable();
    	} else {
    		result = __tcp_checksum_complete(skb);
    	}
    	return result;
    }
    
    
    static inline bool tcp_checksum_complete_user(struct sock *sk,
    
    					     struct sk_buff *skb)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	return !skb_csum_unnecessary(skb) &&
    
    	       __tcp_checksum_complete_user(sk, skb);
    
    #ifdef CONFIG_NET_DMA
    
    Eric Dumazet's avatar
    Eric Dumazet committed
    static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
    
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	int chunk = skb->len - hlen;
    	int dma_cookie;
    
    Eric Dumazet's avatar
    Eric Dumazet committed
    	bool copied_early = false;
    
    
    	if (tp->ucopy.wakeup)
    
    Eric Dumazet's avatar
    Eric Dumazet committed
    		return false;
    
    
    	if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
    
    		tp->ucopy.dma_chan = net_dma_find_channel();
    
    	if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) {
    
    
    		dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan,
    
    							 skb, hlen,
    							 tp->ucopy.iov, chunk,
    							 tp->ucopy.pinned_list);
    
    
    		if (dma_cookie < 0)
    			goto out;
    
    		tp->ucopy.dma_cookie = dma_cookie;
    
    Eric Dumazet's avatar
    Eric Dumazet committed
    		copied_early = true;
    
    
    		tp->ucopy.len -= chunk;
    		tp->copied_seq += chunk;
    		tcp_rcv_space_adjust(sk);
    
    		if ((tp->ucopy.len == 0) ||
    
    		    (tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) ||
    
    		    (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) {
    			tp->ucopy.wakeup = 1;
    			sk->sk_data_ready(sk, 0);
    		}
    	} else if (chunk > 0) {
    		tp->ucopy.wakeup = 1;
    		sk->sk_data_ready(sk, 0);
    	}
    out:
    	return copied_early;
    }
    #endif /* CONFIG_NET_DMA */
    
    
    /* Does PAWS and seqno based validation of an incoming segment, flags will
     * play significant role here.
     */
    
    static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
    				  const struct tcphdr *th, int syn_inerr)
    
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	/* RFC1323: H1. Apply PAWS check first. */
    
    Christoph Paasch's avatar
    Christoph Paasch committed
    	if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
    
    	    tcp_paws_discard(sk, skb)) {
    		if (!th->rst) {
    			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
    			tcp_send_dupack(sk, skb);
    			goto discard;
    		}
    		/* Reset is accepted even if it did not pass PAWS. */
    	}
    
    	/* Step 1: check sequence number */
    	if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
    		/* RFC793, page 37: "In all states except SYN-SENT, all reset
    		 * (RST) segments are validated by checking their SEQ-fields."
    		 * And page 69: "If an incoming segment is not acceptable,
    		 * an acknowledgment should be sent in reply (unless the RST
    		 * bit is set, if so drop the segment and return)".
    		 */
    
    		if (!th->rst) {
    			if (th->syn)
    				goto syn_challenge;
    
    		goto discard;
    	}
    
    	/* Step 2: check RST bit */
    	if (th->rst) {
    
    		/* RFC 5961 3.2 :
    		 * If sequence number exactly matches RCV.NXT, then
    		 *     RESET the connection
    		 * else
    		 *     Send a challenge ACK
    		 */
    		if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt)
    			tcp_reset(sk);
    		else
    			tcp_send_challenge_ack(sk);
    
    		goto discard;
    	}
    
    	/* step 3: check security and precedence [ignored] */
    
    
    	/* step 4: Check for a SYN
    	 * RFC 5691 4.2 : Send a challenge ack
    	 */
    	if (th->syn) {
    
    		if (syn_inerr)
    			TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
    
    		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
    		tcp_send_challenge_ack(sk);
    		goto discard;
    
    	return true;
    
    	return false;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
    
     *	TCP receive function for the ESTABLISHED state.