Skip to content
Snippets Groups Projects
tcp_output.c 71.8 KiB
Newer Older
  • Learn to ignore specific revisions
  •  * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
     *    With Minshall's modification: all sent small packets are ACKed.
     */
    
    static inline int tcp_nagle_check(const struct tcp_sock *tp,
    
    				  const struct sk_buff *skb,
    
    				  unsigned mss_now, int nonagle)
    {
    	return (skb->len < mss_now &&
    		((nonagle&TCP_NAGLE_CORK) ||
    		 (!nonagle &&
    		  tp->packets_out &&
    		  tcp_minshall_check(tp))));
    }
    
    /* Return non-zero if the Nagle test allows this packet to be
     * sent now.
     */
    static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
    				 unsigned int cur_mss, int nonagle)
    {
    	/* Nagle rule does not apply to frames, which sit in the middle of the
    	 * write_queue (they have no chances to get new data).
    	 *
    	 * This is implemented in the callers, where they modify the 'nonagle'
    	 * argument based upon the location of SKB in the send queue.
    	 */
    	if (nonagle & TCP_NAGLE_PUSH)
    		return 1;
    
    	/* Don't use the nagle rule for urgent data (or for the final FIN).  */
    	if (tp->urg_mode ||
    	    (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
    		return 1;
    
    	if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
    		return 1;
    
    	return 0;
    }
    
    /* Does at least the first segment of SKB fit into the send window? */
    static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss)
    {
    	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
    
    	if (skb->len > cur_mss)
    		end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
    
    	return !after(end_seq, tp->snd_una + tp->snd_wnd);
    }
    
    
    /* This checks if the data bearing packet SKB (usually tcp_send_head(sk))
    
     * should be put on the wire right now.  If so, it returns the number of
     * packets allowed by the congestion window.
     */
    static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
    				 unsigned int cur_mss, int nonagle)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	unsigned int cwnd_quota;
    
    
    	tcp_init_tso_segs(sk, skb, cur_mss);
    
    
    	if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
    		return 0;
    
    	cwnd_quota = tcp_cwnd_test(tp, skb);
    	if (cwnd_quota &&
    	    !tcp_snd_wnd_test(tp, skb, cur_mss))
    		cwnd_quota = 0;
    
    	return cwnd_quota;
    }
    
    int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
    {
    
    	struct sk_buff *skb = tcp_send_head(sk);
    
    
    	return (skb &&
    		tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
    			     (tcp_skb_is_last(sk, skb) ?
    			      TCP_NAGLE_PUSH :
    			      tp->nonagle)));
    }
    
    /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
     * which is put after SKB on the list.  It is very much like
     * tcp_fragment() except that it may make several kinds of assumptions
     * in order to speed up the splitting operation.  In particular, we
     * know that all the data is in scatter-gather pages, and that the
     * packet has never been sent out before (and thus is not cloned).
     */
    
    static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, unsigned int mss_now)
    
    {
    	struct sk_buff *buff;
    	int nlen = skb->len - len;
    	u16 flags;
    
    	/* All of a TSO frame must be composed of paged data.  */
    
    	if (skb->len != skb->data_len)
    		return tcp_fragment(sk, skb, len, mss_now);
    
    
    	buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC);
    	if (unlikely(buff == NULL))
    		return -ENOMEM;
    
    
    	sk_charge_skb(sk, buff);
    	buff->truesize += nlen;
    
    	skb->truesize -= nlen;
    
    	/* Correct the sequence numbers. */
    	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
    	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
    	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
    
    	/* PSH and FIN should only be set in the second packet. */
    	flags = TCP_SKB_CB(skb)->flags;
    	TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
    	TCP_SKB_CB(buff)->flags = flags;
    
    	/* This packet was never sent out yet, so no SACK bits. */
    	TCP_SKB_CB(buff)->sacked = 0;
    
    
    	buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL;
    
    	skb_split(skb, buff, len);
    
    	/* Fix up tso_factor for both original and new SKB.  */
    
    	tcp_set_skb_tso_segs(sk, skb, mss_now);
    	tcp_set_skb_tso_segs(sk, buff, mss_now);
    
    
    	/* Link BUFF into the send queue. */
    	skb_header_release(buff);
    
    	tcp_insert_write_queue_after(skb, buff, sk);
    
    
    	return 0;
    }
    
    /* Try to defer sending, if possible, in order to minimize the amount
     * of TSO splitting we do.  View it as a kind of TSO Nagle test.
     *
     * This algorithm is from John Heffner.
     */
    static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
    {
    
    	const struct inet_connection_sock *icsk = inet_csk(sk);
    
    	u32 send_win, cong_win, limit, in_flight;
    
    	if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
    
    		goto send_now;
    
    	if (icsk->icsk_ca_state != TCP_CA_Open)
    
    		goto send_now;
    
    	/* Defer for less than two clock ticks. */
    	if (!tp->tso_deferred && ((jiffies<<1)>>1) - (tp->tso_deferred>>1) > 1)
    		goto send_now;
    
    	in_flight = tcp_packets_in_flight(tp);
    
    	BUG_ON(tcp_skb_pcount(skb) <= 1 ||
    	       (tp->snd_cwnd <= in_flight));
    
    	send_win = (tp->snd_una + tp->snd_wnd) - TCP_SKB_CB(skb)->seq;
    
    	/* From in_flight test above, we know that cwnd > in_flight.  */
    	cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
    
    	limit = min(send_win, cong_win);
    
    
    	/* If a full-sized TSO skb can be sent, do it. */
    	if (limit >= 65536)
    
    		goto send_now;
    
    	if (sysctl_tcp_tso_win_divisor) {
    		u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
    
    		/* If at least some fraction of a window is available,
    		 * just use it.
    		 */
    		chunk /= sysctl_tcp_tso_win_divisor;
    		if (limit >= chunk)
    
    			goto send_now;
    
    	} else {
    		/* Different approach, try not to defer past a single
    		 * ACK.  Receiver should ACK every other full sized
    		 * frame, so if we have space for more than 3 frames
    		 * then send now.
    		 */
    		if (limit > tcp_max_burst(tp) * tp->mss_cache)
    
    			goto send_now;
    
    	}
    
    	/* Ok, it looks like it is advisable to defer.  */
    
    	tp->tso_deferred = 1 | (jiffies<<1);
    
    
    
    send_now:
    	tp->tso_deferred = 0;
    	return 0;
    
    John Heffner's avatar
    John Heffner committed
    /* Create a new MTU probe if we are ready.
     * Returns 0 if we should wait to probe (no cwnd available),
     *         1 if a probe was sent,
     *         -1 otherwise */
    static int tcp_mtu_probe(struct sock *sk)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	struct inet_connection_sock *icsk = inet_csk(sk);
    	struct sk_buff *skb, *nskb, *next;
    	int len;
    	int probe_size;
    	unsigned int pif;
    	int copy;
    	int mss_now;
    
    	/* Not currently probing/verifying,
    	 * not in recovery,
    	 * have enough cwnd, and
    	 * not SACKing (the variable headers throw things off) */
    	if (!icsk->icsk_mtup.enabled ||
    	    icsk->icsk_mtup.probe_size ||
    	    inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
    	    tp->snd_cwnd < 11 ||
    	    tp->rx_opt.eff_sacks)
    		return -1;
    
    	/* Very simple search strategy: just double the MSS. */
    	mss_now = tcp_current_mss(sk, 0);
    	probe_size = 2*tp->mss_cache;
    	if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
    		/* TODO: set timer for probe_converge_event */
    		return -1;
    	}
    
    	/* Have enough data in the send queue to probe? */
    	len = 0;
    
    	if ((skb = tcp_send_head(sk)) == NULL)
    
    John Heffner's avatar
    John Heffner committed
    		return -1;
    	while ((len += skb->len) < probe_size && !tcp_skb_is_last(sk, skb))
    
    		skb = tcp_write_queue_next(sk, skb);
    
    John Heffner's avatar
    John Heffner committed
    	if (len < probe_size)
    		return -1;
    
    	/* Receive window check. */
    	if (after(TCP_SKB_CB(skb)->seq + probe_size, tp->snd_una + tp->snd_wnd)) {
    		if (tp->snd_wnd < probe_size)
    			return -1;
    		else
    			return 0;
    	}
    
    	/* Do we need to wait to drain cwnd? */
    	pif = tcp_packets_in_flight(tp);
    	if (pif + 2 > tp->snd_cwnd) {
    		/* With no packets in flight, don't stall. */
    		if (pif == 0)
    			return -1;
    		else
    			return 0;
    	}
    
    	/* We're allowed to probe.  Build it now. */
    	if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL)
    		return -1;
    	sk_charge_skb(sk, nskb);
    
    
    	skb = tcp_send_head(sk);
    	tcp_insert_write_queue_before(nskb, skb, sk);
    	tcp_advance_send_head(sk, skb);
    
    John Heffner's avatar
    John Heffner committed
    
    	TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
    	TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
    	TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK;
    	TCP_SKB_CB(nskb)->sacked = 0;
    	nskb->csum = 0;
    
    	nskb->ip_summed = skb->ip_summed;
    
    John Heffner's avatar
    John Heffner committed
    
    	len = 0;
    	while (len < probe_size) {
    
    		next = tcp_write_queue_next(sk, skb);
    
    John Heffner's avatar
    John Heffner committed
    
    		copy = min_t(int, skb->len, probe_size - len);
    		if (nskb->ip_summed)
    			skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
    		else
    			nskb->csum = skb_copy_and_csum_bits(skb, 0,
    
    					 skb_put(nskb, copy), copy, nskb->csum);
    
    John Heffner's avatar
    John Heffner committed
    
    		if (skb->len <= copy) {
    			/* We've eaten all the data from this skb.
    			 * Throw it away. */
    			TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags;
    
    			tcp_unlink_write_queue(skb, sk);
    
    John Heffner's avatar
    John Heffner committed
    			sk_stream_free_skb(sk, skb);
    		} else {
    			TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags &
    
    						   ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
    
    John Heffner's avatar
    John Heffner committed
    			if (!skb_shinfo(skb)->nr_frags) {
    				skb_pull(skb, copy);
    
    				if (skb->ip_summed != CHECKSUM_PARTIAL)
    
    John Heffner's avatar
    John Heffner committed
    					skb->csum = csum_partial(skb->data, skb->len, 0);
    			} else {
    				__pskb_trim_head(skb, copy);
    				tcp_set_skb_tso_segs(sk, skb, mss_now);
    			}
    			TCP_SKB_CB(skb)->seq += copy;
    		}
    
    		len += copy;
    		skb = next;
    	}
    	tcp_init_tso_segs(sk, nskb, nskb->len);
    
    	/* We're ready to send.  If this fails, the probe will
    	 * be resegmented into mss-sized pieces by tcp_write_xmit(). */
    	TCP_SKB_CB(nskb)->when = tcp_time_stamp;
    	if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
    		/* Decrement cwnd here because we are sending
    		* effectively two packets. */
    		tp->snd_cwnd--;
    		update_send_head(sk, tp, nskb);
    
    		icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
    
    		tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;
    		tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /* This routine writes packets to the network.  It advances the
     * send_head.  This happens as incoming acks open up the remote
     * window for us.
     *
     * Returns 1, if no segments are in flight and we have queued segments, but
     * cannot send anything now because of SWS or another problem.
     */
    
    static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	struct sk_buff *skb;
    
    	unsigned int tso_segs, sent_pkts;
    	int cwnd_quota;
    
    John Heffner's avatar
    John Heffner committed
    	int result;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	/* If we are closed, the bytes will have to remain here.
    	 * In time closedown will finish, we empty the write queue and all
    	 * will be happy.
    	 */
    
    	if (unlikely(sk->sk_state == TCP_CLOSE))
    		return 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    John Heffner's avatar
    John Heffner committed
    
    	/* Do MTU probing. */
    	if ((result = tcp_mtu_probe(sk)) == 0) {
    		return 0;
    	} else if (result > 0) {
    		sent_pkts = 1;
    	}
    
    
    	while ((skb = tcp_send_head(sk))) {
    
    		tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
    
    		BUG_ON(!tso_segs);
    
    		cwnd_quota = tcp_cwnd_test(tp, skb);
    		if (!cwnd_quota)
    			break;
    
    		if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
    			break;
    
    
    		if (tso_segs == 1) {
    			if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
    						     (tcp_skb_is_last(sk, skb) ?
    						      nonagle : TCP_NAGLE_PUSH))))
    				break;
    		} else {
    			if (tcp_tso_should_defer(sk, tp, skb))
    				break;
    		}
    
    		if (tso_segs > 1) {
    
    			limit = tcp_window_allows(tp, skb,
    						  mss_now, cwnd_quota);
    
    
    			if (skb->len < limit) {
    				unsigned int trim = skb->len % mss_now;
    
    				if (trim)
    					limit = skb->len - trim;
    			}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		if (skb->len > limit &&
    		    unlikely(tso_fragment(sk, skb, limit, mss_now)))
    			break;
    
    
    		TCP_SKB_CB(skb)->when = tcp_time_stamp;
    
    		if (unlikely(tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC)))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		/* Advance the send_head.  This one is sent out.
    		 * This call will increment packets_out.
    		 */
    		update_send_head(sk, tp, skb);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		tcp_minshall_update(tp, mss_now, skb);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		tcp_cwnd_validate(sk, tp);
    		return 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    	return !tp->packets_out && tcp_send_head(sk);
    
    /* Push out any pending frames which were held back due to
     * TCP_CORK or attempt at coalescing tiny packets.
     * The socket must be locked by the caller.
     */
    void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp,
    
    			       unsigned int cur_mss, int nonagle)
    
    	struct sk_buff *skb = tcp_send_head(sk);
    
    		if (tcp_write_xmit(sk, cur_mss, nonagle))
    
    /* Send _single_ skb sitting at the send head. This function requires
     * true push pending frames to setup probe timer etc.
     */
    void tcp_push_one(struct sock *sk, unsigned int mss_now)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	struct sk_buff *skb = tcp_send_head(sk);
    
    	unsigned int tso_segs, cwnd_quota;
    
    	BUG_ON(!skb || skb->len < mss_now);
    
    
    	tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
    
    	cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH);
    
    	if (likely(cwnd_quota)) {
    
    		if (tso_segs > 1) {
    
    			limit = tcp_window_allows(tp, skb,
    						  mss_now, cwnd_quota);
    
    
    			if (skb->len < limit) {
    				unsigned int trim = skb->len % mss_now;
    
    				if (trim)
    					limit = skb->len - trim;
    			}
    		}
    
    
    		if (skb->len > limit &&
    		    unlikely(tso_fragment(sk, skb, limit, mss_now)))
    			return;
    
    
    		/* Send it out now. */
    		TCP_SKB_CB(skb)->when = tcp_time_stamp;
    
    
    		if (likely(!tcp_transmit_skb(sk, skb, 1, sk->sk_allocation))) {
    
    			update_send_head(sk, tp, skb);
    			tcp_cwnd_validate(sk, tp);
    			return;
    		}
    	}
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /* This function returns the amount that we can raise the
     * usable window based on the following constraints
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * 1. The window can never be shrunk once it is offered (RFC 793)
     * 2. We limit memory per socket
     *
     * RFC 1122:
     * "the suggested [SWS] avoidance algorithm for the receiver is to keep
     *  RECV.NEXT + RCV.WIN fixed until:
     *  RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
     *
     * i.e. don't raise the right edge of the window until you can raise
     * it at least MSS bytes.
     *
     * Unfortunately, the recommended algorithm breaks header prediction,
     * since header prediction assumes th->window stays fixed.
     *
     * Strictly speaking, keeping th->window fixed violates the receiver
     * side SWS prevention criteria. The problem is that under this rule
     * a stream of single byte packets will cause the right side of the
     * window to always advance by a single byte.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * Of course, if the sender implements sender side SWS prevention
     * then this will not be a problem.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * BSD seems to make the following compromise:
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *	If the free space is less than the 1/4 of the maximum
     *	space available and the free space is less than 1/2 mss,
     *	then set the window to 0.
     *	[ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
     *	Otherwise, just prevent the window from shrinking
     *	and from being larger than the largest representable value.
     *
     * This prevents incremental opening of the window in the regime
     * where TCP is limited by the speed of the reader side taking
     * data out of the TCP receive queue. It does nothing about
     * those cases where the window is constrained on the sender side
     * because the pipeline is full.
     *
     * BSD also seems to "accidentally" limit itself to windows that are a
     * multiple of MSS, at least until the free space gets quite small.
     * This would appear to be a side effect of the mbuf implementation.
     * Combining these two algorithms results in the observed behavior
     * of having a fixed window size at almost all times.
     *
     * Below we obtain similar behavior by forcing the offered window to
     * a multiple of the mss when it is feasible to do so.
     *
     * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
     * Regular options like TIMESTAMP are taken into account.
     */
    u32 __tcp_select_window(struct sock *sk)
    {
    
    	struct inet_connection_sock *icsk = inet_csk(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct tcp_sock *tp = tcp_sk(sk);
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    	/* MSS for the peer's data.  Previous versions used mss_clamp
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	 * here.  I don't know if the value based on our guesses
    	 * of peer's MSS is better for the performance.  It's more correct
    	 * but may be worse for the performance because of rcv_mss
    	 * fluctuations.  --SAW  1998/11/1
    	 */
    
    	int mss = icsk->icsk_ack.rcv_mss;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	int free_space = tcp_space(sk);
    	int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
    	int window;
    
    	if (mss > full_space)
    
    		mss = full_space;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	if (free_space < full_space/2) {
    
    		icsk->icsk_ack.quick = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		if (tcp_memory_pressure)
    			tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss);
    
    		if (free_space < mss)
    			return 0;
    	}
    
    	if (free_space > tp->rcv_ssthresh)
    		free_space = tp->rcv_ssthresh;
    
    	/* Don't do rounding if we are using window scaling, since the
    	 * scaled window will not line up with the MSS boundary anyway.
    	 */
    	window = tp->rcv_wnd;
    	if (tp->rx_opt.rcv_wscale) {
    		window = free_space;
    
    		/* Advertise enough space so that it won't get scaled away.
    		 * Import case: prevent zero window announcement if
    		 * 1<<rcv_wscale > mss.
    		 */
    		if (((window >> tp->rx_opt.rcv_wscale) << tp->rx_opt.rcv_wscale) != window)
    			window = (((window >> tp->rx_opt.rcv_wscale) + 1)
    				  << tp->rx_opt.rcv_wscale);
    	} else {
    		/* Get the largest window that is a nice multiple of mss.
    		 * Window clamp already applied above.
    		 * If our current window offering is within 1 mss of the
    		 * free space we just keep it. This prevents the divide
    		 * and multiply from happening most of the time.
    		 * We also don't do any window rounding when the free space
    		 * is too small.
    		 */
    		if (window <= free_space - mss || window > free_space)
    			window = (free_space/mss)*mss;
    
    		else if (mss == full_space &&
    		         free_space > window + full_space/2)
    			window = free_space;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    	return window;
    }
    
    /* Attempt to collapse two adjacent SKB's during retransmission. */
    static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	/* The first test we must make is that neither of these two
    	 * SKB's are still referenced by someone else.
    	 */
    	if (!skb_cloned(skb) && !skb_cloned(next_skb)) {
    		int skb_size = skb->len, next_skb_size = next_skb->len;
    		u16 flags = TCP_SKB_CB(skb)->flags;
    
    		/* Also punt if next skb has been SACK'd. */
    		if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
    			return;
    
    		/* Next skb is out of window. */
    		if (after(TCP_SKB_CB(next_skb)->end_seq, tp->snd_una+tp->snd_wnd))
    			return;
    
    		/* Punt if not enough space exists in the first SKB for
    		 * the data in the second, or the total combined payload
    		 * would exceed the MSS.
    		 */
    		if ((next_skb_size > skb_tailroom(skb)) ||
    		    ((skb_size + next_skb_size) > mss_now))
    			return;
    
    		BUG_ON(tcp_skb_pcount(skb) != 1 ||
    		       tcp_skb_pcount(next_skb) != 1);
    
    
    		/* changing transmit queue under us so clear hints */
    		clear_all_retrans_hints(tp);
    
    		/* Ok.	We will be able to collapse the packet. */
    
    		tcp_unlink_write_queue(next_skb, sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
    
    
    		if (next_skb->ip_summed == CHECKSUM_PARTIAL)
    			skb->ip_summed = CHECKSUM_PARTIAL;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		if (skb->ip_summed != CHECKSUM_PARTIAL)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
    
    		/* Update sequence range on original skb. */
    		TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
    
    		/* Merge over control information. */
    		flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */
    		TCP_SKB_CB(skb)->flags = flags;
    
    		/* All done, get rid of second SKB and account for it so
    		 * packet counting does not break.
    		 */
    		TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
    		if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS)
    			tp->retrans_out -= tcp_skb_pcount(next_skb);
    		if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) {
    			tp->lost_out -= tcp_skb_pcount(next_skb);
    			tp->left_out -= tcp_skb_pcount(next_skb);
    		}
    		/* Reno case is special. Sigh... */
    		if (!tp->rx_opt.sack_ok && tp->sacked_out) {
    			tcp_dec_pcount_approx(&tp->sacked_out, next_skb);
    			tp->left_out -= tcp_skb_pcount(next_skb);
    		}
    
    		/* Not quite right: it can be > snd.fack, but
    		 * it is better to underestimate fackets.
    		 */
    		tcp_dec_pcount_approx(&tp->fackets_out, next_skb);
    		tcp_packets_out_dec(tp, next_skb);
    		sk_stream_free_skb(sk, next_skb);
    	}
    }
    
    /* Do a simple retransmit without using the backoff mechanisms in
    
     * tcp_timer. This is used for path mtu discovery.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * The socket is already locked here.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    void tcp_simple_retransmit(struct sock *sk)
    {
    
    	const struct inet_connection_sock *icsk = inet_csk(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct tcp_sock *tp = tcp_sk(sk);
    	struct sk_buff *skb;
    	unsigned int mss = tcp_current_mss(sk, 0);
    	int lost = 0;
    
    
    	tcp_for_write_queue(skb, sk) {
    		if (skb == tcp_send_head(sk))
    			break;
    
    		if (skb->len > mss &&
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		    !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
    			if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
    				TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
    				tp->retrans_out -= tcp_skb_pcount(skb);
    			}
    			if (!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) {
    				TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
    				tp->lost_out += tcp_skb_pcount(skb);
    				lost = 1;
    			}
    		}
    	}
    
    
    	clear_all_retrans_hints(tp);
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (!lost)
    		return;
    
    	tcp_sync_left_out(tp);
    
    
    	/* Don't muck with the congestion window here.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	 * Reason is that we do not increase amount of _data_
    	 * in network, but units changed and effective
    	 * cwnd/ssthresh really reduced now.
    	 */
    
    	if (icsk->icsk_ca_state != TCP_CA_Loss) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		tp->high_seq = tp->snd_nxt;
    
    		tp->snd_ssthresh = tcp_current_ssthresh(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		tp->prior_ssthresh = 0;
    		tp->undo_marker = 0;
    
    		tcp_set_ca_state(sk, TCP_CA_Loss);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    	tcp_xmit_retransmit_queue(sk);
    }
    
    /* This retransmits one SKB.  Policy decisions and retransmit queue
     * state updates are done by the caller.  Returns non-zero if an
     * error occurred which prevented the send.
     */
    int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    John Heffner's avatar
    John Heffner committed
    	struct inet_connection_sock *icsk = inet_csk(sk);
    
    	unsigned int cur_mss = tcp_current_mss(sk, 0);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	int err;
    
    
    John Heffner's avatar
    John Heffner committed
    	/* Inconslusive MTU probe */
    	if (icsk->icsk_mtup.probe_size) {
    		icsk->icsk_mtup.probe_size = 0;
    	}
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	/* Do not sent more than we queued. 1/4 is reserved for possible
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    	 * copying overhead: fragmentation, tunneling, mangling etc.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	 */
    	if (atomic_read(&sk->sk_wmem_alloc) >
    	    min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
    		return -EAGAIN;
    
    	if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
    		if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
    			BUG();
    		if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
    			return -ENOMEM;
    	}
    
    	/* If receiver has shrunk his window, and skb is out of
    	 * new window, do not retransmit it. The exception is the
    	 * case, when window is shrunk to zero. In this case
    	 * our retransmit serves as a zero window probe.
    	 */
    	if (!before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)
    	    && TCP_SKB_CB(skb)->seq != tp->snd_una)
    		return -EAGAIN;
    
    	if (skb->len > cur_mss) {
    
    		if (tcp_fragment(sk, skb, cur_mss, cur_mss))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			return -ENOMEM; /* We'll try again later. */
    	}
    
    	/* Collapse two adjacent packets if worthwhile and we can. */
    	if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
    	   (skb->len < (cur_mss >> 1)) &&
    
    	   (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) &&
    	   (!tcp_skb_is_last(sk, skb)) &&
    	   (skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(tcp_write_queue_next(sk, skb))->nr_frags == 0) &&
    	   (tcp_skb_pcount(skb) == 1 && tcp_skb_pcount(tcp_write_queue_next(sk, skb)) == 1) &&
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	   (sysctl_tcp_retrans_collapse != 0))
    		tcp_retrans_try_collapse(sk, skb, cur_mss);
    
    
    	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return -EHOSTUNREACH; /* Routing failure or similar. */
    
    	/* Some Solaris stacks overoptimize and ignore the FIN on a
    	 * retransmit when old data is attached.  So strip it off
    	 * since it is cheap to do so and saves bytes on the network.
    	 */
    	if(skb->len > 0 &&
    	   (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
    	   tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
    		if (!pskb_trim(skb, 0)) {
    			TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
    
    			skb_shinfo(skb)->gso_segs = 1;
    			skb_shinfo(skb)->gso_size = 0;
    			skb_shinfo(skb)->gso_type = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			skb->ip_summed = CHECKSUM_NONE;
    			skb->csum = 0;
    		}
    	}
    
    	/* Make a copy, if the first transmission SKB clone we made
    	 * is still in somebody's hands, else make a clone.
    	 */
    	TCP_SKB_CB(skb)->when = tcp_time_stamp;
    
    
    	err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	if (err == 0) {
    		/* Update global TCP statistics. */
    		TCP_INC_STATS(TCP_MIB_RETRANSSEGS);
    
    		tp->total_retrans++;
    
    #if FASTRETRANS_DEBUG > 0
    		if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
    			if (net_ratelimit())
    				printk(KERN_DEBUG "retrans_out leaked.\n");
    		}
    #endif
    		TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
    		tp->retrans_out += tcp_skb_pcount(skb);
    
    		/* Save stamp of the first retransmit. */
    		if (!tp->retrans_stamp)
    			tp->retrans_stamp = TCP_SKB_CB(skb)->when;
    
    		tp->undo_retrans++;
    
    		/* snd_nxt is stored to detect loss of retransmitted segment,
    		 * see tcp_input.c tcp_sacktag_write_queue().
    		 */
    		TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
    	}
    	return err;
    }
    
    /* This gets called after a retransmit timeout, and the initially
     * retransmitted data is acknowledged.  It tries to continue
     * resending the rest of the retransmit queue, until either
     * we've sent it all or the congestion window limit is reached.
     * If doing SACK, the first ACK which comes back for a timeout
     * based retransmit packet might feed us FACK information again.
     * If so, we use it to avoid unnecessarily retransmissions.
     */
    void tcp_xmit_retransmit_queue(struct sock *sk)
    {
    
    	const struct inet_connection_sock *icsk = inet_csk(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct tcp_sock *tp = tcp_sk(sk);
    	struct sk_buff *skb;
    
    	int packet_cnt;
    
    	if (tp->retransmit_skb_hint) {
    		skb = tp->retransmit_skb_hint;
    		packet_cnt = tp->retransmit_cnt_hint;
    	}else{
    
    		skb = tcp_write_queue_head(sk);
    
    		packet_cnt = 0;
    	}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	/* First pass: retransmit lost packets. */
    
    	if (tp->lost_out) {
    
    		tcp_for_write_queue_from(skb, sk) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			__u8 sacked = TCP_SKB_CB(skb)->sacked;
    
    
    			if (skb == tcp_send_head(sk))
    				break;
    
    			/* we could do better than to assign each time */
    			tp->retransmit_skb_hint = skb;
    			tp->retransmit_cnt_hint = packet_cnt;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			/* Assume this retransmit will generate
    			 * only one packet for congestion window
    			 * calculation purposes.  This works because
    			 * tcp_retransmit_skb() will chop up the
    			 * packet to be MSS sized and all the
    			 * packet counting works out.
    			 */
    			if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
    				return;
    
    
    			if (sacked & TCPCB_LOST) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
    
    					if (tcp_retransmit_skb(sk, skb)) {
    						tp->retransmit_skb_hint = NULL;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    						return;
    
    					if (icsk->icsk_ca_state != TCP_CA_Loss)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    						NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS);
    					else
    						NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS);
    
    
    					if (skb == tcp_write_queue_head(sk))
    
    						inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
    
    				packet_cnt += tcp_skb_pcount(skb);
    				if (packet_cnt >= tp->lost_out)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    					break;
    			}
    		}
    	}
    
    	/* OK, demanded retransmission is finished. */
    
    	/* Forward retransmissions are possible only during Recovery. */
    
    	if (icsk->icsk_ca_state != TCP_CA_Recovery)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return;
    
    	/* No forward retransmissions in Reno are possible. */
    	if (!tp->rx_opt.sack_ok)
    		return;
    
    	/* Yeah, we have to make difficult choice between forward transmission
    	 * and retransmission... Both ways have their merits...
    	 *
    	 * For now we do not retransmit anything, while we have some new
    	 * segments to send.
    	 */
    
    	if (tcp_may_send_now(sk, tp))
    		return;
    
    
    	if (tp->forward_skb_hint) {
    		skb = tp->forward_skb_hint;
    		packet_cnt = tp->forward_cnt_hint;
    	} else{
    
    		skb = tcp_write_queue_head(sk);
    
    	tcp_for_write_queue_from(skb, sk) {
    		if (skb == tcp_send_head(sk))
    			break;
    
    		tp->forward_cnt_hint = packet_cnt;
    		tp->forward_skb_hint = skb;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		/* Similar to the retransmit loop above we
    		 * can pretend that the retransmitted SKB
    		 * we send out here will be composed of one
    		 * real MSS sized packet because tcp_retransmit_skb()
    		 * will fragment it if necessary.
    		 */
    		if (++packet_cnt > tp->fackets_out)
    			break;
    
    		if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
    			break;
    
    		if (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS)
    			continue;
    
    		/* Ok, retransmit it. */
    
    		if (tcp_retransmit_skb(sk, skb)) {
    			tp->forward_skb_hint = NULL;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			break;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		if (skb == tcp_write_queue_head(sk))
    
    			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
    						  inet_csk(sk)->icsk_rto,
    						  TCP_RTO_MAX);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS);
    	}
    }
    
    
    /* Send a fin.  The caller locks the socket for us.  This cannot be
     * allowed to fail queueing a FIN frame under any circumstances.
     */
    void tcp_send_fin(struct sock *sk)
    {
    
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	struct sk_buff *skb = tcp_write_queue_tail(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	int mss_now;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	/* Optimization, tack on the FIN if we have a queue of
    	 * unsent frames.  But be careful about outgoing SACKS
    	 * and IP options.
    	 */
    	mss_now = tcp_current_mss(sk, 1);
    
    
    	if (tcp_send_head(sk) != NULL) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
    		TCP_SKB_CB(skb)->end_seq++;
    		tp->write_seq++;
    	} else {
    		/* Socket is locked, keep trying until memory is available. */
    		for (;;) {