Skip to content
Snippets Groups Projects
tcp_output.c 59.1 KiB
Newer Older
  • Learn to ignore specific revisions
  • Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * INET		An implementation of the TCP/IP protocol suite for the LINUX
     *		operating system.  INET is implemented using the  BSD Socket
     *		interface as the means of communication with the user level.
     *
     *		Implementation of the Transmission Control Protocol(TCP).
     *
     * Version:	$Id: tcp_output.c,v 1.146 2002/02/01 22:01:04 davem Exp $
     *
    
     * Authors:	Ross Biro
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
     *		Mark Evans, <evansmp@uhura.aston.ac.uk>
     *		Corey Minyard <wf-rch!minyard@relay.EU.net>
     *		Florian La Roche, <flla@stud.uni-sb.de>
     *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
     *		Linus Torvalds, <torvalds@cs.helsinki.fi>
     *		Alan Cox, <gw4pts@gw4pts.ampr.org>
     *		Matthew Dillon, <dillon@apollo.west.oic.com>
     *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
     *		Jorge Cwik, <jorge@laser.satlink.net>
     */
    
    /*
     * Changes:	Pedro Roque	:	Retransmit queue handled by TCP.
     *				:	Fragmentation on mtu decrease
     *				:	Segment collapse on retransmit
     *				:	AF independence
     *
     *		Linus Torvalds	:	send_delayed_ack
     *		David S. Miller	:	Charge memory using the right skb
     *					during syn/ack processing.
     *		David S. Miller :	Output engine completely rewritten.
     *		Andrea Arcangeli:	SYNACK carry ts_recent in tsecr.
     *		Cacophonix Gaul :	draft-minshall-nagle-01
     *		J Hadi Salim	:	ECN support
     *
     */
    
    #include <net/tcp.h>
    
    #include <linux/compiler.h>
    #include <linux/module.h>
    #include <linux/smp_lock.h>
    
    /* People can turn this off for buggy TCP's found in printers etc. */
    int sysctl_tcp_retrans_collapse = 1;
    
    /* This limits the percentage of the congestion window which we
     * will allow a single TSO frame to consume.  Building TSO frames
     * which are too large can cause TCP streams to be bursty.
     */
    
    int sysctl_tcp_tso_win_divisor = 3;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    static inline void update_send_head(struct sock *sk, struct tcp_sock *tp,
    				    struct sk_buff *skb)
    {
    	sk->sk_send_head = skb->next;
    	if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue)
    		sk->sk_send_head = NULL;
    	tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
    	tcp_packets_out_inc(sk, tp, skb);
    }
    
    /* SND.NXT, if window was not shrunk.
     * If window has been shrunk, what should we make? It is not clear at all.
     * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
     * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
     * invalid. OK, let's make this for now:
     */
    static inline __u32 tcp_acceptable_seq(struct sock *sk, struct tcp_sock *tp)
    {
    	if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt))
    		return tp->snd_nxt;
    	else
    		return tp->snd_una+tp->snd_wnd;
    }
    
    /* Calculate mss to advertise in SYN segment.
     * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
     *
     * 1. It is independent of path mtu.
     * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
     * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
     *    attached devices, because some buggy hosts are confused by
     *    large MSS.
     * 4. We do not make 3, we advertise MSS, calculated from first
     *    hop device mtu, but allow to raise it to ip_rt_min_advmss.
     *    This may be overridden via information stored in routing table.
     * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
     *    probably even Jumbo".
     */
    static __u16 tcp_advertise_mss(struct sock *sk)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	struct dst_entry *dst = __sk_dst_get(sk);
    	int mss = tp->advmss;
    
    	if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) {
    		mss = dst_metric(dst, RTAX_ADVMSS);
    		tp->advmss = mss;
    	}
    
    	return (__u16)mss;
    }
    
    /* RFC2861. Reset CWND after idle period longer RTO to "restart window".
     * This is the first part of cwnd validation mechanism. */
    static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst)
    {
    	s32 delta = tcp_time_stamp - tp->lsndtime;
    	u32 restart_cwnd = tcp_init_cwnd(tp, dst);
    	u32 cwnd = tp->snd_cwnd;
    
    
    	tcp_ca_event(tp, CA_EVENT_CWND_RESTART);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	tp->snd_ssthresh = tcp_current_ssthresh(tp);
    	restart_cwnd = min(restart_cwnd, cwnd);
    
    	while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd)
    		cwnd >>= 1;
    	tp->snd_cwnd = max(cwnd, restart_cwnd);
    	tp->snd_cwnd_stamp = tcp_time_stamp;
    	tp->snd_cwnd_used = 0;
    }
    
    static inline void tcp_event_data_sent(struct tcp_sock *tp,
    				       struct sk_buff *skb, struct sock *sk)
    {
    	u32 now = tcp_time_stamp;
    
    	if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto)
    		tcp_cwnd_restart(tp, __sk_dst_get(sk));
    
    	tp->lsndtime = now;
    
    	/* If it is a reply for ato after last received
    	 * packet, enter pingpong mode.
    	 */
    	if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato)
    		tp->ack.pingpong = 1;
    }
    
    
    static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    
    	tcp_dec_quickack_mode(tp, pkts);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
    }
    
    /* Determine a window scaling and initial window to offer.
     * Based on the assumption that the given amount of space
     * will be offered. Store the results in the tp structure.
     * NOTE: for smooth operation initial space offering should
     * be a multiple of mss if possible. We assume here that mss >= 1.
     * This MUST be enforced by all callers.
     */
    void tcp_select_initial_window(int __space, __u32 mss,
    			       __u32 *rcv_wnd, __u32 *window_clamp,
    			       int wscale_ok, __u8 *rcv_wscale)
    {
    	unsigned int space = (__space < 0 ? 0 : __space);
    
    	/* If no clamp set the clamp to the max possible scaled window */
    	if (*window_clamp == 0)
    		(*window_clamp) = (65535 << 14);
    	space = min(*window_clamp, space);
    
    	/* Quantize space offering to a multiple of mss if possible. */
    	if (space > mss)
    		space = (space / mss) * mss;
    
    	/* NOTE: offering an initial window larger than 32767
    	 * will break some buggy TCP stacks. We try to be nice.
    	 * If we are not window scaling, then this truncates
    	 * our initial window offering to 32k. There should also
    	 * be a sysctl option to stop being nice.
    	 */
    	(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
    	(*rcv_wscale) = 0;
    	if (wscale_ok) {
    		/* Set window scaling on max possible window
    		 * See RFC1323 for an explanation of the limit to 14 
    		 */
    		space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
    		while (space > 65535 && (*rcv_wscale) < 14) {
    			space >>= 1;
    			(*rcv_wscale)++;
    		}
    	}
    
    	/* Set initial window to value enough for senders,
    	 * following RFC1414. Senders, not following this RFC,
    	 * will be satisfied with 2.
    	 */
    	if (mss > (1<<*rcv_wscale)) {
    		int init_cwnd = 4;
    		if (mss > 1460*3)
    			init_cwnd = 2;
    		else if (mss > 1460)
    			init_cwnd = 3;
    		if (*rcv_wnd > init_cwnd*mss)
    			*rcv_wnd = init_cwnd*mss;
    	}
    
    	/* Set the clamp no higher than max representable value */
    	(*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);
    }
    
    /* Chose a new window to advertise, update state in tcp_sock for the
     * socket, and return result with RFC1323 scaling applied.  The return
     * value can be stuffed directly into th->window for an outgoing
     * frame.
     */
    static __inline__ u16 tcp_select_window(struct sock *sk)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	u32 cur_win = tcp_receive_window(tp);
    	u32 new_win = __tcp_select_window(sk);
    
    	/* Never shrink the offered window */
    	if(new_win < cur_win) {
    		/* Danger Will Robinson!
    		 * Don't update rcv_wup/rcv_wnd here or else
    		 * we will not be able to advertise a zero
    		 * window in time.  --DaveM
    		 *
    		 * Relax Will Robinson.
    		 */
    		new_win = cur_win;
    	}
    	tp->rcv_wnd = new_win;
    	tp->rcv_wup = tp->rcv_nxt;
    
    	/* Make sure we do not exceed the maximum possible
    	 * scaled window.
    	 */
    	if (!tp->rx_opt.rcv_wscale)
    		new_win = min(new_win, MAX_TCP_WINDOW);
    	else
    		new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
    
    	/* RFC1323 scaling applied */
    	new_win >>= tp->rx_opt.rcv_wscale;
    
    	/* If we advertise zero window, disable fast path. */
    	if (new_win == 0)
    		tp->pred_flags = 0;
    
    	return new_win;
    }
    
    
    /* This routine actually transmits TCP packets queued in by
     * tcp_do_sendmsg().  This is used by both the initial
     * transmission and possible later retransmissions.
     * All SKB's seen here are completely headerless.  It is our
     * job to build the TCP header, and pass the packet down to
     * IP so it can do the same plus pass the packet off to the
     * device.
     *
     * We are working here with either a clone of the original
     * SKB, or a fresh unique copy made by the retransmit engine.
     */
    static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
    {
    	if (skb != NULL) {
    		struct inet_sock *inet = inet_sk(sk);
    		struct tcp_sock *tp = tcp_sk(sk);
    		struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
    		int tcp_header_size = tp->tcp_header_len;
    		struct tcphdr *th;
    		int sysctl_flags;
    		int err;
    
    		BUG_ON(!tcp_skb_pcount(skb));
    
    #define SYSCTL_FLAG_TSTAMPS	0x1
    #define SYSCTL_FLAG_WSCALE	0x2
    #define SYSCTL_FLAG_SACK	0x4
    
    
    		/* If congestion control is doing timestamping */
    		if (tp->ca_ops->rtt_sample)
    			do_gettimeofday(&skb->stamp);
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		sysctl_flags = 0;
    		if (tcb->flags & TCPCB_FLAG_SYN) {
    			tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
    			if(sysctl_tcp_timestamps) {
    				tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
    				sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
    			}
    			if(sysctl_tcp_window_scaling) {
    				tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
    				sysctl_flags |= SYSCTL_FLAG_WSCALE;
    			}
    			if(sysctl_tcp_sack) {
    				sysctl_flags |= SYSCTL_FLAG_SACK;
    				if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
    					tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
    			}
    		} else if (tp->rx_opt.eff_sacks) {
    			/* A SACK is 2 pad bytes, a 2 byte header, plus
    			 * 2 32-bit sequence numbers for each SACK block.
    			 */
    			tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
    					    (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
    		}
    		
    
    		if (tcp_packets_in_flight(tp) == 0)
    			tcp_ca_event(tp, CA_EVENT_TX_START);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		th = (struct tcphdr *) skb_push(skb, tcp_header_size);
    		skb->h.th = th;
    		skb_set_owner_w(skb, sk);
    
    		/* Build TCP header and checksum it. */
    		th->source		= inet->sport;
    		th->dest		= inet->dport;
    		th->seq			= htonl(tcb->seq);
    		th->ack_seq		= htonl(tp->rcv_nxt);
    		*(((__u16 *)th) + 6)	= htons(((tcp_header_size >> 2) << 12) | tcb->flags);
    		if (tcb->flags & TCPCB_FLAG_SYN) {
    			/* RFC1323: The window in SYN & SYN/ACK segments
    			 * is never scaled.
    			 */
    			th->window	= htons(tp->rcv_wnd);
    		} else {
    			th->window	= htons(tcp_select_window(sk));
    		}
    		th->check		= 0;
    		th->urg_ptr		= 0;
    
    		if (tp->urg_mode &&
    		    between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) {
    			th->urg_ptr		= htons(tp->snd_up-tcb->seq);
    			th->urg			= 1;
    		}
    
    		if (tcb->flags & TCPCB_FLAG_SYN) {
    			tcp_syn_build_options((__u32 *)(th + 1),
    					      tcp_advertise_mss(sk),
    					      (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
    					      (sysctl_flags & SYSCTL_FLAG_SACK),
    					      (sysctl_flags & SYSCTL_FLAG_WSCALE),
    					      tp->rx_opt.rcv_wscale,
    					      tcb->when,
    		      			      tp->rx_opt.ts_recent);
    		} else {
    			tcp_build_and_update_options((__u32 *)(th + 1),
    						     tp, tcb->when);
    
    			TCP_ECN_send(sk, tp, skb, tcp_header_size);
    		}
    		tp->af_specific->send_check(sk, th, skb->len, skb);
    
    		if (tcb->flags & TCPCB_FLAG_ACK)
    
    			tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		if (skb->len != tcp_header_size)
    			tcp_event_data_sent(tp, skb, sk);
    
    		TCP_INC_STATS(TCP_MIB_OUTSEGS);
    
    		err = tp->af_specific->queue_xmit(skb, 0);
    		if (err <= 0)
    			return err;
    
    		tcp_enter_cwr(tp);
    
    		/* NET_XMIT_CN is special. It does not guarantee,
    		 * that this packet is lost. It tells that device
    		 * is about to start to drop packets or already
    		 * drops some packets of the same priority and
    		 * invokes us to send less aggressively.
    		 */
    		return err == NET_XMIT_CN ? 0 : err;
    	}
    	return -ENOBUFS;
    #undef SYSCTL_FLAG_TSTAMPS
    #undef SYSCTL_FLAG_WSCALE
    #undef SYSCTL_FLAG_SACK
    }
    
    
    /* This routine just queue's the buffer 
     *
     * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
     * otherwise socket can stall.
     */
    static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	/* Advance write_seq and place onto the write_queue. */
    	tp->write_seq = TCP_SKB_CB(skb)->end_seq;
    	skb_header_release(skb);
    	__skb_queue_tail(&sk->sk_write_queue, skb);
    	sk_charge_skb(sk, skb);
    
    	/* Queue it, remembering where we must start sending. */
    	if (sk->sk_send_head == NULL)
    		sk->sk_send_head = skb;
    }
    
    
    static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
    
    	if (skb->len <= mss_now ||
    
    	    !(sk->sk_route_caps & NETIF_F_TSO)) {
    		/* Avoid the costly divide in the normal
    		 * non-TSO case.
    		 */
    		skb_shinfo(skb)->tso_segs = 1;
    		skb_shinfo(skb)->tso_size = 0;
    	} else {
    		unsigned int factor;
    
    
    		factor = skb->len + (mss_now - 1);
    		factor /= mss_now;
    
    		skb_shinfo(skb)->tso_segs = factor;
    
    		skb_shinfo(skb)->tso_size = mss_now;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    }
    
    /* Function to create two new TCP segments.  Shrinks the given segment
     * to the specified size and appends a new segment with the rest of the
     * packet to the list.  This won't be called frequently, I hope. 
     * Remember, these are still headerless SKBs at this point.
     */
    
    static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss_now)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	struct sk_buff *buff;
    	int nsize;
    	u16 flags;
    
    	nsize = skb_headlen(skb) - len;
    	if (nsize < 0)
    		nsize = 0;
    
    	if (skb_cloned(skb) &&
    	    skb_is_nonlinear(skb) &&
    	    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
    		return -ENOMEM;
    
    	/* Get a new skb... force flag on. */
    	buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC);
    	if (buff == NULL)
    		return -ENOMEM; /* We'll just try again later. */
    	sk_charge_skb(sk, buff);
    
    	/* Correct the sequence numbers. */
    	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
    	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
    	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
    
    	/* PSH and FIN should only be set in the second packet. */
    	flags = TCP_SKB_CB(skb)->flags;
    	TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
    	TCP_SKB_CB(buff)->flags = flags;
    	TCP_SKB_CB(buff)->sacked =
    		(TCP_SKB_CB(skb)->sacked &
    		 (TCPCB_LOST | TCPCB_EVER_RETRANS | TCPCB_AT_TAIL));
    	TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL;
    
    	if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_HW) {
    		/* Copy and checksum data tail into the new buffer. */
    		buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize),
    						       nsize, 0);
    
    		skb_trim(skb, len);
    
    		skb->csum = csum_block_sub(skb->csum, buff->csum, len);
    	} else {
    		skb->ip_summed = CHECKSUM_HW;
    		skb_split(skb, buff, len);
    	}
    
    	buff->ip_summed = skb->ip_summed;
    
    	/* Looks stupid, but our code really uses when of
    	 * skbs, which it never sent before. --ANK
    	 */
    	TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
    		tp->lost_out -= tcp_skb_pcount(skb);
    		tp->left_out -= tcp_skb_pcount(skb);
    	}
    
    	/* Fix up tso_factor for both original and new SKB.  */
    
    	tcp_set_skb_tso_segs(sk, skb, mss_now);
    	tcp_set_skb_tso_segs(sk, buff, mss_now);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
    		tp->lost_out += tcp_skb_pcount(skb);
    		tp->left_out += tcp_skb_pcount(skb);
    	}
    
    	if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) {
    		tp->lost_out += tcp_skb_pcount(buff);
    		tp->left_out += tcp_skb_pcount(buff);
    	}
    
    	/* Link BUFF into the send queue. */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	__skb_append(skb, buff);
    
    	return 0;
    }
    
    /* This is similar to __pskb_pull_head() (it will go to core/skbuff.c
     * eventually). The difference is that pulled data not copied, but
     * immediately discarded.
     */
    static unsigned char *__pskb_trim_head(struct sk_buff *skb, int len)
    {
    	int i, k, eat;
    
    	eat = len;
    	k = 0;
    	for (i=0; i<skb_shinfo(skb)->nr_frags; i++) {
    		if (skb_shinfo(skb)->frags[i].size <= eat) {
    			put_page(skb_shinfo(skb)->frags[i].page);
    			eat -= skb_shinfo(skb)->frags[i].size;
    		} else {
    			skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
    			if (eat) {
    				skb_shinfo(skb)->frags[k].page_offset += eat;
    				skb_shinfo(skb)->frags[k].size -= eat;
    				eat = 0;
    			}
    			k++;
    		}
    	}
    	skb_shinfo(skb)->nr_frags = k;
    
    	skb->tail = skb->data;
    	skb->data_len -= len;
    	skb->len = skb->data_len;
    	return skb->tail;
    }
    
    int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
    {
    	if (skb_cloned(skb) &&
    	    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
    		return -ENOMEM;
    
    	if (len <= skb_headlen(skb)) {
    		__skb_pull(skb, len);
    	} else {
    		if (__pskb_trim_head(skb, len-skb_headlen(skb)) == NULL)
    			return -ENOMEM;
    	}
    
    	TCP_SKB_CB(skb)->seq += len;
    	skb->ip_summed = CHECKSUM_HW;
    
    	skb->truesize	     -= len;
    	sk->sk_wmem_queued   -= len;
    	sk->sk_forward_alloc += len;
    	sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
    
    	/* Any change of skb->len requires recalculation of tso
    	 * factor and mss.
    	 */
    	if (tcp_skb_pcount(skb) > 1)
    
    		tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk, 1));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	return 0;
    }
    
    /* This function synchronize snd mss to current pmtu/exthdr set.
    
       tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
       for TCP options, but includes only bare TCP header.
    
       tp->rx_opt.mss_clamp is mss negotiated at connection setup.
       It is minumum of user_mss and mss received with SYN.
       It also does not include TCP options.
    
       tp->pmtu_cookie is last pmtu, seen by this function.
    
       tp->mss_cache is current effective sending mss, including
       all tcp options except for SACKs. It is evaluated,
       taking into account current pmtu, but never exceeds
       tp->rx_opt.mss_clamp.
    
       NOTE1. rfc1122 clearly states that advertised MSS
       DOES NOT include either tcp or ip options.
    
       NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
       this function.			--ANK (980731)
     */
    
    unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	int mss_now;
    
    	/* Calculate base mss without TCP options:
    	   It is MMS_S - sizeof(tcphdr) of rfc1122
    	 */
    	mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);
    
    	/* Clamp it (mss_clamp does not include tcp options) */
    	if (mss_now > tp->rx_opt.mss_clamp)
    		mss_now = tp->rx_opt.mss_clamp;
    
    	/* Now subtract optional transport overhead */
    	mss_now -= tp->ext_header_len;
    
    	/* Then reserve room for full set of TCP options and 8 bytes of data */
    	if (mss_now < 48)
    		mss_now = 48;
    
    	/* Now subtract TCP options size, not including SACKs */
    	mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
    
    	/* Bound mss with half of window */
    	if (tp->max_window && mss_now > (tp->max_window>>1))
    		mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len);
    
    	/* And store cached results */
    	tp->pmtu_cookie = pmtu;
    
    	tp->mss_cache = mss_now;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	return mss_now;
    }
    
    /* Compute the current effective MSS, taking SACKs and IP options,
     * and even PMTU discovery events into account.
     *
     * LARGESEND note: !urg_mode is overkill, only frames up to snd_up
     * cannot be large. However, taking into account rare use of URG, this
     * is not a big flaw.
     */
    
    unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	struct dst_entry *dst = __sk_dst_get(sk);
    
    	u32 mss_now;
    	u16 xmit_size_goal;
    	int doing_tso = 0;
    
    	mss_now = tp->mss_cache;
    
    	if (large_allowed &&
    	    (sk->sk_route_caps & NETIF_F_TSO) &&
    	    !tp->urg_mode)
    		doing_tso = 1;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	if (dst) {
    		u32 mtu = dst_mtu(dst);
    		if (mtu != tp->pmtu_cookie)
    			mss_now = tcp_sync_mss(sk, mtu);
    	}
    
    
    	if (tp->rx_opt.eff_sacks)
    		mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
    			    (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	xmit_size_goal = mss_now;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if (doing_tso) {
    		xmit_size_goal = 65535 -
    			tp->af_specific->net_header_len -
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			tp->ext_header_len - tp->tcp_header_len;
    
    
    		if (tp->max_window &&
    		    (xmit_size_goal > (tp->max_window >> 1)))
    			xmit_size_goal = max((tp->max_window >> 1),
    					     68U - tp->tcp_header_len);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		xmit_size_goal -= (xmit_size_goal % mss_now);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    	tp->xmit_size_goal = xmit_size_goal;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	return mss_now;
    }
    
    
    /* Congestion window validation. (RFC2861) */
    
    static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
    {
    	__u32 packets_out = tp->packets_out;
    
    	if (packets_out >= tp->snd_cwnd) {
    		/* Network is feed fully. */
    		tp->snd_cwnd_used = 0;
    		tp->snd_cwnd_stamp = tcp_time_stamp;
    	} else {
    		/* Network starves. */
    		if (tp->packets_out > tp->snd_cwnd_used)
    			tp->snd_cwnd_used = tp->packets_out;
    
    		if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto)
    			tcp_cwnd_application_limited(sk);
    	}
    }
    
    
    static unsigned int tcp_window_allows(struct tcp_sock *tp, struct sk_buff *skb, unsigned int mss_now, unsigned int cwnd)
    {
    	u32 window, cwnd_len;
    
    	window = (tp->snd_una + tp->snd_wnd - TCP_SKB_CB(skb)->seq);
    	cwnd_len = mss_now * cwnd;
    	return min(window, cwnd_len);
    }
    
    /* Can at least one segment of SKB be sent right now, according to the
     * congestion window rules?  If so, return how many segments are allowed.
     */
    static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb)
    {
    	u32 in_flight, cwnd;
    
    	/* Don't be strict about the congestion window for the final FIN.  */
    	if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
    		return 1;
    
    	in_flight = tcp_packets_in_flight(tp);
    	cwnd = tp->snd_cwnd;
    	if (in_flight < cwnd)
    		return (cwnd - in_flight);
    
    	return 0;
    }
    
    /* This must be invoked the first time we consider transmitting
     * SKB onto the wire.
     */
    
    static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
    
    {
    	int tso_segs = tcp_skb_pcount(skb);
    
    
    	if (!tso_segs ||
    	    (tso_segs > 1 &&
    	     skb_shinfo(skb)->tso_size != mss_now)) {
    		tcp_set_skb_tso_segs(sk, skb, mss_now);
    
    		tso_segs = tcp_skb_pcount(skb);
    	}
    	return tso_segs;
    }
    
    static inline int tcp_minshall_check(const struct tcp_sock *tp)
    {
    	return after(tp->snd_sml,tp->snd_una) &&
    		!after(tp->snd_sml, tp->snd_nxt);
    }
    
    /* Return 0, if packet can be sent now without violation Nagle's rules:
     * 1. It is full sized.
     * 2. Or it contains FIN. (already checked by caller)
     * 3. Or TCP_NODELAY was set.
     * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
     *    With Minshall's modification: all sent small packets are ACKed.
     */
    
    static inline int tcp_nagle_check(const struct tcp_sock *tp,
    				  const struct sk_buff *skb, 
    				  unsigned mss_now, int nonagle)
    {
    	return (skb->len < mss_now &&
    		((nonagle&TCP_NAGLE_CORK) ||
    		 (!nonagle &&
    		  tp->packets_out &&
    		  tcp_minshall_check(tp))));
    }
    
    /* Return non-zero if the Nagle test allows this packet to be
     * sent now.
     */
    static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
    				 unsigned int cur_mss, int nonagle)
    {
    	/* Nagle rule does not apply to frames, which sit in the middle of the
    	 * write_queue (they have no chances to get new data).
    	 *
    	 * This is implemented in the callers, where they modify the 'nonagle'
    	 * argument based upon the location of SKB in the send queue.
    	 */
    	if (nonagle & TCP_NAGLE_PUSH)
    		return 1;
    
    	/* Don't use the nagle rule for urgent data (or for the final FIN).  */
    	if (tp->urg_mode ||
    	    (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
    		return 1;
    
    	if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
    		return 1;
    
    	return 0;
    }
    
    /* Does at least the first segment of SKB fit into the send window? */
    static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss)
    {
    	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
    
    	if (skb->len > cur_mss)
    		end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
    
    	return !after(end_seq, tp->snd_una + tp->snd_wnd);
    }
    
    /* This checks if the data bearing packet SKB (usually sk->sk_send_head)
     * should be put on the wire right now.  If so, it returns the number of
     * packets allowed by the congestion window.
     */
    static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
    				 unsigned int cur_mss, int nonagle)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	unsigned int cwnd_quota;
    
    
    	tcp_init_tso_segs(sk, skb, cur_mss);
    
    
    	if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
    		return 0;
    
    	cwnd_quota = tcp_cwnd_test(tp, skb);
    	if (cwnd_quota &&
    	    !tcp_snd_wnd_test(tp, skb, cur_mss))
    		cwnd_quota = 0;
    
    	return cwnd_quota;
    }
    
    static inline int tcp_skb_is_last(const struct sock *sk, 
    				  const struct sk_buff *skb)
    {
    	return skb->next == (struct sk_buff *)&sk->sk_write_queue;
    }
    
    int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
    {
    	struct sk_buff *skb = sk->sk_send_head;
    
    	return (skb &&
    		tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
    			     (tcp_skb_is_last(sk, skb) ?
    			      TCP_NAGLE_PUSH :
    			      tp->nonagle)));
    }
    
    /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
     * which is put after SKB on the list.  It is very much like
     * tcp_fragment() except that it may make several kinds of assumptions
     * in order to speed up the splitting operation.  In particular, we
     * know that all the data is in scatter-gather pages, and that the
     * packet has never been sent out before (and thus is not cloned).
     */
    
    static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, unsigned int mss_now)
    
    {
    	struct sk_buff *buff;
    	int nlen = skb->len - len;
    	u16 flags;
    
    	/* All of a TSO frame must be composed of paged data.  */
    	BUG_ON(skb->len != skb->data_len);
    
    	buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC);
    	if (unlikely(buff == NULL))
    		return -ENOMEM;
    
    	buff->truesize = nlen;
    	skb->truesize -= nlen;
    
    	/* Correct the sequence numbers. */
    	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
    	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
    	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
    
    	/* PSH and FIN should only be set in the second packet. */
    	flags = TCP_SKB_CB(skb)->flags;
    	TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
    	TCP_SKB_CB(buff)->flags = flags;
    
    	/* This packet was never sent out yet, so no SACK bits. */
    	TCP_SKB_CB(buff)->sacked = 0;
    
    	buff->ip_summed = skb->ip_summed = CHECKSUM_HW;
    	skb_split(skb, buff, len);
    
    	/* Fix up tso_factor for both original and new SKB.  */
    
    	tcp_set_skb_tso_segs(sk, skb, mss_now);
    	tcp_set_skb_tso_segs(sk, buff, mss_now);
    
    
    	/* Link BUFF into the send queue. */
    	skb_header_release(buff);
    	__skb_append(skb, buff);
    
    	return 0;
    }
    
    /* Try to defer sending, if possible, in order to minimize the amount
     * of TSO splitting we do.  View it as a kind of TSO Nagle test.
     *
     * This algorithm is from John Heffner.
     */
    static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
    {
    	u32 send_win, cong_win, limit, in_flight;
    
    	if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
    		return 0;
    
    
    	if (tp->ca_state != TCP_CA_Open)
    		return 0;
    
    
    	in_flight = tcp_packets_in_flight(tp);
    
    	BUG_ON(tcp_skb_pcount(skb) <= 1 ||
    	       (tp->snd_cwnd <= in_flight));
    
    	send_win = (tp->snd_una + tp->snd_wnd) - TCP_SKB_CB(skb)->seq;
    
    	/* From in_flight test above, we know that cwnd > in_flight.  */
    	cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
    
    	limit = min(send_win, cong_win);
    
    	/* If sk_send_head can be sent fully now, just do it.  */
    	if (skb->len <= limit)
    		return 0;
    
    	if (sysctl_tcp_tso_win_divisor) {
    		u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
    
    		/* If at least some fraction of a window is available,
    		 * just use it.
    		 */
    		chunk /= sysctl_tcp_tso_win_divisor;
    		if (limit >= chunk)
    			return 0;
    	} else {
    		/* Different approach, try not to defer past a single
    		 * ACK.  Receiver should ACK every other full sized
    		 * frame, so if we have space for more than 3 frames
    		 * then send now.
    		 */
    		if (limit > tcp_max_burst(tp) * tp->mss_cache)
    			return 0;
    	}
    
    	/* Ok, it looks like it is advisable to defer.  */
    	return 1;
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /* This routine writes packets to the network.  It advances the
     * send_head.  This happens as incoming acks open up the remote
     * window for us.
     *
     * Returns 1, if no segments are in flight and we have queued segments, but
     * cannot send anything now because of SWS or another problem.
     */
    
    static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	struct sk_buff *skb;
    
    	unsigned int tso_segs, sent_pkts;
    	int cwnd_quota;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	/* If we are closed, the bytes will have to remain here.
    	 * In time closedown will finish, we empty the write queue and all
    	 * will be happy.
    	 */
    
    	if (unlikely(sk->sk_state == TCP_CLOSE))
    		return 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	skb = sk->sk_send_head;
    	if (unlikely(!skb))
    		return 0;
    
    
    	tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
    
    	cwnd_quota = tcp_cwnd_test(tp, skb);
    
    	if (unlikely(!cwnd_quota))
    		goto out;
    
    
    	sent_pkts = 0;
    
    	while (likely(tcp_snd_wnd_test(tp, skb, mss_now))) {
    		BUG_ON(!tso_segs);
    
    		if (tso_segs == 1) {
    			if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
    						     (tcp_skb_is_last(sk, skb) ?
    						      nonagle : TCP_NAGLE_PUSH))))
    				break;
    		} else {
    			if (tcp_tso_should_defer(sk, tp, skb))
    				break;
    		}
    
    		if (tso_segs > 1) {
    			u32 limit = tcp_window_allows(tp, skb,
    						      mss_now, cwnd_quota);