Skip to content
Snippets Groups Projects
tcp_output.c 73.2 KiB
Newer Older
  • Learn to ignore specific revisions
  • Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * INET		An implementation of the TCP/IP protocol suite for the LINUX
     *		operating system.  INET is implemented using the  BSD Socket
     *		interface as the means of communication with the user level.
     *
     *		Implementation of the Transmission Control Protocol(TCP).
     *
     * Version:	$Id: tcp_output.c,v 1.146 2002/02/01 22:01:04 davem Exp $
     *
    
     * Authors:	Ross Biro
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
     *		Mark Evans, <evansmp@uhura.aston.ac.uk>
     *		Corey Minyard <wf-rch!minyard@relay.EU.net>
     *		Florian La Roche, <flla@stud.uni-sb.de>
     *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
     *		Linus Torvalds, <torvalds@cs.helsinki.fi>
     *		Alan Cox, <gw4pts@gw4pts.ampr.org>
     *		Matthew Dillon, <dillon@apollo.west.oic.com>
     *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
     *		Jorge Cwik, <jorge@laser.satlink.net>
     */
    
    /*
     * Changes:	Pedro Roque	:	Retransmit queue handled by TCP.
     *				:	Fragmentation on mtu decrease
     *				:	Segment collapse on retransmit
     *				:	AF independence
     *
     *		Linus Torvalds	:	send_delayed_ack
     *		David S. Miller	:	Charge memory using the right skb
     *					during syn/ack processing.
     *		David S. Miller :	Output engine completely rewritten.
     *		Andrea Arcangeli:	SYNACK carry ts_recent in tsecr.
     *		Cacophonix Gaul :	draft-minshall-nagle-01
     *		J Hadi Salim	:	ECN support
     *
     */
    
    #include <net/tcp.h>
    
    #include <linux/compiler.h>
    #include <linux/module.h>
    
    /* People can turn this off for buggy TCP's found in printers etc. */
    
    int sysctl_tcp_retrans_collapse __read_mostly = 1;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    /* People can turn this on to  work with those rare, broken TCPs that
     * interpret the window field as a signed quantity.
     */
    
    int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /* This limits the percentage of the congestion window which we
     * will allow a single TSO frame to consume.  Building TSO frames
     * which are too large can cause TCP streams to be bursty.
     */
    
    int sysctl_tcp_tso_win_divisor __read_mostly = 3;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    int sysctl_tcp_mtu_probing __read_mostly = 0;
    int sysctl_tcp_base_mss __read_mostly = 512;
    
    John Heffner's avatar
    John Heffner committed
    
    
    /* By default, RFC2861 behavior.  */
    
    int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
    
    static void update_send_head(struct sock *sk, struct sk_buff *skb)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct tcp_sock *tp = tcp_sk(sk);
    
    
    	tcp_advance_send_head(sk, skb);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
    
    	tcp_packets_out_inc(sk, skb);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /* SND.NXT, if window was not shrunk.
     * If window has been shrunk, what should we make? It is not clear at all.
     * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
     * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
     * invalid. OK, let's make this for now:
     */
    
    static inline __u32 tcp_acceptable_seq(struct sock *sk)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct tcp_sock *tp = tcp_sk(sk);
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt))
    		return tp->snd_nxt;
    	else
    		return tp->snd_una+tp->snd_wnd;
    }
    
    /* Calculate mss to advertise in SYN segment.
     * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
     *
     * 1. It is independent of path mtu.
     * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
     * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
     *    attached devices, because some buggy hosts are confused by
     *    large MSS.
     * 4. We do not make 3, we advertise MSS, calculated from first
     *    hop device mtu, but allow to raise it to ip_rt_min_advmss.
     *    This may be overridden via information stored in routing table.
     * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
     *    probably even Jumbo".
     */
    static __u16 tcp_advertise_mss(struct sock *sk)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	struct dst_entry *dst = __sk_dst_get(sk);
    	int mss = tp->advmss;
    
    	if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) {
    		mss = dst_metric(dst, RTAX_ADVMSS);
    		tp->advmss = mss;
    	}
    
    	return (__u16)mss;
    }
    
    /* RFC2861. Reset CWND after idle period longer RTO to "restart window".
     * This is the first part of cwnd validation mechanism. */
    
    static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct tcp_sock *tp = tcp_sk(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	s32 delta = tcp_time_stamp - tp->lsndtime;
    	u32 restart_cwnd = tcp_init_cwnd(tp, dst);
    	u32 cwnd = tp->snd_cwnd;
    
    
    	tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	tp->snd_ssthresh = tcp_current_ssthresh(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	restart_cwnd = min(restart_cwnd, cwnd);
    
    
    	while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		cwnd >>= 1;
    	tp->snd_cwnd = max(cwnd, restart_cwnd);
    	tp->snd_cwnd_stamp = tcp_time_stamp;
    	tp->snd_cwnd_used = 0;
    }
    
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    static void tcp_event_data_sent(struct tcp_sock *tp,
    				struct sk_buff *skb, struct sock *sk)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct inet_connection_sock *icsk = inet_csk(sk);
    	const u32 now = tcp_time_stamp;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if (sysctl_tcp_slow_start_after_idle &&
    	    (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto))
    
    		tcp_cwnd_restart(sk, __sk_dst_get(sk));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	tp->lsndtime = now;
    
    	/* If it is a reply for ato after last received
    	 * packet, enter pingpong mode.
    	 */
    
    	if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
    		icsk->icsk_ack.pingpong = 1;
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	tcp_dec_quickack_mode(sk, pkts);
    	inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /* Determine a window scaling and initial window to offer.
     * Based on the assumption that the given amount of space
     * will be offered. Store the results in the tp structure.
     * NOTE: for smooth operation initial space offering should
     * be a multiple of mss if possible. We assume here that mss >= 1.
     * This MUST be enforced by all callers.
     */
    void tcp_select_initial_window(int __space, __u32 mss,
    			       __u32 *rcv_wnd, __u32 *window_clamp,
    			       int wscale_ok, __u8 *rcv_wscale)
    {
    	unsigned int space = (__space < 0 ? 0 : __space);
    
    	/* If no clamp set the clamp to the max possible scaled window */
    	if (*window_clamp == 0)
    		(*window_clamp) = (65535 << 14);
    	space = min(*window_clamp, space);
    
    	/* Quantize space offering to a multiple of mss if possible. */
    	if (space > mss)
    		space = (space / mss) * mss;
    
    	/* NOTE: offering an initial window larger than 32767
    
    	 * will break some buggy TCP stacks. If the admin tells us
    	 * it is likely we could be speaking with such a buggy stack
    	 * we will truncate our initial window offering to 32K-1
    	 * unless the remote has sent us a window scaling option,
    	 * which we interpret as a sign the remote TCP is not
    	 * misinterpreting the window field as a signed quantity.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	 */
    
    	if (sysctl_tcp_workaround_signed_windows)
    		(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
    	else
    		(*rcv_wnd) = space;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	(*rcv_wscale) = 0;
    	if (wscale_ok) {
    		/* Set window scaling on max possible window
    
    		 * See RFC1323 for an explanation of the limit to 14
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		 */
    		space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
    
    		space = min_t(u32, space, *window_clamp);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		while (space > 65535 && (*rcv_wscale) < 14) {
    			space >>= 1;
    			(*rcv_wscale)++;
    		}
    	}
    
    	/* Set initial window to value enough for senders,
    
    	 * following RFC2414. Senders, not following this RFC,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	 * will be satisfied with 2.
    	 */
    	if (mss > (1<<*rcv_wscale)) {
    
    		int init_cwnd = 4;
    		if (mss > 1460*3)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			init_cwnd = 2;
    
    		else if (mss > 1460)
    			init_cwnd = 3;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (*rcv_wnd > init_cwnd*mss)
    			*rcv_wnd = init_cwnd*mss;
    	}
    
    	/* Set the clamp no higher than max representable value */
    	(*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);
    }
    
    /* Chose a new window to advertise, update state in tcp_sock for the
     * socket, and return result with RFC1323 scaling applied.  The return
     * value can be stuffed directly into th->window for an outgoing
     * frame.
     */
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    static u16 tcp_select_window(struct sock *sk)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	u32 cur_win = tcp_receive_window(tp);
    	u32 new_win = __tcp_select_window(sk);
    
    	/* Never shrink the offered window */
    
    	if (new_win < cur_win) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/* Danger Will Robinson!
    		 * Don't update rcv_wup/rcv_wnd here or else
    		 * we will not be able to advertise a zero
    		 * window in time.  --DaveM
    		 *
    		 * Relax Will Robinson.
    		 */
    		new_win = cur_win;
    	}
    	tp->rcv_wnd = new_win;
    	tp->rcv_wup = tp->rcv_nxt;
    
    	/* Make sure we do not exceed the maximum possible
    	 * scaled window.
    	 */
    
    	if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		new_win = min(new_win, MAX_TCP_WINDOW);
    	else
    		new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
    
    	/* RFC1323 scaling applied */
    	new_win >>= tp->rx_opt.rcv_wscale;
    
    	/* If we advertise zero window, disable fast path. */
    	if (new_win == 0)
    		tp->pred_flags = 0;
    
    	return new_win;
    }
    
    
    static inline void TCP_ECN_send_synack(struct tcp_sock *tp,
    				       struct sk_buff *skb)
    {
    	TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_CWR;
    	if (!(tp->ecn_flags&TCP_ECN_OK))
    		TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_ECE;
    }
    
    static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	tp->ecn_flags = 0;
    	if (sysctl_tcp_ecn) {
    		TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ECE|TCPCB_FLAG_CWR;
    		tp->ecn_flags = TCP_ECN_OK;
    	}
    }
    
    static __inline__ void
    TCP_ECN_make_synack(struct request_sock *req, struct tcphdr *th)
    {
    	if (inet_rsk(req)->ecn_ok)
    		th->ece = 1;
    }
    
    static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
    				int tcp_header_len)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	if (tp->ecn_flags & TCP_ECN_OK) {
    		/* Not-retransmitted data segment: set ECT and inject CWR. */
    		if (skb->len != tcp_header_len &&
    		    !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
    			INET_ECN_xmit(sk);
    			if (tp->ecn_flags&TCP_ECN_QUEUE_CWR) {
    				tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
    				tcp_hdr(skb)->cwr = 1;
    				skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
    			}
    		} else {
    			/* ACK or retransmitted segment: clear ECT|CE */
    			INET_ECN_dontxmit(sk);
    		}
    		if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
    			tcp_hdr(skb)->ece = 1;
    	}
    }
    
    
    static void tcp_build_and_update_options(__be32 *ptr, struct tcp_sock *tp,
    
    					 __u32 tstamp, __u8 **md5_hash)
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    {
    	if (tp->rx_opt.tstamp_ok) {
    
    		*ptr++ = htonl((TCPOPT_NOP << 24) |
    			       (TCPOPT_NOP << 16) |
    			       (TCPOPT_TIMESTAMP << 8) |
    			       TCPOLEN_TIMESTAMP);
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    		*ptr++ = htonl(tstamp);
    		*ptr++ = htonl(tp->rx_opt.ts_recent);
    	}
    	if (tp->rx_opt.eff_sacks) {
    		struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks;
    		int this_sack;
    
    		*ptr++ = htonl((TCPOPT_NOP  << 24) |
    			       (TCPOPT_NOP  << 16) |
    			       (TCPOPT_SACK <<  8) |
    			       (TCPOLEN_SACK_BASE + (tp->rx_opt.eff_sacks *
    						     TCPOLEN_SACK_PERBLOCK)));
    
    
    		for (this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) {
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    			*ptr++ = htonl(sp[this_sack].start_seq);
    			*ptr++ = htonl(sp[this_sack].end_seq);
    		}
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    		if (tp->rx_opt.dsack) {
    			tp->rx_opt.dsack = 0;
    			tp->rx_opt.eff_sacks--;
    		}
    	}
    
    #ifdef CONFIG_TCP_MD5SIG
    	if (md5_hash) {
    		*ptr++ = htonl((TCPOPT_NOP << 24) |
    			       (TCPOPT_NOP << 16) |
    			       (TCPOPT_MD5SIG << 8) |
    			       TCPOLEN_MD5SIG);
    		*md5_hash = (__u8 *)ptr;
    	}
    #endif
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    }
    
    /* Construct a tcp options header for a SYN or SYN_ACK packet.
     * If this is every changed make sure to change the definition of
     * MAX_SYN_SIZE to match the new maximum number of options that you
     * can generate.
    
     *
     * Note - that with the RFC2385 TCP option, we make room for the
     * 16 byte MD5 hash. This will be filled in later, so the pointer for the
     * location to be filled is passed back up.
    
    static void tcp_syn_build_options(__be32 *ptr, int mss, int ts, int sack,
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    				  int offer_wscale, int wscale, __u32 tstamp,
    
    				  __u32 ts_recent, __u8 **md5_hash)
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    {
    	/* We always get an MSS option.
    	 * The option bytes which will be seen in normal data
    	 * packets should timestamps be used, must be in the MSS
    	 * advertised.  But we subtract them from tp->mss_cache so
    	 * that calculations in tcp_sendmsg are simpler etc.
    	 * So account for this fact here if necessary.  If we
    	 * don't do this correctly, as a receiver we won't
    	 * recognize data packets as being full sized when we
    	 * should, and thus we won't abide by the delayed ACK
    	 * rules correctly.
    	 * SACKs don't matter, we never delay an ACK when we
    	 * have any of those going out.
    	 */
    	*ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss);
    	if (ts) {
    
    		if (sack)
    
    			*ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
    				       (TCPOLEN_SACK_PERM << 16) |
    				       (TCPOPT_TIMESTAMP << 8) |
    				       TCPOLEN_TIMESTAMP);
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    		else
    
    			*ptr++ = htonl((TCPOPT_NOP << 24) |
    				       (TCPOPT_NOP << 16) |
    				       (TCPOPT_TIMESTAMP << 8) |
    				       TCPOLEN_TIMESTAMP);
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    		*ptr++ = htonl(tstamp);		/* TSVAL */
    		*ptr++ = htonl(ts_recent);	/* TSECR */
    
    	} else if (sack)
    
    		*ptr++ = htonl((TCPOPT_NOP << 24) |
    			       (TCPOPT_NOP << 16) |
    			       (TCPOPT_SACK_PERM << 8) |
    			       TCPOLEN_SACK_PERM);
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    	if (offer_wscale)
    
    		*ptr++ = htonl((TCPOPT_NOP << 24) |
    			       (TCPOPT_WINDOW << 16) |
    			       (TCPOLEN_WINDOW << 8) |
    			       (wscale));
    
    #ifdef CONFIG_TCP_MD5SIG
    	/*
    	 * If MD5 is enabled, then we set the option, and include the size
    	 * (always 18). The actual MD5 hash is added just before the
    	 * packet is sent.
    	 */
    	if (md5_hash) {
    		*ptr++ = htonl((TCPOPT_NOP << 24) |
    			       (TCPOPT_NOP << 16) |
    			       (TCPOPT_MD5SIG << 8) |
    			       TCPOLEN_MD5SIG);
    		*md5_hash = (__u8 *) ptr;
    	}
    #endif
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    /* This routine actually transmits TCP packets queued in by
     * tcp_do_sendmsg().  This is used by both the initial
     * transmission and possible later retransmissions.
     * All SKB's seen here are completely headerless.  It is our
     * job to build the TCP header, and pass the packet down to
     * IP so it can do the same plus pass the packet off to the
     * device.
     *
     * We are working here with either a clone of the original
     * SKB, or a fresh unique copy made by the retransmit engine.
     */
    
    static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	const struct inet_connection_sock *icsk = inet_csk(sk);
    	struct inet_sock *inet;
    	struct tcp_sock *tp;
    	struct tcp_skb_cb *tcb;
    	int tcp_header_size;
    
    #ifdef CONFIG_TCP_MD5SIG
    	struct tcp_md5sig_key *md5;
    	__u8 *md5_hash_location;
    #endif
    
    	struct tcphdr *th;
    	int sysctl_flags;
    	int err;
    
    	BUG_ON(!skb || !tcp_skb_pcount(skb));
    
    	/* If congestion control is doing timestamping, we must
    	 * take such a timestamp before we potentially clone/copy.
    	 */
    
    	if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP)
    
    		__net_timestamp(skb);
    
    	if (likely(clone_it)) {
    		if (unlikely(skb_cloned(skb)))
    			skb = pskb_copy(skb, gfp_mask);
    		else
    			skb = skb_clone(skb, gfp_mask);
    		if (unlikely(!skb))
    			return -ENOBUFS;
    	}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	inet = inet_sk(sk);
    	tp = tcp_sk(sk);
    	tcb = TCP_SKB_CB(skb);
    	tcp_header_size = tp->tcp_header_len;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    #define SYSCTL_FLAG_TSTAMPS	0x1
    #define SYSCTL_FLAG_WSCALE	0x2
    #define SYSCTL_FLAG_SACK	0x4
    
    
    	sysctl_flags = 0;
    	if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
    		tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
    
    		if (sysctl_tcp_timestamps) {
    
    			tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
    			sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    
    		if (sysctl_tcp_window_scaling) {
    			tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
    			sysctl_flags |= SYSCTL_FLAG_WSCALE;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    
    		if (sysctl_tcp_sack) {
    			sysctl_flags |= SYSCTL_FLAG_SACK;
    			if (!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
    				tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    
    	} else if (unlikely(tp->rx_opt.eff_sacks)) {
    		/* A SACK is 2 pad bytes, a 2 byte header, plus
    		 * 2 32-bit sequence numbers for each SACK block.
    		 */
    		tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
    				    (tp->rx_opt.eff_sacks *
    				     TCPOLEN_SACK_PERBLOCK));
    	}
    
    	if (tcp_packets_in_flight(tp) == 0)
    		tcp_ca_event(sk, CA_EVENT_TX_START);
    
    
    #ifdef CONFIG_TCP_MD5SIG
    	/*
    	 * Are we doing MD5 on this segment? If so - make
    	 * room for it.
    	 */
    	md5 = tp->af_specific->md5_lookup(sk, sk);
    	if (md5)
    		tcp_header_size += TCPOLEN_MD5SIG_ALIGNED;
    #endif
    
    
    	skb_push(skb, tcp_header_size);
    	skb_reset_transport_header(skb);
    
    
    	/* Build TCP header and checksum it. */
    
    	th->source		= inet->sport;
    	th->dest		= inet->dport;
    	th->seq			= htonl(tcb->seq);
    	th->ack_seq		= htonl(tp->rcv_nxt);
    
    	*(((__be16 *)th) + 6)	= htons(((tcp_header_size >> 2) << 12) |
    
    					tcb->flags);
    
    	if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
    		/* RFC1323: The window in SYN & SYN/ACK segments
    		 * is never scaled.
    		 */
    
    		th->window	= htons(min(tp->rcv_wnd, 65535U));
    
    	} else {
    		th->window	= htons(tcp_select_window(sk));
    	}
    	th->check		= 0;
    	th->urg_ptr		= 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if (unlikely(tp->urg_mode &&
    		     between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF))) {
    		th->urg_ptr		= htons(tp->snd_up-tcb->seq);
    		th->urg			= 1;
    	}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
    
    		tcp_syn_build_options((__be32 *)(th + 1),
    
    				      tcp_advertise_mss(sk),
    				      (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
    				      (sysctl_flags & SYSCTL_FLAG_SACK),
    				      (sysctl_flags & SYSCTL_FLAG_WSCALE),
    				      tp->rx_opt.rcv_wscale,
    				      tcb->when,
    
    				      tp->rx_opt.ts_recent,
    
    #ifdef CONFIG_TCP_MD5SIG
    				      md5 ? &md5_hash_location :
    #endif
    				      NULL);
    
    		tcp_build_and_update_options((__be32 *)(th + 1),
    
    					     tp, tcb->when,
    #ifdef CONFIG_TCP_MD5SIG
    					     md5 ? &md5_hash_location :
    #endif
    					     NULL);
    
    		TCP_ECN_send(sk, skb, tcp_header_size);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    #ifdef CONFIG_TCP_MD5SIG
    	/* Calculate the MD5 hash, as we have all we need now */
    	if (md5) {
    		tp->af_specific->calc_md5_hash(md5_hash_location,
    					       md5,
    					       sk, NULL, NULL,
    
    	icsk->icsk_af_ops->send_check(sk, skb->len, skb);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if (likely(tcb->flags & TCPCB_FLAG_ACK))
    		tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if (skb->len != tcp_header_size)
    		tcp_event_data_sent(tp, skb, sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
    		TCP_INC_STATS(TCP_MIB_OUTSEGS);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	err = icsk->icsk_af_ops->queue_xmit(skb, 0);
    
    	if (likely(err <= 0))
    
    	tcp_enter_cwr(sk, 1);
    
    	return net_xmit_eval(err);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    #undef SYSCTL_FLAG_TSTAMPS
    #undef SYSCTL_FLAG_WSCALE
    #undef SYSCTL_FLAG_SACK
    }
    
    
    
    /* This routine just queue's the buffer
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *
     * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
     * otherwise socket can stall.
     */
    static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	/* Advance write_seq and place onto the write_queue. */
    	tp->write_seq = TCP_SKB_CB(skb)->end_seq;
    	skb_header_release(skb);
    
    	tcp_add_write_queue_tail(sk, skb);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	sk_charge_skb(sk, skb);
    }
    
    
    static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
    
    	if (skb->len <= mss_now || !sk_can_gso(sk)) {
    
    		/* Avoid the costly divide in the normal
    		 * non-TSO case.
    		 */
    
    		skb_shinfo(skb)->gso_segs = 1;
    		skb_shinfo(skb)->gso_size = 0;
    		skb_shinfo(skb)->gso_type = 0;
    
    	} else {
    		unsigned int factor;
    
    
    		factor = skb->len + (mss_now - 1);
    		factor /= mss_now;
    
    		skb_shinfo(skb)->gso_segs = factor;
    		skb_shinfo(skb)->gso_size = mss_now;
    
    		skb_shinfo(skb)->gso_type = sk->sk_gso_type;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    }
    
    /* Function to create two new TCP segments.  Shrinks the given segment
     * to the specified size and appends a new segment with the rest of the
    
     * packet to the list.  This won't be called frequently, I hope.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * Remember, these are still headerless SKBs at this point.
     */
    
    int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss_now)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	struct sk_buff *buff;
    
    	int nsize, old_factor;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	u16 flags;
    
    
    	BUG_ON(len > skb->len);
    
    	clear_all_retrans_hints(tp);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	nsize = skb_headlen(skb) - len;
    	if (nsize < 0)
    		nsize = 0;
    
    	if (skb_cloned(skb) &&
    	    skb_is_nonlinear(skb) &&
    	    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
    		return -ENOMEM;
    
    	/* Get a new skb... force flag on. */
    	buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC);
    	if (buff == NULL)
    		return -ENOMEM; /* We'll just try again later. */
    
    	sk_charge_skb(sk, buff);
    	nlen = skb->len - len - nsize;
    	buff->truesize += nlen;
    	skb->truesize -= nlen;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	/* Correct the sequence numbers. */
    	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
    	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
    	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
    
    	/* PSH and FIN should only be set in the second packet. */
    	flags = TCP_SKB_CB(skb)->flags;
    	TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
    	TCP_SKB_CB(buff)->flags = flags;
    
    	TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL;
    
    
    	if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/* Copy and checksum data tail into the new buffer. */
    		buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize),
    						       nsize, 0);
    
    		skb_trim(skb, len);
    
    		skb->csum = csum_block_sub(skb->csum, buff->csum, len);
    	} else {
    
    		skb->ip_summed = CHECKSUM_PARTIAL;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		skb_split(skb, buff, len);
    	}
    
    	buff->ip_summed = skb->ip_summed;
    
    	/* Looks stupid, but our code really uses when of
    	 * skbs, which it never sent before. --ANK
    	 */
    	TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
    
    	buff->tstamp = skb->tstamp;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	old_factor = tcp_skb_pcount(skb);
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	/* Fix up tso_factor for both original and new SKB.  */
    
    	tcp_set_skb_tso_segs(sk, skb, mss_now);
    	tcp_set_skb_tso_segs(sk, buff, mss_now);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	/* If this packet has been sent out already, we must
    	 * adjust the various packet counters.
    	 */
    
    	if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
    
    		int diff = old_factor - tcp_skb_pcount(skb) -
    			tcp_skb_pcount(buff);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		tp->packets_out -= diff;
    
    
    		if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
    			tp->sacked_out -= diff;
    		if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
    			tp->retrans_out -= diff;
    
    
    		if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
    			tp->lost_out -= diff;
    			tp->left_out -= diff;
    		}
    
    			/* Adjust Reno SACK estimate. */
    			if (!tp->rx_opt.sack_ok) {
    				tp->sacked_out -= diff;
    				if ((int)tp->sacked_out < 0)
    					tp->sacked_out = 0;
    				tcp_sync_left_out(tp);
    			}
    
    
    			tp->fackets_out -= diff;
    			if ((int)tp->fackets_out < 0)
    				tp->fackets_out = 0;
    
    			/* SACK fastpath might overwrite it unless dealt with */
    			if (tp->fastpath_skb_hint != NULL &&
    			    after(TCP_SKB_CB(tp->fastpath_skb_hint)->seq,
    				  TCP_SKB_CB(skb)->seq)) {
    				tp->fastpath_cnt_hint -= diff;
    				if ((int)tp->fastpath_cnt_hint < 0)
    					tp->fastpath_cnt_hint = 0;
    			}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    	/* Link BUFF into the send queue. */
    
    	tcp_insert_write_queue_after(skb, buff, sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	return 0;
    }
    
    /* This is similar to __pskb_pull_head() (it will go to core/skbuff.c
     * eventually). The difference is that pulled data not copied, but
     * immediately discarded.
     */
    
    static void __pskb_trim_head(struct sk_buff *skb, int len)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	int i, k, eat;
    
    	eat = len;
    	k = 0;
    	for (i=0; i<skb_shinfo(skb)->nr_frags; i++) {
    		if (skb_shinfo(skb)->frags[i].size <= eat) {
    			put_page(skb_shinfo(skb)->frags[i].page);
    			eat -= skb_shinfo(skb)->frags[i].size;
    		} else {
    			skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
    			if (eat) {
    				skb_shinfo(skb)->frags[k].page_offset += eat;
    				skb_shinfo(skb)->frags[k].size -= eat;
    				eat = 0;
    			}
    			k++;
    		}
    	}
    	skb_shinfo(skb)->nr_frags = k;
    
    
    	skb_reset_tail_pointer(skb);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	skb->data_len -= len;
    	skb->len = skb->data_len;
    }
    
    int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
    {
    	if (skb_cloned(skb) &&
    	    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
    		return -ENOMEM;
    
    
    	/* If len == headlen, we avoid __skb_pull to preserve alignment. */
    	if (unlikely(len < skb_headlen(skb)))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		__skb_pull(skb, len);
    
    	else
    		__pskb_trim_head(skb, len - skb_headlen(skb));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	TCP_SKB_CB(skb)->seq += len;
    
    	skb->ip_summed = CHECKSUM_PARTIAL;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	skb->truesize	     -= len;
    	sk->sk_wmem_queued   -= len;
    	sk->sk_forward_alloc += len;
    	sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
    
    	/* Any change of skb->len requires recalculation of tso
    	 * factor and mss.
    	 */
    	if (tcp_skb_pcount(skb) > 1)
    
    		tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk, 1));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	return 0;
    }
    
    
    John Heffner's avatar
    John Heffner committed
    /* Not accounting for SACKs here. */
    int tcp_mtu_to_mss(struct sock *sk, int pmtu)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	struct inet_connection_sock *icsk = inet_csk(sk);
    	int mss_now;
    
    	/* Calculate base mss without TCP options:
    	   It is MMS_S - sizeof(tcphdr) of rfc1122
    	 */
    	mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
    
    	/* Clamp it (mss_clamp does not include tcp options) */
    	if (mss_now > tp->rx_opt.mss_clamp)
    		mss_now = tp->rx_opt.mss_clamp;
    
    	/* Now subtract optional transport overhead */
    	mss_now -= icsk->icsk_ext_hdr_len;
    
    	/* Then reserve room for full set of TCP options and 8 bytes of data */
    	if (mss_now < 48)
    		mss_now = 48;
    
    	/* Now subtract TCP options size, not including SACKs */
    	mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
    
    	return mss_now;
    }
    
    /* Inverse of above */
    int tcp_mss_to_mtu(struct sock *sk, int mss)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	struct inet_connection_sock *icsk = inet_csk(sk);
    	int mtu;
    
    	mtu = mss +
    	      tp->tcp_header_len +
    	      icsk->icsk_ext_hdr_len +
    	      icsk->icsk_af_ops->net_header_len;
    
    	return mtu;
    }
    
    void tcp_mtup_init(struct sock *sk)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	struct inet_connection_sock *icsk = inet_csk(sk);
    
    	icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1;
    	icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
    
    			       icsk->icsk_af_ops->net_header_len;
    
    John Heffner's avatar
    John Heffner committed
    	icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss);
    	icsk->icsk_mtup.probe_size = 0;
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /* This function synchronize snd mss to current pmtu/exthdr set.
    
       tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
       for TCP options, but includes only bare TCP header.
    
       tp->rx_opt.mss_clamp is mss negotiated at connection setup.
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
       It is minimum of user_mss and mss received with SYN.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
       It also does not include TCP options.
    
    
       inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
       tp->mss_cache is current effective sending mss, including
       all tcp options except for SACKs. It is evaluated,
       taking into account current pmtu, but never exceeds
       tp->rx_opt.mss_clamp.
    
       NOTE1. rfc1122 clearly states that advertised MSS
       DOES NOT include either tcp or ip options.
    
    
       NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
       are READ ONLY outside this function.		--ANK (980731)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	struct inet_connection_sock *icsk = inet_csk(sk);
    
    John Heffner's avatar
    John Heffner committed
    	int mss_now;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    John Heffner's avatar
    John Heffner committed
    	if (icsk->icsk_mtup.search_high > pmtu)
    		icsk->icsk_mtup.search_high = pmtu;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    John Heffner's avatar
    John Heffner committed
    	mss_now = tcp_mtu_to_mss(sk, pmtu);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	/* Bound mss with half of window */
    	if (tp->max_window && mss_now > (tp->max_window>>1))
    		mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len);
    
    	/* And store cached results */
    
    	icsk->icsk_pmtu_cookie = pmtu;
    
    John Heffner's avatar
    John Heffner committed
    	if (icsk->icsk_mtup.enabled)
    		mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
    
    	tp->mss_cache = mss_now;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	return mss_now;
    }
    
    /* Compute the current effective MSS, taking SACKs and IP options,
     * and even PMTU discovery events into account.
     *
     * LARGESEND note: !urg_mode is overkill, only frames up to snd_up
     * cannot be large. However, taking into account rare use of URG, this
     * is not a big flaw.
     */
    
    unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	struct dst_entry *dst = __sk_dst_get(sk);
    
    	u32 mss_now;
    	u16 xmit_size_goal;
    	int doing_tso = 0;
    
    	mss_now = tp->mss_cache;
    
    
    	if (large_allowed && sk_can_gso(sk) && !tp->urg_mode)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	if (dst) {
    		u32 mtu = dst_mtu(dst);
    
    		if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			mss_now = tcp_sync_mss(sk, mtu);
    	}
    
    
    	if (tp->rx_opt.eff_sacks)
    		mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
    			    (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    #ifdef CONFIG_TCP_MD5SIG
    	if (tp->af_specific->md5_lookup(sk, sk))
    		mss_now -= TCPOLEN_MD5SIG_ALIGNED;
    #endif
    
    
    	xmit_size_goal = mss_now;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if (doing_tso) {
    
    		xmit_size_goal = (65535 -
    				  inet_csk(sk)->icsk_af_ops->net_header_len -
    
    				  inet_csk(sk)->icsk_ext_hdr_len -
    				  tp->tcp_header_len);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		if (tp->max_window &&
    		    (xmit_size_goal > (tp->max_window >> 1)))
    			xmit_size_goal = max((tp->max_window >> 1),
    					     68U - tp->tcp_header_len);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		xmit_size_goal -= (xmit_size_goal % mss_now);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    	tp->xmit_size_goal = xmit_size_goal;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	return mss_now;
    }
    
    
    /* Congestion window validation. (RFC2861) */
    
    
    static void tcp_cwnd_validate(struct sock *sk)
    
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	__u32 packets_out = tp->packets_out;
    
    	if (packets_out >= tp->snd_cwnd) {
    		/* Network is feed fully. */
    		tp->snd_cwnd_used = 0;
    		tp->snd_cwnd_stamp = tcp_time_stamp;
    	} else {
    		/* Network starves. */