Skip to content
Snippets Groups Projects
tcp_output.c 74.6 KiB
Newer Older
  • Learn to ignore specific revisions
  • Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * INET		An implementation of the TCP/IP protocol suite for the LINUX
     *		operating system.  INET is implemented using the  BSD Socket
     *		interface as the means of communication with the user level.
     *
     *		Implementation of the Transmission Control Protocol(TCP).
     *
    
     * Authors:	Ross Biro
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
     *		Mark Evans, <evansmp@uhura.aston.ac.uk>
     *		Corey Minyard <wf-rch!minyard@relay.EU.net>
     *		Florian La Roche, <flla@stud.uni-sb.de>
     *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
     *		Linus Torvalds, <torvalds@cs.helsinki.fi>
     *		Alan Cox, <gw4pts@gw4pts.ampr.org>
     *		Matthew Dillon, <dillon@apollo.west.oic.com>
     *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
     *		Jorge Cwik, <jorge@laser.satlink.net>
     */
    
    /*
     * Changes:	Pedro Roque	:	Retransmit queue handled by TCP.
     *				:	Fragmentation on mtu decrease
     *				:	Segment collapse on retransmit
     *				:	AF independence
     *
     *		Linus Torvalds	:	send_delayed_ack
     *		David S. Miller	:	Charge memory using the right skb
     *					during syn/ack processing.
     *		David S. Miller :	Output engine completely rewritten.
     *		Andrea Arcangeli:	SYNACK carry ts_recent in tsecr.
     *		Cacophonix Gaul :	draft-minshall-nagle-01
     *		J Hadi Salim	:	ECN support
     *
     */
    
    #include <net/tcp.h>
    
    #include <linux/compiler.h>
    #include <linux/module.h>
    
    /* People can turn this off for buggy TCP's found in printers etc. */
    
    int sysctl_tcp_retrans_collapse __read_mostly = 1;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    /* People can turn this on to  work with those rare, broken TCPs that
     * interpret the window field as a signed quantity.
     */
    
    int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /* This limits the percentage of the congestion window which we
     * will allow a single TSO frame to consume.  Building TSO frames
     * which are too large can cause TCP streams to be bursty.
     */
    
    int sysctl_tcp_tso_win_divisor __read_mostly = 3;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    int sysctl_tcp_mtu_probing __read_mostly = 0;
    int sysctl_tcp_base_mss __read_mostly = 512;
    
    John Heffner's avatar
    John Heffner committed
    
    
    /* By default, RFC2861 behavior.  */
    
    int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
    
    static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	unsigned int prior_packets = tp->packets_out;
    
    	tcp_advance_send_head(sk, skb);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
    
    
    	/* Don't override Nagle indefinately with F-RTO */
    	if (tp->frto_counter == 2)
    		tp->frto_counter = 3;
    
    
    	tp->packets_out += tcp_skb_pcount(skb);
    	if (!prior_packets)
    		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
    					  inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /* SND.NXT, if window was not shrunk.
     * If window has been shrunk, what should we make? It is not clear at all.
     * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
     * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
     * invalid. OK, let's make this for now:
     */
    
    static inline __u32 tcp_acceptable_seq(struct sock *sk)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct tcp_sock *tp = tcp_sk(sk);
    
    
    	if (!before(tcp_wnd_end(tp), tp->snd_nxt))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return tp->snd_nxt;
    	else
    
    		return tcp_wnd_end(tp);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /* Calculate mss to advertise in SYN segment.
     * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
     *
     * 1. It is independent of path mtu.
     * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
     * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
     *    attached devices, because some buggy hosts are confused by
     *    large MSS.
     * 4. We do not make 3, we advertise MSS, calculated from first
     *    hop device mtu, but allow to raise it to ip_rt_min_advmss.
     *    This may be overridden via information stored in routing table.
     * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
     *    probably even Jumbo".
     */
    static __u16 tcp_advertise_mss(struct sock *sk)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	struct dst_entry *dst = __sk_dst_get(sk);
    	int mss = tp->advmss;
    
    	if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) {
    		mss = dst_metric(dst, RTAX_ADVMSS);
    		tp->advmss = mss;
    	}
    
    	return (__u16)mss;
    }
    
    /* RFC2861. Reset CWND after idle period longer RTO to "restart window".
     * This is the first part of cwnd validation mechanism. */
    
    static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct tcp_sock *tp = tcp_sk(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	s32 delta = tcp_time_stamp - tp->lsndtime;
    	u32 restart_cwnd = tcp_init_cwnd(tp, dst);
    	u32 cwnd = tp->snd_cwnd;
    
    
    	tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	tp->snd_ssthresh = tcp_current_ssthresh(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	restart_cwnd = min(restart_cwnd, cwnd);
    
    
    	while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		cwnd >>= 1;
    	tp->snd_cwnd = max(cwnd, restart_cwnd);
    	tp->snd_cwnd_stamp = tcp_time_stamp;
    	tp->snd_cwnd_used = 0;
    }
    
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    static void tcp_event_data_sent(struct tcp_sock *tp,
    				struct sk_buff *skb, struct sock *sk)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct inet_connection_sock *icsk = inet_csk(sk);
    	const u32 now = tcp_time_stamp;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if (sysctl_tcp_slow_start_after_idle &&
    	    (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto))
    
    		tcp_cwnd_restart(sk, __sk_dst_get(sk));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	tp->lsndtime = now;
    
    	/* If it is a reply for ato after last received
    	 * packet, enter pingpong mode.
    	 */
    
    	if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
    		icsk->icsk_ack.pingpong = 1;
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	tcp_dec_quickack_mode(sk, pkts);
    	inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /* Determine a window scaling and initial window to offer.
     * Based on the assumption that the given amount of space
     * will be offered. Store the results in the tp structure.
     * NOTE: for smooth operation initial space offering should
     * be a multiple of mss if possible. We assume here that mss >= 1.
     * This MUST be enforced by all callers.
     */
    void tcp_select_initial_window(int __space, __u32 mss,
    			       __u32 *rcv_wnd, __u32 *window_clamp,
    			       int wscale_ok, __u8 *rcv_wscale)
    {
    	unsigned int space = (__space < 0 ? 0 : __space);
    
    	/* If no clamp set the clamp to the max possible scaled window */
    	if (*window_clamp == 0)
    		(*window_clamp) = (65535 << 14);
    	space = min(*window_clamp, space);
    
    	/* Quantize space offering to a multiple of mss if possible. */
    	if (space > mss)
    		space = (space / mss) * mss;
    
    	/* NOTE: offering an initial window larger than 32767
    
    	 * will break some buggy TCP stacks. If the admin tells us
    	 * it is likely we could be speaking with such a buggy stack
    	 * we will truncate our initial window offering to 32K-1
    	 * unless the remote has sent us a window scaling option,
    	 * which we interpret as a sign the remote TCP is not
    	 * misinterpreting the window field as a signed quantity.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	 */
    
    	if (sysctl_tcp_workaround_signed_windows)
    		(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
    	else
    		(*rcv_wnd) = space;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	(*rcv_wscale) = 0;
    	if (wscale_ok) {
    		/* Set window scaling on max possible window
    
    		 * See RFC1323 for an explanation of the limit to 14
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		 */
    		space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
    
    		space = min_t(u32, space, *window_clamp);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		while (space > 65535 && (*rcv_wscale) < 14) {
    			space >>= 1;
    			(*rcv_wscale)++;
    		}
    	}
    
    	/* Set initial window to value enough for senders,
    
    	 * following RFC2414. Senders, not following this RFC,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	 * will be satisfied with 2.
    	 */
    
    	if (mss > (1 << *rcv_wscale)) {
    
    		if (mss > 1460 * 3)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			init_cwnd = 2;
    
    		else if (mss > 1460)
    			init_cwnd = 3;
    
    		if (*rcv_wnd > init_cwnd * mss)
    			*rcv_wnd = init_cwnd * mss;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    	/* Set the clamp no higher than max representable value */
    	(*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);
    }
    
    /* Chose a new window to advertise, update state in tcp_sock for the
     * socket, and return result with RFC1323 scaling applied.  The return
     * value can be stuffed directly into th->window for an outgoing
     * frame.
     */
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    static u16 tcp_select_window(struct sock *sk)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	u32 cur_win = tcp_receive_window(tp);
    	u32 new_win = __tcp_select_window(sk);
    
    	/* Never shrink the offered window */
    
    	if (new_win < cur_win) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/* Danger Will Robinson!
    		 * Don't update rcv_wup/rcv_wnd here or else
    		 * we will not be able to advertise a zero
    		 * window in time.  --DaveM
    		 *
    		 * Relax Will Robinson.
    		 */
    
    		new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    	tp->rcv_wnd = new_win;
    	tp->rcv_wup = tp->rcv_nxt;
    
    	/* Make sure we do not exceed the maximum possible
    	 * scaled window.
    	 */
    
    	if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		new_win = min(new_win, MAX_TCP_WINDOW);
    	else
    		new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
    
    	/* RFC1323 scaling applied */
    	new_win >>= tp->rx_opt.rcv_wscale;
    
    	/* If we advertise zero window, disable fast path. */
    	if (new_win == 0)
    		tp->pred_flags = 0;
    
    	return new_win;
    }
    
    
    static inline void TCP_ECN_send_synack(struct tcp_sock *tp, struct sk_buff *skb)
    
    {
    	TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_CWR;
    
    	if (!(tp->ecn_flags & TCP_ECN_OK))
    
    		TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_ECE;
    }
    
    static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	tp->ecn_flags = 0;
    	if (sysctl_tcp_ecn) {
    
    		TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ECE | TCPCB_FLAG_CWR;
    
    		tp->ecn_flags = TCP_ECN_OK;
    	}
    }
    
    static __inline__ void
    TCP_ECN_make_synack(struct request_sock *req, struct tcphdr *th)
    {
    	if (inet_rsk(req)->ecn_ok)
    		th->ece = 1;
    }
    
    static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
    				int tcp_header_len)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	if (tp->ecn_flags & TCP_ECN_OK) {
    		/* Not-retransmitted data segment: set ECT and inject CWR. */
    		if (skb->len != tcp_header_len &&
    		    !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
    			INET_ECN_xmit(sk);
    
    			if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {
    
    				tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
    				tcp_hdr(skb)->cwr = 1;
    				skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
    			}
    		} else {
    			/* ACK or retransmitted segment: clear ECT|CE */
    			INET_ECN_dontxmit(sk);
    		}
    		if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
    			tcp_hdr(skb)->ece = 1;
    	}
    }
    
    
    /* Constructs common control bits of non-data skb. If SYN/FIN is present,
     * auto increment end seqno.
     */
    static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
    {
    	skb->csum = 0;
    
    	TCP_SKB_CB(skb)->flags = flags;
    	TCP_SKB_CB(skb)->sacked = 0;
    
    	skb_shinfo(skb)->gso_segs = 1;
    	skb_shinfo(skb)->gso_size = 0;
    	skb_shinfo(skb)->gso_type = 0;
    
    	TCP_SKB_CB(skb)->seq = seq;
    	if (flags & (TCPCB_FLAG_SYN | TCPCB_FLAG_FIN))
    		seq++;
    	TCP_SKB_CB(skb)->end_seq = seq;
    }
    
    
    Adam Langley's avatar
    Adam Langley committed
    #define OPTION_SACK_ADVERTISE	(1 << 0)
    #define OPTION_TS		(1 << 1)
    #define OPTION_MD5		(1 << 2)
    
    struct tcp_out_options {
    	u8 options;		/* bit field of OPTION_* */
    	u8 ws;			/* window scale, 0 to disable */
    	u8 num_sack_blocks;	/* number of SACK blocks to include */
    	u16 mss;		/* 0 to disable */
    	__u32 tsval, tsecr;	/* need to include OPTION_TS */
    };
    
    static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
    			      const struct tcp_out_options *opts,
    			      __u8 **md5_hash) {
    	if (unlikely(OPTION_MD5 & opts->options)) {
    
    		*ptr++ = htonl((TCPOPT_NOP << 24) |
    			       (TCPOPT_NOP << 16) |
    
    Adam Langley's avatar
    Adam Langley committed
    			       (TCPOPT_MD5SIG << 8) |
    			       TCPOLEN_MD5SIG);
    		*md5_hash = (__u8 *)ptr;
    		ptr += 4;
    	} else {
    		*md5_hash = NULL;
    
    Adam Langley's avatar
    Adam Langley committed
    
    	if (likely(OPTION_TS & opts->options)) {
    		if (unlikely(OPTION_SACK_ADVERTISE & opts->options)) {
    			*ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
    				       (TCPOLEN_SACK_PERM << 16) |
    				       (TCPOPT_TIMESTAMP << 8) |
    				       TCPOLEN_TIMESTAMP);
    		} else {
    			*ptr++ = htonl((TCPOPT_NOP << 24) |
    				       (TCPOPT_NOP << 16) |
    				       (TCPOPT_TIMESTAMP << 8) |
    				       TCPOLEN_TIMESTAMP);
    		}
    		*ptr++ = htonl(opts->tsval);
    		*ptr++ = htonl(opts->tsecr);
    	}
    
    	if (unlikely(opts->mss)) {
    		*ptr++ = htonl((TCPOPT_MSS << 24) |
    			       (TCPOLEN_MSS << 16) |
    			       opts->mss);
    	}
    
    	if (unlikely(OPTION_SACK_ADVERTISE & opts->options &&
    		     !(OPTION_TS & opts->options))) {
    		*ptr++ = htonl((TCPOPT_NOP << 24) |
    			       (TCPOPT_NOP << 16) |
    			       (TCPOPT_SACK_PERM << 8) |
    			       TCPOLEN_SACK_PERM);
    	}
    
    	if (unlikely(opts->ws)) {
    		*ptr++ = htonl((TCPOPT_NOP << 24) |
    			       (TCPOPT_WINDOW << 16) |
    			       (TCPOLEN_WINDOW << 8) |
    			       opts->ws);
    	}
    
    	if (unlikely(opts->num_sack_blocks)) {
    		struct tcp_sack_block *sp = tp->rx_opt.dsack ?
    			tp->duplicate_sack : tp->selective_acks;
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    		int this_sack;
    
    		*ptr++ = htonl((TCPOPT_NOP  << 24) |
    			       (TCPOPT_NOP  << 16) |
    			       (TCPOPT_SACK <<  8) |
    
    Adam Langley's avatar
    Adam Langley committed
    			       (TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    						     TCPOLEN_SACK_PERBLOCK)));
    
    Adam Langley's avatar
    Adam Langley committed
    		for (this_sack = 0; this_sack < opts->num_sack_blocks;
    		     ++this_sack) {
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    			*ptr++ = htonl(sp[this_sack].start_seq);
    			*ptr++ = htonl(sp[this_sack].end_seq);
    		}
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    		if (tp->rx_opt.dsack) {
    			tp->rx_opt.dsack = 0;
    			tp->rx_opt.eff_sacks--;
    		}
    	}
    
    Adam Langley's avatar
    Adam Langley committed
    }
    
    static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
    				struct tcp_out_options *opts,
    				struct tcp_md5sig_key **md5) {
    	struct tcp_sock *tp = tcp_sk(sk);
    	unsigned size = 0;
    
    
    #ifdef CONFIG_TCP_MD5SIG
    
    Adam Langley's avatar
    Adam Langley committed
    	*md5 = tp->af_specific->md5_lookup(sk, sk);
    	if (*md5) {
    		opts->options |= OPTION_MD5;
    		size += TCPOLEN_MD5SIG_ALIGNED;
    
    Adam Langley's avatar
    Adam Langley committed
    #else
    	*md5 = NULL;
    
    Adam Langley's avatar
    Adam Langley committed
    
    	/* We always get an MSS option.  The option bytes which will be seen in
    	 * normal data packets should timestamps be used, must be in the MSS
    	 * advertised.  But we subtract them from tp->mss_cache so that
    	 * calculations in tcp_sendmsg are simpler etc.  So account for this
    	 * fact here if necessary.  If we don't do this correctly, as a
    	 * receiver we won't recognize data packets as being full sized when we
    	 * should, and thus we won't abide by the delayed ACK rules correctly.
    	 * SACKs don't matter, we never delay an ACK when we have any of those
    	 * going out.  */
    	opts->mss = tcp_advertise_mss(sk);
    	size += TCPOLEN_MSS_ALIGNED;
    
    	if (likely(sysctl_tcp_timestamps && *md5 == NULL)) {
    		opts->options |= OPTION_TS;
    		opts->tsval = TCP_SKB_CB(skb)->when;
    		opts->tsecr = tp->rx_opt.ts_recent;
    		size += TCPOLEN_TSTAMP_ALIGNED;
    	}
    	if (likely(sysctl_tcp_window_scaling)) {
    		opts->ws = tp->rx_opt.rcv_wscale;
    		size += TCPOLEN_WSCALE_ALIGNED;
    	}
    	if (likely(sysctl_tcp_sack)) {
    		opts->options |= OPTION_SACK_ADVERTISE;
    
    		if (unlikely(!(OPTION_TS & opts->options)))
    
    Adam Langley's avatar
    Adam Langley committed
    			size += TCPOLEN_SACKPERM_ALIGNED;
    	}
    
    	return size;
    
    Adam Langley's avatar
    Adam Langley committed
    static unsigned tcp_synack_options(struct sock *sk,
    				   struct request_sock *req,
    				   unsigned mss, struct sk_buff *skb,
    				   struct tcp_out_options *opts,
    				   struct tcp_md5sig_key **md5) {
    	unsigned size = 0;
    	struct inet_request_sock *ireq = inet_rsk(req);
    	char doing_ts;
    
    
    #ifdef CONFIG_TCP_MD5SIG
    
    Adam Langley's avatar
    Adam Langley committed
    	*md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req);
    	if (*md5) {
    		opts->options |= OPTION_MD5;
    		size += TCPOLEN_MD5SIG_ALIGNED;
    
    Adam Langley's avatar
    Adam Langley committed
    #else
    	*md5 = NULL;
    
    Adam Langley's avatar
    Adam Langley committed
    
    	/* we can't fit any SACK blocks in a packet with MD5 + TS
    	   options. There was discussion about disabling SACK rather than TS in
    	   order to fit in better with old, buggy kernels, but that was deemed
    	   to be unnecessary. */
    	doing_ts = ireq->tstamp_ok && !(*md5 && ireq->sack_ok);
    
    	opts->mss = mss;
    	size += TCPOLEN_MSS_ALIGNED;
    
    	if (likely(ireq->wscale_ok)) {
    		opts->ws = ireq->rcv_wscale;
    		size += TCPOLEN_WSCALE_ALIGNED;
    	}
    	if (likely(doing_ts)) {
    		opts->options |= OPTION_TS;
    		opts->tsval = TCP_SKB_CB(skb)->when;
    		opts->tsecr = req->ts_recent;
    		size += TCPOLEN_TSTAMP_ALIGNED;
    	}
    	if (likely(ireq->sack_ok)) {
    		opts->options |= OPTION_SACK_ADVERTISE;
    		if (unlikely(!doing_ts))
    			size += TCPOLEN_SACKPERM_ALIGNED;
    	}
    
    	return size;
    }
    
    static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb,
    					struct tcp_out_options *opts,
    					struct tcp_md5sig_key **md5) {
    	struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL;
    	struct tcp_sock *tp = tcp_sk(sk);
    	unsigned size = 0;
    
    #ifdef CONFIG_TCP_MD5SIG
    	*md5 = tp->af_specific->md5_lookup(sk, sk);
    	if (unlikely(*md5)) {
    		opts->options |= OPTION_MD5;
    		size += TCPOLEN_MD5SIG_ALIGNED;
    	}
    #else
    	*md5 = NULL;
    #endif
    
    	if (likely(tp->rx_opt.tstamp_ok)) {
    		opts->options |= OPTION_TS;
    		opts->tsval = tcb ? tcb->when : 0;
    		opts->tsecr = tp->rx_opt.ts_recent;
    		size += TCPOLEN_TSTAMP_ALIGNED;
    	}
    
    	if (unlikely(tp->rx_opt.eff_sacks)) {
    		const unsigned remaining = MAX_TCP_OPTION_SPACE - size;
    		opts->num_sack_blocks =
    			min_t(unsigned, tp->rx_opt.eff_sacks,
    			      (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
    			      TCPOLEN_SACK_PERBLOCK);
    		size += TCPOLEN_SACK_BASE_ALIGNED +
    			opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
    	}
    
    	return size;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    /* This routine actually transmits TCP packets queued in by
     * tcp_do_sendmsg().  This is used by both the initial
     * transmission and possible later retransmissions.
     * All SKB's seen here are completely headerless.  It is our
     * job to build the TCP header, and pass the packet down to
     * IP so it can do the same plus pass the packet off to the
     * device.
     *
     * We are working here with either a clone of the original
     * SKB, or a fresh unique copy made by the retransmit engine.
     */
    
    static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
    			    gfp_t gfp_mask)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	const struct inet_connection_sock *icsk = inet_csk(sk);
    	struct inet_sock *inet;
    	struct tcp_sock *tp;
    	struct tcp_skb_cb *tcb;
    
    Adam Langley's avatar
    Adam Langley committed
    	struct tcp_out_options opts;
    	unsigned tcp_options_size, tcp_header_size;
    
    	struct tcp_md5sig_key *md5;
    	__u8 *md5_hash_location;
    
    	struct tcphdr *th;
    	int err;
    
    	BUG_ON(!skb || !tcp_skb_pcount(skb));
    
    	/* If congestion control is doing timestamping, we must
    	 * take such a timestamp before we potentially clone/copy.
    	 */
    
    	if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP)
    
    		__net_timestamp(skb);
    
    	if (likely(clone_it)) {
    		if (unlikely(skb_cloned(skb)))
    			skb = pskb_copy(skb, gfp_mask);
    		else
    			skb = skb_clone(skb, gfp_mask);
    		if (unlikely(!skb))
    			return -ENOBUFS;
    	}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	inet = inet_sk(sk);
    	tp = tcp_sk(sk);
    	tcb = TCP_SKB_CB(skb);
    
    Adam Langley's avatar
    Adam Langley committed
    	memset(&opts, 0, sizeof(opts));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    Adam Langley's avatar
    Adam Langley committed
    	if (unlikely(tcb->flags & TCPCB_FLAG_SYN))
    		tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
    	else
    		tcp_options_size = tcp_established_options(sk, skb, &opts,
    							   &md5);
    	tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
    
    	if (tcp_packets_in_flight(tp) == 0)
    		tcp_ca_event(sk, CA_EVENT_TX_START);
    
    
    	skb_push(skb, tcp_header_size);
    	skb_reset_transport_header(skb);
    
    
    	/* Build TCP header and checksum it. */
    
    	th->source		= inet->sport;
    	th->dest		= inet->dport;
    	th->seq			= htonl(tcb->seq);
    	th->ack_seq		= htonl(tp->rcv_nxt);
    
    	*(((__be16 *)th) + 6)	= htons(((tcp_header_size >> 2) << 12) |
    
    					tcb->flags);
    
    	if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
    		/* RFC1323: The window in SYN & SYN/ACK segments
    		 * is never scaled.
    		 */
    
    		th->window	= htons(min(tp->rcv_wnd, 65535U));
    
    	} else {
    		th->window	= htons(tcp_select_window(sk));
    	}
    	th->check		= 0;
    	th->urg_ptr		= 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if (unlikely(tp->urg_mode &&
    
    		     between(tp->snd_up, tcb->seq + 1, tcb->seq + 0xFFFF))) {
    		th->urg_ptr		= htons(tp->snd_up - tcb->seq);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    Adam Langley's avatar
    Adam Langley committed
    	tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location);
    	if (likely((tcb->flags & TCPCB_FLAG_SYN) == 0))
    
    		TCP_ECN_send(sk, skb, tcp_header_size);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    #ifdef CONFIG_TCP_MD5SIG
    	/* Calculate the MD5 hash, as we have all we need now */
    	if (md5) {
    
    Adam Langley's avatar
    Adam Langley committed
    		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
    
    		tp->af_specific->calc_md5_hash(md5_hash_location,
    
    					       md5, sk, NULL, skb);
    
    	icsk->icsk_af_ops->send_check(sk, skb->len, skb);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if (likely(tcb->flags & TCPCB_FLAG_ACK))
    		tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if (skb->len != tcp_header_size)
    		tcp_event_data_sent(tp, skb, sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
    
    		TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	err = icsk->icsk_af_ops->queue_xmit(skb, 0);
    
    	if (likely(err <= 0))
    
    	tcp_enter_cwr(sk, 1);
    
    	return net_xmit_eval(err);
    
    /* This routine just queue's the buffer
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *
     * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
     * otherwise socket can stall.
     */
    static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	/* Advance write_seq and place onto the write_queue. */
    	tp->write_seq = TCP_SKB_CB(skb)->end_seq;
    	skb_header_release(skb);
    
    	tcp_add_write_queue_tail(sk, skb);
    
    	sk->sk_wmem_queued += skb->truesize;
    	sk_mem_charge(sk, skb->truesize);
    
    static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb,
    				 unsigned int mss_now)
    
    	if (skb->len <= mss_now || !sk_can_gso(sk)) {
    
    		/* Avoid the costly divide in the normal
    		 * non-TSO case.
    		 */
    
    		skb_shinfo(skb)->gso_segs = 1;
    		skb_shinfo(skb)->gso_size = 0;
    		skb_shinfo(skb)->gso_type = 0;
    
    		skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss_now);
    
    		skb_shinfo(skb)->gso_size = mss_now;
    
    		skb_shinfo(skb)->gso_type = sk->sk_gso_type;
    
    /* When a modification to fackets out becomes necessary, we need to check
    
     * skb is counted to fackets_out or not.
    
    static void tcp_adjust_fackets_out(struct sock *sk, struct sk_buff *skb,
    
    	struct tcp_sock *tp = tcp_sk(sk);
    
    
    	if (!tp->sacked_out || tcp_is_reno(tp))
    
    	if (after(tcp_highest_sack_seq(tp), TCP_SKB_CB(skb)->seq))
    
    		tp->fackets_out -= decr;
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /* Function to create two new TCP segments.  Shrinks the given segment
     * to the specified size and appends a new segment with the rest of the
    
     * packet to the list.  This won't be called frequently, I hope.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * Remember, these are still headerless SKBs at this point.
     */
    
    int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
    		 unsigned int mss_now)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	struct sk_buff *buff;
    
    	int nsize, old_factor;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	u16 flags;
    
    
    	BUG_ON(len > skb->len);
    
    	tcp_clear_retrans_hints_partial(tp);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	nsize = skb_headlen(skb) - len;
    	if (nsize < 0)
    		nsize = 0;
    
    	if (skb_cloned(skb) &&
    	    skb_is_nonlinear(skb) &&
    	    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
    		return -ENOMEM;
    
    	/* Get a new skb... force flag on. */
    	buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC);
    	if (buff == NULL)
    		return -ENOMEM; /* We'll just try again later. */
    
    	sk->sk_wmem_queued += buff->truesize;
    	sk_mem_charge(sk, buff->truesize);
    
    	nlen = skb->len - len - nsize;
    	buff->truesize += nlen;
    	skb->truesize -= nlen;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	/* Correct the sequence numbers. */
    	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
    	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
    	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
    
    	/* PSH and FIN should only be set in the second packet. */
    	flags = TCP_SKB_CB(skb)->flags;
    
    	TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	TCP_SKB_CB(buff)->flags = flags;
    
    	TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/* Copy and checksum data tail into the new buffer. */
    
    		buff->csum = csum_partial_copy_nocheck(skb->data + len,
    						       skb_put(buff, nsize),
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    						       nsize, 0);
    
    		skb_trim(skb, len);
    
    		skb->csum = csum_block_sub(skb->csum, buff->csum, len);
    	} else {
    
    		skb->ip_summed = CHECKSUM_PARTIAL;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		skb_split(skb, buff, len);
    	}
    
    	buff->ip_summed = skb->ip_summed;
    
    	/* Looks stupid, but our code really uses when of
    	 * skbs, which it never sent before. --ANK
    	 */
    	TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
    
    	buff->tstamp = skb->tstamp;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	old_factor = tcp_skb_pcount(skb);
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	/* Fix up tso_factor for both original and new SKB.  */
    
    	tcp_set_skb_tso_segs(sk, skb, mss_now);
    	tcp_set_skb_tso_segs(sk, buff, mss_now);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	/* If this packet has been sent out already, we must
    	 * adjust the various packet counters.
    	 */
    
    	if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
    
    		int diff = old_factor - tcp_skb_pcount(skb) -
    			tcp_skb_pcount(buff);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		tp->packets_out -= diff;
    
    
    		if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
    			tp->sacked_out -= diff;
    		if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
    			tp->retrans_out -= diff;
    
    
    		if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
    
    			tp->lost_out -= diff;
    
    		/* Adjust Reno SACK estimate. */
    		if (tcp_is_reno(tp) && diff > 0) {
    			tcp_dec_pcount_approx_int(&tp->sacked_out, diff);
    			tcp_verify_left_out(tp);
    
    		tcp_adjust_fackets_out(sk, skb, diff);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    	/* Link BUFF into the send queue. */
    
    	tcp_insert_write_queue_after(skb, buff, sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	return 0;
    }
    
    /* This is similar to __pskb_pull_head() (it will go to core/skbuff.c
     * eventually). The difference is that pulled data not copied, but
     * immediately discarded.
     */
    
    static void __pskb_trim_head(struct sk_buff *skb, int len)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	int i, k, eat;
    
    	eat = len;
    	k = 0;
    
    	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (skb_shinfo(skb)->frags[i].size <= eat) {
    			put_page(skb_shinfo(skb)->frags[i].page);
    			eat -= skb_shinfo(skb)->frags[i].size;
    		} else {
    			skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
    			if (eat) {
    				skb_shinfo(skb)->frags[k].page_offset += eat;
    				skb_shinfo(skb)->frags[k].size -= eat;
    				eat = 0;
    			}
    			k++;
    		}
    	}
    	skb_shinfo(skb)->nr_frags = k;
    
    
    	skb_reset_tail_pointer(skb);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	skb->data_len -= len;
    	skb->len = skb->data_len;
    }
    
    int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
    {
    
    	if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return -ENOMEM;
    
    
    	/* If len == headlen, we avoid __skb_pull to preserve alignment. */
    	if (unlikely(len < skb_headlen(skb)))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		__skb_pull(skb, len);
    
    	else
    		__pskb_trim_head(skb, len - skb_headlen(skb));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	TCP_SKB_CB(skb)->seq += len;
    
    	skb->ip_summed = CHECKSUM_PARTIAL;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	skb->truesize	     -= len;
    	sk->sk_wmem_queued   -= len;
    
    	sk_mem_uncharge(sk, len);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
    
    	/* Any change of skb->len requires recalculation of tso
    	 * factor and mss.
    	 */
    	if (tcp_skb_pcount(skb) > 1)
    
    		tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk, 1));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	return 0;
    }
    
    
    John Heffner's avatar
    John Heffner committed
    /* Not accounting for SACKs here. */
    int tcp_mtu_to_mss(struct sock *sk, int pmtu)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	struct inet_connection_sock *icsk = inet_csk(sk);
    	int mss_now;
    
    	/* Calculate base mss without TCP options:
    	   It is MMS_S - sizeof(tcphdr) of rfc1122
    	 */
    	mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
    
    	/* Clamp it (mss_clamp does not include tcp options) */
    	if (mss_now > tp->rx_opt.mss_clamp)
    		mss_now = tp->rx_opt.mss_clamp;
    
    	/* Now subtract optional transport overhead */
    	mss_now -= icsk->icsk_ext_hdr_len;
    
    	/* Then reserve room for full set of TCP options and 8 bytes of data */
    	if (mss_now < 48)
    		mss_now = 48;
    
    	/* Now subtract TCP options size, not including SACKs */
    	mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
    
    	return mss_now;
    }
    
    /* Inverse of above */
    int tcp_mss_to_mtu(struct sock *sk, int mss)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	struct inet_connection_sock *icsk = inet_csk(sk);
    	int mtu;
    
    	mtu = mss +
    	      tp->tcp_header_len +
    	      icsk->icsk_ext_hdr_len +
    	      icsk->icsk_af_ops->net_header_len;
    
    	return mtu;
    }
    
    void tcp_mtup_init(struct sock *sk)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	struct inet_connection_sock *icsk = inet_csk(sk);
    
    	icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1;
    	icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
    
    			       icsk->icsk_af_ops->net_header_len;
    
    John Heffner's avatar
    John Heffner committed
    	icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss);
    	icsk->icsk_mtup.probe_size = 0;
    }
    
    
    /* Bound MSS / TSO packet size with the half of the window */
    static int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
    {
    	if (tp->max_window && pktsize > (tp->max_window >> 1))
    		return max(tp->max_window >> 1, 68U - tp->tcp_header_len);
    	else
    		return pktsize;
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /* This function synchronize snd mss to current pmtu/exthdr set.
    
       tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
       for TCP options, but includes only bare TCP header.
    
       tp->rx_opt.mss_clamp is mss negotiated at connection setup.
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
       It is minimum of user_mss and mss received with SYN.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
       It also does not include TCP options.
    
    
       inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
       tp->mss_cache is current effective sending mss, including
       all tcp options except for SACKs. It is evaluated,
       taking into account current pmtu, but never exceeds
       tp->rx_opt.mss_clamp.
    
       NOTE1. rfc1122 clearly states that advertised MSS
       DOES NOT include either tcp or ip options.
    
    
       NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
       are READ ONLY outside this function.		--ANK (980731)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	struct inet_connection_sock *icsk = inet_csk(sk);
    
    John Heffner's avatar
    John Heffner committed
    	int mss_now;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    John Heffner's avatar
    John Heffner committed
    	if (icsk->icsk_mtup.search_high > pmtu)
    		icsk->icsk_mtup.search_high = pmtu;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    John Heffner's avatar
    John Heffner committed
    	mss_now = tcp_mtu_to_mss(sk, pmtu);
    
    	mss_now = tcp_bound_to_half_wnd(tp, mss_now);
    
    Linus Torvalds's avatar
    Linus Torvalds committed