Skip to content
Snippets Groups Projects
tcp_output.c 81.4 KiB
Newer Older
	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
		return 0;

	return 1;
}

/* Collapse packets in the retransmit queue to make to create
 * less packets on the wire. This is only done on retransmission.
 */
static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
				     int space)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct sk_buff *skb = to, *tmp;
	int first = 1;

	if (!sysctl_tcp_retrans_collapse)
		return;
Changli Gao's avatar
Changli Gao committed
	if (TCP_SKB_CB(skb)->flags & TCPHDR_SYN)
		return;

	tcp_for_write_queue_from_safe(skb, tmp, sk) {
		if (!tcp_can_collapse(sk, skb))
			break;

		space -= skb->len;

		if (first) {
			first = 0;
			continue;
		}

		if (space < 0)
			break;
		/* Punt if not enough space exists in the first SKB for
		 * the data in the second
		 */
		if (skb->len > skb_tailroom(to))
			break;

		if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
			break;

		tcp_collapse_retrans(sk, to);
	}
}

Linus Torvalds's avatar
Linus Torvalds committed
/* This retransmits one SKB.  Policy decisions and retransmit queue
 * state updates are done by the caller.  Returns non-zero if an
 * error occurred which prevented the send.
 */
int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
{
	struct tcp_sock *tp = tcp_sk(sk);
John Heffner's avatar
John Heffner committed
	struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
	int err;

John Heffner's avatar
John Heffner committed
	/* Inconslusive MTU probe */
	if (icsk->icsk_mtup.probe_size) {
		icsk->icsk_mtup.probe_size = 0;
	}

Linus Torvalds's avatar
Linus Torvalds committed
	/* Do not sent more than we queued. 1/4 is reserved for possible
Stephen Hemminger's avatar
Stephen Hemminger committed
	 * copying overhead: fragmentation, tunneling, mangling etc.
Linus Torvalds's avatar
Linus Torvalds committed
	 */
	if (atomic_read(&sk->sk_wmem_alloc) >
	    min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
		return -EAGAIN;

	if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
		if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
			BUG();
		if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
			return -ENOMEM;
	}

	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
		return -EHOSTUNREACH; /* Routing failure or similar. */

	cur_mss = tcp_current_mss(sk);
Linus Torvalds's avatar
Linus Torvalds committed
	/* If receiver has shrunk his window, and skb is out of
	 * new window, do not retransmit it. The exception is the
	 * case, when window is shrunk to zero. In this case
	 * our retransmit serves as a zero window probe.
	 */
	if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&
	    TCP_SKB_CB(skb)->seq != tp->snd_una)
Linus Torvalds's avatar
Linus Torvalds committed
		return -EAGAIN;

	if (skb->len > cur_mss) {
		if (tcp_fragment(sk, skb, cur_mss, cur_mss))
Linus Torvalds's avatar
Linus Torvalds committed
			return -ENOMEM; /* We'll try again later. */
		int oldpcount = tcp_skb_pcount(skb);

		if (unlikely(oldpcount > 1)) {
			tcp_init_tso_segs(sk, skb, cur_mss);
			tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb));
		}
	tcp_retrans_try_collapse(sk, skb, cur_mss);
Linus Torvalds's avatar
Linus Torvalds committed

	/* Some Solaris stacks overoptimize and ignore the FIN on a
	 * retransmit when old data is attached.  So strip it off
	 * since it is cheap to do so and saves bytes on the network.
	 */
	if (skb->len > 0 &&
Changli Gao's avatar
Changli Gao committed
	    (TCP_SKB_CB(skb)->flags & TCPHDR_FIN) &&
	    tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
Linus Torvalds's avatar
Linus Torvalds committed
		if (!pskb_trim(skb, 0)) {
			/* Reuse, even though it does some unnecessary work */
			tcp_init_nondata_skb(skb, TCP_SKB_CB(skb)->end_seq - 1,
					     TCP_SKB_CB(skb)->flags);
Linus Torvalds's avatar
Linus Torvalds committed
			skb->ip_summed = CHECKSUM_NONE;
		}
	}

	/* Make a copy, if the first transmission SKB clone we made
	 * is still in somebody's hands, else make a clone.
	 */
	TCP_SKB_CB(skb)->when = tcp_time_stamp;

	err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
Linus Torvalds's avatar
Linus Torvalds committed

	if (err == 0) {
		/* Update global TCP statistics. */
		TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
Linus Torvalds's avatar
Linus Torvalds committed

		tp->total_retrans++;

#if FASTRETRANS_DEBUG > 0
		if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
Linus Torvalds's avatar
Linus Torvalds committed
			if (net_ratelimit())
				printk(KERN_DEBUG "retrans_out leaked.\n");
		}
#endif
		if (!tp->retrans_out)
			tp->lost_retrans_low = tp->snd_nxt;
Linus Torvalds's avatar
Linus Torvalds committed
		TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
		tp->retrans_out += tcp_skb_pcount(skb);

		/* Save stamp of the first retransmit. */
		if (!tp->retrans_stamp)
			tp->retrans_stamp = TCP_SKB_CB(skb)->when;

		tp->undo_retrans++;

		/* snd_nxt is stored to detect loss of retransmitted segment,
		 * see tcp_input.c tcp_sacktag_write_queue().
		 */
		TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
	}
	return err;
}

/* Check if we forward retransmits are possible in the current
 * window/congestion state.
 */
static int tcp_can_forward_retransmit(struct sock *sk)
{
	const struct inet_connection_sock *icsk = inet_csk(sk);
	struct tcp_sock *tp = tcp_sk(sk);

	/* Forward retransmissions are possible only during Recovery. */
	if (icsk->icsk_ca_state != TCP_CA_Recovery)
		return 0;

	/* No forward retransmissions in Reno are possible. */
	if (tcp_is_reno(tp))
		return 0;

	/* Yeah, we have to make difficult choice between forward transmission
	 * and retransmission... Both ways have their merits...
	 *
	 * For now we do not retransmit anything, while we have some new
	 * segments to send. In the other cases, follow rule 3 for
	 * NextSeg() specified in RFC3517.
	 */

	if (tcp_may_send_now(sk))
		return 0;

	return 1;
}

Linus Torvalds's avatar
Linus Torvalds committed
/* This gets called after a retransmit timeout, and the initially
 * retransmitted data is acknowledged.  It tries to continue
 * resending the rest of the retransmit queue, until either
 * we've sent it all or the congestion window limit is reached.
 * If doing SACK, the first ACK which comes back for a timeout
 * based retransmit packet might feed us FACK information again.
 * If so, we use it to avoid unnecessarily retransmissions.
 */
void tcp_xmit_retransmit_queue(struct sock *sk)
{
	const struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
	struct tcp_sock *tp = tcp_sk(sk);
	struct sk_buff *skb;
	struct sk_buff *hole = NULL;
	int mib_idx;
	int fwd_rexmitting = 0;
	if (!tp->packets_out)
		return;

	if (!tp->lost_out)
		tp->retransmit_high = tp->snd_una;

	if (tp->retransmit_skb_hint) {
		skb = tp->retransmit_skb_hint;
		last_lost = TCP_SKB_CB(skb)->end_seq;
		if (after(last_lost, tp->retransmit_high))
			last_lost = tp->retransmit_high;
	} else {
		skb = tcp_write_queue_head(sk);
		last_lost = tp->snd_una;
	}
Linus Torvalds's avatar
Linus Torvalds committed

	tcp_for_write_queue_from(skb, sk) {
		__u8 sacked = TCP_SKB_CB(skb)->sacked;
Linus Torvalds's avatar
Linus Torvalds committed

		if (skb == tcp_send_head(sk))
			break;
		/* we could do better than to assign each time */
		if (hole == NULL)
			tp->retransmit_skb_hint = skb;

		/* Assume this retransmit will generate
		 * only one packet for congestion window
		 * calculation purposes.  This works because
		 * tcp_retransmit_skb() will chop up the
		 * packet to be MSS sized and all the
		 * packet counting works out.
		 */
		if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
			return;
Linus Torvalds's avatar
Linus Torvalds committed

		if (fwd_rexmitting) {
begin_fwd:
			if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
				break;
			mib_idx = LINUX_MIB_TCPFORWARDRETRANS;
		} else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) {
			tp->retransmit_high = last_lost;
			if (!tcp_can_forward_retransmit(sk))
				break;
			/* Backtrack if necessary to non-L'ed skb */
			if (hole != NULL) {
				skb = hole;
				hole = NULL;
			}
			fwd_rexmitting = 1;
			goto begin_fwd;
Linus Torvalds's avatar
Linus Torvalds committed

		} else if (!(sacked & TCPCB_LOST)) {
			if (hole == NULL && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
				hole = skb;
			continue;
Linus Torvalds's avatar
Linus Torvalds committed

			last_lost = TCP_SKB_CB(skb)->end_seq;
			if (icsk->icsk_ca_state != TCP_CA_Loss)
				mib_idx = LINUX_MIB_TCPFASTRETRANS;
			else
				mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
		}
Linus Torvalds's avatar
Linus Torvalds committed

		if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
Linus Torvalds's avatar
Linus Torvalds committed
			continue;

		if (tcp_retransmit_skb(sk, skb))
			return;
		NET_INC_STATS_BH(sock_net(sk), mib_idx);
Linus Torvalds's avatar
Linus Torvalds committed

		if (skb == tcp_write_queue_head(sk))
			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
						  inet_csk(sk)->icsk_rto,
						  TCP_RTO_MAX);
Linus Torvalds's avatar
Linus Torvalds committed
	}
}

/* Send a fin.  The caller locks the socket for us.  This cannot be
 * allowed to fail queueing a FIN frame under any circumstances.
 */
void tcp_send_fin(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct sk_buff *skb = tcp_write_queue_tail(sk);
Linus Torvalds's avatar
Linus Torvalds committed
	int mss_now;
Linus Torvalds's avatar
Linus Torvalds committed
	/* Optimization, tack on the FIN if we have a queue of
	 * unsent frames.  But be careful about outgoing SACKS
	 * and IP options.
	 */
	mss_now = tcp_current_mss(sk);
Linus Torvalds's avatar
Linus Torvalds committed

	if (tcp_send_head(sk) != NULL) {
Changli Gao's avatar
Changli Gao committed
		TCP_SKB_CB(skb)->flags |= TCPHDR_FIN;
Linus Torvalds's avatar
Linus Torvalds committed
		TCP_SKB_CB(skb)->end_seq++;
		tp->write_seq++;
	} else {
		/* Socket is locked, keep trying until memory is available. */
		for (;;) {
			skb = alloc_skb_fclone(MAX_TCP_HEADER,
					       sk->sk_allocation);
Linus Torvalds's avatar
Linus Torvalds committed
			if (skb)
				break;
			yield();
		}

		/* Reserve space for headers and prepare control bits. */
		skb_reserve(skb, MAX_TCP_HEADER);
		/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
		tcp_init_nondata_skb(skb, tp->write_seq,
Changli Gao's avatar
Changli Gao committed
				     TCPHDR_ACK | TCPHDR_FIN);
Linus Torvalds's avatar
Linus Torvalds committed
		tcp_queue_skb(sk, skb);
	}
	__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
Linus Torvalds's avatar
Linus Torvalds committed
}

/* We get here when a process closes a file descriptor (either due to
 * an explicit close() or as a byproduct of exit()'ing) and there
 * was unread data in the receive queue.  This behavior is recommended
 * by RFC 2525, section 2.17.  -DaveM
Linus Torvalds's avatar
Linus Torvalds committed
 */
void tcp_send_active_reset(struct sock *sk, gfp_t priority)
Linus Torvalds's avatar
Linus Torvalds committed
{
	struct sk_buff *skb;

	/* NOTE: No TCP options attached and we never retransmit this. */
	skb = alloc_skb(MAX_TCP_HEADER, priority);
	if (!skb) {
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
Linus Torvalds's avatar
Linus Torvalds committed
		return;
	}

	/* Reserve space for headers and prepare control bits. */
	skb_reserve(skb, MAX_TCP_HEADER);
	tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
Changli Gao's avatar
Changli Gao committed
			     TCPHDR_ACK | TCPHDR_RST);
Linus Torvalds's avatar
Linus Torvalds committed
	/* Send it off. */
	TCP_SKB_CB(skb)->when = tcp_time_stamp;
	if (tcp_transmit_skb(sk, skb, 0, priority))
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
	TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
/* Send a crossed SYN-ACK during socket establishment.
 * WARNING: This routine must only be called when we have already sent
Linus Torvalds's avatar
Linus Torvalds committed
 * a SYN packet that crossed the incoming SYN that caused this routine
 * to get called. If this assumption fails then the initial rcv_wnd
 * and rcv_wscale values will not be correct.
 */
int tcp_send_synack(struct sock *sk)
{
	struct sk_buff *skb;
Linus Torvalds's avatar
Linus Torvalds committed

	skb = tcp_write_queue_head(sk);
Changli Gao's avatar
Changli Gao committed
	if (skb == NULL || !(TCP_SKB_CB(skb)->flags & TCPHDR_SYN)) {
Linus Torvalds's avatar
Linus Torvalds committed
		printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
		return -EFAULT;
	}
Changli Gao's avatar
Changli Gao committed
	if (!(TCP_SKB_CB(skb)->flags & TCPHDR_ACK)) {
Linus Torvalds's avatar
Linus Torvalds committed
		if (skb_cloned(skb)) {
			struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
			if (nskb == NULL)
				return -ENOMEM;
			tcp_unlink_write_queue(skb, sk);
Linus Torvalds's avatar
Linus Torvalds committed
			skb_header_release(nskb);
			__tcp_add_write_queue_head(sk, nskb);
			sk_wmem_free_skb(sk, skb);
			sk->sk_wmem_queued += nskb->truesize;
			sk_mem_charge(sk, nskb->truesize);
Linus Torvalds's avatar
Linus Torvalds committed
			skb = nskb;
		}

Changli Gao's avatar
Changli Gao committed
		TCP_SKB_CB(skb)->flags |= TCPHDR_ACK;
Linus Torvalds's avatar
Linus Torvalds committed
		TCP_ECN_send_synack(tcp_sk(sk), skb);
	}
	TCP_SKB_CB(skb)->when = tcp_time_stamp;
	return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
				struct request_sock *req,
				struct request_values *rvp)
Linus Torvalds's avatar
Linus Torvalds committed
{
	struct tcp_out_options opts;
	struct tcp_extend_values *xvp = tcp_xv(rvp);
	struct inet_request_sock *ireq = inet_rsk(req);
Linus Torvalds's avatar
Linus Torvalds committed
	struct tcp_sock *tp = tcp_sk(sk);
Eric Dumazet's avatar
Eric Dumazet committed
	const struct tcp_cookie_values *cvp = tp->cookie_values;
Linus Torvalds's avatar
Linus Torvalds committed
	struct tcphdr *th;
	struct sk_buff *skb;
	struct tcp_md5sig_key *md5;
Eric Dumazet's avatar
Eric Dumazet committed
	int s_data_desired = 0;
Linus Torvalds's avatar
Linus Torvalds committed

Eric Dumazet's avatar
Eric Dumazet committed
	if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired)
		s_data_desired = cvp->s_data_desired;
	skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15 + s_data_desired, 1, GFP_ATOMIC);
Linus Torvalds's avatar
Linus Torvalds committed
	if (skb == NULL)
		return NULL;

	/* Reserve space for headers. */
	skb_reserve(skb, MAX_TCP_HEADER);

Eric Dumazet's avatar
Eric Dumazet committed
	skb_dst_set(skb, dst_clone(dst));
Linus Torvalds's avatar
Linus Torvalds committed

	mss = dst_metric(dst, RTAX_ADVMSS);
	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
		mss = tp->rx_opt.user_mss;

Adam Langley's avatar
Adam Langley committed
	if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
		__u8 rcv_wscale;
		/* Set this up on the first call only */
		req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);

		/* limit the window selection if the user enforce a smaller rx buffer */
		if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
		    (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
			req->window_clamp = tcp_full_space(sk);

Adam Langley's avatar
Adam Langley committed
		/* tcp_full_space because it is guaranteed to be the first packet */
		tcp_select_initial_window(tcp_full_space(sk),
			mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
Adam Langley's avatar
Adam Langley committed
			&req->rcv_wnd,
			&req->window_clamp,
			ireq->wscale_ok,
			&rcv_wscale,
			dst_metric(dst, RTAX_INITRWND));
Adam Langley's avatar
Adam Langley committed
		ireq->rcv_wscale = rcv_wscale;
	}

	memset(&opts, 0, sizeof(opts));
#ifdef CONFIG_SYN_COOKIES
	if (unlikely(req->cookie_ts))
		TCP_SKB_CB(skb)->when = cookie_init_timestamp(req);
	else
#endif
Adam Langley's avatar
Adam Langley committed
	TCP_SKB_CB(skb)->when = tcp_time_stamp;
	tcp_header_size = tcp_synack_options(sk, req, mss,
					     skb, &opts, &md5, xvp)
			+ sizeof(*th);
	skb_push(skb, tcp_header_size);
	skb_reset_transport_header(skb);
Linus Torvalds's avatar
Linus Torvalds committed

Linus Torvalds's avatar
Linus Torvalds committed
	memset(th, 0, sizeof(struct tcphdr));
	th->syn = 1;
	th->ack = 1;
	TCP_ECN_make_synack(req, th);
	th->source = ireq->loc_port;
	/* Setting of flags are superfluous here for callers (and ECE is
	 * not even correctly set)
	 */
	tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,
Changli Gao's avatar
Changli Gao committed
			     TCPHDR_SYN | TCPHDR_ACK);

	if (OPTION_COOKIE_EXTENSION & opts.options) {
Eric Dumazet's avatar
Eric Dumazet committed
		if (s_data_desired) {
			u8 *buf = skb_put(skb, s_data_desired);

			/* copy data directly from the listening socket. */
Eric Dumazet's avatar
Eric Dumazet committed
			memcpy(buf, cvp->s_data_payload, s_data_desired);
			TCP_SKB_CB(skb)->end_seq += s_data_desired;
		}

		if (opts.hash_size > 0) {
			__u32 workspace[SHA_WORKSPACE_WORDS];
			u32 *mess = &xvp->cookie_bakery[COOKIE_DIGEST_WORDS];
			u32 *tail = &mess[COOKIE_MESSAGE_WORDS-1];

			/* Secret recipe depends on the Timestamp, (future)
			 * Sequence and Acknowledgment Numbers, Initiator
			 * Cookie, and others handled by IP variant caller.
			 */
			*tail-- ^= opts.tsval;
			*tail-- ^= tcp_rsk(req)->rcv_isn + 1;
			*tail-- ^= TCP_SKB_CB(skb)->seq + 1;

			/* recommended */
			*tail-- ^= (((__force u32)th->dest << 16) | (__force u32)th->source);
			*tail-- ^= (u32)(unsigned long)cvp; /* per sockopt */

			sha_transform((__u32 *)&xvp->cookie_bakery[0],
				      (char *)mess,
				      &workspace[0]);
			opts.hash_location =
				(__u8 *)&xvp->cookie_bakery[0];
		}
	}

Linus Torvalds's avatar
Linus Torvalds committed
	th->seq = htonl(TCP_SKB_CB(skb)->seq);
	th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
Linus Torvalds's avatar
Linus Torvalds committed

	/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
	th->window = htons(min(req->rcv_wnd, 65535U));
	tcp_options_write((__be32 *)(th + 1), tp, &opts);
Linus Torvalds's avatar
Linus Torvalds committed
	th->doff = (tcp_header_size >> 2);
	TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb));

#ifdef CONFIG_TCP_MD5SIG
	/* Okay, we have all we need - do the md5 hash if needed */
	if (md5) {
		tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
					       md5, NULL, req, skb);
Linus Torvalds's avatar
Linus Torvalds committed
	return skb;
}
EXPORT_SYMBOL(tcp_make_synack);
Linus Torvalds's avatar
Linus Torvalds committed

/* Do all connect socket setups that can be done AF independent. */
Stephen Hemminger's avatar
Stephen Hemminger committed
static void tcp_connect_init(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
{
	struct dst_entry *dst = __sk_dst_get(sk);
	struct tcp_sock *tp = tcp_sk(sk);
	__u8 rcv_wscale;

	/* We'll fix this up when we get a response from the other end.
	 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
	 */
	tp->tcp_header_len = sizeof(struct tcphdr) +
		(sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
Linus Torvalds's avatar
Linus Torvalds committed

#ifdef CONFIG_TCP_MD5SIG
	if (tp->af_specific->md5_lookup(sk, sk) != NULL)
		tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
#endif

Linus Torvalds's avatar
Linus Torvalds committed
	/* If user gave his TCP_MAXSEG, record it to clamp */
	if (tp->rx_opt.user_mss)
		tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
	tp->max_window = 0;
John Heffner's avatar
John Heffner committed
	tcp_mtup_init(sk);
Linus Torvalds's avatar
Linus Torvalds committed
	tcp_sync_mss(sk, dst_mtu(dst));

	if (!tp->window_clamp)
		tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
	tp->advmss = dst_metric(dst, RTAX_ADVMSS);
	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss)
		tp->advmss = tp->rx_opt.user_mss;

Linus Torvalds's avatar
Linus Torvalds committed
	tcp_initialize_rcv_mss(sk);

	/* limit the window selection if the user enforce a smaller rx buffer */
	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
	    (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
		tp->window_clamp = tcp_full_space(sk);

Linus Torvalds's avatar
Linus Torvalds committed
	tcp_select_initial_window(tcp_full_space(sk),
				  tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
				  &tp->rcv_wnd,
				  &tp->window_clamp,
				  sysctl_tcp_window_scaling,
				  &rcv_wscale,
				  dst_metric(dst, RTAX_INITRWND));
Linus Torvalds's avatar
Linus Torvalds committed

	tp->rx_opt.rcv_wscale = rcv_wscale;
	tp->rcv_ssthresh = tp->rcv_wnd;

	sk->sk_err = 0;
	sock_reset_flag(sk, SOCK_DONE);
	tp->snd_wnd = 0;
Linus Torvalds's avatar
Linus Torvalds committed
	tp->snd_una = tp->write_seq;
	tp->snd_sml = tp->write_seq;
	tp->snd_up = tp->write_seq;
Linus Torvalds's avatar
Linus Torvalds committed
	tp->rcv_nxt = 0;
	tp->rcv_wup = 0;
	tp->copied_seq = 0;

	inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
	inet_csk(sk)->icsk_retransmits = 0;
Linus Torvalds's avatar
Linus Torvalds committed
	tcp_clear_retrans(tp);
}

/* Build a SYN and send it off. */
Linus Torvalds's avatar
Linus Torvalds committed
int tcp_connect(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct sk_buff *buff;

	tcp_connect_init(sk);

	buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
Linus Torvalds's avatar
Linus Torvalds committed
	if (unlikely(buff == NULL))
		return -ENOBUFS;

	/* Reserve space for headers. */
	skb_reserve(buff, MAX_TCP_HEADER);

	tp->snd_nxt = tp->write_seq;
Changli Gao's avatar
Changli Gao committed
	tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
Linus Torvalds's avatar
Linus Torvalds committed

	/* Send it off. */
	TCP_SKB_CB(buff)->when = tcp_time_stamp;
	tp->retrans_stamp = TCP_SKB_CB(buff)->when;
	skb_header_release(buff);
	__tcp_add_write_queue_tail(sk, buff);
	sk->sk_wmem_queued += buff->truesize;
	sk_mem_charge(sk, buff->truesize);
Linus Torvalds's avatar
Linus Torvalds committed
	tp->packets_out += tcp_skb_pcount(buff);
	tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);

	/* We change tp->snd_nxt after the tcp_transmit_skb() call
	 * in order to make this packet get counted in tcpOutSegs.
	 */
	tp->snd_nxt = tp->write_seq;
	tp->pushed_seq = tp->write_seq;
	TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
Linus Torvalds's avatar
Linus Torvalds committed

	/* Timer for repeating the SYN until an answer. */
	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
				  inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
Linus Torvalds's avatar
Linus Torvalds committed
	return 0;
}
EXPORT_SYMBOL(tcp_connect);
Linus Torvalds's avatar
Linus Torvalds committed

/* Send out a delayed ack, the caller does the policy checking
 * to see if we should even be here.  See tcp_input.c:tcp_ack_snd_check()
 * for details.
 */
void tcp_send_delayed_ack(struct sock *sk)
{
	struct inet_connection_sock *icsk = inet_csk(sk);
	int ato = icsk->icsk_ack.ato;
Linus Torvalds's avatar
Linus Torvalds committed
	unsigned long timeout;

	if (ato > TCP_DELACK_MIN) {
		const struct tcp_sock *tp = tcp_sk(sk);
		int max_ato = HZ / 2;
Linus Torvalds's avatar
Linus Torvalds committed

		if (icsk->icsk_ack.pingpong ||
		    (icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
Linus Torvalds's avatar
Linus Torvalds committed
			max_ato = TCP_DELACK_MAX;

		/* Slow path, intersegment interval is "high". */

		/* If some rtt estimate is known, use it to bound delayed ack.
		 * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
Linus Torvalds's avatar
Linus Torvalds committed
		 * directly.
		 */
		if (tp->srtt) {
			int rtt = max(tp->srtt >> 3, TCP_DELACK_MIN);
Linus Torvalds's avatar
Linus Torvalds committed

			if (rtt < max_ato)
				max_ato = rtt;
		}

		ato = min(ato, max_ato);
	}

	/* Stay within the limit we were given */
	timeout = jiffies + ato;

	/* Use new timeout only if there wasn't a older one earlier. */
	if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
Linus Torvalds's avatar
Linus Torvalds committed
		/* If delack timer was blocked or is about to expire,
		 * send ACK now.
		 */
		if (icsk->icsk_ack.blocked ||
		    time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
Linus Torvalds's avatar
Linus Torvalds committed
			tcp_send_ack(sk);
			return;
		}

		if (!time_before(timeout, icsk->icsk_ack.timeout))
			timeout = icsk->icsk_ack.timeout;
Linus Torvalds's avatar
Linus Torvalds committed
	}
	icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
	icsk->icsk_ack.timeout = timeout;
	sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
Linus Torvalds's avatar
Linus Torvalds committed
}

/* This routine sends an ack and also updates the window. */
void tcp_send_ack(struct sock *sk)
{
Linus Torvalds's avatar
Linus Torvalds committed

	/* If we have been reset, we may not send again. */
	if (sk->sk_state == TCP_CLOSE)
		return;
Linus Torvalds's avatar
Linus Torvalds committed

	/* We are not putting this on the write queue, so
	 * tcp_transmit_skb() will set the ownership to this
	 * sock.
	 */
	buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
	if (buff == NULL) {
		inet_csk_schedule_ack(sk);
		inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
		inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
					  TCP_DELACK_MAX, TCP_RTO_MAX);
		return;
Linus Torvalds's avatar
Linus Torvalds committed
	}

	/* Reserve space for headers and prepare control bits. */
	skb_reserve(buff, MAX_TCP_HEADER);
Changli Gao's avatar
Changli Gao committed
	tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);

	/* Send it off, this clears delayed acks for us. */
	TCP_SKB_CB(buff)->when = tcp_time_stamp;
	tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC);
Linus Torvalds's avatar
Linus Torvalds committed
}

/* This routine sends a packet with an out of date sequence
 * number. It assumes the other end will try to ack it.
 *
 * Question: what should we make while urgent mode?
 * 4.4BSD forces sending single byte of data. We cannot send
 * out of window data, because we have SND.NXT==SND.MAX...
 *
 * Current solution: to send TWO zero-length segments in urgent mode:
 * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
 * out-of-date with SND.UNA-1 to probe window.
 */
static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct sk_buff *skb;

	/* We don't queue it, tcp_transmit_skb() sets ownership. */
	skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
	if (skb == NULL)
Linus Torvalds's avatar
Linus Torvalds committed
		return -1;

	/* Reserve space for headers and set control bits. */
	skb_reserve(skb, MAX_TCP_HEADER);
	/* Use a previous sequence.  This should cause the other
	 * end to send an ack.  Don't queue or clone SKB, just
	 * send it.
	 */
Changli Gao's avatar
Changli Gao committed
	tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
Linus Torvalds's avatar
Linus Torvalds committed
	TCP_SKB_CB(skb)->when = tcp_time_stamp;
	return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
/* Initiate keepalive or window probe from timer. */
Linus Torvalds's avatar
Linus Torvalds committed
int tcp_write_wakeup(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct sk_buff *skb;
Linus Torvalds's avatar
Linus Torvalds committed

	if (sk->sk_state == TCP_CLOSE)
		return -1;

	if ((skb = tcp_send_head(sk)) != NULL &&
	    before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
		int err;
		unsigned int mss = tcp_current_mss(sk);
		unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;

		if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
			tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;

		/* We are probing the opening of a window
		 * but the window size is != 0
		 * must have been a result SWS avoidance ( sender )
		 */
		if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
		    skb->len > mss) {
			seg_size = min(seg_size, mss);
Changli Gao's avatar
Changli Gao committed
			TCP_SKB_CB(skb)->flags |= TCPHDR_PSH;
			if (tcp_fragment(sk, skb, seg_size, mss))
				return -1;
		} else if (!tcp_skb_pcount(skb))
			tcp_set_skb_tso_segs(sk, skb, mss);

Changli Gao's avatar
Changli Gao committed
		TCP_SKB_CB(skb)->flags |= TCPHDR_PSH;
		TCP_SKB_CB(skb)->when = tcp_time_stamp;
		err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
		if (!err)
			tcp_event_new_data_sent(sk, skb);
		return err;
	} else {
		if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
			tcp_xmit_probe_skb(sk, 1);
		return tcp_xmit_probe_skb(sk, 0);
Linus Torvalds's avatar
Linus Torvalds committed
	}
}

/* A window probe timeout has occurred.  If window is not closed send
 * a partial packet else a zero probe.
 */
void tcp_send_probe0(struct sock *sk)
{
	struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
	struct tcp_sock *tp = tcp_sk(sk);
	int err;

	err = tcp_write_wakeup(sk);

	if (tp->packets_out || !tcp_send_head(sk)) {
Linus Torvalds's avatar
Linus Torvalds committed
		/* Cancel probe timer, if it is not required. */
		icsk->icsk_backoff = 0;
Linus Torvalds's avatar
Linus Torvalds committed
		return;
	}

	if (err <= 0) {
		if (icsk->icsk_backoff < sysctl_tcp_retries2)
			icsk->icsk_backoff++;
		inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
					  min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
					  TCP_RTO_MAX);
Linus Torvalds's avatar
Linus Torvalds committed
	} else {
		/* If packet was not sent due to local congestion,
		 * do not backoff and do not remember icsk_probes_out.
Linus Torvalds's avatar
Linus Torvalds committed
		 * Let local senders to fight for local resources.
		 *
		 * Use accumulated backoff yet.
		 */
		if (!icsk->icsk_probes_out)
			icsk->icsk_probes_out = 1;
		inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
					  min(icsk->icsk_rto << icsk->icsk_backoff,
Linus Torvalds's avatar
Linus Torvalds committed
	}
}