Skip to content
Snippets Groups Projects
tcp_output.c 96.7 KiB
Newer Older
		cwnd_quota = tcp_cwnd_test(tp, skb);
		if (!cwnd_quota) {
			if (push_one == 2)
				/* Force out a loss probe pkt. */
				cwnd_quota = 1;
			else
				break;
		}

		if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
			break;

		if (tso_segs == 1) {
			if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
						     (tcp_skb_is_last(sk, skb) ?
						      nonagle : TCP_NAGLE_PUSH))))
				break;
		} else {
			if (!push_one && tcp_tso_should_defer(sk, skb))
Eric Dumazet's avatar
Eric Dumazet committed
		/* TSQ : sk_wmem_alloc accounts skb truesize,
		 * including skb overhead. But thats OK.
		 */
		if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) {
			set_bit(TSQ_THROTTLED, &tp->tsq_flags);
			break;
		}
		if (tso_segs > 1 && !tcp_urg_mode(tp))
			limit = tcp_mss_split_point(sk, skb, mss_now,
						    min_t(unsigned int,
							  cwnd_quota,
							  sk->sk_gso_max_segs));
Linus Torvalds's avatar
Linus Torvalds committed

		    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
		TCP_SKB_CB(skb)->when = tcp_time_stamp;
		if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
Linus Torvalds's avatar
Linus Torvalds committed

		/* Advance the send_head.  This one is sent out.
		 * This call will increment packets_out.
		 */
		tcp_event_new_data_sent(sk, skb);
Linus Torvalds's avatar
Linus Torvalds committed

		tcp_minshall_update(tp, mss_now, skb);
		sent_pkts += tcp_skb_pcount(skb);
Linus Torvalds's avatar
Linus Torvalds committed

		if (tcp_in_cwnd_reduction(sk))
			tp->prr_out += sent_pkts;

		/* Send one loss probe per tail loss episode. */
		if (push_one != 2)
			tcp_schedule_loss_probe(sk);
Eric Dumazet's avatar
Eric Dumazet committed
		return false;
Linus Torvalds's avatar
Linus Torvalds committed
	}
	return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk));
}

bool tcp_schedule_loss_probe(struct sock *sk)
{
	struct inet_connection_sock *icsk = inet_csk(sk);
	struct tcp_sock *tp = tcp_sk(sk);
	u32 timeout, tlp_time_stamp, rto_time_stamp;
	u32 rtt = tp->srtt >> 3;

	if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS))
		return false;
	/* No consecutive loss probes. */
	if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) {
		tcp_rearm_rto(sk);
		return false;
	}
	/* Don't do any loss probe on a Fast Open connection before 3WHS
	 * finishes.
	 */
	if (sk->sk_state == TCP_SYN_RECV)
		return false;

	/* TLP is only scheduled when next timer event is RTO. */
	if (icsk->icsk_pending != ICSK_TIME_RETRANS)
		return false;

	/* Schedule a loss probe in 2*RTT for SACK capable connections
	 * in Open state, that are either limited by cwnd or application.
	 */
	if (sysctl_tcp_early_retrans < 3 || !rtt || !tp->packets_out ||
	    !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
		return false;

	if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
	     tcp_send_head(sk))
		return false;

	/* Probe timeout is at least 1.5*rtt + TCP_DELACK_MAX to account
	 * for delayed ack when there's one outstanding packet.
	 */
	timeout = rtt << 1;
	if (tp->packets_out == 1)
		timeout = max_t(u32, timeout,
				(rtt + (rtt >> 1) + TCP_DELACK_MAX));
	timeout = max_t(u32, timeout, msecs_to_jiffies(10));

	/* If RTO is shorter, just schedule TLP in its place. */
	tlp_time_stamp = tcp_time_stamp + timeout;
	rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout;
	if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) {
		s32 delta = rto_time_stamp - tcp_time_stamp;
		if (delta > 0)
			timeout = delta;
	}

	inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
				  TCP_RTO_MAX);
	return true;
}

/* When probe timeout (PTO) fires, send a new segment if one exists, else
 * retransmit the last segment.
 */
void tcp_send_loss_probe(struct sock *sk)
{
	struct sk_buff *skb;
	int pcount;
	int mss = tcp_current_mss(sk);
	int err = -1;

	if (tcp_send_head(sk) != NULL) {
		err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
		goto rearm_timer;
	}

	/* Retransmit last segment. */
	skb = tcp_write_queue_tail(sk);
	if (WARN_ON(!skb))
		goto rearm_timer;

	pcount = tcp_skb_pcount(skb);
	if (WARN_ON(!pcount))
		goto rearm_timer;

	if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
		if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss)))
			goto rearm_timer;
		skb = tcp_write_queue_tail(sk);
	}

	if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
		goto rearm_timer;

	/* Probe with zero data doesn't trigger fast recovery. */
	if (skb->len > 0)
		err = __tcp_retransmit_skb(sk, skb);

rearm_timer:
	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
				  inet_csk(sk)->icsk_rto,
				  TCP_RTO_MAX);

	if (likely(!err))
		NET_INC_STATS_BH(sock_net(sk),
				 LINUX_MIB_TCPLOSSPROBES);
	return;
/* Push out any pending frames which were held back due to
 * TCP_CORK or attempt at coalescing tiny packets.
 * The socket must be locked by the caller.
 */
void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
			       int nonagle)
	/* If we are closed, the bytes will have to remain here.
	 * In time closedown will finish, we empty the write queue and
	 * all will be happy.
	 */
	if (unlikely(sk->sk_state == TCP_CLOSE))
		return;

	if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
			   sk_gfp_atomic(sk, GFP_ATOMIC)))
		tcp_check_probe_timer(sk);
/* Send _single_ skb sitting at the send head. This function requires
 * true push pending frames to setup probe timer etc.
 */
void tcp_push_one(struct sock *sk, unsigned int mss_now)
{
	struct sk_buff *skb = tcp_send_head(sk);

	BUG_ON(!skb || skb->len < mss_now);

	tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation);
Linus Torvalds's avatar
Linus Torvalds committed
/* This function returns the amount that we can raise the
 * usable window based on the following constraints
Linus Torvalds's avatar
Linus Torvalds committed
 * 1. The window can never be shrunk once it is offered (RFC 793)
 * 2. We limit memory per socket
 *
 * RFC 1122:
 * "the suggested [SWS] avoidance algorithm for the receiver is to keep
 *  RECV.NEXT + RCV.WIN fixed until:
 *  RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
 *
 * i.e. don't raise the right edge of the window until you can raise
 * it at least MSS bytes.
 *
 * Unfortunately, the recommended algorithm breaks header prediction,
 * since header prediction assumes th->window stays fixed.
 *
 * Strictly speaking, keeping th->window fixed violates the receiver
 * side SWS prevention criteria. The problem is that under this rule
 * a stream of single byte packets will cause the right side of the
 * window to always advance by a single byte.
Linus Torvalds's avatar
Linus Torvalds committed
 * Of course, if the sender implements sender side SWS prevention
 * then this will not be a problem.
Linus Torvalds's avatar
Linus Torvalds committed
 * BSD seems to make the following compromise:
Linus Torvalds's avatar
Linus Torvalds committed
 *	If the free space is less than the 1/4 of the maximum
 *	space available and the free space is less than 1/2 mss,
 *	then set the window to 0.
 *	[ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
 *	Otherwise, just prevent the window from shrinking
 *	and from being larger than the largest representable value.
 *
 * This prevents incremental opening of the window in the regime
 * where TCP is limited by the speed of the reader side taking
 * data out of the TCP receive queue. It does nothing about
 * those cases where the window is constrained on the sender side
 * because the pipeline is full.
 *
 * BSD also seems to "accidentally" limit itself to windows that are a
 * multiple of MSS, at least until the free space gets quite small.
 * This would appear to be a side effect of the mbuf implementation.
 * Combining these two algorithms results in the observed behavior
 * of having a fixed window size at almost all times.
 *
 * Below we obtain similar behavior by forcing the offered window to
 * a multiple of the mss when it is feasible to do so.
 *
 * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
 * Regular options like TIMESTAMP are taken into account.
 */
u32 __tcp_select_window(struct sock *sk)
{
	struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
	struct tcp_sock *tp = tcp_sk(sk);
Stephen Hemminger's avatar
Stephen Hemminger committed
	/* MSS for the peer's data.  Previous versions used mss_clamp
Linus Torvalds's avatar
Linus Torvalds committed
	 * here.  I don't know if the value based on our guesses
	 * of peer's MSS is better for the performance.  It's more correct
	 * but may be worse for the performance because of rcv_mss
	 * fluctuations.  --SAW  1998/11/1
	 */
	int mss = icsk->icsk_ack.rcv_mss;
Linus Torvalds's avatar
Linus Torvalds committed
	int free_space = tcp_space(sk);
	int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
	int window;

	if (mss > full_space)
		mss = full_space;
Linus Torvalds's avatar
Linus Torvalds committed

	if (free_space < (full_space >> 1)) {
		icsk->icsk_ack.quick = 0;
Linus Torvalds's avatar
Linus Torvalds committed

		if (sk_under_memory_pressure(sk))
			tp->rcv_ssthresh = min(tp->rcv_ssthresh,
					       4U * tp->advmss);
Linus Torvalds's avatar
Linus Torvalds committed

		if (free_space < mss)
			return 0;
	}

	if (free_space > tp->rcv_ssthresh)
		free_space = tp->rcv_ssthresh;

	/* Don't do rounding if we are using window scaling, since the
	 * scaled window will not line up with the MSS boundary anyway.
	 */
	window = tp->rcv_wnd;
	if (tp->rx_opt.rcv_wscale) {
		window = free_space;

		/* Advertise enough space so that it won't get scaled away.
		 * Import case: prevent zero window announcement if
		 * 1<<rcv_wscale > mss.
		 */
		if (((window >> tp->rx_opt.rcv_wscale) << tp->rx_opt.rcv_wscale) != window)
			window = (((window >> tp->rx_opt.rcv_wscale) + 1)
				  << tp->rx_opt.rcv_wscale);
	} else {
		/* Get the largest window that is a nice multiple of mss.
		 * Window clamp already applied above.
		 * If our current window offering is within 1 mss of the
		 * free space we just keep it. This prevents the divide
		 * and multiply from happening most of the time.
		 * We also don't do any window rounding when the free space
		 * is too small.
		 */
		if (window <= free_space - mss || window > free_space)
			window = (free_space / mss) * mss;
		else if (mss == full_space &&
			 free_space > window + (full_space >> 1))
Linus Torvalds's avatar
Linus Torvalds committed
	}

	return window;
}

/* Collapses two adjacent SKB's during retransmission. */
static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
	int skb_size, next_skb_size;
Linus Torvalds's avatar
Linus Torvalds committed

	skb_size = skb->len;
	next_skb_size = next_skb->len;
Linus Torvalds's avatar
Linus Torvalds committed

	BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
	tcp_highest_sack_combine(sk, next_skb, skb);
Linus Torvalds's avatar
Linus Torvalds committed

	tcp_unlink_write_queue(next_skb, sk);
Linus Torvalds's avatar
Linus Torvalds committed

	skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size),
				  next_skb_size);
Linus Torvalds's avatar
Linus Torvalds committed

	if (next_skb->ip_summed == CHECKSUM_PARTIAL)
		skb->ip_summed = CHECKSUM_PARTIAL;
Linus Torvalds's avatar
Linus Torvalds committed

	if (skb->ip_summed != CHECKSUM_PARTIAL)
		skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
Linus Torvalds's avatar
Linus Torvalds committed

	/* Update sequence range on original skb. */
	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
Linus Torvalds's avatar
Linus Torvalds committed

	/* Merge over control information. This moves PSH/FIN etc. over */
	TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags;

	/* All done, get rid of second SKB and account for it so
	 * packet counting does not break.
	 */
	TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;

	/* changed transmit queue under us so clear hints */
	tcp_clear_retrans_hints_partial(tp);
	if (next_skb == tp->retransmit_skb_hint)
		tp->retransmit_skb_hint = skb;
	tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb));

	sk_wmem_free_skb(sk, next_skb);
/* Check if coalescing SKBs is legal. */
Eric Dumazet's avatar
Eric Dumazet committed
static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
{
	if (tcp_skb_pcount(skb) > 1)
Eric Dumazet's avatar
Eric Dumazet committed
		return false;
	/* TODO: SACK collapsing could be used to remove this condition */
	if (skb_shinfo(skb)->nr_frags != 0)
Eric Dumazet's avatar
Eric Dumazet committed
		return false;
	if (skb_cloned(skb))
Eric Dumazet's avatar
Eric Dumazet committed
		return false;
	if (skb == tcp_send_head(sk))
Eric Dumazet's avatar
Eric Dumazet committed
		return false;
	/* Some heurestics for collapsing over SACK'd could be invented */
	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
Eric Dumazet's avatar
Eric Dumazet committed
		return false;
Eric Dumazet's avatar
Eric Dumazet committed
	return true;
/* Collapse packets in the retransmit queue to make to create
 * less packets on the wire. This is only done on retransmission.
 */
static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
				     int space)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct sk_buff *skb = to, *tmp;
Eric Dumazet's avatar
Eric Dumazet committed
	bool first = true;

	if (!sysctl_tcp_retrans_collapse)
		return;
	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
		return;

	tcp_for_write_queue_from_safe(skb, tmp, sk) {
		if (!tcp_can_collapse(sk, skb))
			break;

		space -= skb->len;

		if (first) {
Eric Dumazet's avatar
Eric Dumazet committed
			first = false;
			continue;
		}

		if (space < 0)
			break;
		/* Punt if not enough space exists in the first SKB for
		 * the data in the second
		 */
		if (skb->len > skb_availroom(to))
			break;

		if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
			break;

		tcp_collapse_retrans(sk, to);
	}
}

Linus Torvalds's avatar
Linus Torvalds committed
/* This retransmits one SKB.  Policy decisions and retransmit queue
 * state updates are done by the caller.  Returns non-zero if an
 * error occurred which prevented the send.
 */
int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
{
	struct tcp_sock *tp = tcp_sk(sk);
John Heffner's avatar
John Heffner committed
	struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds's avatar
Linus Torvalds committed

John Heffner's avatar
John Heffner committed
	/* Inconslusive MTU probe */
	if (icsk->icsk_mtup.probe_size) {
		icsk->icsk_mtup.probe_size = 0;
	}

Linus Torvalds's avatar
Linus Torvalds committed
	/* Do not sent more than we queued. 1/4 is reserved for possible
Stephen Hemminger's avatar
Stephen Hemminger committed
	 * copying overhead: fragmentation, tunneling, mangling etc.
Linus Torvalds's avatar
Linus Torvalds committed
	 */
	if (atomic_read(&sk->sk_wmem_alloc) >
	    min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
		return -EAGAIN;

	if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
		if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
			BUG();
		if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
			return -ENOMEM;
	}

	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
		return -EHOSTUNREACH; /* Routing failure or similar. */

	cur_mss = tcp_current_mss(sk);
Linus Torvalds's avatar
Linus Torvalds committed
	/* If receiver has shrunk his window, and skb is out of
	 * new window, do not retransmit it. The exception is the
	 * case, when window is shrunk to zero. In this case
	 * our retransmit serves as a zero window probe.
	 */
	if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&
	    TCP_SKB_CB(skb)->seq != tp->snd_una)
Linus Torvalds's avatar
Linus Torvalds committed
		return -EAGAIN;

	if (skb->len > cur_mss) {
		if (tcp_fragment(sk, skb, cur_mss, cur_mss))
Linus Torvalds's avatar
Linus Torvalds committed
			return -ENOMEM; /* We'll try again later. */
		int oldpcount = tcp_skb_pcount(skb);

		if (unlikely(oldpcount > 1)) {
			tcp_init_tso_segs(sk, skb, cur_mss);
			tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb));
		}
	tcp_retrans_try_collapse(sk, skb, cur_mss);
Linus Torvalds's avatar
Linus Torvalds committed

	/* Some Solaris stacks overoptimize and ignore the FIN on a
	 * retransmit when old data is attached.  So strip it off
	 * since it is cheap to do so and saves bytes on the network.
	 */
	if (skb->len > 0 &&
	    (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
	    tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
Linus Torvalds's avatar
Linus Torvalds committed
		if (!pskb_trim(skb, 0)) {
			/* Reuse, even though it does some unnecessary work */
			tcp_init_nondata_skb(skb, TCP_SKB_CB(skb)->end_seq - 1,
					     TCP_SKB_CB(skb)->tcp_flags);
Linus Torvalds's avatar
Linus Torvalds committed
			skb->ip_summed = CHECKSUM_NONE;
		}
	}

	/* Make a copy, if the first transmission SKB clone we made
	 * is still in somebody's hands, else make a clone.
	 */
	TCP_SKB_CB(skb)->when = tcp_time_stamp;

	/* make sure skb->data is aligned on arches that require it */
	if (unlikely(NET_IP_ALIGN && ((unsigned long)skb->data & 3))) {
		struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER,
						   GFP_ATOMIC);
		return nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
			      -ENOBUFS;
	} else {
		return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
}

int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
{
	struct tcp_sock *tp = tcp_sk(sk);
	int err = __tcp_retransmit_skb(sk, skb);
Linus Torvalds's avatar
Linus Torvalds committed

	if (err == 0) {
		/* Update global TCP statistics. */
		TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
Linus Torvalds's avatar
Linus Torvalds committed

		tp->total_retrans++;

#if FASTRETRANS_DEBUG > 0
		if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
			net_dbg_ratelimited("retrans_out leaked\n");
Linus Torvalds's avatar
Linus Torvalds committed
		}
#endif
		if (!tp->retrans_out)
			tp->lost_retrans_low = tp->snd_nxt;
Linus Torvalds's avatar
Linus Torvalds committed
		TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
		tp->retrans_out += tcp_skb_pcount(skb);

		/* Save stamp of the first retransmit. */
		if (!tp->retrans_stamp)
			tp->retrans_stamp = TCP_SKB_CB(skb)->when;

		tp->undo_retrans += tcp_skb_pcount(skb);
Linus Torvalds's avatar
Linus Torvalds committed

		/* snd_nxt is stored to detect loss of retransmitted segment,
		 * see tcp_input.c tcp_sacktag_write_queue().
		 */
		TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
	}
	return err;
}

/* Check if we forward retransmits are possible in the current
 * window/congestion state.
 */
Eric Dumazet's avatar
Eric Dumazet committed
static bool tcp_can_forward_retransmit(struct sock *sk)
{
	const struct inet_connection_sock *icsk = inet_csk(sk);
	const struct tcp_sock *tp = tcp_sk(sk);

	/* Forward retransmissions are possible only during Recovery. */
	if (icsk->icsk_ca_state != TCP_CA_Recovery)
Eric Dumazet's avatar
Eric Dumazet committed
		return false;

	/* No forward retransmissions in Reno are possible. */
	if (tcp_is_reno(tp))
Eric Dumazet's avatar
Eric Dumazet committed
		return false;

	/* Yeah, we have to make difficult choice between forward transmission
	 * and retransmission... Both ways have their merits...
	 *
	 * For now we do not retransmit anything, while we have some new
	 * segments to send. In the other cases, follow rule 3 for
	 * NextSeg() specified in RFC3517.
	 */

	if (tcp_may_send_now(sk))
Eric Dumazet's avatar
Eric Dumazet committed
		return false;
Eric Dumazet's avatar
Eric Dumazet committed
	return true;
Linus Torvalds's avatar
Linus Torvalds committed
/* This gets called after a retransmit timeout, and the initially
 * retransmitted data is acknowledged.  It tries to continue
 * resending the rest of the retransmit queue, until either
 * we've sent it all or the congestion window limit is reached.
 * If doing SACK, the first ACK which comes back for a timeout
 * based retransmit packet might feed us FACK information again.
 * If so, we use it to avoid unnecessarily retransmissions.
 */
void tcp_xmit_retransmit_queue(struct sock *sk)
{
	const struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
	struct tcp_sock *tp = tcp_sk(sk);
	struct sk_buff *skb;
	struct sk_buff *hole = NULL;
	int mib_idx;
	int fwd_rexmitting = 0;
	if (!tp->packets_out)
		return;

	if (!tp->lost_out)
		tp->retransmit_high = tp->snd_una;

	if (tp->retransmit_skb_hint) {
		skb = tp->retransmit_skb_hint;
		last_lost = TCP_SKB_CB(skb)->end_seq;
		if (after(last_lost, tp->retransmit_high))
			last_lost = tp->retransmit_high;
	} else {
		skb = tcp_write_queue_head(sk);
		last_lost = tp->snd_una;
	}
Linus Torvalds's avatar
Linus Torvalds committed

	tcp_for_write_queue_from(skb, sk) {
		__u8 sacked = TCP_SKB_CB(skb)->sacked;
Linus Torvalds's avatar
Linus Torvalds committed

		if (skb == tcp_send_head(sk))
			break;
		/* we could do better than to assign each time */
		if (hole == NULL)
			tp->retransmit_skb_hint = skb;

		/* Assume this retransmit will generate
		 * only one packet for congestion window
		 * calculation purposes.  This works because
		 * tcp_retransmit_skb() will chop up the
		 * packet to be MSS sized and all the
		 * packet counting works out.
		 */
		if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
			return;
Linus Torvalds's avatar
Linus Torvalds committed

		if (fwd_rexmitting) {
begin_fwd:
			if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
				break;
			mib_idx = LINUX_MIB_TCPFORWARDRETRANS;
		} else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) {
			tp->retransmit_high = last_lost;
			if (!tcp_can_forward_retransmit(sk))
				break;
			/* Backtrack if necessary to non-L'ed skb */
			if (hole != NULL) {
				skb = hole;
				hole = NULL;
			}
			fwd_rexmitting = 1;
			goto begin_fwd;
Linus Torvalds's avatar
Linus Torvalds committed

		} else if (!(sacked & TCPCB_LOST)) {
			if (hole == NULL && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
				hole = skb;
			continue;
Linus Torvalds's avatar
Linus Torvalds committed

			last_lost = TCP_SKB_CB(skb)->end_seq;
			if (icsk->icsk_ca_state != TCP_CA_Loss)
				mib_idx = LINUX_MIB_TCPFASTRETRANS;
			else
				mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
		}
Linus Torvalds's avatar
Linus Torvalds committed

		if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
Linus Torvalds's avatar
Linus Torvalds committed
			continue;

		if (tcp_retransmit_skb(sk, skb)) {
			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
		NET_INC_STATS_BH(sock_net(sk), mib_idx);
Linus Torvalds's avatar
Linus Torvalds committed

		if (tcp_in_cwnd_reduction(sk))
			tp->prr_out += tcp_skb_pcount(skb);

		if (skb == tcp_write_queue_head(sk))
			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
						  inet_csk(sk)->icsk_rto,
						  TCP_RTO_MAX);
Linus Torvalds's avatar
Linus Torvalds committed
	}
}

/* Send a fin.  The caller locks the socket for us.  This cannot be
 * allowed to fail queueing a FIN frame under any circumstances.
 */
void tcp_send_fin(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct sk_buff *skb = tcp_write_queue_tail(sk);
Linus Torvalds's avatar
Linus Torvalds committed
	int mss_now;
Linus Torvalds's avatar
Linus Torvalds committed
	/* Optimization, tack on the FIN if we have a queue of
	 * unsent frames.  But be careful about outgoing SACKS
	 * and IP options.
	 */
	mss_now = tcp_current_mss(sk);
Linus Torvalds's avatar
Linus Torvalds committed

	if (tcp_send_head(sk) != NULL) {
		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
Linus Torvalds's avatar
Linus Torvalds committed
		TCP_SKB_CB(skb)->end_seq++;
		tp->write_seq++;
	} else {
		/* Socket is locked, keep trying until memory is available. */
		for (;;) {
			skb = alloc_skb_fclone(MAX_TCP_HEADER,
					       sk->sk_allocation);
Linus Torvalds's avatar
Linus Torvalds committed
			if (skb)
				break;
			yield();
		}

		/* Reserve space for headers and prepare control bits. */
		skb_reserve(skb, MAX_TCP_HEADER);
		/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
		tcp_init_nondata_skb(skb, tp->write_seq,
Changli Gao's avatar
Changli Gao committed
				     TCPHDR_ACK | TCPHDR_FIN);
Linus Torvalds's avatar
Linus Torvalds committed
		tcp_queue_skb(sk, skb);
	}
	__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
Linus Torvalds's avatar
Linus Torvalds committed
}

/* We get here when a process closes a file descriptor (either due to
 * an explicit close() or as a byproduct of exit()'ing) and there
 * was unread data in the receive queue.  This behavior is recommended
 * by RFC 2525, section 2.17.  -DaveM
Linus Torvalds's avatar
Linus Torvalds committed
 */
void tcp_send_active_reset(struct sock *sk, gfp_t priority)
Linus Torvalds's avatar
Linus Torvalds committed
{
	struct sk_buff *skb;

	/* NOTE: No TCP options attached and we never retransmit this. */
	skb = alloc_skb(MAX_TCP_HEADER, priority);
	if (!skb) {
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
Linus Torvalds's avatar
Linus Torvalds committed
		return;
	}

	/* Reserve space for headers and prepare control bits. */
	skb_reserve(skb, MAX_TCP_HEADER);
	tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
Changli Gao's avatar
Changli Gao committed
			     TCPHDR_ACK | TCPHDR_RST);
Linus Torvalds's avatar
Linus Torvalds committed
	/* Send it off. */
	TCP_SKB_CB(skb)->when = tcp_time_stamp;
	if (tcp_transmit_skb(sk, skb, 0, priority))
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
	TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
/* Send a crossed SYN-ACK during socket establishment.
 * WARNING: This routine must only be called when we have already sent
Linus Torvalds's avatar
Linus Torvalds committed
 * a SYN packet that crossed the incoming SYN that caused this routine
 * to get called. If this assumption fails then the initial rcv_wnd
 * and rcv_wscale values will not be correct.
 */
int tcp_send_synack(struct sock *sk)
{
	struct sk_buff *skb;
Linus Torvalds's avatar
Linus Torvalds committed

	skb = tcp_write_queue_head(sk);
	if (skb == NULL || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
		pr_debug("%s: wrong queue state\n", __func__);
Linus Torvalds's avatar
Linus Torvalds committed
		return -EFAULT;
	}
	if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
Linus Torvalds's avatar
Linus Torvalds committed
		if (skb_cloned(skb)) {
			struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
			if (nskb == NULL)
				return -ENOMEM;
			tcp_unlink_write_queue(skb, sk);
Linus Torvalds's avatar
Linus Torvalds committed
			skb_header_release(nskb);
			__tcp_add_write_queue_head(sk, nskb);
			sk_wmem_free_skb(sk, skb);
			sk->sk_wmem_queued += nskb->truesize;
			sk_mem_charge(sk, nskb->truesize);
Linus Torvalds's avatar
Linus Torvalds committed
			skb = nskb;
		}

		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;
Linus Torvalds's avatar
Linus Torvalds committed
		TCP_ECN_send_synack(tcp_sk(sk), skb);
	}
	TCP_SKB_CB(skb)->when = tcp_time_stamp;
	return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
/**
 * tcp_make_synack - Prepare a SYN-ACK.
 * sk: listener socket
 * dst: dst entry attached to the SYNACK
 * req: request_sock pointer
 * rvp: request_values pointer
 *
 * Allocate one skb and build a SYNACK packet.
 * @dst is consumed : Caller should not use it again.
 */
struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
				struct request_values *rvp,
				struct tcp_fastopen_cookie *foc)
Linus Torvalds's avatar
Linus Torvalds committed
{
	struct tcp_out_options opts;
	struct tcp_extend_values *xvp = tcp_xv(rvp);
	struct inet_request_sock *ireq = inet_rsk(req);
Linus Torvalds's avatar
Linus Torvalds committed
	struct tcp_sock *tp = tcp_sk(sk);
Eric Dumazet's avatar
Eric Dumazet committed
	const struct tcp_cookie_values *cvp = tp->cookie_values;
Linus Torvalds's avatar
Linus Torvalds committed
	struct tcphdr *th;
	struct sk_buff *skb;
	struct tcp_md5sig_key *md5;
Eric Dumazet's avatar
Eric Dumazet committed
	int s_data_desired = 0;
Linus Torvalds's avatar
Linus Torvalds committed

Eric Dumazet's avatar
Eric Dumazet committed
	if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired)
		s_data_desired = cvp->s_data_desired;
	skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired,
			sk_gfp_atomic(sk, GFP_ATOMIC));
	if (unlikely(!skb)) {
		dst_release(dst);
Linus Torvalds's avatar
Linus Torvalds committed
		return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
	/* Reserve space for headers. */
	skb_reserve(skb, MAX_TCP_HEADER);

	skb_dst_set(skb, dst);
Linus Torvalds's avatar
Linus Torvalds committed

	mss = dst_metric_advmss(dst);
	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
		mss = tp->rx_opt.user_mss;

Adam Langley's avatar
Adam Langley committed
	if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
		__u8 rcv_wscale;
		/* Set this up on the first call only */
		req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);

		/* limit the window selection if the user enforce a smaller rx buffer */
		if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
		    (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
			req->window_clamp = tcp_full_space(sk);

Adam Langley's avatar
Adam Langley committed
		/* tcp_full_space because it is guaranteed to be the first packet */
		tcp_select_initial_window(tcp_full_space(sk),
			mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
Adam Langley's avatar
Adam Langley committed
			&req->rcv_wnd,
			&req->window_clamp,
			ireq->wscale_ok,
			&rcv_wscale,
			dst_metric(dst, RTAX_INITRWND));
Adam Langley's avatar
Adam Langley committed
		ireq->rcv_wscale = rcv_wscale;
	}

	memset(&opts, 0, sizeof(opts));
#ifdef CONFIG_SYN_COOKIES
	if (unlikely(req->cookie_ts))
		TCP_SKB_CB(skb)->when = cookie_init_timestamp(req);
	else
#endif
Adam Langley's avatar
Adam Langley committed
	TCP_SKB_CB(skb)->when = tcp_time_stamp;
	tcp_header_size = tcp_synack_options(sk, req, mss,
					     skb, &opts, &md5, xvp, foc)
	skb_push(skb, tcp_header_size);
	skb_reset_transport_header(skb);
Linus Torvalds's avatar
Linus Torvalds committed

Linus Torvalds's avatar
Linus Torvalds committed
	memset(th, 0, sizeof(struct tcphdr));
	th->syn = 1;
	th->ack = 1;
	TCP_ECN_make_synack(req, th);
	th->source = ireq->loc_port;
	/* Setting of flags are superfluous here for callers (and ECE is
	 * not even correctly set)
	 */
	tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,
Changli Gao's avatar
Changli Gao committed
			     TCPHDR_SYN | TCPHDR_ACK);

	if (OPTION_COOKIE_EXTENSION & opts.options) {
Eric Dumazet's avatar
Eric Dumazet committed
		if (s_data_desired) {
			u8 *buf = skb_put(skb, s_data_desired);

			/* copy data directly from the listening socket. */
Eric Dumazet's avatar
Eric Dumazet committed
			memcpy(buf, cvp->s_data_payload, s_data_desired);
			TCP_SKB_CB(skb)->end_seq += s_data_desired;
		}

		if (opts.hash_size > 0) {
			__u32 workspace[SHA_WORKSPACE_WORDS];
			u32 *mess = &xvp->cookie_bakery[COOKIE_DIGEST_WORDS];
			u32 *tail = &mess[COOKIE_MESSAGE_WORDS-1];

			/* Secret recipe depends on the Timestamp, (future)
			 * Sequence and Acknowledgment Numbers, Initiator
			 * Cookie, and others handled by IP variant caller.
			 */
			*tail-- ^= opts.tsval;
			*tail-- ^= tcp_rsk(req)->rcv_isn + 1;
			*tail-- ^= TCP_SKB_CB(skb)->seq + 1;

			/* recommended */
			*tail-- ^= (((__force u32)th->dest << 16) | (__force u32)th->source);
			*tail-- ^= (u32)(unsigned long)cvp; /* per sockopt */

			sha_transform((__u32 *)&xvp->cookie_bakery[0],
				      (char *)mess,
				      &workspace[0]);
			opts.hash_location =
				(__u8 *)&xvp->cookie_bakery[0];
		}
	}

Linus Torvalds's avatar
Linus Torvalds committed
	th->seq = htonl(TCP_SKB_CB(skb)->seq);
	/* XXX data is queued and acked as is. No buffer/window check */
	th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
Linus Torvalds's avatar
Linus Torvalds committed

	/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
	th->window = htons(min(req->rcv_wnd, 65535U));
	tcp_options_write((__be32 *)(th + 1), tp, &opts);
Linus Torvalds's avatar
Linus Torvalds committed
	th->doff = (tcp_header_size >> 2);
	TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb));

#ifdef CONFIG_TCP_MD5SIG
	/* Okay, we have all we need - do the md5 hash if needed */
	if (md5) {
		tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
					       md5, NULL, req, skb);
Linus Torvalds's avatar
Linus Torvalds committed
	return skb;
}
EXPORT_SYMBOL(tcp_make_synack);
Linus Torvalds's avatar
Linus Torvalds committed

/* Do all connect socket setups that can be done AF independent. */
Pavel Emelyanov's avatar
Pavel Emelyanov committed
void tcp_connect_init(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
{
	const struct dst_entry *dst = __sk_dst_get(sk);
Linus Torvalds's avatar
Linus Torvalds committed
	struct tcp_sock *tp = tcp_sk(sk);
	__u8 rcv_wscale;

	/* We'll fix this up when we get a response from the other end.
	 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
	 */
	tp->tcp_header_len = sizeof(struct tcphdr) +
		(sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
Linus Torvalds's avatar
Linus Torvalds committed

#ifdef CONFIG_TCP_MD5SIG
	if (tp->af_specific->md5_lookup(sk, sk) != NULL)
		tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
#endif

Linus Torvalds's avatar
Linus Torvalds committed
	/* If user gave his TCP_MAXSEG, record it to clamp */
	if (tp->rx_opt.user_mss)
		tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
	tp->max_window = 0;
John Heffner's avatar
John Heffner committed
	tcp_mtup_init(sk);
Linus Torvalds's avatar
Linus Torvalds committed
	tcp_sync_mss(sk, dst_mtu(dst));

	if (!tp->window_clamp)
		tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
	tp->advmss = dst_metric_advmss(dst);
	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss)
		tp->advmss = tp->rx_opt.user_mss;

Linus Torvalds's avatar
Linus Torvalds committed
	tcp_initialize_rcv_mss(sk);

	/* limit the window selection if the user enforce a smaller rx buffer */
	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
	    (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
		tp->window_clamp = tcp_full_space(sk);

Linus Torvalds's avatar
Linus Torvalds committed
	tcp_select_initial_window(tcp_full_space(sk),
				  tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
				  &tp->rcv_wnd,
				  &tp->window_clamp,
				  sysctl_tcp_window_scaling,
				  &rcv_wscale,
				  dst_metric(dst, RTAX_INITRWND));
Linus Torvalds's avatar
Linus Torvalds committed

	tp->rx_opt.rcv_wscale = rcv_wscale;
	tp->rcv_ssthresh = tp->rcv_wnd;

	sk->sk_err = 0;
	sock_reset_flag(sk, SOCK_DONE);
	tp->snd_wnd = 0;
Linus Torvalds's avatar
Linus Torvalds committed
	tp->snd_una = tp->write_seq;
	tp->snd_sml = tp->write_seq;
	tp->snd_up = tp->write_seq;
Pavel Emelyanov's avatar
Pavel Emelyanov committed
	tp->snd_nxt = tp->write_seq;

	if (likely(!tp->repair))
		tp->rcv_nxt = 0;
	tp->rcv_wup = tp->rcv_nxt;
	tp->copied_seq = tp->rcv_nxt;