Newer
Older
goto rearm_timer;
if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss)))
goto rearm_timer;
skb = tcp_write_queue_tail(sk);
}
if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
goto rearm_timer;
/* Probe with zero data doesn't trigger fast recovery. */
if (skb->len > 0)
err = __tcp_retransmit_skb(sk, skb);
/* Record snd_nxt for loss detection. */
if (likely(!err))
tp->tlp_high_seq = tp->snd_nxt;
rearm_timer:
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
inet_csk(sk)->icsk_rto,
TCP_RTO_MAX);
if (likely(!err))
NET_INC_STATS_BH(sock_net(sk),
LINUX_MIB_TCPLOSSPROBES);
return;
/* Push out any pending frames which were held back due to
* TCP_CORK or attempt at coalescing tiny packets.
* The socket must be locked by the caller.
*/
void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
int nonagle)
/* If we are closed, the bytes will have to remain here.
* In time closedown will finish, we empty the write queue and
* all will be happy.
*/
if (unlikely(sk->sk_state == TCP_CLOSE))
return;

Mel Gorman
committed
if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
sk_gfp_atomic(sk, GFP_ATOMIC)))
}
/* Send _single_ skb sitting at the send head. This function requires
* true push pending frames to setup probe timer etc.
*/
void tcp_push_one(struct sock *sk, unsigned int mss_now)
{
struct sk_buff *skb = tcp_send_head(sk);
BUG_ON(!skb || skb->len < mss_now);
tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation);
/* This function returns the amount that we can raise the
* usable window based on the following constraints
* 1. The window can never be shrunk once it is offered (RFC 793)
* 2. We limit memory per socket
*
* RFC 1122:
* "the suggested [SWS] avoidance algorithm for the receiver is to keep
* RECV.NEXT + RCV.WIN fixed until:
* RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
*
* i.e. don't raise the right edge of the window until you can raise
* it at least MSS bytes.
*
* Unfortunately, the recommended algorithm breaks header prediction,
* since header prediction assumes th->window stays fixed.
*
* Strictly speaking, keeping th->window fixed violates the receiver
* side SWS prevention criteria. The problem is that under this rule
* a stream of single byte packets will cause the right side of the
* window to always advance by a single byte.
* Of course, if the sender implements sender side SWS prevention
* then this will not be a problem.
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
* If the free space is less than the 1/4 of the maximum
* space available and the free space is less than 1/2 mss,
* then set the window to 0.
* [ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
* Otherwise, just prevent the window from shrinking
* and from being larger than the largest representable value.
*
* This prevents incremental opening of the window in the regime
* where TCP is limited by the speed of the reader side taking
* data out of the TCP receive queue. It does nothing about
* those cases where the window is constrained on the sender side
* because the pipeline is full.
*
* BSD also seems to "accidentally" limit itself to windows that are a
* multiple of MSS, at least until the free space gets quite small.
* This would appear to be a side effect of the mbuf implementation.
* Combining these two algorithms results in the observed behavior
* of having a fixed window size at almost all times.
*
* Below we obtain similar behavior by forcing the offered window to
* a multiple of the mss when it is feasible to do so.
*
* Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
* Regular options like TIMESTAMP are taken into account.
*/
u32 __tcp_select_window(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
/* MSS for the peer's data. Previous versions used mss_clamp
* here. I don't know if the value based on our guesses
* of peer's MSS is better for the performance. It's more correct
* but may be worse for the performance because of rcv_mss
* fluctuations. --SAW 1998/11/1
*/
int mss = icsk->icsk_ack.rcv_mss;
int free_space = tcp_space(sk);
int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
int window;
if (mss > full_space)
if (free_space < (full_space >> 1)) {
icsk->icsk_ack.quick = 0;
if (sk_under_memory_pressure(sk))
tp->rcv_ssthresh = min(tp->rcv_ssthresh,
4U * tp->advmss);
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
if (free_space < mss)
return 0;
}
if (free_space > tp->rcv_ssthresh)
free_space = tp->rcv_ssthresh;
/* Don't do rounding if we are using window scaling, since the
* scaled window will not line up with the MSS boundary anyway.
*/
window = tp->rcv_wnd;
if (tp->rx_opt.rcv_wscale) {
window = free_space;
/* Advertise enough space so that it won't get scaled away.
* Import case: prevent zero window announcement if
* 1<<rcv_wscale > mss.
*/
if (((window >> tp->rx_opt.rcv_wscale) << tp->rx_opt.rcv_wscale) != window)
window = (((window >> tp->rx_opt.rcv_wscale) + 1)
<< tp->rx_opt.rcv_wscale);
} else {
/* Get the largest window that is a nice multiple of mss.
* Window clamp already applied above.
* If our current window offering is within 1 mss of the
* free space we just keep it. This prevents the divide
* and multiply from happening most of the time.
* We also don't do any window rounding when the free space
* is too small.
*/
if (window <= free_space - mss || window > free_space)
window = (free_space / mss) * mss;
else if (mss == full_space &&
free_space > window + (full_space >> 1))
window = free_space;
/* Collapses two adjacent SKB's during retransmission. */
static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
int skb_size, next_skb_size;
skb_size = skb->len;
next_skb_size = next_skb->len;
BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
tcp_highest_sack_combine(sk, next_skb, skb);
tcp_unlink_write_queue(next_skb, sk);
skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size),
next_skb_size);
if (next_skb->ip_summed == CHECKSUM_PARTIAL)
skb->ip_summed = CHECKSUM_PARTIAL;
if (skb->ip_summed != CHECKSUM_PARTIAL)
skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
/* Update sequence range on original skb. */
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
/* Merge over control information. This moves PSH/FIN etc. over */
TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags;
/* All done, get rid of second SKB and account for it so
* packet counting does not break.
*/
TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;
/* changed transmit queue under us so clear hints */
tcp_clear_retrans_hints_partial(tp);
if (next_skb == tp->retransmit_skb_hint)
tp->retransmit_skb_hint = skb;
tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb));
sk_wmem_free_skb(sk, next_skb);
/* Check if coalescing SKBs is legal. */
static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
{
if (tcp_skb_pcount(skb) > 1)
/* TODO: SACK collapsing could be used to remove this condition */
if (skb_shinfo(skb)->nr_frags != 0)
if (skb == tcp_send_head(sk))
/* Some heurestics for collapsing over SACK'd could be invented */
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
/* Collapse packets in the retransmit queue to make to create
* less packets on the wire. This is only done on retransmission.
*/
static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
int space)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb = to, *tmp;
if (!sysctl_tcp_retrans_collapse)
return;
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
return;
tcp_for_write_queue_from_safe(skb, tmp, sk) {
if (!tcp_can_collapse(sk, skb))
break;
space -= skb->len;
if (first) {
continue;
}
if (space < 0)
break;
/* Punt if not enough space exists in the first SKB for
* the data in the second
*/
if (skb->len > skb_availroom(to))
break;
if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
break;
tcp_collapse_retrans(sk, to);
}
}
/* This retransmits one SKB. Policy decisions and retransmit queue
* state updates are done by the caller. Returns non-zero if an
* error occurred which prevented the send.
*/
int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
unsigned int cur_mss;
/* Inconslusive MTU probe */
if (icsk->icsk_mtup.probe_size) {
icsk->icsk_mtup.probe_size = 0;
}
/* Do not sent more than we queued. 1/4 is reserved for possible
* copying overhead: fragmentation, tunneling, mangling etc.
*/
if (atomic_read(&sk->sk_wmem_alloc) >
min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
return -EAGAIN;
if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
BUG();
if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
return -ENOMEM;
}
if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
return -EHOSTUNREACH; /* Routing failure or similar. */
/* If receiver has shrunk his window, and skb is out of
* new window, do not retransmit it. The exception is the
* case, when window is shrunk to zero. In this case
* our retransmit serves as a zero window probe.
*/
if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&
TCP_SKB_CB(skb)->seq != tp->snd_una)
if (tcp_fragment(sk, skb, cur_mss, cur_mss))
int oldpcount = tcp_skb_pcount(skb);
if (unlikely(oldpcount > 1)) {
tcp_init_tso_segs(sk, skb, cur_mss);
tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb));
}
tcp_retrans_try_collapse(sk, skb, cur_mss);
/* Some Solaris stacks overoptimize and ignore the FIN on a
* retransmit when old data is attached. So strip it off
* since it is cheap to do so and saves bytes on the network.
*/
(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
/* Reuse, even though it does some unnecessary work */
tcp_init_nondata_skb(skb, TCP_SKB_CB(skb)->end_seq - 1,
skb->ip_summed = CHECKSUM_NONE;
}
}
/* Make a copy, if the first transmission SKB clone we made
* is still in somebody's hands, else make a clone.
*/
TCP_SKB_CB(skb)->when = tcp_time_stamp;
/* make sure skb->data is aligned on arches that require it
* and check if ack-trimming & collapsing extended the headroom
* beyond what csum_start can cover.
*/
if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) ||
skb_headroom(skb) >= 0xFFFF)) {
struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER,
GFP_ATOMIC);
return nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
-ENOBUFS;
return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
}
int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
int err = __tcp_retransmit_skb(sk, skb);
if (err == 0) {
/* Update global TCP statistics. */
TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
tp->total_retrans++;
#if FASTRETRANS_DEBUG > 0
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
net_dbg_ratelimited("retrans_out leaked\n");
if (!tp->retrans_out)
tp->lost_retrans_low = tp->snd_nxt;
TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
tp->retrans_out += tcp_skb_pcount(skb);
/* Save stamp of the first retransmit. */
if (!tp->retrans_stamp)
tp->retrans_stamp = TCP_SKB_CB(skb)->when;
tp->undo_retrans += tcp_skb_pcount(skb);
/* snd_nxt is stored to detect loss of retransmitted segment,
* see tcp_input.c tcp_sacktag_write_queue().
*/
TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
}
return err;
}
/* Check if we forward retransmits are possible in the current
* window/congestion state.
*/
static bool tcp_can_forward_retransmit(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_sock *tp = tcp_sk(sk);
/* Forward retransmissions are possible only during Recovery. */
if (icsk->icsk_ca_state != TCP_CA_Recovery)
/* No forward retransmissions in Reno are possible. */
if (tcp_is_reno(tp))
/* Yeah, we have to make difficult choice between forward transmission
* and retransmission... Both ways have their merits...
*
* For now we do not retransmit anything, while we have some new
* segments to send. In the other cases, follow rule 3 for
* NextSeg() specified in RFC3517.
*/
if (tcp_may_send_now(sk))
/* This gets called after a retransmit timeout, and the initially
* retransmitted data is acknowledged. It tries to continue
* resending the rest of the retransmit queue, until either
* we've sent it all or the congestion window limit is reached.
* If doing SACK, the first ACK which comes back for a timeout
* based retransmit packet might feed us FACK information again.
* If so, we use it to avoid unnecessarily retransmissions.
*/
void tcp_xmit_retransmit_queue(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
if (!tp->packets_out)
return;
if (!tp->lost_out)
tp->retransmit_high = tp->snd_una;
if (tp->retransmit_skb_hint) {
last_lost = TCP_SKB_CB(skb)->end_seq;
if (after(last_lost, tp->retransmit_high))
last_lost = tp->retransmit_high;
} else {
skb = tcp_write_queue_head(sk);
last_lost = tp->snd_una;
}
tcp_for_write_queue_from(skb, sk) {
__u8 sacked = TCP_SKB_CB(skb)->sacked;
if (skb == tcp_send_head(sk))
break;
/* we could do better than to assign each time */
if (hole == NULL)
tp->retransmit_skb_hint = skb;
/* Assume this retransmit will generate
* only one packet for congestion window
* calculation purposes. This works because
* tcp_retransmit_skb() will chop up the
* packet to be MSS sized and all the
* packet counting works out.
*/
if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
return;
if (fwd_rexmitting) {
begin_fwd:
if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
break;
mib_idx = LINUX_MIB_TCPFORWARDRETRANS;
} else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) {
tp->retransmit_high = last_lost;
if (!tcp_can_forward_retransmit(sk))
break;
/* Backtrack if necessary to non-L'ed skb */
if (hole != NULL) {
skb = hole;
hole = NULL;
}
fwd_rexmitting = 1;
goto begin_fwd;
} else if (!(sacked & TCPCB_LOST)) {
if (hole == NULL && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
last_lost = TCP_SKB_CB(skb)->end_seq;
if (icsk->icsk_ca_state != TCP_CA_Loss)
mib_idx = LINUX_MIB_TCPFASTRETRANS;
else
mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
}
if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
if (tcp_retransmit_skb(sk, skb)) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
NET_INC_STATS_BH(sock_net(sk), mib_idx);
if (tcp_in_cwnd_reduction(sk))
tp->prr_out += tcp_skb_pcount(skb);
if (skb == tcp_write_queue_head(sk))

Arnaldo Carvalho de Melo
committed
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
inet_csk(sk)->icsk_rto,
TCP_RTO_MAX);
}
}
/* Send a fin. The caller locks the socket for us. This cannot be
* allowed to fail queueing a FIN frame under any circumstances.
*/
void tcp_send_fin(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb = tcp_write_queue_tail(sk);
/* Optimization, tack on the FIN if we have a queue of
* unsent frames. But be careful about outgoing SACKS
* and IP options.
*/
if (tcp_send_head(sk) != NULL) {
TCP_SKB_CB(skb)->end_seq++;
tp->write_seq++;
} else {
/* Socket is locked, keep trying until memory is available. */
for (;;) {
skb = alloc_skb_fclone(MAX_TCP_HEADER,
sk->sk_allocation);
if (skb)
break;
yield();
}
/* Reserve space for headers and prepare control bits. */
skb_reserve(skb, MAX_TCP_HEADER);
/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
tcp_init_nondata_skb(skb, tp->write_seq,
__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
}
/* We get here when a process closes a file descriptor (either due to
* an explicit close() or as a byproduct of exit()'ing) and there
* was unread data in the receive queue. This behavior is recommended
* by RFC 2525, section 2.17. -DaveM
void tcp_send_active_reset(struct sock *sk, gfp_t priority)
{
struct sk_buff *skb;
/* NOTE: No TCP options attached and we never retransmit this. */
skb = alloc_skb(MAX_TCP_HEADER, priority);
if (!skb) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
return;
}
/* Reserve space for headers and prepare control bits. */
skb_reserve(skb, MAX_TCP_HEADER);
tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
/* Send it off. */
TCP_SKB_CB(skb)->when = tcp_time_stamp;
if (tcp_transmit_skb(sk, skb, 0, priority))
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
/* Send a crossed SYN-ACK during socket establishment.
* WARNING: This routine must only be called when we have already sent
* a SYN packet that crossed the incoming SYN that caused this routine
* to get called. If this assumption fails then the initial rcv_wnd
* and rcv_wscale values will not be correct.
*/
int tcp_send_synack(struct sock *sk)
{
skb = tcp_write_queue_head(sk);
if (skb == NULL || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
pr_debug("%s: wrong queue state\n", __func__);
if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
if (skb_cloned(skb)) {
struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
if (nskb == NULL)
return -ENOMEM;
tcp_unlink_write_queue(skb, sk);
__tcp_add_write_queue_head(sk, nskb);
sk_wmem_free_skb(sk, skb);
sk->sk_wmem_queued += nskb->truesize;
sk_mem_charge(sk, nskb->truesize);
TCP_ECN_send_synack(tcp_sk(sk), skb);
}
TCP_SKB_CB(skb)->when = tcp_time_stamp;
return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
/**
* tcp_make_synack - Prepare a SYN-ACK.
* sk: listener socket
* dst: dst entry attached to the SYNACK
* req: request_sock pointer
*
* Allocate one skb and build a SYNACK packet.
* @dst is consumed : Caller should not use it again.
*/
struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
struct request_sock *req,
struct tcp_fastopen_cookie *foc)
struct tcp_out_options opts;

Arnaldo Carvalho de Melo
committed
struct inet_request_sock *ireq = inet_rsk(req);
struct tcp_sock *tp = tcp_sk(sk);
struct tcphdr *th;
struct sk_buff *skb;
struct tcp_md5sig_key *md5;
int tcp_header_size;
skb = alloc_skb(MAX_TCP_HEADER + 15, sk_gfp_atomic(sk, GFP_ATOMIC));
if (unlikely(!skb)) {
dst_release(dst);
/* Reserve space for headers. */
skb_reserve(skb, MAX_TCP_HEADER);
mss = dst_metric_advmss(dst);
if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
mss = tp->rx_opt.user_mss;
if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
__u8 rcv_wscale;
/* Set this up on the first call only */
req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
/* limit the window selection if the user enforce a smaller rx buffer */
if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
(req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
req->window_clamp = tcp_full_space(sk);
/* tcp_full_space because it is guaranteed to be the first packet */
tcp_select_initial_window(tcp_full_space(sk),
mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
&req->rcv_wnd,
&req->window_clamp,
ireq->wscale_ok,
&rcv_wscale,
dst_metric(dst, RTAX_INITRWND));
ireq->rcv_wscale = rcv_wscale;
}
memset(&opts, 0, sizeof(opts));
#ifdef CONFIG_SYN_COOKIES
if (unlikely(req->cookie_ts))
TCP_SKB_CB(skb)->when = cookie_init_timestamp(req);
else
#endif
tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5,
foc) + sizeof(*th);
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
th = tcp_hdr(skb);
memset(th, 0, sizeof(struct tcphdr));
th->syn = 1;
th->ack = 1;
TCP_ECN_make_synack(req, th);

Arnaldo Carvalho de Melo
committed
th->dest = ireq->rmt_port;
/* Setting of flags are superfluous here for callers (and ECE is
* not even correctly set)
*/
tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,
/* XXX data is queued and acked as is. No buffer/window check */
th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
th->window = htons(min(req->rcv_wnd, 65535U));
tcp_options_write((__be32 *)(th + 1), tp, &opts);
TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb));
#ifdef CONFIG_TCP_MD5SIG
/* Okay, we have all we need - do the md5 hash if needed */
if (md5) {
tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
/* Do all connect socket setups that can be done AF independent. */
const struct dst_entry *dst = __sk_dst_get(sk);
struct tcp_sock *tp = tcp_sk(sk);
__u8 rcv_wscale;
/* We'll fix this up when we get a response from the other end.
* See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
*/
tp->tcp_header_len = sizeof(struct tcphdr) +
(sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
#ifdef CONFIG_TCP_MD5SIG
if (tp->af_specific->md5_lookup(sk, sk) != NULL)
tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
#endif
/* If user gave his TCP_MAXSEG, record it to clamp */
if (tp->rx_opt.user_mss)
tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
tp->max_window = 0;
tcp_sync_mss(sk, dst_mtu(dst));
if (!tp->window_clamp)
tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
tp->advmss = dst_metric_advmss(dst);
if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss)
tp->advmss = tp->rx_opt.user_mss;
/* limit the window selection if the user enforce a smaller rx buffer */
if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
(tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
tp->window_clamp = tcp_full_space(sk);
tcp_select_initial_window(tcp_full_space(sk),
tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
&tp->rcv_wnd,
&tp->window_clamp,
sysctl_tcp_window_scaling,
&rcv_wscale,
dst_metric(dst, RTAX_INITRWND));
tp->rx_opt.rcv_wscale = rcv_wscale;
tp->rcv_ssthresh = tp->rcv_wnd;
sk->sk_err = 0;
sock_reset_flag(sk, SOCK_DONE);
tp->snd_wnd = 0;
tp->snd_una = tp->write_seq;
tp->snd_sml = tp->write_seq;
if (likely(!tp->repair))
tp->rcv_nxt = 0;
tp->rcv_wup = tp->rcv_nxt;
tp->copied_seq = tp->rcv_nxt;
inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
inet_csk(sk)->icsk_retransmits = 0;
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
tcb->end_seq += skb->len;
skb_header_release(skb);
__tcp_add_write_queue_tail(sk, skb);
sk->sk_wmem_queued += skb->truesize;
sk_mem_charge(sk, skb->truesize);
tp->write_seq = tcb->end_seq;
tp->packets_out += tcp_skb_pcount(skb);
}
/* Build and send a SYN with data and (cached) Fast Open cookie. However,
* queue a data-only packet after the regular SYN, such that regular SYNs
* are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges
* only the SYN sequence, the data are retransmitted in the first ACK.
* If cookie is not cached or other error occurs, falls back to send a
* regular SYN with Fast Open cookie request option.
*/
static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_fastopen_request *fo = tp->fastopen_req;
int syn_loss = 0, space, i, err = 0, iovlen = fo->data->msg_iovlen;
struct sk_buff *syn_data = NULL, *data;
unsigned long last_syn_loss = 0;
tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */
tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie,
&syn_loss, &last_syn_loss);
/* Recurring FO SYN losses: revert to regular handshake temporarily */
if (syn_loss > 1 &&
time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
fo->cookie.len = -1;
goto fallback;
}
if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE)
fo->cookie.len = -1;
else if (fo->cookie.len <= 0)
goto fallback;
/* MSS for SYN-data is based on cached MSS and bounded by PMTU and
* user-MSS. Reserve maximum option space for middleboxes that add
* private TCP options. The cost is reduced data space in SYN :(
*/
if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp)
tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
MAX_TCP_OPTION_SPACE;
syn_data = skb_copy_expand(syn, skb_headroom(syn), space,
sk->sk_allocation);
if (syn_data == NULL)
goto fallback;
for (i = 0; i < iovlen && syn_data->len < space; ++i) {
struct iovec *iov = &fo->data->msg_iov[i];
unsigned char __user *from = iov->iov_base;
int len = iov->iov_len;
if (syn_data->len + len > space)
len = space - syn_data->len;
else if (i + 1 == iovlen)
/* No more data pending in inet_wait_for_connect() */
fo->data = NULL;
if (skb_add_data(syn_data, from, len))
goto fallback;
}
/* Queue a data-only packet after the regular SYN for retransmission */
data = pskb_copy(syn_data, sk->sk_allocation);
if (data == NULL)
goto fallback;
TCP_SKB_CB(data)->seq++;
TCP_SKB_CB(data)->tcp_flags &= ~TCPHDR_SYN;
TCP_SKB_CB(data)->tcp_flags = (TCPHDR_ACK|TCPHDR_PSH);
tcp_connect_queue_skb(sk, data);
fo->copied = data->len;
if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) {
tp->syn_data = (fo->copied > 0);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
goto done;
}
syn_data = NULL;
fallback:
/* Send a regular SYN with Fast Open cookie request option */
if (fo->cookie.len > 0)
fo->cookie.len = 0;
err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation);
if (err)
tp->syn_fastopen = 0;
kfree_skb(syn_data);
done:
fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */
return err;
}
/* Build a SYN and send it off. */
int tcp_connect(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *buff;
if (unlikely(tp->repair)) {
tcp_finish_connect(sk, NULL);
return 0;
}
buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
if (unlikely(buff == NULL))
return -ENOBUFS;
/* Reserve space for headers. */
skb_reserve(buff, MAX_TCP_HEADER);
tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
tp->retrans_stamp = TCP_SKB_CB(buff)->when = tcp_time_stamp;
tcp_connect_queue_skb(sk, buff);
TCP_ECN_send_syn(sk, buff);
/* Send off SYN; include data in Fast Open. */
err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
if (err == -ECONNREFUSED)
return err;
/* We change tp->snd_nxt after the tcp_transmit_skb() call
* in order to make this packet get counted in tcpOutSegs.
*/
tp->snd_nxt = tp->write_seq;
tp->pushed_seq = tp->write_seq;
TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
/* Timer for repeating the SYN until an answer. */

Arnaldo Carvalho de Melo
committed
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
/* Send out a delayed ack, the caller does the policy checking
* to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
* for details.
*/
void tcp_send_delayed_ack(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
int ato = icsk->icsk_ack.ato;
unsigned long timeout;
if (ato > TCP_DELACK_MIN) {
const struct tcp_sock *tp = tcp_sk(sk);
if (icsk->icsk_ack.pingpong ||
(icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
max_ato = TCP_DELACK_MAX;
/* Slow path, intersegment interval is "high". */
/* If some rtt estimate is known, use it to bound delayed ack.
* Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
int rtt = max(tp->srtt >> 3, TCP_DELACK_MIN);