Newer
Older
if (sacked & TCPCB_RETRANS) {
if (sacked & TCPCB_SACKED_RETRANS)
tp->retrans_out -= acked_pcount;
flag |= FLAG_RETRANS_DATA_ACKED;
ca_seq_rtt = now - scb->when;
last_ackt = skb->tstamp;
if (seq_rtt < 0) {
seq_rtt = ca_seq_rtt;
if (!(sacked & TCPCB_SACKED_ACKED))
reord = min(pkts_acked, reord);
if (!after(scb->end_seq, tp->high_seq))
flag |= FLAG_ORIG_SACK_ACKED;
if (sacked & TCPCB_SACKED_ACKED)
tp->sacked_out -= acked_pcount;
if (sacked & TCPCB_LOST)
tp->lost_out -= acked_pcount;
tp->packets_out -= acked_pcount;
pkts_acked += acked_pcount;
/* Initial outgoing SYN's get put onto the write_queue
* just like anything else we transmit. It is not
* true data, and if we misinform our callers that
* this ACK acks real data, we will erroneously exit
* connection startup slow start one packet too
* quickly. This is severely frowned upon behavior.
*/
flag |= FLAG_DATA_ACKED;
} else {
flag |= FLAG_SYN_ACKED;
tp->retrans_stamp = 0;
}
if (!fully_acked)
break;
tcp_unlink_write_queue(skb, sk);
sk_wmem_free_skb(sk, skb);
if (skb == tp->retransmit_skb_hint)
tp->retransmit_skb_hint = NULL;
if (skb == tp->lost_skb_hint)
tp->lost_skb_hint = NULL;
if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
tp->snd_up = tp->snd_una;
if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
flag |= FLAG_SACK_RENEGING;
if (tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt) ||
(flag & FLAG_ACKED))
tcp_rearm_rto(sk);
if (flag & FLAG_ACKED) {
const struct tcp_congestion_ops *ca_ops
= inet_csk(sk)->icsk_ca_ops;
if (unlikely(icsk->icsk_mtup.probe_size &&
!after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
tcp_mtup_probe_success(sk);
}
if (tcp_is_reno(tp)) {
tcp_remove_reno_sacks(sk, pkts_acked);
} else {
/* Non-retransmitted hole got filled? That's reordering */
if (reord < prior_fackets)
tcp_update_reordering(sk, tp->fackets_out - reord, 0);
delta = tcp_is_fack(tp) ? pkts_acked :
prior_sacked - tp->sacked_out;
tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
tp->fackets_out -= min(pkts_acked, tp->fackets_out);
if (ca_ops->pkts_acked) {
s32 rtt_us = -1;
/* Is the ACK triggering packet unambiguous? */
if (!(flag & FLAG_RETRANS_DATA_ACKED)) {
/* High resolution needed and available? */
if (ca_ops->flags & TCP_CONG_RTT_STAMP &&
!ktime_equal(last_ackt,
net_invalid_timestamp()))
rtt_us = ktime_us_delta(ktime_get_real(),
last_ackt);
else if (ca_seq_rtt >= 0)
rtt_us = jiffies_to_usecs(ca_seq_rtt);
ca_ops->pkts_acked(sk, pkts_acked, rtt_us);
}
WARN_ON((int)tp->sacked_out < 0);
WARN_ON((int)tp->lost_out < 0);
WARN_ON((int)tp->retrans_out < 0);
if (!tp->packets_out && tcp_is_sack(tp)) {
pr_debug("Leak l=%u %d\n",
tp->lost_out, icsk->icsk_ca_state);
pr_debug("Leak s=%u %d\n",
tp->sacked_out, icsk->icsk_ca_state);
tp->sacked_out = 0;
}
if (tp->retrans_out) {
pr_debug("Leak r=%u %d\n",
tp->retrans_out, icsk->icsk_ca_state);
}
static void tcp_ack_probe(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) {
icsk->icsk_backoff = 0;
inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
/* Socket must be waked up by subsequent tcp_data_snd_check().
* This function is not for random using!
*/
} else {
inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,

Arnaldo Carvalho de Melo
committed
min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
TCP_RTO_MAX);
static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
/* Decide wheather to run the increase function of congestion control. */
static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
if (tcp_in_cwnd_reduction(sk))
return false;
/* If reordering is high then always grow cwnd whenever data is
* delivered regardless of its ordering. Otherwise stay conservative
* and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/
* new SACK or ECE mark may first advance cwnd here and later reduce
* cwnd in tcp_fastretrans_alert() based on more states.
*/
if (tcp_sk(sk)->reordering > sysctl_tcp_reordering)
return flag & FLAG_FORWARD_PROGRESS;
}
/* Check that window update is acceptable.
* The function assumes that snd_una<=ack<=snd_next.
*/
static inline bool tcp_may_update_window(const struct tcp_sock *tp,
const u32 ack, const u32 ack_seq,
const u32 nwin)
(ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
}
/* Update our send window.
*
* Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
* and in FreeBSD. NetBSD's one is even worse.) is wrong.
*/
static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 ack,
u32 ack_seq)
struct tcp_sock *tp = tcp_sk(sk);
u32 nwin = ntohs(tcp_hdr(skb)->window);
if (likely(!tcp_hdr(skb)->syn))
nwin <<= tp->rx_opt.snd_wscale;
if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
flag |= FLAG_WIN_UPDATE;
tcp_update_wl(tp, ack_seq);
if (tp->snd_wnd != nwin) {
tp->snd_wnd = nwin;
/* Note, it is the only place, where
* fast path is recovered for sending TCP.
*/
tcp_fast_path_check(sk);
if (nwin > tp->max_window) {
tp->max_window = nwin;
tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
}
}
}
tp->snd_una = ack;
return flag;
}
/* RFC 5961 7 [ACK Throttling] */
static void tcp_send_challenge_ack(struct sock *sk)
{
/* unprotected vars, we dont care of overwrites */
static u32 challenge_timestamp;
static unsigned int challenge_count;
u32 now = jiffies / HZ;
if (now != challenge_timestamp) {
challenge_timestamp = now;
challenge_count = 0;
}
if (++challenge_count <= sysctl_tcp_challenge_ack_limit) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
tcp_send_ack(sk);
}
}
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
static void tcp_store_ts_recent(struct tcp_sock *tp)
{
tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
tp->rx_opt.ts_recent_stamp = get_seconds();
}
static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
{
if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
/* PAWS bug workaround wrt. ACK frames, the PAWS discard
* extra check below makes sure this can only happen
* for pure ACK frames. -DaveM
*
* Not only, also it occurs for expired timestamps.
*/
if (tcp_paws_check(&tp->rx_opt, 0))
tcp_store_ts_recent(tp);
}
}
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
/* This routine deals with acks during a TLP episode.
* Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe.
*/
static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
bool is_tlp_dupack = (ack == tp->tlp_high_seq) &&
!(flag & (FLAG_SND_UNA_ADVANCED |
FLAG_NOT_DUP | FLAG_DATA_SACKED));
/* Mark the end of TLP episode on receiving TLP dupack or when
* ack is after tlp_high_seq.
*/
if (is_tlp_dupack) {
tp->tlp_high_seq = 0;
return;
}
if (after(ack, tp->tlp_high_seq)) {
tp->tlp_high_seq = 0;
/* Don't reduce cwnd if DSACK arrives for TLP retrans. */
if (!(flag & FLAG_DSACKING_ACK)) {
tcp_init_cwnd_reduction(sk, true);
tcp_set_ca_state(sk, TCP_CA_CWR);
tcp_end_cwnd_reduction(sk);
tcp_set_ca_state(sk, TCP_CA_Open);
NET_INC_STATS_BH(sock_net(sk),
LINUX_MIB_TCPLOSSPROBERECOVERY);
}
}
}
/* This routine deals with incoming acks, but not outgoing ones. */
static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
u32 prior_snd_una = tp->snd_una;
u32 ack_seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
bool is_dupack = false;
u32 prior_in_flight, prior_cwnd = tp->snd_cwnd, prior_rtt = tp->srtt;
u32 prior_fackets;
int prior_packets = tp->packets_out;
const int prior_unsacked = tp->packets_out - tp->sacked_out;
int acked = 0; /* Number of packets newly acked */
/* If the ack is older than previous acks
if (before(ack, prior_snd_una)) {
/* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
if (before(ack, prior_snd_una - tp->max_window)) {
tcp_send_challenge_ack(sk);
return -1;
}
/* If the ack includes data we haven't sent yet, discard
* this segment (RFC793 Section 3.9).
*/
if (after(ack, tp->snd_nxt))
goto invalid_ack;
if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
if (after(ack, prior_snd_una))
flag |= FLAG_SND_UNA_ADVANCED;
prior_fackets = tp->fackets_out;
prior_in_flight = tcp_packets_in_flight(tp);
/* ts_recent update must be made after we are sure that the packet
* is in window.
*/
if (flag & FLAG_UPDATE_TS_RECENT)
tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
/* Window is constant, pure forward advance.
* No more checks are required.
* Note, we use the fact that SND.UNA>=SND.WL2.
*/
tcp_update_wl(tp, ack_seq);
tp->snd_una = ack;
flag |= FLAG_WIN_UPDATE;
tcp_ca_event(sk, CA_EVENT_FAST_ACK);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS);
} else {
if (ack_seq != TCP_SKB_CB(skb)->end_seq)
flag |= FLAG_DATA;
else
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPUREACKS);
flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
&sack_rtt);
if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))
tcp_ca_event(sk, CA_EVENT_SLOW_ACK);
}
/* We passed data and got it acked, remove any soft error
* log. Something worked...
*/
sk->sk_err_soft = 0;
icsk->icsk_probes_out = 0;
tp->rcv_tstamp = tcp_time_stamp;
if (!prior_packets)
goto no_queue;
/* See if we can take anything off of the retransmit queue. */
flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, sack_rtt);
/* Advance cwnd if state allows */
if (tcp_may_raise_cwnd(sk, flag))
tcp_cong_avoid(sk, ack, prior_in_flight);
if (tcp_ack_is_dubious(sk, flag)) {
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
tcp_fastretrans_alert(sk, acked, prior_unsacked,
is_dupack, flag);
if (tp->tlp_high_seq)
tcp_process_tlp_ack(sk, ack, flag);
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
struct dst_entry *dst = __sk_dst_get(sk);
if (dst)
dst_confirm(dst);
}
if (icsk->icsk_pending == ICSK_TIME_RETRANS)
tcp_schedule_loss_probe(sk);
if (tp->srtt != prior_rtt || tp->snd_cwnd != prior_cwnd)
tcp_update_pacing_rate(sk);
/* If data was DSACKed, see if we can undo a cwnd reduction. */
if (flag & FLAG_DSACKING_ACK)
tcp_fastretrans_alert(sk, acked, prior_unsacked,
is_dupack, flag);
/* If this ack opens up a zero window, clear backoff. It was
* being used to time the probes, and is probably far higher than
* it needs to be for normal retransmission.
*/
if (tcp_send_head(sk))
if (tp->tlp_high_seq)
tcp_process_tlp_ack(sk, ack, flag);
invalid_ack:
SOCK_DEBUG(sk, "Ack %u after %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
return -1;
/* If data was SACKed, tag it and see if we should send more data.
* If data was DSACKed, see if we can undo a cwnd reduction.
*/
if (TCP_SKB_CB(skb)->sacked) {
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
&sack_rtt);
tcp_fastretrans_alert(sk, acked, prior_unsacked,
is_dupack, flag);
SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
return 0;
}
/* Look for tcp options. Normally only called on SYN and SYNACK packets.
* But, this can also be called on packets in the established flow when
* the fast version below fails.
*/
void tcp_parse_options(const struct sk_buff *skb,
struct tcp_options_received *opt_rx, int estab,
const unsigned char *ptr;
const struct tcphdr *th = tcp_hdr(skb);
int length = (th->doff * 4) - sizeof(struct tcphdr);
ptr = (const unsigned char *)(th + 1);
case TCPOPT_EOL:
return;
case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
length--;
continue;
default:
opsize = *ptr++;
if (opsize < 2) /* "silly options" */
if (opsize > length)
return; /* don't parse partial options */
switch (opcode) {
case TCPOPT_MSS:
if (opsize == TCPOLEN_MSS && th->syn && !estab) {
u16 in_mss = get_unaligned_be16(ptr);
if (in_mss) {
if (opt_rx->user_mss &&
opt_rx->user_mss < in_mss)
in_mss = opt_rx->user_mss;
opt_rx->mss_clamp = in_mss;
}
break;
case TCPOPT_WINDOW:
if (opsize == TCPOLEN_WINDOW && th->syn &&
!estab && sysctl_tcp_window_scaling) {
__u8 snd_wscale = *(__u8 *)ptr;
opt_rx->wscale_ok = 1;
if (snd_wscale > 14) {
net_info_ratelimited("%s: Illegal window scaling value %d >14 received\n",
__func__,
snd_wscale);
opt_rx->snd_wscale = snd_wscale;
}
break;
case TCPOPT_TIMESTAMP:
if ((opsize == TCPOLEN_TIMESTAMP) &&
((estab && opt_rx->tstamp_ok) ||
(!estab && sysctl_tcp_timestamps))) {
opt_rx->saw_tstamp = 1;
opt_rx->rcv_tsval = get_unaligned_be32(ptr);
opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
}
break;
case TCPOPT_SACK_PERM:
if (opsize == TCPOLEN_SACK_PERM && th->syn &&
!estab && sysctl_tcp_sack) {
opt_rx->sack_ok = TCP_SACK_SEEN;
tcp_sack_reset(opt_rx);
}
break;
case TCPOPT_SACK:
if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
!((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
opt_rx->sack_ok) {
TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
}
break;
#ifdef CONFIG_TCP_MD5SIG
case TCPOPT_MD5SIG:
/*
* The MD5 Hash has already been
* checked (see tcp_v{4,6}_do_rcv()).
*/
break;
case TCPOPT_EXP:
/* Fast Open option shares code 254 using a
* 16 bits magic number. It's valid only in
* SYN or SYN-ACK with an even size.
*/
if (opsize < TCPOLEN_EXP_FASTOPEN_BASE ||
get_unaligned_be16(ptr) != TCPOPT_FASTOPEN_MAGIC ||
foc == NULL || !th->syn || (opsize & 1))
break;
foc->len = opsize - TCPOLEN_EXP_FASTOPEN_BASE;
if (foc->len >= TCP_FASTOPEN_COOKIE_MIN &&
foc->len <= TCP_FASTOPEN_COOKIE_MAX)
memcpy(foc->val, ptr + 2, foc->len);
else if (foc->len != 0)
foc->len = -1;
break;
}
ptr += opsize-2;
length -= opsize;
static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th)
const __be32 *ptr = (const __be32 *)(th + 1);
if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
| (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
tp->rx_opt.saw_tstamp = 1;
++ptr;
tp->rx_opt.rcv_tsval = ntohl(*ptr);
++ptr;
if (*ptr)
tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset;
else
tp->rx_opt.rcv_tsecr = 0;
/* Fast parse options. This hopes to only see timestamps.
* If it is wrong it falls back on tcp_parse_options().
*/
static bool tcp_fast_parse_options(const struct sk_buff *skb,
/* In the spirit of fast parsing, compare doff directly to constant
* values. Because equality is used, short doff can be ignored here.
*/
if (th->doff == (sizeof(*th) / 4)) {
th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
if (tcp_parse_aligned_timestamp(tp, th))
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
tp->rx_opt.rcv_tsecr -= tp->tsoffset;
#ifdef CONFIG_TCP_MD5SIG
/*
* Parse MD5 Signature option
*/
const u8 *tcp_parse_md5sig_option(const struct tcphdr *th)
{
int length = (th->doff << 2) - sizeof(*th);
const u8 *ptr = (const u8 *)(th + 1);
/* If the TCP option is too short, we can short cut */
if (length < TCPOLEN_MD5SIG)
return NULL;
while (length > 0) {
int opcode = *ptr++;
int opsize;
switch(opcode) {
case TCPOPT_EOL:
return NULL;
case TCPOPT_NOP:
length--;
continue;
default:
opsize = *ptr++;
if (opsize < 2 || opsize > length)
return NULL;
if (opcode == TCPOPT_MD5SIG)
return opsize == TCPOLEN_MD5SIG ? ptr : NULL;
}
ptr += opsize - 2;
length -= opsize;
}
return NULL;
}
EXPORT_SYMBOL(tcp_parse_md5sig_option);
#endif
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
*
* It is not fatal. If this ACK does _not_ change critical state (seqs, window)
* it can pass through stack. So, the following predicate verifies that
* this segment is not used for anything but congestion avoidance or
* fast retransmit. Moreover, we even are able to eliminate most of such
* second order effects, if we apply some small "replay" window (~RTO)
* to timestamp space.
*
* All these measures still do not guarantee that we reject wrapped ACKs
* on networks with high bandwidth, when sequence space is recycled fastly,
* but it guarantees that such events will be very rare and do not affect
* connection seriously. This doesn't look nice, but alas, PAWS is really
* buggy extension.
*
* [ Later note. Even worse! It is buggy for segments _with_ data. RFC
* states that events when retransmit arrives after original data are rare.
* It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is
* the biggest problem on large power networks even with minor reordering.
* OK, let's give it small replay window. If peer clock is even 1hz, it is safe
* up to bandwidth of 18Gigabit/sec. 8) ]
*/
static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
const struct tcp_sock *tp = tcp_sk(sk);
const struct tcphdr *th = tcp_hdr(skb);
u32 seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
return (/* 1. Pure ACK with correct sequence number. */
(th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&
/* 2. ... and duplicate ACK. */
ack == tp->snd_una &&
/* 3. ... and does not update window. */
!tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
/* 4. ... and sits in replay window. */
(s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
static inline bool tcp_paws_discard(const struct sock *sk,
const struct tcp_sock *tp = tcp_sk(sk);
return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) &&
!tcp_disordered_ack(sk, skb);
}
/* Check segment sequence number for validity.
*
* Segment controls are considered valid, if the segment
* fits to the window after truncation to the window. Acceptability
* of data (and SYN, FIN, of course) is checked separately.
* See tcp_data_queue(), for example.
*
* Also, controls (RST is main one) are accepted using RCV.WUP instead
* of RCV.NXT. Peer still did not advance his SND.UNA when we
* delayed ACK, so that hisSND.UNA<=ourRCV.WUP.
* (borrowed from freebsd)
*/
static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
{
return !before(end_seq, tp->rcv_wup) &&
!after(seq, tp->rcv_nxt + tcp_receive_window(tp));
}
/* When we get a reset we do this. */
void tcp_reset(struct sock *sk)
{
/* We want the right error as BSD sees it (and indeed as we do). */
switch (sk->sk_state) {
case TCP_SYN_SENT:
sk->sk_err = ECONNREFUSED;
break;
case TCP_CLOSE_WAIT:
sk->sk_err = EPIPE;
break;
case TCP_CLOSE:
return;
default:
sk->sk_err = ECONNRESET;
/* This barrier is coupled with smp_rmb() in tcp_poll() */
smp_wmb();
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
if (!sock_flag(sk, SOCK_DEAD))
sk->sk_error_report(sk);
tcp_done(sk);
}
/*
* Process the FIN bit. This now behaves as it is supposed to work
* and the FIN takes effect when it is validly part of sequence
* space. Not before when we get holes.
*
* If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
* (and thence onto LAST-ACK and finally, CLOSE, we never enter
* TIME-WAIT)
*
* If we are in FINWAIT-1, a received FIN indicates simultaneous
* close and we go into CLOSING (and later onto TIME-WAIT)
*
* If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
*/
static void tcp_fin(struct sock *sk)
inet_csk_schedule_ack(sk);
sk->sk_shutdown |= RCV_SHUTDOWN;
sock_set_flag(sk, SOCK_DONE);
switch (sk->sk_state) {
case TCP_SYN_RECV:
case TCP_ESTABLISHED:
/* Move to CLOSE_WAIT */
tcp_set_state(sk, TCP_CLOSE_WAIT);
dst = __sk_dst_get(sk);
if (!dst || !dst_metric(dst, RTAX_QUICKACK))
inet_csk(sk)->icsk_ack.pingpong = 1;
case TCP_CLOSE_WAIT:
case TCP_CLOSING:
/* Received a retransmission of the FIN, do
* nothing.
*/
break;
case TCP_LAST_ACK:
/* RFC793: Remain in the LAST-ACK state. */
break;
case TCP_FIN_WAIT1:
/* This case occurs when a simultaneous close
* happens, we must ack the received FIN and
* enter the CLOSING state.
*/
tcp_send_ack(sk);
tcp_set_state(sk, TCP_CLOSING);
break;
case TCP_FIN_WAIT2:
/* Received a FIN -- send ACK and enter TIME_WAIT. */
tcp_send_ack(sk);
tcp_time_wait(sk, TCP_TIME_WAIT, 0);
break;
default:
/* Only TCP_LISTEN and TCP_CLOSE are left, in these
* cases we should never reach this piece of code.
*/
pr_err("%s: Impossible, sk->sk_state=%d\n",
__func__, sk->sk_state);
/* It _is_ possible, that we have something out-of-order _after_ FIN.
* Probably, we should reset in this case. For now drop them.
*/
__skb_queue_purge(&tp->out_of_order_queue);
if (tcp_is_sack(tp))
sk_mem_reclaim(sk);
if (!sock_flag(sk, SOCK_DEAD)) {
sk->sk_state_change(sk);
/* Do not send POLL_HUP for half duplex close. */
if (sk->sk_shutdown == SHUTDOWN_MASK ||
sk->sk_state == TCP_CLOSE)
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
{
if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
if (before(seq, sp->start_seq))
sp->start_seq = seq;
if (after(end_seq, sp->end_seq))
sp->end_seq = end_seq;
static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
struct tcp_sock *tp = tcp_sk(sk);
if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
int mib_idx;
mib_idx = LINUX_MIB_TCPDSACKOLDSENT;
mib_idx = LINUX_MIB_TCPDSACKOFOSENT;
NET_INC_STATS_BH(sock_net(sk), mib_idx);
tp->rx_opt.dsack = 1;
tp->duplicate_sack[0].start_seq = seq;
tp->duplicate_sack[0].end_seq = end_seq;
}
}
static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
struct tcp_sock *tp = tcp_sk(sk);
tcp_dsack_set(sk, seq, end_seq);
else
tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
}
static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
tcp_enter_quickack_mode(sk);
if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
u32 end_seq = TCP_SKB_CB(skb)->end_seq;
if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
end_seq = tp->rcv_nxt;
tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
}
}
tcp_send_ack(sk);
}
/* These routines update the SACK block as out-of-order packets arrive or
* in-order packets close up the sequence space.
*/
static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
{
int this_sack;
struct tcp_sack_block *sp = &tp->selective_acks[0];
struct tcp_sack_block *swalk = sp + 1;
/* See if the recent change to the first SACK eats into
* or hits the sequence space of other SACK blocks, if so coalesce.
*/
for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;) {
if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) {
int i;
/* Zap SWALK, by moving every further SACK up by one slot.
* Decrease num_sacks.
*/
tp->rx_opt.num_sacks--;
for (i = this_sack; i < tp->rx_opt.num_sacks; i++)
sp[i] = sp[i + 1];
continue;
}
this_sack++, swalk++;
}
}
static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_sack_block *sp = &tp->selective_acks[0];
int cur_sacks = tp->rx_opt.num_sacks;
int this_sack;
if (!cur_sacks)
goto new_sack;
for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) {
if (tcp_sack_extend(sp, seq, end_seq)) {
/* Rotate this_sack to the first one. */
for (; this_sack > 0; this_sack--, sp--)
if (cur_sacks > 1)
tcp_sack_maybe_coalesce(tp);
return;
}
}
/* Could not find an adjacent existing SACK, build a new one,
* put it at the front, and shift everyone else down. We
* always know there is at least one SACK present already here.
*
* If the sack array is full, forget about the last one.
*/
if (this_sack >= TCP_NUM_SACKS) {
this_sack--;
tp->rx_opt.num_sacks--;
sp--;
}
new_sack:
/* Build the new head SACK, and we're done. */
sp->start_seq = seq;
sp->end_seq = end_seq;
tp->rx_opt.num_sacks++;
}
/* RCV.NXT advances, some SACKs should be eaten. */
static void tcp_sack_remove(struct tcp_sock *tp)
{
struct tcp_sack_block *sp = &tp->selective_acks[0];
int num_sacks = tp->rx_opt.num_sacks;
int this_sack;
/* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
if (skb_queue_empty(&tp->out_of_order_queue)) {
for (this_sack = 0; this_sack < num_sacks;) {
/* Check if the start of the sack is covered by RCV.NXT. */
if (!before(tp->rcv_nxt, sp->start_seq)) {
int i;
/* RCV.NXT must cover all the block! */
WARN_ON(before(tp->rcv_nxt, sp->end_seq));
/* Zap this SACK, by moving forward any other SACKS. */
for (i=this_sack+1; i < num_sacks; i++)
tp->selective_acks[i-1] = tp->selective_acks[i];
num_sacks--;
continue;
}
this_sack++;
sp++;
}
tp->rx_opt.num_sacks = num_sacks;
}
/* This one checks to see if we can put data from the
* out_of_order queue into the receive_queue.
*/
static void tcp_ofo_queue(struct sock *sk)