Newer
Older
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
* losses and/or application stalls), do not perform any further cwnd
* reductions, but instead slow start up to ssthresh.
*/
static void tcp_update_cwnd_in_recovery(struct sock *sk, int newly_acked_sacked,
int fast_rexmit, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
int sndcnt = 0;
int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) {
u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
tp->prior_cwnd - 1;
sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
} else {
sndcnt = min_t(int, delta,
max_t(int, tp->prr_delivered - tp->prr_out,
newly_acked_sacked) + 1);
}
sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));
tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
}
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
{
struct tcp_sock *tp = tcp_sk(sk);
int mib_idx;
if (tcp_is_reno(tp))
mib_idx = LINUX_MIB_TCPRENORECOVERY;
else
mib_idx = LINUX_MIB_TCPSACKRECOVERY;
NET_INC_STATS_BH(sock_net(sk), mib_idx);
tp->high_seq = tp->snd_nxt;
tp->prior_ssthresh = 0;
tp->undo_marker = tp->snd_una;
tp->undo_retrans = tp->retrans_out;
if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
if (!ece_ack)
tp->prior_ssthresh = tcp_current_ssthresh(sk);
tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
TCP_ECN_queue_cwr(tp);
}
tp->bytes_acked = 0;
tp->snd_cwnd_cnt = 0;
tp->prior_cwnd = tp->snd_cwnd;
tp->prr_delivered = 0;
tp->prr_out = 0;
tcp_set_ca_state(sk, TCP_CA_Recovery);
}
/* Process an event, which can update packets-in-flight not trivially.
* Main goal of this function is to calculate new estimate for left_out,
* taking into account both packets sitting in receiver's buffer and
* packets lost by network.
*
* Besides that it does CWND reduction, when packet loss is detected
* and changes state of machine.
*
* It does _not_ decide what to send, it is made in function
* tcp_xmit_retransmit_queue().
*/
static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
int newly_acked_sacked, bool is_dupack,
int flag)
struct inet_connection_sock *icsk = inet_csk(sk);
int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
(tcp_fackets_out(tp) > tp->reordering));
if (WARN_ON(!tp->packets_out && tp->sacked_out))
if (WARN_ON(!tp->sacked_out && tp->fackets_out))
* A. ECE, hence prohibit cwnd undoing, the reduction is required. */
tp->prior_ssthresh = 0;
/* B. In all the states check for reneging SACKs. */
if (tcp_check_sack_reneging(sk, flag))
/* C. Check consistency of the current state. */
tcp_verify_left_out(tp);
/* D. Check state exit conditions. State can be terminated
if (icsk->icsk_ca_state == TCP_CA_Open) {
WARN_ON(tp->retrans_out != 0);
tp->retrans_stamp = 0;
} else if (!before(tp->snd_una, tp->high_seq)) {
switch (icsk->icsk_ca_state) {
icsk->icsk_retransmits = 0;
if (tcp_try_undo_recovery(sk))
return;
break;
case TCP_CA_CWR:
/* CWR is to be held something *above* high_seq
* is ACKed for CWR bit to reach receiver. */
if (tp->snd_una != tp->high_seq) {
tcp_complete_cwr(sk);
tcp_set_ca_state(sk, TCP_CA_Open);
if (tcp_is_reno(tp))
if (tcp_try_undo_recovery(sk))
tcp_complete_cwr(sk);
switch (icsk->icsk_ca_state) {
if (!(flag & FLAG_SND_UNA_ADVANCED)) {
if (tcp_is_reno(tp) && is_dupack)
tcp_add_reno_sack(sk);
} else
do_lost = tcp_try_undo_partial(sk, pkts_acked);
icsk->icsk_retransmits = 0;
if (tcp_is_reno(tp) && flag & FLAG_SND_UNA_ADVANCED)
tcp_reset_reno_sack(tp);
if (!tcp_try_undo_loss(sk)) {
tcp_moderate_cwnd(tp);
tcp_xmit_retransmit_queue(sk);
return;
}
if (icsk->icsk_ca_state != TCP_CA_Open)
return;
/* Loss is undone; fall through to processing in Open state. */
default:
if (tcp_is_reno(tp)) {
if (flag & FLAG_SND_UNA_ADVANCED)
tcp_add_reno_sack(sk);
if (icsk->icsk_ca_state <= TCP_CA_Disorder)
tcp_try_undo_dsack(sk);
if (!tcp_time_to_recover(sk)) {
tcp_try_to_open(sk, flag);
/* MTU probe failure: don't reduce cwnd */
if (icsk->icsk_ca_state < TCP_CA_CWR &&
icsk->icsk_mtup.probe_size &&
tp->snd_una == tp->mtu_probe.probe_seq_start) {
tcp_mtup_probe_failed(sk);
/* Restores the reduction we did in tcp_mtup_probe() */
tp->snd_cwnd++;
tcp_simple_retransmit(sk);
return;
}
tcp_enter_recovery(sk, (flag & FLAG_ECE));
fast_rexmit = 1;
if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk)))
tcp_update_scoreboard(sk, fast_rexmit);
tp->prr_delivered += newly_acked_sacked;
tcp_update_cwnd_in_recovery(sk, newly_acked_sacked, fast_rexmit, flag);
void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt)
{
tcp_rtt_estimator(sk, seq_rtt);
tcp_set_rto(sk);
inet_csk(sk)->icsk_backoff = 0;
}
EXPORT_SYMBOL(tcp_valid_rtt_meas);
/* Read draft-ietf-tcplw-high-performance before mucking
static void tcp_ack_saw_tstamp(struct sock *sk, int flag)
{
/* RTTM Rule: A TSecr value received in a segment is used to
* update the averaged RTT measurement only if the segment
* acknowledges some new data, i.e., only if it advances the
* left edge of the send window.
*
* See draft-ietf-tcplw-high-performance-00, section 3.3.
* 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
*
* Changed: reset backoff as soon as we see the first valid sample.
* If we do not, we get strongly overestimated rto. With timestamps
* samples are accepted even from very old segments: f.e., when rtt=1
* increases to 8, we retransmit 5 times and after 8 seconds delayed
* answer arrives rto becomes 120 seconds! If at least one of segments
* in window is lost... Voila. --ANK (010210)
*/
struct tcp_sock *tp = tcp_sk(sk);
tcp_valid_rtt_meas(sk, tcp_time_stamp - tp->rx_opt.rcv_tsecr);
static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag)
{
/* We don't have a timestamp. Can only use
* packets that are not retransmitted to determine
* rtt estimates. Also, we must not reset the
* backoff for rto until we get a non-retransmitted
* packet. This allows us to deal with a situation
* where the network delay has increased suddenly.
* I.e. Karn's algorithm. (SIGCOMM '87, p5.)
*/
if (flag & FLAG_RETRANS_DATA_ACKED)
return;
tcp_valid_rtt_meas(sk, seq_rtt);
static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
const struct tcp_sock *tp = tcp_sk(sk);
/* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
tcp_ack_saw_tstamp(sk, flag);
tcp_ack_no_tstamp(sk, seq_rtt, flag);
static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
const struct inet_connection_sock *icsk = inet_csk(sk);
icsk->icsk_ca_ops->cong_avoid(sk, ack, in_flight);
tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;
}
/* Restart timer after forward progress on connection.
* RFC2988 recommends to restart timer to now+rto.
*/
static void tcp_rearm_rto(struct sock *sk)
const struct tcp_sock *tp = tcp_sk(sk);
inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
/* If we get here, the whole TSO packet has not been acked. */
static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
u32 packets_acked;
BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una));
if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
return 0;
packets_acked -= tcp_skb_pcount(skb);
if (packets_acked) {
BUG_ON(tcp_skb_pcount(skb) == 0);
BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));
return packets_acked;
/* Remove acknowledged frames from the retransmission queue. If our packet
* is before the ack sequence we can discard it as it's confirmed to have
* arrived at the other end.
*/
static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
u32 prior_snd_una)
const struct inet_connection_sock *icsk = inet_csk(sk);
u32 now = tcp_time_stamp;
int fully_acked = 1;
u32 reord = tp->packets_out;
u32 prior_sacked = tp->sacked_out;
s32 ca_seq_rtt = -1;
ktime_t last_ackt = net_invalid_timestamp();
while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
u8 sacked = scb->sacked;
/* Determine how many packets and what bytes were acked, tso and else */
if (tcp_skb_pcount(skb) == 1 ||
!after(tp->snd_una, scb->seq))
break;
acked_pcount = tcp_tso_acked(sk, skb);
if (!acked_pcount)
break;
fully_acked = 0;
} else {
acked_pcount = tcp_skb_pcount(skb);
if (sacked & TCPCB_RETRANS) {
if (sacked & TCPCB_SACKED_RETRANS)
tp->retrans_out -= acked_pcount;
flag |= FLAG_RETRANS_DATA_ACKED;
ca_seq_rtt = -1;
seq_rtt = -1;
if ((flag & FLAG_DATA_ACKED) || (acked_pcount > 1))
flag |= FLAG_NONHEAD_RETRANS_ACKED;
ca_seq_rtt = now - scb->when;
last_ackt = skb->tstamp;
if (seq_rtt < 0) {
seq_rtt = ca_seq_rtt;
if (!(sacked & TCPCB_SACKED_ACKED))
reord = min(pkts_acked, reord);
if (sacked & TCPCB_SACKED_ACKED)
tp->sacked_out -= acked_pcount;
if (sacked & TCPCB_LOST)
tp->lost_out -= acked_pcount;
tp->packets_out -= acked_pcount;
pkts_acked += acked_pcount;
/* Initial outgoing SYN's get put onto the write_queue
* just like anything else we transmit. It is not
* true data, and if we misinform our callers that
* this ACK acks real data, we will erroneously exit
* connection startup slow start one packet too
* quickly. This is severely frowned upon behavior.
*/
flag |= FLAG_DATA_ACKED;
} else {
flag |= FLAG_SYN_ACKED;
tp->retrans_stamp = 0;
}
if (!fully_acked)
break;
tcp_unlink_write_queue(skb, sk);
sk_wmem_free_skb(sk, skb);
tp->scoreboard_skb_hint = NULL;
if (skb == tp->retransmit_skb_hint)
tp->retransmit_skb_hint = NULL;
if (skb == tp->lost_skb_hint)
tp->lost_skb_hint = NULL;
if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
tp->snd_up = tp->snd_una;
if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
flag |= FLAG_SACK_RENEGING;
if (flag & FLAG_ACKED) {
const struct tcp_congestion_ops *ca_ops
= inet_csk(sk)->icsk_ca_ops;
if (unlikely(icsk->icsk_mtup.probe_size &&
!after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
tcp_mtup_probe_success(sk);
}
tcp_ack_update_rtt(sk, flag, seq_rtt);
if (tcp_is_reno(tp)) {
tcp_remove_reno_sacks(sk, pkts_acked);
} else {
/* Non-retransmitted hole got filled? That's reordering */
if (reord < prior_fackets)
tcp_update_reordering(sk, tp->fackets_out - reord, 0);
delta = tcp_is_fack(tp) ? pkts_acked :
prior_sacked - tp->sacked_out;
tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
tp->fackets_out -= min(pkts_acked, tp->fackets_out);
if (ca_ops->pkts_acked) {
s32 rtt_us = -1;
/* Is the ACK triggering packet unambiguous? */
if (!(flag & FLAG_RETRANS_DATA_ACKED)) {
/* High resolution needed and available? */
if (ca_ops->flags & TCP_CONG_RTT_STAMP &&
!ktime_equal(last_ackt,
net_invalid_timestamp()))
rtt_us = ktime_us_delta(ktime_get_real(),
last_ackt);
else if (ca_seq_rtt >= 0)
rtt_us = jiffies_to_usecs(ca_seq_rtt);
ca_ops->pkts_acked(sk, pkts_acked, rtt_us);
}
WARN_ON((int)tp->sacked_out < 0);
WARN_ON((int)tp->lost_out < 0);
WARN_ON((int)tp->retrans_out < 0);
if (!tp->packets_out && tcp_is_sack(tp)) {
if (tp->lost_out) {
printk(KERN_DEBUG "Leak l=%u %d\n",
tp->lost_out, icsk->icsk_ca_state);
tp->lost_out = 0;
}
if (tp->sacked_out) {
printk(KERN_DEBUG "Leak s=%u %d\n",
tp->sacked_out, icsk->icsk_ca_state);
tp->sacked_out = 0;
}
if (tp->retrans_out) {
printk(KERN_DEBUG "Leak r=%u %d\n",
tp->retrans_out, icsk->icsk_ca_state);
}
static void tcp_ack_probe(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) {
icsk->icsk_backoff = 0;
inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
/* Socket must be waked up by subsequent tcp_data_snd_check().
* This function is not for random using!
*/
} else {
inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,

Arnaldo Carvalho de Melo
committed
min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
TCP_RTO_MAX);
static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag)
return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
const struct tcp_sock *tp = tcp_sk(sk);
return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) &&
!((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR));
}
/* Check that window update is acceptable.
* The function assumes that snd_una<=ack<=snd_next.
*/
static inline int tcp_may_update_window(const struct tcp_sock *tp,
const u32 ack, const u32 ack_seq,
const u32 nwin)
(ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
}
/* Update our send window.
*
* Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
* and in FreeBSD. NetBSD's one is even worse.) is wrong.
*/
static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 ack,
u32 ack_seq)
struct tcp_sock *tp = tcp_sk(sk);
u32 nwin = ntohs(tcp_hdr(skb)->window);
if (likely(!tcp_hdr(skb)->syn))
nwin <<= tp->rx_opt.snd_wscale;
if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
flag |= FLAG_WIN_UPDATE;
tcp_update_wl(tp, ack_seq);
if (tp->snd_wnd != nwin) {
tp->snd_wnd = nwin;
/* Note, it is the only place, where
* fast path is recovered for sending TCP.
*/
tcp_fast_path_check(sk);
if (nwin > tp->max_window) {
tp->max_window = nwin;
tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
}
}
}
tp->snd_una = ack;
return flag;
}
/* A very conservative spurious RTO response algorithm: reduce cwnd and
* continue in congestion avoidance.
*/
static void tcp_conservative_spur_to_response(struct tcp_sock *tp)
{
tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
tp->snd_cwnd_cnt = 0;
TCP_ECN_queue_cwr(tp);
tcp_moderate_cwnd(tp);
}
/* A conservative spurious RTO response algorithm: reduce cwnd using
* rate halving and continue in congestion avoidance.
*/
static void tcp_ratehalving_spur_to_response(struct sock *sk)
{
tcp_enter_cwr(sk, 0);
}
static void tcp_undo_spur_to_response(struct sock *sk, int flag)
tcp_ratehalving_spur_to_response(sk);
else
tcp_undo_cwr(sk, true);
/* F-RTO spurious RTO detection algorithm (RFC4138)
*
* F-RTO affects during two new ACKs following RTO (well, almost, see inline
* comments). State (ACK number) is kept in frto_counter. When ACK advances
* window (but not to or beyond highest sequence sent before RTO):
* On First ACK, send two new segments out.
* On Second ACK, RTO was likely spurious. Do spurious response (response
* algorithm is not part of the F-RTO detection algorithm
* given in RFC4138 but can be selected separately).
* Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss
* and TCP falls back to conventional RTO recovery. F-RTO allows overriding
* of Nagle, this is done using frto_counter states 2 and 3, when a new data
* segment of any size sent during F-RTO, state 2 is upgraded to 3.
*
* Rationale: if the RTO was spurious, new ACKs should arrive from the
* original window even after we transmit two new data segments.
*
* SACK version:
* on first step, wait until first cumulative ACK arrives, then move to
* the second step. In second step, the next ACK decides.
*
* F-RTO is implemented (mainly) in four functions:
* - tcp_use_frto() is used to determine if TCP is can use F-RTO
* - tcp_enter_frto() prepares TCP state on RTO if F-RTO is used, it is
* called when tcp_use_frto() showed green light
* - tcp_process_frto() handles incoming ACKs during F-RTO algorithm
* - tcp_enter_frto_loss() is called if there is not enough evidence
* to prove that the RTO is indeed spurious. It transfers the control
* from F-RTO to the conventional RTO recovery
*/
static int tcp_process_frto(struct sock *sk, int flag)
tcp_verify_left_out(tp);
/* Duplicate the behavior from Loss state (fastretrans_alert) */
inet_csk(sk)->icsk_retransmits = 0;
if ((flag & FLAG_NONHEAD_RETRANS_ACKED) ||
((tp->frto_counter >= 2) && (flag & FLAG_RETRANS_DATA_ACKED)))
tp->undo_marker = 0;
if (!before(tp->snd_una, tp->frto_highmark)) {
tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag);
if (!tcp_is_sackfrto(tp)) {
/* RFC4138 shortcoming in step 2; should also have case c):
* ACK isn't duplicate nor advances window, e.g., opposite dir
* data, winupdate
*/
if (!(flag & FLAG_ANY_PROGRESS) && (flag & FLAG_NOT_DUP))
if (!(flag & FLAG_DATA_ACKED)) {
tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3),
flag);
return 1;
}
} else {
if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) {
/* Prevent sending of new data. */
tp->snd_cwnd = min(tp->snd_cwnd,
tcp_packets_in_flight(tp));
return 1;
}
if ((tp->frto_counter >= 2) &&
(!(flag & FLAG_FORWARD_PROGRESS) ||
((flag & FLAG_DATA_SACKED) &&
!(flag & FLAG_ONLY_ORIG_SACKED)))) {
/* RFC4138 shortcoming (see comment above) */
if (!(flag & FLAG_FORWARD_PROGRESS) &&
(flag & FLAG_NOT_DUP))
return 1;
tcp_enter_frto_loss(sk, 3, flag);
return 1;
}
/* tcp_may_send_now needs to see updated state */
tp->frto_counter = 2;
if (!tcp_may_send_now(sk))
tcp_enter_frto_loss(sk, 2, flag);
} else {
switch (sysctl_tcp_frto_response) {
case 2:
tcp_undo_spur_to_response(sk, flag);
break;
case 1:
tcp_conservative_spur_to_response(tp);
break;
default:
tcp_ratehalving_spur_to_response(sk);
break;
tp->frto_counter = 0;
tp->undo_marker = 0;
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS);
}
/* This routine deals with incoming acks, but not outgoing ones. */
static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
u32 prior_snd_una = tp->snd_una;
u32 ack_seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
bool is_dupack = false;
u32 prior_fackets;
int prior_sacked = tp->sacked_out;
int pkts_acked = 0;
int frto_cwnd = 0;
/* If the ack is older than previous acks
* then we can probably ignore it.
*/
if (before(ack, prior_snd_una))
goto old_ack;
/* If the ack includes data we haven't sent yet, discard
* this segment (RFC793 Section 3.9).
*/
if (after(ack, tp->snd_nxt))
goto invalid_ack;
if (after(ack, prior_snd_una))
flag |= FLAG_SND_UNA_ADVANCED;
if (sysctl_tcp_abc) {
if (icsk->icsk_ca_state < TCP_CA_CWR)
tp->bytes_acked += ack - prior_snd_una;
else if (icsk->icsk_ca_state == TCP_CA_Loss)
/* we assume just one segment left network */
tp->bytes_acked += min(ack - prior_snd_una,
tp->mss_cache);
prior_fackets = tp->fackets_out;
prior_in_flight = tcp_packets_in_flight(tp);
if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
/* Window is constant, pure forward advance.
* No more checks are required.
* Note, we use the fact that SND.UNA>=SND.WL2.
*/
tcp_update_wl(tp, ack_seq);
tp->snd_una = ack;
flag |= FLAG_WIN_UPDATE;
tcp_ca_event(sk, CA_EVENT_FAST_ACK);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS);
} else {
if (ack_seq != TCP_SKB_CB(skb)->end_seq)
flag |= FLAG_DATA;
else
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPUREACKS);
flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
if (TCP_SKB_CB(skb)->sacked)
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))
tcp_ca_event(sk, CA_EVENT_SLOW_ACK);
}
/* We passed data and got it acked, remove any soft error
* log. Something worked...
*/
sk->sk_err_soft = 0;
icsk->icsk_probes_out = 0;
tp->rcv_tstamp = tcp_time_stamp;
prior_packets = tp->packets_out;
if (!prior_packets)
goto no_queue;
/* See if we can take anything off of the retransmit queue. */
flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una);
pkts_acked = prior_packets - tp->packets_out;
newly_acked_sacked = (prior_packets - prior_sacked) -
(tp->packets_out - tp->sacked_out);
if (tp->frto_counter)
frto_cwnd = tcp_process_frto(sk, flag);
/* Guarantee sacktag reordering detection against wrap-arounds */
if (before(tp->frto_highmark, tp->snd_una))
tp->frto_highmark = 0;
if (tcp_ack_is_dubious(sk, flag)) {
if ((flag & FLAG_DATA_ACKED) && !frto_cwnd &&
tcp_may_raise_cwnd(sk, flag))
tcp_cong_avoid(sk, ack, prior_in_flight);
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked,
is_dupack, flag);
if ((flag & FLAG_DATA_ACKED) && !frto_cwnd)
tcp_cong_avoid(sk, ack, prior_in_flight);
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
/* If data was DSACKed, see if we can undo a cwnd reduction. */
if (flag & FLAG_DSACKING_ACK)
tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked,
is_dupack, flag);
/* If this ack opens up a zero window, clear backoff. It was
* being used to time the probes, and is probably far higher than
* it needs to be for normal retransmission.
*/
if (tcp_send_head(sk))
invalid_ack:
SOCK_DEBUG(sk, "Ack %u after %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
return -1;
/* If data was SACKed, tag it and see if we should send more data.
* If data was DSACKed, see if we can undo a cwnd reduction.
*/
if (TCP_SKB_CB(skb)->sacked) {
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
newly_acked_sacked = tp->sacked_out - prior_sacked;
tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked,
is_dupack, flag);
SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
return 0;
}
/* Look for tcp options. Normally only called on SYN and SYNACK packets.
* But, this can also be called on packets in the established flow when
* the fast version below fails.
*/
void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *opt_rx,
const u8 **hvpp, int estab)
const unsigned char *ptr;
const struct tcphdr *th = tcp_hdr(skb);
int length = (th->doff * 4) - sizeof(struct tcphdr);
ptr = (const unsigned char *)(th + 1);
case TCPOPT_EOL:
return;
case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
length--;
continue;
default:
opsize = *ptr++;
if (opsize < 2) /* "silly options" */
if (opsize > length)
return; /* don't parse partial options */
switch (opcode) {
case TCPOPT_MSS:
if (opsize == TCPOLEN_MSS && th->syn && !estab) {
u16 in_mss = get_unaligned_be16(ptr);
if (in_mss) {
if (opt_rx->user_mss &&
opt_rx->user_mss < in_mss)
in_mss = opt_rx->user_mss;
opt_rx->mss_clamp = in_mss;
}
break;
case TCPOPT_WINDOW:
if (opsize == TCPOLEN_WINDOW && th->syn &&
!estab && sysctl_tcp_window_scaling) {
__u8 snd_wscale = *(__u8 *)ptr;
opt_rx->wscale_ok = 1;
if (snd_wscale > 14) {
if (net_ratelimit())
pr_info("%s: Illegal window scaling value %d >14 received\n",
__func__,
snd_wscale);
opt_rx->snd_wscale = snd_wscale;
}
break;
case TCPOPT_TIMESTAMP:
if ((opsize == TCPOLEN_TIMESTAMP) &&
((estab && opt_rx->tstamp_ok) ||
(!estab && sysctl_tcp_timestamps))) {
opt_rx->saw_tstamp = 1;
opt_rx->rcv_tsval = get_unaligned_be32(ptr);
opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
}
break;
case TCPOPT_SACK_PERM:
if (opsize == TCPOLEN_SACK_PERM && th->syn &&
!estab && sysctl_tcp_sack) {
opt_rx->sack_ok = TCP_SACK_SEEN;
tcp_sack_reset(opt_rx);
}
break;
case TCPOPT_SACK:
if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
!((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
opt_rx->sack_ok) {
TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
}
break;
#ifdef CONFIG_TCP_MD5SIG
case TCPOPT_MD5SIG:
/*
* The MD5 Hash has already been
* checked (see tcp_v{4,6}_do_rcv()).
*/
break;
case TCPOPT_COOKIE:
/* This option is variable length.
*/
switch (opsize) {
case TCPOLEN_COOKIE_BASE:
/* not yet implemented */
break;
case TCPOLEN_COOKIE_PAIR:
/* not yet implemented */
break;
case TCPOLEN_COOKIE_MIN+0:
case TCPOLEN_COOKIE_MIN+2:
case TCPOLEN_COOKIE_MIN+4:
case TCPOLEN_COOKIE_MIN+6:
case TCPOLEN_COOKIE_MAX:
/* 16-bit multiple */
opt_rx->cookie_plus = opsize;
*hvpp = ptr;
default:
/* ignore option */
break;
ptr += opsize-2;
length -= opsize;
static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th)
const __be32 *ptr = (const __be32 *)(th + 1);
if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
| (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
tp->rx_opt.saw_tstamp = 1;
++ptr;
tp->rx_opt.rcv_tsval = ntohl(*ptr);
++ptr;
tp->rx_opt.rcv_tsecr = ntohl(*ptr);
return 1;
}
return 0;
}
/* Fast parse options. This hopes to only see timestamps.
* If it is wrong it falls back on tcp_parse_options().
*/
static int tcp_fast_parse_options(const struct sk_buff *skb,
const struct tcphdr *th,
struct tcp_sock *tp, const u8 **hvpp)
/* In the spirit of fast parsing, compare doff directly to constant
* values. Because equality is used, short doff can be ignored here.
*/
if (th->doff == (sizeof(*th) / 4)) {
tp->rx_opt.saw_tstamp = 0;
return 0;
} else if (tp->rx_opt.tstamp_ok &&
th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
if (tcp_parse_aligned_timestamp(tp, th))
tcp_parse_options(skb, &tp->rx_opt, hvpp, 1);
#ifdef CONFIG_TCP_MD5SIG
/*
* Parse MD5 Signature option
*/
const u8 *tcp_parse_md5sig_option(const struct tcphdr *th)
{
int length = (th->doff << 2) - sizeof(*th);