Newer
Older
static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
int newly_acked_sacked, int flag)
struct inet_connection_sock *icsk = inet_csk(sk);
int is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
(tcp_fackets_out(tp) > tp->reordering));
int fast_rexmit = 0, mib_idx;
if (WARN_ON(!tp->packets_out && tp->sacked_out))
if (WARN_ON(!tp->sacked_out && tp->fackets_out))
* A. ECE, hence prohibit cwnd undoing, the reduction is required. */
tp->prior_ssthresh = 0;
/* B. In all the states check for reneging SACKs. */
if (tcp_check_sack_reneging(sk, flag))
return;
/* C. Process data loss notification, provided it is valid. */
if (tcp_is_fack(tp) && (flag & FLAG_DATA_LOST) &&
icsk->icsk_ca_state != TCP_CA_Open &&
tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS);
/* D. Check consistency of the current state. */
tcp_verify_left_out(tp);
/* E. Check state exit conditions. State can be terminated
* when high_seq is ACKed. */
if (icsk->icsk_ca_state == TCP_CA_Open) {
WARN_ON(tp->retrans_out != 0);
tp->retrans_stamp = 0;
} else if (!before(tp->snd_una, tp->high_seq)) {
switch (icsk->icsk_ca_state) {
icsk->icsk_retransmits = 0;
if (tcp_try_undo_recovery(sk))
return;
break;
case TCP_CA_CWR:
/* CWR is to be held something *above* high_seq
* is ACKed for CWR bit to reach receiver. */
if (tp->snd_una != tp->high_seq) {
tcp_complete_cwr(sk);
tcp_set_ca_state(sk, TCP_CA_Open);
tcp_try_undo_dsack(sk);
if (!tp->undo_marker ||
/* For SACK case do not Open to allow to undo
* catching for all duplicate ACKs. */
tcp_is_reno(tp) || tp->snd_una != tp->high_seq) {
tcp_set_ca_state(sk, TCP_CA_Open);
if (tcp_is_reno(tp))
if (tcp_try_undo_recovery(sk))
tcp_complete_cwr(sk);
break;
}
}
/* F. Process state. */
switch (icsk->icsk_ca_state) {
if (!(flag & FLAG_SND_UNA_ADVANCED)) {
if (tcp_is_reno(tp) && is_dupack)
tcp_add_reno_sack(sk);
} else
do_lost = tcp_try_undo_partial(sk, pkts_acked);
icsk->icsk_retransmits = 0;
if (tcp_is_reno(tp) && flag & FLAG_SND_UNA_ADVANCED)
tcp_reset_reno_sack(tp);
if (!tcp_try_undo_loss(sk)) {
tcp_moderate_cwnd(tp);
tcp_xmit_retransmit_queue(sk);
return;
}
if (icsk->icsk_ca_state != TCP_CA_Open)
return;
/* Loss is undone; fall through to processing in Open state. */
default:
if (tcp_is_reno(tp)) {
if (flag & FLAG_SND_UNA_ADVANCED)
tcp_add_reno_sack(sk);
if (icsk->icsk_ca_state == TCP_CA_Disorder)
tcp_try_undo_dsack(sk);
if (!tcp_time_to_recover(sk)) {
tcp_try_to_open(sk, flag);
/* MTU probe failure: don't reduce cwnd */
if (icsk->icsk_ca_state < TCP_CA_CWR &&
icsk->icsk_mtup.probe_size &&
tp->snd_una == tp->mtu_probe.probe_seq_start) {
tcp_mtup_probe_failed(sk);
/* Restores the reduction we did in tcp_mtup_probe() */
tp->snd_cwnd++;
tcp_simple_retransmit(sk);
return;
}
if (tcp_is_reno(tp))
mib_idx = LINUX_MIB_TCPRENORECOVERY;
mib_idx = LINUX_MIB_TCPSACKRECOVERY;
NET_INC_STATS_BH(sock_net(sk), mib_idx);
tp->high_seq = tp->snd_nxt;
tp->prior_ssthresh = 0;
tp->undo_marker = tp->snd_una;
tp->undo_retrans = tp->retrans_out;
if (icsk->icsk_ca_state < TCP_CA_CWR) {
tp->prior_ssthresh = tcp_current_ssthresh(sk);
tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
tp->prior_cwnd = tp->snd_cwnd;
tp->prr_delivered = 0;
tp->prr_out = 0;
tcp_set_ca_state(sk, TCP_CA_Recovery);
fast_rexmit = 1;
if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk)))
tcp_update_scoreboard(sk, fast_rexmit);
tp->prr_delivered += newly_acked_sacked;
tcp_update_cwnd_in_recovery(sk, newly_acked_sacked, fast_rexmit, flag);
void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt)
{
tcp_rtt_estimator(sk, seq_rtt);
tcp_set_rto(sk);
inet_csk(sk)->icsk_backoff = 0;
}
EXPORT_SYMBOL(tcp_valid_rtt_meas);
/* Read draft-ietf-tcplw-high-performance before mucking
static void tcp_ack_saw_tstamp(struct sock *sk, int flag)
{
/* RTTM Rule: A TSecr value received in a segment is used to
* update the averaged RTT measurement only if the segment
* acknowledges some new data, i.e., only if it advances the
* left edge of the send window.
*
* See draft-ietf-tcplw-high-performance-00, section 3.3.
* 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
*
* Changed: reset backoff as soon as we see the first valid sample.
* If we do not, we get strongly overestimated rto. With timestamps
* samples are accepted even from very old segments: f.e., when rtt=1
* increases to 8, we retransmit 5 times and after 8 seconds delayed
* answer arrives rto becomes 120 seconds! If at least one of segments
* in window is lost... Voila. --ANK (010210)
*/
struct tcp_sock *tp = tcp_sk(sk);
tcp_valid_rtt_meas(sk, tcp_time_stamp - tp->rx_opt.rcv_tsecr);
static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag)
{
/* We don't have a timestamp. Can only use
* packets that are not retransmitted to determine
* rtt estimates. Also, we must not reset the
* backoff for rto until we get a non-retransmitted
* packet. This allows us to deal with a situation
* where the network delay has increased suddenly.
* I.e. Karn's algorithm. (SIGCOMM '87, p5.)
*/
if (flag & FLAG_RETRANS_DATA_ACKED)
return;
tcp_valid_rtt_meas(sk, seq_rtt);
static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
const struct tcp_sock *tp = tcp_sk(sk);
/* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
tcp_ack_saw_tstamp(sk, flag);
tcp_ack_no_tstamp(sk, seq_rtt, flag);
static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
const struct inet_connection_sock *icsk = inet_csk(sk);
icsk->icsk_ca_ops->cong_avoid(sk, ack, in_flight);
tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;
}
/* Restart timer after forward progress on connection.
* RFC2988 recommends to restart timer to now+rto.
*/
static void tcp_rearm_rto(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
/* If we get here, the whole TSO packet has not been acked. */
static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
u32 packets_acked;
BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una));
if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
return 0;
packets_acked -= tcp_skb_pcount(skb);
if (packets_acked) {
BUG_ON(tcp_skb_pcount(skb) == 0);
BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));
return packets_acked;
/* Remove acknowledged frames from the retransmission queue. If our packet
* is before the ack sequence we can discard it as it's confirmed to have
* arrived at the other end.
*/
static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
u32 prior_snd_una)
const struct inet_connection_sock *icsk = inet_csk(sk);
u32 now = tcp_time_stamp;
int fully_acked = 1;
u32 reord = tp->packets_out;
u32 prior_sacked = tp->sacked_out;
s32 ca_seq_rtt = -1;
ktime_t last_ackt = net_invalid_timestamp();
while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
u8 sacked = scb->sacked;
/* Determine how many packets and what bytes were acked, tso and else */
if (tcp_skb_pcount(skb) == 1 ||
!after(tp->snd_una, scb->seq))
break;
acked_pcount = tcp_tso_acked(sk, skb);
if (!acked_pcount)
break;
fully_acked = 0;
} else {
acked_pcount = tcp_skb_pcount(skb);
if (sacked & TCPCB_RETRANS) {
if (sacked & TCPCB_SACKED_RETRANS)
tp->retrans_out -= acked_pcount;
flag |= FLAG_RETRANS_DATA_ACKED;
ca_seq_rtt = -1;
seq_rtt = -1;
if ((flag & FLAG_DATA_ACKED) || (acked_pcount > 1))
flag |= FLAG_NONHEAD_RETRANS_ACKED;
ca_seq_rtt = now - scb->when;
last_ackt = skb->tstamp;
if (seq_rtt < 0) {
seq_rtt = ca_seq_rtt;
if (!(sacked & TCPCB_SACKED_ACKED))
reord = min(pkts_acked, reord);
if (sacked & TCPCB_SACKED_ACKED)
tp->sacked_out -= acked_pcount;
if (sacked & TCPCB_LOST)
tp->lost_out -= acked_pcount;
tp->packets_out -= acked_pcount;
pkts_acked += acked_pcount;
/* Initial outgoing SYN's get put onto the write_queue
* just like anything else we transmit. It is not
* true data, and if we misinform our callers that
* this ACK acks real data, we will erroneously exit
* connection startup slow start one packet too
* quickly. This is severely frowned upon behavior.
*/
flag |= FLAG_DATA_ACKED;
} else {
flag |= FLAG_SYN_ACKED;
tp->retrans_stamp = 0;
}
if (!fully_acked)
break;
tcp_unlink_write_queue(skb, sk);
sk_wmem_free_skb(sk, skb);
tp->scoreboard_skb_hint = NULL;
if (skb == tp->retransmit_skb_hint)
tp->retransmit_skb_hint = NULL;
if (skb == tp->lost_skb_hint)
tp->lost_skb_hint = NULL;
if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
tp->snd_up = tp->snd_una;
if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
flag |= FLAG_SACK_RENEGING;
if (flag & FLAG_ACKED) {
const struct tcp_congestion_ops *ca_ops
= inet_csk(sk)->icsk_ca_ops;
if (unlikely(icsk->icsk_mtup.probe_size &&
!after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
tcp_mtup_probe_success(sk);
}
tcp_ack_update_rtt(sk, flag, seq_rtt);
if (tcp_is_reno(tp)) {
tcp_remove_reno_sacks(sk, pkts_acked);
} else {
/* Non-retransmitted hole got filled? That's reordering */
if (reord < prior_fackets)
tcp_update_reordering(sk, tp->fackets_out - reord, 0);
delta = tcp_is_fack(tp) ? pkts_acked :
prior_sacked - tp->sacked_out;
tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
tp->fackets_out -= min(pkts_acked, tp->fackets_out);
if (ca_ops->pkts_acked) {
s32 rtt_us = -1;
/* Is the ACK triggering packet unambiguous? */
if (!(flag & FLAG_RETRANS_DATA_ACKED)) {
/* High resolution needed and available? */
if (ca_ops->flags & TCP_CONG_RTT_STAMP &&
!ktime_equal(last_ackt,
net_invalid_timestamp()))
rtt_us = ktime_us_delta(ktime_get_real(),
last_ackt);
else if (ca_seq_rtt >= 0)
rtt_us = jiffies_to_usecs(ca_seq_rtt);
ca_ops->pkts_acked(sk, pkts_acked, rtt_us);
}
WARN_ON((int)tp->sacked_out < 0);
WARN_ON((int)tp->lost_out < 0);
WARN_ON((int)tp->retrans_out < 0);
if (!tp->packets_out && tcp_is_sack(tp)) {
if (tp->lost_out) {
printk(KERN_DEBUG "Leak l=%u %d\n",
tp->lost_out, icsk->icsk_ca_state);
tp->lost_out = 0;
}
if (tp->sacked_out) {
printk(KERN_DEBUG "Leak s=%u %d\n",
tp->sacked_out, icsk->icsk_ca_state);
tp->sacked_out = 0;
}
if (tp->retrans_out) {
printk(KERN_DEBUG "Leak r=%u %d\n",
tp->retrans_out, icsk->icsk_ca_state);
}
static void tcp_ack_probe(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) {
icsk->icsk_backoff = 0;
inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
/* Socket must be waked up by subsequent tcp_data_snd_check().
* This function is not for random using!
*/
} else {
inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,

Arnaldo Carvalho de Melo
committed
min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
TCP_RTO_MAX);
static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag)
return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
const struct tcp_sock *tp = tcp_sk(sk);
return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) &&
!((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR));
}
/* Check that window update is acceptable.
* The function assumes that snd_una<=ack<=snd_next.
*/
static inline int tcp_may_update_window(const struct tcp_sock *tp,
const u32 ack, const u32 ack_seq,
const u32 nwin)
(ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
}
/* Update our send window.
*
* Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
* and in FreeBSD. NetBSD's one is even worse.) is wrong.
*/
static int tcp_ack_update_window(struct sock *sk, struct sk_buff *skb, u32 ack,
u32 ack_seq)
struct tcp_sock *tp = tcp_sk(sk);
u32 nwin = ntohs(tcp_hdr(skb)->window);
if (likely(!tcp_hdr(skb)->syn))
nwin <<= tp->rx_opt.snd_wscale;
if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
flag |= FLAG_WIN_UPDATE;
tcp_update_wl(tp, ack_seq);
if (tp->snd_wnd != nwin) {
tp->snd_wnd = nwin;
/* Note, it is the only place, where
* fast path is recovered for sending TCP.
*/
tcp_fast_path_check(sk);
if (nwin > tp->max_window) {
tp->max_window = nwin;
tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
}
}
}
tp->snd_una = ack;
return flag;
}
/* A very conservative spurious RTO response algorithm: reduce cwnd and
* continue in congestion avoidance.
*/
static void tcp_conservative_spur_to_response(struct tcp_sock *tp)
{
tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
tp->snd_cwnd_cnt = 0;
TCP_ECN_queue_cwr(tp);
tcp_moderate_cwnd(tp);
}
/* A conservative spurious RTO response algorithm: reduce cwnd using
* rate halving and continue in congestion avoidance.
*/
static void tcp_ratehalving_spur_to_response(struct sock *sk)
{
tcp_enter_cwr(sk, 0);
}
static void tcp_undo_spur_to_response(struct sock *sk, int flag)
tcp_ratehalving_spur_to_response(sk);
else
tcp_undo_cwr(sk, true);
/* F-RTO spurious RTO detection algorithm (RFC4138)
*
* F-RTO affects during two new ACKs following RTO (well, almost, see inline
* comments). State (ACK number) is kept in frto_counter. When ACK advances
* window (but not to or beyond highest sequence sent before RTO):
* On First ACK, send two new segments out.
* On Second ACK, RTO was likely spurious. Do spurious response (response
* algorithm is not part of the F-RTO detection algorithm
* given in RFC4138 but can be selected separately).
* Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss
* and TCP falls back to conventional RTO recovery. F-RTO allows overriding
* of Nagle, this is done using frto_counter states 2 and 3, when a new data
* segment of any size sent during F-RTO, state 2 is upgraded to 3.
*
* Rationale: if the RTO was spurious, new ACKs should arrive from the
* original window even after we transmit two new data segments.
*
* SACK version:
* on first step, wait until first cumulative ACK arrives, then move to
* the second step. In second step, the next ACK decides.
*
* F-RTO is implemented (mainly) in four functions:
* - tcp_use_frto() is used to determine if TCP is can use F-RTO
* - tcp_enter_frto() prepares TCP state on RTO if F-RTO is used, it is
* called when tcp_use_frto() showed green light
* - tcp_process_frto() handles incoming ACKs during F-RTO algorithm
* - tcp_enter_frto_loss() is called if there is not enough evidence
* to prove that the RTO is indeed spurious. It transfers the control
* from F-RTO to the conventional RTO recovery
*/
static int tcp_process_frto(struct sock *sk, int flag)
tcp_verify_left_out(tp);
/* Duplicate the behavior from Loss state (fastretrans_alert) */
inet_csk(sk)->icsk_retransmits = 0;
if ((flag & FLAG_NONHEAD_RETRANS_ACKED) ||
((tp->frto_counter >= 2) && (flag & FLAG_RETRANS_DATA_ACKED)))
tp->undo_marker = 0;
if (!before(tp->snd_una, tp->frto_highmark)) {
tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag);
if (!tcp_is_sackfrto(tp)) {
/* RFC4138 shortcoming in step 2; should also have case c):
* ACK isn't duplicate nor advances window, e.g., opposite dir
* data, winupdate
*/
if (!(flag & FLAG_ANY_PROGRESS) && (flag & FLAG_NOT_DUP))
if (!(flag & FLAG_DATA_ACKED)) {
tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3),
flag);
return 1;
}
} else {
if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) {
/* Prevent sending of new data. */
tp->snd_cwnd = min(tp->snd_cwnd,
tcp_packets_in_flight(tp));
return 1;
}
if ((tp->frto_counter >= 2) &&
(!(flag & FLAG_FORWARD_PROGRESS) ||
((flag & FLAG_DATA_SACKED) &&
!(flag & FLAG_ONLY_ORIG_SACKED)))) {
/* RFC4138 shortcoming (see comment above) */
if (!(flag & FLAG_FORWARD_PROGRESS) &&
(flag & FLAG_NOT_DUP))
return 1;
tcp_enter_frto_loss(sk, 3, flag);
return 1;
}
/* tcp_may_send_now needs to see updated state */
tp->frto_counter = 2;
if (!tcp_may_send_now(sk))
tcp_enter_frto_loss(sk, 2, flag);
} else {
switch (sysctl_tcp_frto_response) {
case 2:
tcp_undo_spur_to_response(sk, flag);
break;
case 1:
tcp_conservative_spur_to_response(tp);
break;
default:
tcp_ratehalving_spur_to_response(sk);
break;
tp->frto_counter = 0;
tp->undo_marker = 0;
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS);
}
/* This routine deals with incoming acks, but not outgoing ones. */
static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
u32 prior_snd_una = tp->snd_una;
u32 ack_seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
u32 prior_in_flight;
u32 prior_fackets;
int prior_sacked = tp->sacked_out;
int newly_acked_sacked = 0;
int frto_cwnd = 0;
/* If the ack is older than previous acks
* then we can probably ignore it.
*/
if (before(ack, prior_snd_una))
goto old_ack;
/* If the ack includes data we haven't sent yet, discard
* this segment (RFC793 Section 3.9).
*/
if (after(ack, tp->snd_nxt))
goto invalid_ack;
if (after(ack, prior_snd_una))
flag |= FLAG_SND_UNA_ADVANCED;
if (sysctl_tcp_abc) {
if (icsk->icsk_ca_state < TCP_CA_CWR)
tp->bytes_acked += ack - prior_snd_una;
else if (icsk->icsk_ca_state == TCP_CA_Loss)
/* we assume just one segment left network */
tp->bytes_acked += min(ack - prior_snd_una,
tp->mss_cache);
prior_fackets = tp->fackets_out;
prior_in_flight = tcp_packets_in_flight(tp);
if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
/* Window is constant, pure forward advance.
* No more checks are required.
* Note, we use the fact that SND.UNA>=SND.WL2.
*/
tcp_update_wl(tp, ack_seq);
tp->snd_una = ack;
flag |= FLAG_WIN_UPDATE;
tcp_ca_event(sk, CA_EVENT_FAST_ACK);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS);
} else {
if (ack_seq != TCP_SKB_CB(skb)->end_seq)
flag |= FLAG_DATA;
else
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPUREACKS);
flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
if (TCP_SKB_CB(skb)->sacked)
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))
tcp_ca_event(sk, CA_EVENT_SLOW_ACK);
}
/* We passed data and got it acked, remove any soft error
* log. Something worked...
*/
sk->sk_err_soft = 0;
icsk->icsk_probes_out = 0;
tp->rcv_tstamp = tcp_time_stamp;
prior_packets = tp->packets_out;
if (!prior_packets)
goto no_queue;
/* See if we can take anything off of the retransmit queue. */
flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una);
newly_acked_sacked = (prior_packets - prior_sacked) -
(tp->packets_out - tp->sacked_out);
if (tp->frto_counter)
frto_cwnd = tcp_process_frto(sk, flag);
/* Guarantee sacktag reordering detection against wrap-arounds */
if (before(tp->frto_highmark, tp->snd_una))
tp->frto_highmark = 0;
if (tcp_ack_is_dubious(sk, flag)) {
if ((flag & FLAG_DATA_ACKED) && !frto_cwnd &&
tcp_may_raise_cwnd(sk, flag))
tcp_cong_avoid(sk, ack, prior_in_flight);
tcp_fastretrans_alert(sk, prior_packets - tp->packets_out,
if ((flag & FLAG_DATA_ACKED) && !frto_cwnd)
tcp_cong_avoid(sk, ack, prior_in_flight);
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
return 1;
no_queue:
/* If this ack opens up a zero window, clear backoff. It was
* being used to time the probes, and is probably far higher than
* it needs to be for normal retransmission.
*/
if (tcp_send_head(sk))
invalid_ack:
SOCK_DEBUG(sk, "Ack %u after %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
return -1;
if (TCP_SKB_CB(skb)->sacked) {
if (icsk->icsk_ca_state == TCP_CA_Open)
tcp_try_keep_open(sk);
}
SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
return 0;
}
/* Look for tcp options. Normally only called on SYN and SYNACK packets.
* But, this can also be called on packets in the established flow when
* the fast version below fails.
*/
void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
u8 **hvpp, int estab)
struct tcphdr *th = tcp_hdr(skb);
int length = (th->doff * 4) - sizeof(struct tcphdr);
ptr = (unsigned char *)(th + 1);
opt_rx->saw_tstamp = 0;
case TCPOPT_EOL:
return;
case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
length--;
continue;
default:
opsize = *ptr++;
if (opsize < 2) /* "silly options" */
if (opsize > length)
return; /* don't parse partial options */
switch (opcode) {
case TCPOPT_MSS:
if (opsize == TCPOLEN_MSS && th->syn && !estab) {
u16 in_mss = get_unaligned_be16(ptr);
if (in_mss) {
if (opt_rx->user_mss &&
opt_rx->user_mss < in_mss)
in_mss = opt_rx->user_mss;
opt_rx->mss_clamp = in_mss;
}
break;
case TCPOPT_WINDOW:
if (opsize == TCPOLEN_WINDOW && th->syn &&
!estab && sysctl_tcp_window_scaling) {
__u8 snd_wscale = *(__u8 *)ptr;
opt_rx->wscale_ok = 1;
if (snd_wscale > 14) {
if (net_ratelimit())
printk(KERN_INFO "tcp_parse_options: Illegal window "
"scaling value %d >14 received.\n",
snd_wscale);
snd_wscale = 14;
opt_rx->snd_wscale = snd_wscale;
}
break;
case TCPOPT_TIMESTAMP:
if ((opsize == TCPOLEN_TIMESTAMP) &&
((estab && opt_rx->tstamp_ok) ||
(!estab && sysctl_tcp_timestamps))) {
opt_rx->saw_tstamp = 1;
opt_rx->rcv_tsval = get_unaligned_be32(ptr);
opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
}
break;
case TCPOPT_SACK_PERM:
if (opsize == TCPOLEN_SACK_PERM && th->syn &&
!estab && sysctl_tcp_sack) {
opt_rx->sack_ok = 1;
tcp_sack_reset(opt_rx);
}
break;
case TCPOPT_SACK:
if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
!((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
opt_rx->sack_ok) {
TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
}
break;
#ifdef CONFIG_TCP_MD5SIG
case TCPOPT_MD5SIG:
/*
* The MD5 Hash has already been
* checked (see tcp_v{4,6}_do_rcv()).
*/
break;
case TCPOPT_COOKIE:
/* This option is variable length.
*/
switch (opsize) {
case TCPOLEN_COOKIE_BASE:
/* not yet implemented */
break;
case TCPOLEN_COOKIE_PAIR:
/* not yet implemented */
break;
case TCPOLEN_COOKIE_MIN+0:
case TCPOLEN_COOKIE_MIN+2:
case TCPOLEN_COOKIE_MIN+4:
case TCPOLEN_COOKIE_MIN+6:
case TCPOLEN_COOKIE_MAX:
/* 16-bit multiple */
opt_rx->cookie_plus = opsize;
*hvpp = ptr;
default:
/* ignore option */
break;
ptr += opsize-2;
length -= opsize;
static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th)
{
__be32 *ptr = (__be32 *)(th + 1);
if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
| (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
tp->rx_opt.saw_tstamp = 1;
++ptr;
tp->rx_opt.rcv_tsval = ntohl(*ptr);
++ptr;
tp->rx_opt.rcv_tsecr = ntohl(*ptr);
return 1;
}
return 0;
}
/* Fast parse options. This hopes to only see timestamps.
* If it is wrong it falls back on tcp_parse_options().
*/
static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
struct tcp_sock *tp, u8 **hvpp)
/* In the spirit of fast parsing, compare doff directly to constant
* values. Because equality is used, short doff can be ignored here.
*/
if (th->doff == (sizeof(*th) / 4)) {
tp->rx_opt.saw_tstamp = 0;
return 0;
} else if (tp->rx_opt.tstamp_ok &&
th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
if (tcp_parse_aligned_timestamp(tp, th))
tcp_parse_options(skb, &tp->rx_opt, hvpp, 1);
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
#ifdef CONFIG_TCP_MD5SIG
/*
* Parse MD5 Signature option
*/
u8 *tcp_parse_md5sig_option(struct tcphdr *th)
{
int length = (th->doff << 2) - sizeof (*th);
u8 *ptr = (u8*)(th + 1);
/* If the TCP option is too short, we can short cut */
if (length < TCPOLEN_MD5SIG)
return NULL;
while (length > 0) {
int opcode = *ptr++;
int opsize;
switch(opcode) {
case TCPOPT_EOL:
return NULL;
case TCPOPT_NOP:
length--;
continue;
default:
opsize = *ptr++;
if (opsize < 2 || opsize > length)
return NULL;
if (opcode == TCPOPT_MD5SIG)
return opsize == TCPOLEN_MD5SIG ? ptr : NULL;
}
ptr += opsize - 2;
length -= opsize;
}
return NULL;
}
EXPORT_SYMBOL(tcp_parse_md5sig_option);
#endif
static inline void tcp_store_ts_recent(struct tcp_sock *tp)
{