Skip to content
Snippets Groups Projects
tcp_input.c 143 KiB
Newer Older
  • Learn to ignore specific revisions
  • Linus Torvalds's avatar
    Linus Torvalds committed
     *	   hole was sent out.
     *	C. SACK arrives sacking SND.NXT at the moment, when the
     *	   segment was retransmitted.
     * 4. D-SACK added new rule: D-SACK changes any tag to S.
     *
     * It is pleasant to note, that state diagram turns out to be commutative,
     * so that we are allowed not to be bothered by order of our actions,
     * when multiple events arrive simultaneously. (see the function below).
     *
     * Reordering detection.
     * --------------------
     * Reordering metric is maximal distance, which a packet can be displaced
     * in packet stream. With SACKs we can estimate it:
     *
     * 1. SACK fills old hole and the corresponding segment was not
     *    ever retransmitted -> reordering. Alas, we cannot use it
     *    when segment was retransmitted.
     * 2. The last flaw is solved with D-SACK. D-SACK arrives
     *    for retransmitted and already SACKed segment -> reordering..
     * Both of these heuristics are not used in Loss state, when we cannot
     * account for retransmits accurately.
     */
    
    static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb,
    			   struct tcp_sack_block_wire *sp, int num_sacks,
    			   u32 prior_snd_una)
    {
    	u32 start_seq_0 = ntohl(get_unaligned(&sp[0].start_seq));
    	u32 end_seq_0 = ntohl(get_unaligned(&sp[0].end_seq));
    	int dup_sack = 0;
    
    	if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
    		dup_sack = 1;
    
    		NET_INC_STATS_BH(LINUX_MIB_TCPDSACKRECV);
    	} else if (num_sacks > 1) {
    		u32 end_seq_1 = ntohl(get_unaligned(&sp[1].end_seq));
    		u32 start_seq_1 = ntohl(get_unaligned(&sp[1].start_seq));
    
    		if (!after(end_seq_0, end_seq_1) &&
    		    !before(start_seq_0, start_seq_1)) {
    			dup_sack = 1;
    
    			NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFORECV);
    		}
    	}
    
    	/* D-SACK for already forgotten data... Do dumb counting. */
    	if (dup_sack &&
    	    !after(end_seq_0, prior_snd_una) &&
    	    after(end_seq_0, tp->undo_marker))
    		tp->undo_retrans--;
    
    	return dup_sack;
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    static int
    tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una)
    {
    
    	const struct inet_connection_sock *icsk = inet_csk(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	unsigned char *ptr = (skb_transport_header(ack_skb) +
    			      TCP_SKB_CB(ack_skb)->sacked);
    
    	struct tcp_sack_block_wire *sp = (struct tcp_sack_block_wire *)(ptr+2);
    
    	struct sk_buff *cached_skb;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3;
    	int reord = tp->packets_out;
    	int prior_fackets;
    	u32 lost_retrans = 0;
    	int flag = 0;
    
    	int found_dup_sack = 0;
    
    	int cached_fack_count;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	int i;
    
    	int first_sack_index;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		tp->fackets_out = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	prior_fackets = tp->fackets_out;
    
    
    	found_dup_sack = tcp_check_dsack(tp, ack_skb, sp,
    					 num_sacks, prior_snd_una);
    	if (found_dup_sack)
    
    		flag |= FLAG_DSACKING_ACK;
    
    
    	/* Eliminate too old ACKs, but take into
    	 * account more or less fresh ones, they can
    	 * contain valid SACK info.
    	 */
    	if (before(TCP_SKB_CB(ack_skb)->ack_seq, prior_snd_una - tp->max_window))
    		return 0;
    
    
    	/* SACK fastpath:
    	 * if the only SACK change is the increase of the end_seq of
    	 * the first block then only apply that SACK block
    	 * and use retrans queue hinting otherwise slowpath */
    	flag = 1;
    
    	for (i = 0; i < num_sacks; i++) {
    		__be32 start_seq = sp[i].start_seq;
    		__be32 end_seq = sp[i].end_seq;
    
    		if (i == 0) {
    
    			if (tp->recv_sack_cache[i].start_seq != start_seq)
    				flag = 0;
    		} else {
    			if ((tp->recv_sack_cache[i].start_seq != start_seq) ||
    			    (tp->recv_sack_cache[i].end_seq != end_seq))
    				flag = 0;
    		}
    		tp->recv_sack_cache[i].start_seq = start_seq;
    		tp->recv_sack_cache[i].end_seq = end_seq;
    	}
    
    	/* Clear the rest of the cache sack blocks so they won't match mistakenly. */
    	for (; i < ARRAY_SIZE(tp->recv_sack_cache); i++) {
    		tp->recv_sack_cache[i].start_seq = 0;
    		tp->recv_sack_cache[i].end_seq = 0;
    	}
    
    	first_sack_index = 0;
    
    	if (flag)
    		num_sacks = 1;
    	else {
    		int j;
    		tp->fastpath_skb_hint = NULL;
    
    		/* order SACK blocks to allow in order walk of the retrans queue */
    		for (i = num_sacks-1; i > 0; i--) {
    			for (j = 0; j < i; j++){
    				if (after(ntohl(sp[j].start_seq),
    					  ntohl(sp[j+1].start_seq))){
    
    					struct tcp_sack_block_wire tmp;
    
    					tmp = sp[j];
    					sp[j] = sp[j+1];
    					sp[j+1] = tmp;
    
    
    					/* Track where the first SACK block goes to */
    					if (j == first_sack_index)
    						first_sack_index = j+1;
    
    				}
    
    			}
    		}
    	}
    
    	/* clear flag as used for different purpose in following code */
    	flag = 0;
    
    
    	/* Use SACK fastpath hint if valid */
    	cached_skb = tp->fastpath_skb_hint;
    	cached_fack_count = tp->fastpath_cnt_hint;
    	if (!cached_skb) {
    
    		cached_skb = tcp_write_queue_head(sk);
    
    	for (i=0; i<num_sacks; i++, sp++) {
    		struct sk_buff *skb;
    		__u32 start_seq = ntohl(sp->start_seq);
    		__u32 end_seq = ntohl(sp->end_seq);
    		int fack_count;
    
    		int dup_sack = (found_dup_sack && (i == first_sack_index));
    
    		skb = cached_skb;
    		fack_count = cached_fack_count;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		/* Event "B" in the comment above. */
    		if (after(end_seq, tp->high_seq))
    			flag |= FLAG_DATA_LOST;
    
    
    		tcp_for_write_queue_from(skb, sk) {
    
    			int in_sack, pcount;
    			u8 sacked;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    			if (skb == tcp_send_head(sk))
    				break;
    
    
    			cached_skb = skb;
    			cached_fack_count = fack_count;
    			if (i == first_sack_index) {
    				tp->fastpath_skb_hint = skb;
    				tp->fastpath_cnt_hint = fack_count;
    			}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			/* The retransmission queue is always in order, so
    			 * we can short-circuit the walk early.
    			 */
    
    			if (!before(TCP_SKB_CB(skb)->seq, end_seq))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				break;
    
    
    			in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
    				!before(end_seq, TCP_SKB_CB(skb)->end_seq);
    
    
    			pcount = tcp_skb_pcount(skb);
    
    
    			if (pcount > 1 && !in_sack &&
    			    after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
    
    				in_sack = !after(start_seq,
    						 TCP_SKB_CB(skb)->seq);
    
    				if (!in_sack)
    
    					pkt_len = (start_seq -
    						   TCP_SKB_CB(skb)->seq);
    				else
    					pkt_len = (end_seq -
    						   TCP_SKB_CB(skb)->seq);
    
    				if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->gso_size))
    
    					break;
    				pcount = tcp_skb_pcount(skb);
    			}
    
    			fack_count += pcount;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    			sacked = TCP_SKB_CB(skb)->sacked;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			/* Account D-SACK for retransmitted packet. */
    			if ((dup_sack && in_sack) &&
    			    (sacked & TCPCB_RETRANS) &&
    			    after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker))
    				tp->undo_retrans--;
    
    			/* The frame is ACKed. */
    			if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) {
    				if (sacked&TCPCB_RETRANS) {
    					if ((dup_sack && in_sack) &&
    					    (sacked&TCPCB_SACKED_ACKED))
    						reord = min(fack_count, reord);
    				} else {
    					/* If it was in a hole, we detected reordering. */
    					if (fack_count < prior_fackets &&
    					    !(sacked&TCPCB_SACKED_ACKED))
    						reord = min(fack_count, reord);
    				}
    
    				/* Nothing to do; acked frame is about to be dropped. */
    				continue;
    			}
    
    			if ((sacked&TCPCB_SACKED_RETRANS) &&
    			    after(end_seq, TCP_SKB_CB(skb)->ack_seq) &&
    			    (!lost_retrans || after(end_seq, lost_retrans)))
    				lost_retrans = end_seq;
    
    			if (!in_sack)
    				continue;
    
    			if (!(sacked&TCPCB_SACKED_ACKED)) {
    				if (sacked & TCPCB_SACKED_RETRANS) {
    					/* If the segment is not tagged as lost,
    					 * we do not clear RETRANS, believing
    					 * that retransmission is still in flight.
    					 */
    					if (sacked & TCPCB_LOST) {
    						TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
    						tp->lost_out -= tcp_skb_pcount(skb);
    						tp->retrans_out -= tcp_skb_pcount(skb);
    
    
    						/* clear lost hint */
    						tp->retransmit_skb_hint = NULL;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    					}
    				} else {
    					/* New sack for not retransmitted frame,
    					 * which was in hole. It is reordering.
    					 */
    					if (!(sacked & TCPCB_RETRANS) &&
    					    fack_count < prior_fackets)
    						reord = min(fack_count, reord);
    
    					if (sacked & TCPCB_LOST) {
    						TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
    						tp->lost_out -= tcp_skb_pcount(skb);
    
    
    						/* clear lost hint */
    						tp->retransmit_skb_hint = NULL;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    					}
    
    					/* SACK enhanced F-RTO detection.
    					 * Set flag if and only if non-rexmitted
    					 * segments below frto_highmark are
    					 * SACKed (RFC4138; Appendix B).
    					 * Clearing correct due to in-order walk
    					 */
    					if (after(end_seq, tp->frto_highmark)) {
    						flag &= ~FLAG_ONLY_ORIG_SACKED;
    					} else {
    						if (!(sacked & TCPCB_RETRANS))
    							flag |= FLAG_ONLY_ORIG_SACKED;
    					}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				}
    
    				TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED;
    				flag |= FLAG_DATA_SACKED;
    				tp->sacked_out += tcp_skb_pcount(skb);
    
    				if (fack_count > tp->fackets_out)
    					tp->fackets_out = fack_count;
    
    
    				if (after(TCP_SKB_CB(skb)->seq,
    				    tp->highest_sack))
    					tp->highest_sack = TCP_SKB_CB(skb)->seq;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			} else {
    				if (dup_sack && (sacked&TCPCB_RETRANS))
    					reord = min(fack_count, reord);
    			}
    
    			/* D-SACK. We can detect redundant retransmission
    			 * in S|R and plain R frames and clear it.
    			 * undo_retrans is decreased above, L|R frames
    			 * are accounted above as well.
    			 */
    			if (dup_sack &&
    			    (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) {
    				TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
    				tp->retrans_out -= tcp_skb_pcount(skb);
    
    				tp->retransmit_skb_hint = NULL;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			}
    		}
    	}
    
    	/* Check for lost retransmit. This superb idea is
    	 * borrowed from "ratehalving". Event "C".
    	 * Later note: FACK people cheated me again 8),
    	 * we have to account for reordering! Ugly,
    	 * but should help.
    	 */
    
    	if (lost_retrans && icsk->icsk_ca_state == TCP_CA_Recovery) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		struct sk_buff *skb;
    
    
    		tcp_for_write_queue(skb, sk) {
    			if (skb == tcp_send_head(sk))
    				break;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			if (after(TCP_SKB_CB(skb)->seq, lost_retrans))
    				break;
    			if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
    				continue;
    			if ((TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) &&
    			    after(lost_retrans, TCP_SKB_CB(skb)->ack_seq) &&
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			     !before(lost_retrans,
    				     TCP_SKB_CB(skb)->ack_seq + tp->reordering *
    
    				     tp->mss_cache))) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
    				tp->retrans_out -= tcp_skb_pcount(skb);
    
    
    				/* clear lost hint */
    				tp->retransmit_skb_hint = NULL;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) {
    					tp->lost_out += tcp_skb_pcount(skb);
    					TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
    					flag |= FLAG_DATA_SACKED;
    					NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT);
    				}
    			}
    		}
    	}
    
    
    	if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss &&
    
    	    (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark)))
    
    		tcp_update_reordering(sk, ((tp->fackets_out + 1) - reord), 0);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    #if FASTRETRANS_DEBUG > 0
    	BUG_TRAP((int)tp->sacked_out >= 0);
    	BUG_TRAP((int)tp->lost_out >= 0);
    	BUG_TRAP((int)tp->retrans_out >= 0);
    	BUG_TRAP((int)tcp_packets_in_flight(tp) >= 0);
    #endif
    	return flag;
    }
    
    
    /* F-RTO can only be used if TCP has never retransmitted anything other than
     * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here)
    
    static void tcp_check_reno_reordering(struct sock *sk, const int addend)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	u32 holes;
    
    	holes = max(tp->lost_out, 1U);
    	holes = min(holes, tp->packets_out);
    
    	if ((tp->sacked_out + holes) > tp->packets_out) {
    		tp->sacked_out = tp->packets_out - holes;
    		tcp_update_reordering(sk, tp->packets_out + addend, 0);
    	}
    }
    
    /* Emulate SACKs for SACKless connection: account for a new dupack. */
    
    static void tcp_add_reno_sack(struct sock *sk)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	tp->sacked_out++;
    	tcp_check_reno_reordering(sk, 0);
    
    }
    
    /* Account for ACK, ACKing some data in Reno Recovery phase. */
    
    static void tcp_remove_reno_sacks(struct sock *sk, int acked)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	if (acked > 0) {
    		/* One ACK acked hole. The rest eat duplicate ACKs. */
    		if (acked-1 >= tp->sacked_out)
    			tp->sacked_out = 0;
    		else
    			tp->sacked_out -= acked-1;
    	}
    	tcp_check_reno_reordering(sk, acked);
    
    }
    
    static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
    {
    	tp->sacked_out = 0;
    }
    
    
    int tcp_use_frto(struct sock *sk)
    
    {
    	const struct tcp_sock *tp = tcp_sk(sk);
    
    	if (IsSackFrto())
    		return 1;
    
    
    	/* Avoid expensive walking of rexmit queue if possible */
    	if (tp->retrans_out > 1)
    		return 0;
    
    
    	skb = tcp_write_queue_head(sk);
    	skb = tcp_write_queue_next(sk, skb);	/* Skips head */
    	tcp_for_write_queue_from(skb, sk) {
    		if (skb == tcp_send_head(sk))
    			break;
    
    		if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS)
    			return 0;
    		/* Short-circuit when first non-SACKed skb has been checked */
    		if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED))
    			break;
    	}
    	return 1;
    
    /* RTO occurred, but do not yet enter Loss state. Instead, defer RTO
     * recovery a bit and use heuristics in tcp_process_frto() to detect if
    
     * the RTO was spurious. Only clear SACKED_RETRANS of the head here to
     * keep retrans_out counting accurate (with SACK F-RTO, other than head
     * may still have that bit set); TCPCB_LOST and remaining SACKED_RETRANS
     * bits are handled if the Loss state is really to be entered (in
     * tcp_enter_frto_loss).
    
     *
     * Do like tcp_enter_loss() would; when RTO expires the second time it
     * does:
     *  "Reduce ssthresh if it has not yet been made inside this window."
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    void tcp_enter_frto(struct sock *sk)
    {
    
    	const struct inet_connection_sock *icsk = inet_csk(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct tcp_sock *tp = tcp_sk(sk);
    	struct sk_buff *skb;
    
    
    	if ((!tp->frto_counter && icsk->icsk_ca_state <= TCP_CA_Disorder) ||
    
    	    tp->snd_una == tp->high_seq ||
    
    	    ((icsk->icsk_ca_state == TCP_CA_Loss || tp->frto_counter) &&
    	     !icsk->icsk_retransmits)) {
    
    		tp->prior_ssthresh = tcp_current_ssthresh(sk);
    
    		/* Our state is too optimistic in ssthresh() call because cwnd
    		 * is not reduced until tcp_enter_frto_loss() when previous FRTO
    		 * recovery has not yet completed. Pattern would be this: RTO,
    		 * Cumulative ACK, RTO (2xRTO for the same segment does not end
    		 * up here twice).
    		 * RFC4138 should be more specific on what to do, even though
    		 * RTO is quite unlikely to occur after the first Cumulative ACK
    		 * due to back-off and complexity of triggering events ...
    		 */
    		if (tp->frto_counter) {
    			u32 stored_cwnd;
    			stored_cwnd = tp->snd_cwnd;
    			tp->snd_cwnd = 2;
    			tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
    			tp->snd_cwnd = stored_cwnd;
    		} else {
    			tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
    		}
    		/* ... in theory, cong.control module could do "any tricks" in
    		 * ssthresh(), which means that ca_state, lost bits and lost_out
    		 * counter would have to be faked before the call occurs. We
    		 * consider that too expensive, unlikely and hacky, so modules
    		 * using these in ssthresh() must deal these incompatibility
    		 * issues if they receives CA_EVENT_FRTO and frto_counter != 0
    		 */
    
    		tcp_ca_event(sk, CA_EVENT_FRTO);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    	tp->undo_marker = tp->snd_una;
    	tp->undo_retrans = 0;
    
    
    	skb = tcp_write_queue_head(sk);
    
    	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
    
    		TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
    
    		tp->retrans_out -= tcp_skb_pcount(skb);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	/* Earlier loss recovery underway (see RFC4138; Appendix B).
    	 * The last condition is necessary at least in tp->frto_counter case.
    	 */
    	if (IsSackFrto() && (tp->frto_counter ||
    	    ((1 << icsk->icsk_ca_state) & (TCPF_CA_Recovery|TCPF_CA_Loss))) &&
    	    after(tp->high_seq, tp->snd_una)) {
    		tp->frto_highmark = tp->high_seq;
    	} else {
    		tp->frto_highmark = tp->snd_nxt;
    	}
    
    	tcp_set_ca_state(sk, TCP_CA_Disorder);
    	tp->high_seq = tp->snd_nxt;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /* Enter Loss state after F-RTO was applied. Dupack arrived after RTO,
     * which indicates that we should follow the traditional RTO recovery,
     * i.e. mark everything lost and do go-back-N retransmission.
     */
    
    static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	struct sk_buff *skb;
    
    	tp->lost_out = 0;
    
    	tp->retrans_out = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	tcp_for_write_queue(skb, sk) {
    		if (skb == tcp_send_head(sk))
    			break;
    
    		/*
    		 * Count the retransmission made on RTO correctly (only when
    		 * waiting for the first ACK and did not get it)...
    		 */
    		if ((tp->frto_counter == 1) && !(flag&FLAG_DATA_ACKED)) {
    
    			/* For some reason this R-bit might get cleared? */
    			if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
    				tp->retrans_out += tcp_skb_pcount(skb);
    
    			/* ...enter this if branch just for the first segment */
    			flag |= FLAG_DATA_ACKED;
    		} else {
    			TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
    		}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		/* Don't lost mark skbs that were fwd transmitted after RTO */
    		if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) &&
    		    !after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) {
    			TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
    			tp->lost_out += tcp_skb_pcount(skb);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    	}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	tp->snd_cwnd = tcp_packets_in_flight(tp) + allowed_segments;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	tp->snd_cwnd_cnt = 0;
    	tp->snd_cwnd_stamp = tcp_time_stamp;
    	tp->undo_marker = 0;
    	tp->frto_counter = 0;
    
    	tp->reordering = min_t(unsigned int, tp->reordering,
    					     sysctl_tcp_reordering);
    
    	tcp_set_ca_state(sk, TCP_CA_Loss);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	tp->high_seq = tp->frto_highmark;
    	TCP_ECN_queue_cwr(tp);
    
    
    	clear_all_retrans_hints(tp);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    void tcp_clear_retrans(struct tcp_sock *tp)
    {
    	tp->retrans_out = 0;
    
    	tp->fackets_out = 0;
    	tp->sacked_out = 0;
    	tp->lost_out = 0;
    
    	tp->undo_marker = 0;
    	tp->undo_retrans = 0;
    }
    
    /* Enter Loss state. If "how" is not zero, forget all SACK information
     * and reset tags completely, otherwise preserve SACKs. If receiver
     * dropped its ofo queue, we will know this due to reneging detection.
     */
    void tcp_enter_loss(struct sock *sk, int how)
    {
    
    	const struct inet_connection_sock *icsk = inet_csk(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct tcp_sock *tp = tcp_sk(sk);
    	struct sk_buff *skb;
    	int cnt = 0;
    
    	/* Reduce ssthresh if it has not yet been made inside this window. */
    
    	if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
    	    (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
    		tp->prior_ssthresh = tcp_current_ssthresh(sk);
    		tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
    		tcp_ca_event(sk, CA_EVENT_LOSS);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    	tp->snd_cwnd	   = 1;
    	tp->snd_cwnd_cnt   = 0;
    	tp->snd_cwnd_stamp = tcp_time_stamp;
    
    
    	tp->bytes_acked = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	tcp_clear_retrans(tp);
    
    	/* Push undo marker, if it was plain RTO and nothing
    	 * was retransmitted. */
    	if (!how)
    		tp->undo_marker = tp->snd_una;
    
    
    	tcp_for_write_queue(skb, sk) {
    		if (skb == tcp_send_head(sk))
    			break;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		cnt += tcp_skb_pcount(skb);
    		if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS)
    			tp->undo_marker = 0;
    		TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
    		if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) {
    			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
    			TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
    			tp->lost_out += tcp_skb_pcount(skb);
    		} else {
    			tp->sacked_out += tcp_skb_pcount(skb);
    			tp->fackets_out = cnt;
    		}
    	}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	tp->reordering = min_t(unsigned int, tp->reordering,
    					     sysctl_tcp_reordering);
    
    	tcp_set_ca_state(sk, TCP_CA_Loss);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	tp->high_seq = tp->snd_nxt;
    	TCP_ECN_queue_cwr(tp);
    
    	/* Abort FRTO algorithm if one is in progress */
    	tp->frto_counter = 0;
    
    
    	clear_all_retrans_hints(tp);
    
    static int tcp_check_sack_reneging(struct sock *sk)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct sk_buff *skb;
    
    	/* If ACK arrived pointing to a remembered SACK,
    	 * it means that our remembered SACKs do not reflect
    	 * real state of receiver i.e.
    	 * receiver _host_ is heavily congested (or buggy).
    	 * Do processing similar to RTO timeout.
    	 */
    
    	if ((skb = tcp_write_queue_head(sk)) != NULL &&
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	    (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
    
    		struct inet_connection_sock *icsk = inet_csk(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING);
    
    		tcp_enter_loss(sk, 1);
    
    		tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
    
    		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return 1;
    	}
    	return 0;
    }
    
    static inline int tcp_fackets_out(struct tcp_sock *tp)
    {
    
    	return tcp_is_reno(tp) ? tp->sacked_out+1 : tp->fackets_out;
    
    static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto);
    
    static inline int tcp_head_timedout(struct sock *sk)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct tcp_sock *tp = tcp_sk(sk);
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	return tp->packets_out &&
    
    	       tcp_skb_timedout(sk, tcp_write_queue_head(sk));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /* Linux NewReno/SACK/FACK/ECN state machine.
     * --------------------------------------
     *
     * "Open"	Normal state, no dubious events, fast path.
     * "Disorder"   In all the respects it is "Open",
     *		but requires a bit more attention. It is entered when
     *		we see some SACKs or dupacks. It is split of "Open"
     *		mainly to move some processing from fast path to slow one.
     * "CWR"	CWND was reduced due to some Congestion Notification event.
     *		It can be ECN, ICMP source quench, local device congestion.
     * "Recovery"	CWND was reduced, we are fast-retransmitting.
     * "Loss"	CWND was reduced due to RTO timeout or SACK reneging.
     *
     * tcp_fastretrans_alert() is entered:
     * - each incoming ACK, if state is not "Open"
     * - when arrived ACK is unusual, namely:
     *	* SACK
     *	* Duplicate ACK.
     *	* ECN ECE.
     *
     * Counting packets in flight is pretty simple.
     *
     *	in_flight = packets_out - left_out + retrans_out
     *
     *	packets_out is SND.NXT-SND.UNA counted in packets.
     *
     *	retrans_out is number of retransmitted segments.
     *
     *	left_out is number of segments left network, but not ACKed yet.
     *
     *		left_out = sacked_out + lost_out
     *
     *     sacked_out: Packets, which arrived to receiver out of order
     *		   and hence not ACKed. With SACKs this number is simply
     *		   amount of SACKed data. Even without SACKs
     *		   it is easy to give pretty reliable estimate of this number,
     *		   counting duplicate ACKs.
     *
     *       lost_out: Packets lost by network. TCP has no explicit
     *		   "loss notification" feedback from network (for now).
     *		   It means that this number can be only _guessed_.
     *		   Actually, it is the heuristics to predict lossage that
     *		   distinguishes different algorithms.
     *
     *	F.e. after RTO, when all the queue is considered as lost,
     *	lost_out = packets_out and in_flight = retrans_out.
     *
     *		Essentially, we have now two algorithms counting
     *		lost packets.
     *
     *		FACK: It is the simplest heuristics. As soon as we decided
     *		that something is lost, we decide that _all_ not SACKed
     *		packets until the most forward SACK are lost. I.e.
     *		lost_out = fackets_out - sacked_out and left_out = fackets_out.
     *		It is absolutely correct estimate, if network does not reorder
     *		packets. And it loses any connection to reality when reordering
     *		takes place. We use FACK by default until reordering
     *		is suspected on the path to this destination.
     *
     *		NewReno: when Recovery is entered, we assume that one segment
     *		is lost (classic Reno). While we are in Recovery and
     *		a partial ACK arrives, we assume that one more packet
     *		is lost (NewReno). This heuristics are the same in NewReno
     *		and SACK.
     *
     *  Imagine, that's all! Forget about all this shamanism about CWND inflation
     *  deflation etc. CWND is real congestion window, never inflated, changes
     *  only according to classic VJ rules.
     *
     * Really tricky (and requiring careful tuning) part of algorithm
     * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
     * The first determines the moment _when_ we should reduce CWND and,
     * hence, slow down forward transmission. In fact, it determines the moment
     * when we decide that hole is caused by loss, rather than by a reorder.
     *
     * tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill
     * holes, caused by lost packets.
     *
     * And the most logically complicated part of algorithm is undo
     * heuristics. We detect false retransmits due to both too early
     * fast retransmit (reordering) and underestimated RTO, analyzing
     * timestamps and D-SACKs. When we detect that some segments were
     * retransmitted by mistake and CWND reduction was wrong, we undo
     * window reduction and abort recovery phase. This logic is hidden
     * inside several functions named tcp_try_undo_<something>.
     */
    
    /* This function decides, when we should leave Disordered state
     * and enter Recovery phase, reducing congestion window.
     *
     * Main question: may we further continue forward transmission
     * with the same cwnd?
     */
    
    static int tcp_time_to_recover(struct sock *sk)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct tcp_sock *tp = tcp_sk(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	__u32 packets_out;
    
    
    	/* Do not perform any recovery during FRTO algorithm */
    	if (tp->frto_counter)
    		return 0;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	/* Trick#1: The loss is proven. */
    	if (tp->lost_out)
    		return 1;
    
    	/* Not-A-Trick#2 : Classic rule... */
    	if (tcp_fackets_out(tp) > tp->reordering)
    		return 1;
    
    	/* Trick#3 : when we use RFC2988 timer restart, fast
    	 * retransmit can be triggered by timeout of queue head.
    	 */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return 1;
    
    	/* Trick#4: It is still not OK... But will it be useful to delay
    	 * recovery more?
    	 */
    	packets_out = tp->packets_out;
    	if (packets_out <= tp->reordering &&
    	    tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) &&
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/* We have nothing to send. This connection is limited
    		 * either by receiver window or by application.
    		 */
    		return 1;
    	}
    
    	return 0;
    }
    
    
    /* RFC: This is from the original, I doubt that this is necessary at all:
     * clear xmit_retrans hint if seq of this skb is beyond hint. How could we
     * retransmitted past LOST markings in the first place? I'm not fully sure
     * about undo and end of connection cases, which can cause R without L?
     */
    static void tcp_verify_retransmit_hint(struct tcp_sock *tp,
    				       struct sk_buff *skb)
    {
    	if ((tp->retransmit_skb_hint != NULL) &&
    	    before(TCP_SKB_CB(skb)->seq,
    	    TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
    
    		tp->retransmit_skb_hint = NULL;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /* Mark head of queue up as lost. */
    
    static void tcp_mark_head_lost(struct sock *sk,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			       int packets, u32 high_seq)
    {
    
    	struct tcp_sock *tp = tcp_sk(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct sk_buff *skb;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	BUG_TRAP(packets <= tp->packets_out);
    	if (tp->lost_skb_hint) {
    		skb = tp->lost_skb_hint;
    		cnt = tp->lost_cnt_hint;
    	} else {
    
    		skb = tcp_write_queue_head(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	tcp_for_write_queue_from(skb, sk) {
    		if (skb == tcp_send_head(sk))
    			break;
    
    		/* TODO: do this better */
    		/* this is not the most efficient way to do this... */
    		tp->lost_skb_hint = skb;
    		tp->lost_cnt_hint = cnt;
    		cnt += tcp_skb_pcount(skb);
    		if (cnt > packets || after(TCP_SKB_CB(skb)->end_seq, high_seq))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			break;
    		if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
    			TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
    			tp->lost_out += tcp_skb_pcount(skb);
    
    			tcp_verify_retransmit_hint(tp, skb);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    	}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /* Account newly detected lost packet(s) */
    
    
    static void tcp_update_scoreboard(struct sock *sk)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct tcp_sock *tp = tcp_sk(sk);
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		int lost = tp->fackets_out - tp->reordering;
    		if (lost <= 0)
    			lost = 1;
    
    		tcp_mark_head_lost(sk, lost, tp->high_seq);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	} else {
    
    		tcp_mark_head_lost(sk, 1, tp->high_seq);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    	/* New heuristics: it is possible only after we switched
    	 * to restart timer each time when something is ACKed.
    	 * Hence, we can detect timed out packets during fast
    	 * retransmit without falling to slow start.
    	 */
    
    	if (!tcp_is_reno(tp) && tcp_head_timedout(sk)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		struct sk_buff *skb;
    
    
    		skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint
    
    			: tcp_write_queue_head(sk);
    
    		tcp_for_write_queue_from(skb, sk) {
    			if (skb == tcp_send_head(sk))
    				break;
    
    			if (!tcp_skb_timedout(sk, skb))
    				break;
    
    			if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
    				tp->lost_out += tcp_skb_pcount(skb);
    
    				tcp_verify_retransmit_hint(tp, skb);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			}
    		}
    
    
    		tp->scoreboard_skb_hint = skb;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    }
    
    /* CWND moderation, preventing bursts due to too big ACKs
     * in dubious situations.
     */
    static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
    {
    	tp->snd_cwnd = min(tp->snd_cwnd,
    			   tcp_packets_in_flight(tp)+tcp_max_burst(tp));
    	tp->snd_cwnd_stamp = tcp_time_stamp;
    }
    
    
    /* Lower bound on congestion window is slow start threshold
     * unless congestion avoidance choice decides to overide it.
     */
    static inline u32 tcp_cwnd_min(const struct sock *sk)
    {
    	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
    
    	return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh;
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /* Decrease cwnd each second ack. */
    
    static void tcp_cwnd_down(struct sock *sk, int flag)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct tcp_sock *tp = tcp_sk(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	int decr = tp->snd_cwnd_cnt + 1;
    
    
    	if ((flag&(FLAG_ANY_PROGRESS|FLAG_DSACKING_ACK)) ||
    
    	    (tcp_is_reno(tp) && !(flag&FLAG_NOT_DUP))) {
    
    		tp->snd_cwnd_cnt = decr&1;
    		decr >>= 1;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		if (decr && tp->snd_cwnd > tcp_cwnd_min(sk))
    			tp->snd_cwnd -= decr;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
    		tp->snd_cwnd_stamp = tcp_time_stamp;
    	}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /* Nothing was retransmitted or returned timestamp is less
     * than timestamp of the first retransmission.
     */
    static inline int tcp_packet_delayed(struct tcp_sock *tp)
    {
    	return !tp->retrans_stamp ||
    		(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
    		 (__s32)(tp->rx_opt.rcv_tsecr - tp->retrans_stamp) < 0);
    }
    
    /* Undo procedures. */
    
    #if FASTRETRANS_DEBUG > 1
    
    static void DBGUNDO(struct sock *sk, const char *msg)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct tcp_sock *tp = tcp_sk(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct inet_sock *inet = inet_sk(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	printk(KERN_DEBUG "Undo %s %u.%u.%u.%u/%u c%u l%u ss%u/%u p%u\n",
    	       msg,
    	       NIPQUAD(inet->daddr), ntohs(inet->dport),
    
    	       tp->snd_cwnd, tcp_left_out(tp),
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	       tp->snd_ssthresh, tp->prior_ssthresh,
    	       tp->packets_out);
    }
    #else
    #define DBGUNDO(x...) do { } while (0)
    #endif
    
    
    static void tcp_undo_cwr(struct sock *sk, const int undo)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (tp->prior_ssthresh) {
    
    		const struct inet_connection_sock *icsk = inet_csk(sk);
    
    		if (icsk->icsk_ca_ops->undo_cwnd)
    			tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);