Skip to content
Snippets Groups Projects
tcp_input.c 139 KiB
Newer Older
  • Learn to ignore specific revisions
  • Linus Torvalds's avatar
    Linus Torvalds committed
    3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000
    				return;
    			case TCPOPT_NOP:	/* Ref: RFC 793 section 3.1 */
    				length--;
    				continue;
    			default:
    				opsize=*ptr++;
    				if (opsize < 2) /* "silly options" */
    					return;
    				if (opsize > length)
    					return;	/* don't parse partial options */
    	  			switch(opcode) {
    				case TCPOPT_MSS:
    					if(opsize==TCPOLEN_MSS && th->syn && !estab) {
    						u16 in_mss = ntohs(get_unaligned((__u16 *)ptr));
    						if (in_mss) {
    							if (opt_rx->user_mss && opt_rx->user_mss < in_mss)
    								in_mss = opt_rx->user_mss;
    							opt_rx->mss_clamp = in_mss;
    						}
    					}
    					break;
    				case TCPOPT_WINDOW:
    					if(opsize==TCPOLEN_WINDOW && th->syn && !estab)
    						if (sysctl_tcp_window_scaling) {
    							__u8 snd_wscale = *(__u8 *) ptr;
    							opt_rx->wscale_ok = 1;
    							if (snd_wscale > 14) {
    								if(net_ratelimit())
    									printk(KERN_INFO "tcp_parse_options: Illegal window "
    									       "scaling value %d >14 received.\n",
    									       snd_wscale);
    								snd_wscale = 14;
    							}
    							opt_rx->snd_wscale = snd_wscale;
    						}
    					break;
    				case TCPOPT_TIMESTAMP:
    					if(opsize==TCPOLEN_TIMESTAMP) {
    						if ((estab && opt_rx->tstamp_ok) ||
    						    (!estab && sysctl_tcp_timestamps)) {
    							opt_rx->saw_tstamp = 1;
    							opt_rx->rcv_tsval = ntohl(get_unaligned((__u32 *)ptr));
    							opt_rx->rcv_tsecr = ntohl(get_unaligned((__u32 *)(ptr+4)));
    						}
    					}
    					break;
    				case TCPOPT_SACK_PERM:
    					if(opsize==TCPOLEN_SACK_PERM && th->syn && !estab) {
    						if (sysctl_tcp_sack) {
    							opt_rx->sack_ok = 1;
    							tcp_sack_reset(opt_rx);
    						}
    					}
    					break;
    
    				case TCPOPT_SACK:
    					if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
    					   !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
    					   opt_rx->sack_ok) {
    						TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
    					}
    	  			};
    	  			ptr+=opsize-2;
    	  			length-=opsize;
    	  	};
    	}
    }
    
    /* Fast parse options. This hopes to only see timestamps.
     * If it is wrong it falls back on tcp_parse_options().
     */
    static inline int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
    					 struct tcp_sock *tp)
    {
    	if (th->doff == sizeof(struct tcphdr)>>2) {
    		tp->rx_opt.saw_tstamp = 0;
    		return 0;
    	} else if (tp->rx_opt.tstamp_ok &&
    		   th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) {
    		__u32 *ptr = (__u32 *)(th + 1);
    		if (*ptr == ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
    				  | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
    			tp->rx_opt.saw_tstamp = 1;
    			++ptr;
    			tp->rx_opt.rcv_tsval = ntohl(*ptr);
    			++ptr;
    			tp->rx_opt.rcv_tsecr = ntohl(*ptr);
    			return 1;
    		}
    	}
    	tcp_parse_options(skb, &tp->rx_opt, 1);
    	return 1;
    }
    
    static inline void tcp_store_ts_recent(struct tcp_sock *tp)
    {
    	tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
    	tp->rx_opt.ts_recent_stamp = xtime.tv_sec;
    }
    
    static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
    {
    	if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
    		/* PAWS bug workaround wrt. ACK frames, the PAWS discard
    		 * extra check below makes sure this can only happen
    		 * for pure ACK frames.  -DaveM
    		 *
    		 * Not only, also it occurs for expired timestamps.
    		 */
    
    		if((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) >= 0 ||
    		   xtime.tv_sec >= tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS)
    			tcp_store_ts_recent(tp);
    	}
    }
    
    /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
     *
     * It is not fatal. If this ACK does _not_ change critical state (seqs, window)
     * it can pass through stack. So, the following predicate verifies that
     * this segment is not used for anything but congestion avoidance or
     * fast retransmit. Moreover, we even are able to eliminate most of such
     * second order effects, if we apply some small "replay" window (~RTO)
     * to timestamp space.
     *
     * All these measures still do not guarantee that we reject wrapped ACKs
     * on networks with high bandwidth, when sequence space is recycled fastly,
     * but it guarantees that such events will be very rare and do not affect
     * connection seriously. This doesn't look nice, but alas, PAWS is really
     * buggy extension.
     *
     * [ Later note. Even worse! It is buggy for segments _with_ data. RFC
     * states that events when retransmit arrives after original data are rare.
     * It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is
     * the biggest problem on large power networks even with minor reordering.
     * OK, let's give it small replay window. If peer clock is even 1hz, it is safe
     * up to bandwidth of 18Gigabit/sec. 8) ]
     */
    
    static int tcp_disordered_ack(struct tcp_sock *tp, struct sk_buff *skb)
    {
    	struct tcphdr *th = skb->h.th;
    	u32 seq = TCP_SKB_CB(skb)->seq;
    	u32 ack = TCP_SKB_CB(skb)->ack_seq;
    
    	return (/* 1. Pure ACK with correct sequence number. */
    		(th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&
    
    		/* 2. ... and duplicate ACK. */
    		ack == tp->snd_una &&
    
    		/* 3. ... and does not update window. */
    		!tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
    
    		/* 4. ... and sits in replay window. */
    		(s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (tp->rto*1024)/HZ);
    }
    
    static inline int tcp_paws_discard(struct tcp_sock *tp, struct sk_buff *skb)
    {
    	return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW &&
    		xtime.tv_sec < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS &&
    		!tcp_disordered_ack(tp, skb));
    }
    
    /* Check segment sequence number for validity.
     *
     * Segment controls are considered valid, if the segment
     * fits to the window after truncation to the window. Acceptability
     * of data (and SYN, FIN, of course) is checked separately.
     * See tcp_data_queue(), for example.
     *
     * Also, controls (RST is main one) are accepted using RCV.WUP instead
     * of RCV.NXT. Peer still did not advance his SND.UNA when we
     * delayed ACK, so that hisSND.UNA<=ourRCV.WUP.
     * (borrowed from freebsd)
     */
    
    static inline int tcp_sequence(struct tcp_sock *tp, u32 seq, u32 end_seq)
    {
    	return	!before(end_seq, tp->rcv_wup) &&
    		!after(seq, tp->rcv_nxt + tcp_receive_window(tp));
    }
    
    /* When we get a reset we do this. */
    static void tcp_reset(struct sock *sk)
    {
    	/* We want the right error as BSD sees it (and indeed as we do). */
    	switch (sk->sk_state) {
    		case TCP_SYN_SENT:
    			sk->sk_err = ECONNREFUSED;
    			break;
    		case TCP_CLOSE_WAIT:
    			sk->sk_err = EPIPE;
    			break;
    		case TCP_CLOSE:
    			return;
    		default:
    			sk->sk_err = ECONNRESET;
    	}
    
    	if (!sock_flag(sk, SOCK_DEAD))
    		sk->sk_error_report(sk);
    
    	tcp_done(sk);
    }
    
    /*
     * 	Process the FIN bit. This now behaves as it is supposed to work
     *	and the FIN takes effect when it is validly part of sequence
     *	space. Not before when we get holes.
     *
     *	If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
     *	(and thence onto LAST-ACK and finally, CLOSE, we never enter
     *	TIME-WAIT)
     *
     *	If we are in FINWAIT-1, a received FIN indicates simultaneous
     *	close and we go into CLOSING (and later onto TIME-WAIT)
     *
     *	If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
     */
    static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	tcp_schedule_ack(tp);
    
    	sk->sk_shutdown |= RCV_SHUTDOWN;
    	sock_set_flag(sk, SOCK_DONE);
    
    	switch (sk->sk_state) {
    		case TCP_SYN_RECV:
    		case TCP_ESTABLISHED:
    			/* Move to CLOSE_WAIT */
    			tcp_set_state(sk, TCP_CLOSE_WAIT);
    			tp->ack.pingpong = 1;
    			break;
    
    		case TCP_CLOSE_WAIT:
    		case TCP_CLOSING:
    			/* Received a retransmission of the FIN, do
    			 * nothing.
    			 */
    			break;
    		case TCP_LAST_ACK:
    			/* RFC793: Remain in the LAST-ACK state. */
    			break;
    
    		case TCP_FIN_WAIT1:
    			/* This case occurs when a simultaneous close
    			 * happens, we must ack the received FIN and
    			 * enter the CLOSING state.
    			 */
    			tcp_send_ack(sk);
    			tcp_set_state(sk, TCP_CLOSING);
    			break;
    		case TCP_FIN_WAIT2:
    			/* Received a FIN -- send ACK and enter TIME_WAIT. */
    			tcp_send_ack(sk);
    			tcp_time_wait(sk, TCP_TIME_WAIT, 0);
    			break;
    		default:
    			/* Only TCP_LISTEN and TCP_CLOSE are left, in these
    			 * cases we should never reach this piece of code.
    			 */
    			printk(KERN_ERR "%s: Impossible, sk->sk_state=%d\n",
    			       __FUNCTION__, sk->sk_state);
    			break;
    	};
    
    	/* It _is_ possible, that we have something out-of-order _after_ FIN.
    	 * Probably, we should reset in this case. For now drop them.
    	 */
    	__skb_queue_purge(&tp->out_of_order_queue);
    	if (tp->rx_opt.sack_ok)
    		tcp_sack_reset(&tp->rx_opt);
    	sk_stream_mem_reclaim(sk);
    
    	if (!sock_flag(sk, SOCK_DEAD)) {
    		sk->sk_state_change(sk);
    
    		/* Do not send POLL_HUP for half duplex close. */
    		if (sk->sk_shutdown == SHUTDOWN_MASK ||
    		    sk->sk_state == TCP_CLOSE)
    			sk_wake_async(sk, 1, POLL_HUP);
    		else
    			sk_wake_async(sk, 1, POLL_IN);
    	}
    }
    
    static __inline__ int
    tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq)
    {
    	if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
    		if (before(seq, sp->start_seq))
    			sp->start_seq = seq;
    		if (after(end_seq, sp->end_seq))
    			sp->end_seq = end_seq;
    		return 1;
    	}
    	return 0;
    }
    
    static inline void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq)
    {
    	if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) {
    		if (before(seq, tp->rcv_nxt))
    			NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOLDSENT);
    		else
    			NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFOSENT);
    
    		tp->rx_opt.dsack = 1;
    		tp->duplicate_sack[0].start_seq = seq;
    		tp->duplicate_sack[0].end_seq = end_seq;
    		tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + 1, 4 - tp->rx_opt.tstamp_ok);
    	}
    }
    
    static inline void tcp_dsack_extend(struct tcp_sock *tp, u32 seq, u32 end_seq)
    {
    	if (!tp->rx_opt.dsack)
    		tcp_dsack_set(tp, seq, end_seq);
    	else
    		tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
    }
    
    static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
    	    before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
    		NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST);
    		tcp_enter_quickack_mode(tp);
    
    		if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) {
    			u32 end_seq = TCP_SKB_CB(skb)->end_seq;
    
    			if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
    				end_seq = tp->rcv_nxt;
    			tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, end_seq);
    		}
    	}
    
    	tcp_send_ack(sk);
    }
    
    /* These routines update the SACK block as out-of-order packets arrive or
     * in-order packets close up the sequence space.
     */
    static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
    {
    	int this_sack;
    	struct tcp_sack_block *sp = &tp->selective_acks[0];
    	struct tcp_sack_block *swalk = sp+1;
    
    	/* See if the recent change to the first SACK eats into
    	 * or hits the sequence space of other SACK blocks, if so coalesce.
    	 */
    	for (this_sack = 1; this_sack < tp->rx_opt.num_sacks; ) {
    		if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) {
    			int i;
    
    			/* Zap SWALK, by moving every further SACK up by one slot.
    			 * Decrease num_sacks.
    			 */
    			tp->rx_opt.num_sacks--;
    			tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok);
    			for(i=this_sack; i < tp->rx_opt.num_sacks; i++)
    				sp[i] = sp[i+1];
    			continue;
    		}
    		this_sack++, swalk++;
    	}
    }
    
    static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2)
    {
    	__u32 tmp;
    
    	tmp = sack1->start_seq;
    	sack1->start_seq = sack2->start_seq;
    	sack2->start_seq = tmp;
    
    	tmp = sack1->end_seq;
    	sack1->end_seq = sack2->end_seq;
    	sack2->end_seq = tmp;
    }
    
    static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	struct tcp_sack_block *sp = &tp->selective_acks[0];
    	int cur_sacks = tp->rx_opt.num_sacks;
    	int this_sack;
    
    	if (!cur_sacks)
    		goto new_sack;
    
    	for (this_sack=0; this_sack<cur_sacks; this_sack++, sp++) {
    		if (tcp_sack_extend(sp, seq, end_seq)) {
    			/* Rotate this_sack to the first one. */
    			for (; this_sack>0; this_sack--, sp--)
    				tcp_sack_swap(sp, sp-1);
    			if (cur_sacks > 1)
    				tcp_sack_maybe_coalesce(tp);
    			return;
    		}
    	}
    
    	/* Could not find an adjacent existing SACK, build a new one,
    	 * put it at the front, and shift everyone else down.  We
    	 * always know there is at least one SACK present already here.
    	 *
    	 * If the sack array is full, forget about the last one.
    	 */
    	if (this_sack >= 4) {
    		this_sack--;
    		tp->rx_opt.num_sacks--;
    		sp--;
    	}
    	for(; this_sack > 0; this_sack--, sp--)
    		*sp = *(sp-1);
    
    new_sack:
    	/* Build the new head SACK, and we're done. */
    	sp->start_seq = seq;
    	sp->end_seq = end_seq;
    	tp->rx_opt.num_sacks++;
    	tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok);
    }
    
    /* RCV.NXT advances, some SACKs should be eaten. */
    
    static void tcp_sack_remove(struct tcp_sock *tp)
    {
    	struct tcp_sack_block *sp = &tp->selective_acks[0];
    	int num_sacks = tp->rx_opt.num_sacks;
    	int this_sack;
    
    	/* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
    	if (skb_queue_len(&tp->out_of_order_queue) == 0) {
    		tp->rx_opt.num_sacks = 0;
    		tp->rx_opt.eff_sacks = tp->rx_opt.dsack;
    		return;
    	}
    
    	for(this_sack = 0; this_sack < num_sacks; ) {
    		/* Check if the start of the sack is covered by RCV.NXT. */
    		if (!before(tp->rcv_nxt, sp->start_seq)) {
    			int i;
    
    			/* RCV.NXT must cover all the block! */
    			BUG_TRAP(!before(tp->rcv_nxt, sp->end_seq));
    
    			/* Zap this SACK, by moving forward any other SACKS. */
    			for (i=this_sack+1; i < num_sacks; i++)
    				tp->selective_acks[i-1] = tp->selective_acks[i];
    			num_sacks--;
    			continue;
    		}
    		this_sack++;
    		sp++;
    	}
    	if (num_sacks != tp->rx_opt.num_sacks) {
    		tp->rx_opt.num_sacks = num_sacks;
    		tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok);
    	}
    }
    
    /* This one checks to see if we can put data from the
     * out_of_order queue into the receive_queue.
     */
    static void tcp_ofo_queue(struct sock *sk)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	__u32 dsack_high = tp->rcv_nxt;
    	struct sk_buff *skb;
    
    	while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) {
    		if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
    			break;
    
    		if (before(TCP_SKB_CB(skb)->seq, dsack_high)) {
    			__u32 dsack = dsack_high;
    			if (before(TCP_SKB_CB(skb)->end_seq, dsack_high))
    				dsack_high = TCP_SKB_CB(skb)->end_seq;
    			tcp_dsack_extend(tp, TCP_SKB_CB(skb)->seq, dsack);
    		}
    
    		if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
    			SOCK_DEBUG(sk, "ofo packet was already received \n");
    			__skb_unlink(skb, skb->list);
    			__kfree_skb(skb);
    			continue;
    		}
    		SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
    			   tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
    			   TCP_SKB_CB(skb)->end_seq);
    
    		__skb_unlink(skb, skb->list);
    		__skb_queue_tail(&sk->sk_receive_queue, skb);
    		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
    		if(skb->h.th->fin)
    			tcp_fin(skb, sk, skb->h.th);
    	}
    }
    
    static int tcp_prune_queue(struct sock *sk);
    
    static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
    {
    	struct tcphdr *th = skb->h.th;
    	struct tcp_sock *tp = tcp_sk(sk);
    	int eaten = -1;
    
    	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
    		goto drop;
    
    	__skb_pull(skb, th->doff*4);
    
    	TCP_ECN_accept_cwr(tp, skb);
    
    	if (tp->rx_opt.dsack) {
    		tp->rx_opt.dsack = 0;
    		tp->rx_opt.eff_sacks = min_t(unsigned int, tp->rx_opt.num_sacks,
    						    4 - tp->rx_opt.tstamp_ok);
    	}
    
    	/*  Queue data for delivery to the user.
    	 *  Packets in sequence go to the receive queue.
    	 *  Out of sequence packets to the out_of_order_queue.
    	 */
    	if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
    		if (tcp_receive_window(tp) == 0)
    			goto out_of_window;
    
    		/* Ok. In sequence. In window. */
    		if (tp->ucopy.task == current &&
    		    tp->copied_seq == tp->rcv_nxt && tp->ucopy.len &&
    		    sock_owned_by_user(sk) && !tp->urg_data) {
    			int chunk = min_t(unsigned int, skb->len,
    							tp->ucopy.len);
    
    			__set_current_state(TASK_RUNNING);
    
    			local_bh_enable();
    			if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) {
    				tp->ucopy.len -= chunk;
    				tp->copied_seq += chunk;
    				eaten = (chunk == skb->len && !th->fin);
    				tcp_rcv_space_adjust(sk);
    			}
    			local_bh_disable();
    		}
    
    		if (eaten <= 0) {
    queue_and_out:
    			if (eaten < 0 &&
    			    (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
    			     !sk_stream_rmem_schedule(sk, skb))) {
    				if (tcp_prune_queue(sk) < 0 ||
    				    !sk_stream_rmem_schedule(sk, skb))
    					goto drop;
    			}
    			sk_stream_set_owner_r(skb, sk);
    			__skb_queue_tail(&sk->sk_receive_queue, skb);
    		}
    		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
    		if(skb->len)
    			tcp_event_data_recv(sk, tp, skb);
    		if(th->fin)
    			tcp_fin(skb, sk, th);
    
    		if (skb_queue_len(&tp->out_of_order_queue)) {
    			tcp_ofo_queue(sk);
    
    			/* RFC2581. 4.2. SHOULD send immediate ACK, when
    			 * gap in queue is filled.
    			 */
    			if (!skb_queue_len(&tp->out_of_order_queue))
    				tp->ack.pingpong = 0;
    		}
    
    		if (tp->rx_opt.num_sacks)
    			tcp_sack_remove(tp);
    
    		tcp_fast_path_check(sk, tp);
    
    		if (eaten > 0)
    			__kfree_skb(skb);
    		else if (!sock_flag(sk, SOCK_DEAD))
    			sk->sk_data_ready(sk, 0);
    		return;
    	}
    
    	if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
    		/* A retransmit, 2nd most common case.  Force an immediate ack. */
    		NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST);
    		tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
    
    out_of_window:
    		tcp_enter_quickack_mode(tp);
    		tcp_schedule_ack(tp);
    drop:
    		__kfree_skb(skb);
    		return;
    	}
    
    	/* Out of window. F.e. zero window probe. */
    	if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
    		goto out_of_window;
    
    	tcp_enter_quickack_mode(tp);
    
    	if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
    		/* Partial packet, seq < rcv_next < end_seq */
    		SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
    			   tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
    			   TCP_SKB_CB(skb)->end_seq);
    
    		tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
    		
    		/* If window is closed, drop tail of packet. But after
    		 * remembering D-SACK for its head made in previous line.
    		 */
    		if (!tcp_receive_window(tp))
    			goto out_of_window;
    		goto queue_and_out;
    	}
    
    	TCP_ECN_check_ce(tp, skb);
    
    	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
    	    !sk_stream_rmem_schedule(sk, skb)) {
    		if (tcp_prune_queue(sk) < 0 ||
    		    !sk_stream_rmem_schedule(sk, skb))
    			goto drop;
    	}
    
    	/* Disable header prediction. */
    	tp->pred_flags = 0;
    	tcp_schedule_ack(tp);
    
    	SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
    		   tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
    
    	sk_stream_set_owner_r(skb, sk);
    
    	if (!skb_peek(&tp->out_of_order_queue)) {
    		/* Initial out of order segment, build 1 SACK. */
    		if (tp->rx_opt.sack_ok) {
    			tp->rx_opt.num_sacks = 1;
    			tp->rx_opt.dsack     = 0;
    			tp->rx_opt.eff_sacks = 1;
    			tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
    			tp->selective_acks[0].end_seq =
    						TCP_SKB_CB(skb)->end_seq;
    		}
    		__skb_queue_head(&tp->out_of_order_queue,skb);
    	} else {
    		struct sk_buff *skb1 = tp->out_of_order_queue.prev;
    		u32 seq = TCP_SKB_CB(skb)->seq;
    		u32 end_seq = TCP_SKB_CB(skb)->end_seq;
    
    		if (seq == TCP_SKB_CB(skb1)->end_seq) {
    			__skb_append(skb1, skb);
    
    			if (!tp->rx_opt.num_sacks ||
    			    tp->selective_acks[0].end_seq != seq)
    				goto add_sack;
    
    			/* Common case: data arrive in order after hole. */
    			tp->selective_acks[0].end_seq = end_seq;
    			return;
    		}
    
    		/* Find place to insert this segment. */
    		do {
    			if (!after(TCP_SKB_CB(skb1)->seq, seq))
    				break;
    		} while ((skb1 = skb1->prev) !=
    			 (struct sk_buff*)&tp->out_of_order_queue);
    
    		/* Do skb overlap to previous one? */
    		if (skb1 != (struct sk_buff*)&tp->out_of_order_queue &&
    		    before(seq, TCP_SKB_CB(skb1)->end_seq)) {
    			if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
    				/* All the bits are present. Drop. */
    				__kfree_skb(skb);
    				tcp_dsack_set(tp, seq, end_seq);
    				goto add_sack;
    			}
    			if (after(seq, TCP_SKB_CB(skb1)->seq)) {
    				/* Partial overlap. */
    				tcp_dsack_set(tp, seq, TCP_SKB_CB(skb1)->end_seq);
    			} else {
    				skb1 = skb1->prev;
    			}
    		}
    		__skb_insert(skb, skb1, skb1->next, &tp->out_of_order_queue);
    		
    		/* And clean segments covered by new one as whole. */
    		while ((skb1 = skb->next) !=
    		       (struct sk_buff*)&tp->out_of_order_queue &&
    		       after(end_seq, TCP_SKB_CB(skb1)->seq)) {
    		       if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
    			       tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, end_seq);
    			       break;
    		       }
    		       __skb_unlink(skb1, skb1->list);
    		       tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq);
    		       __kfree_skb(skb1);
    		}
    
    add_sack:
    		if (tp->rx_opt.sack_ok)
    			tcp_sack_new_ofo_skb(sk, seq, end_seq);
    	}
    }
    
    /* Collapse contiguous sequence of skbs head..tail with
     * sequence numbers start..end.
     * Segments with FIN/SYN are not collapsed (only because this
     * simplifies code)
     */
    static void
    tcp_collapse(struct sock *sk, struct sk_buff *head,
    	     struct sk_buff *tail, u32 start, u32 end)
    {
    	struct sk_buff *skb;
    
    	/* First, check that queue is collapsable and find
    	 * the point where collapsing can be useful. */
    	for (skb = head; skb != tail; ) {
    		/* No new bits? It is possible on ofo queue. */
    		if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
    			struct sk_buff *next = skb->next;
    			__skb_unlink(skb, skb->list);
    			__kfree_skb(skb);
    			NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED);
    			skb = next;
    			continue;
    		}
    
    		/* The first skb to collapse is:
    		 * - not SYN/FIN and
    		 * - bloated or contains data before "start" or
    		 *   overlaps to the next one.
    		 */
    		if (!skb->h.th->syn && !skb->h.th->fin &&
    		    (tcp_win_from_space(skb->truesize) > skb->len ||
    		     before(TCP_SKB_CB(skb)->seq, start) ||
    		     (skb->next != tail &&
    		      TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb->next)->seq)))
    			break;
    
    		/* Decided to skip this, advance start seq. */
    		start = TCP_SKB_CB(skb)->end_seq;
    		skb = skb->next;
    	}
    	if (skb == tail || skb->h.th->syn || skb->h.th->fin)
    		return;
    
    	while (before(start, end)) {
    		struct sk_buff *nskb;
    		int header = skb_headroom(skb);
    		int copy = SKB_MAX_ORDER(header, 0);
    
    		/* Too big header? This can happen with IPv6. */
    		if (copy < 0)
    			return;
    		if (end-start < copy)
    			copy = end-start;
    		nskb = alloc_skb(copy+header, GFP_ATOMIC);
    		if (!nskb)
    			return;
    		skb_reserve(nskb, header);
    		memcpy(nskb->head, skb->head, header);
    		nskb->nh.raw = nskb->head + (skb->nh.raw-skb->head);
    		nskb->h.raw = nskb->head + (skb->h.raw-skb->head);
    		nskb->mac.raw = nskb->head + (skb->mac.raw-skb->head);
    		memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
    		TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
    		__skb_insert(nskb, skb->prev, skb, skb->list);
    		sk_stream_set_owner_r(nskb, sk);
    
    		/* Copy data, releasing collapsed skbs. */
    		while (copy > 0) {
    			int offset = start - TCP_SKB_CB(skb)->seq;
    			int size = TCP_SKB_CB(skb)->end_seq - start;
    
    			if (offset < 0) BUG();
    			if (size > 0) {
    				size = min(copy, size);
    				if (skb_copy_bits(skb, offset, skb_put(nskb, size), size))
    					BUG();
    				TCP_SKB_CB(nskb)->end_seq += size;
    				copy -= size;
    				start += size;
    			}
    			if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
    				struct sk_buff *next = skb->next;
    				__skb_unlink(skb, skb->list);
    				__kfree_skb(skb);
    				NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED);
    				skb = next;
    				if (skb == tail || skb->h.th->syn || skb->h.th->fin)
    					return;
    			}
    		}
    	}
    }
    
    /* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
     * and tcp_collapse() them until all the queue is collapsed.
     */
    static void tcp_collapse_ofo_queue(struct sock *sk)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	struct sk_buff *skb = skb_peek(&tp->out_of_order_queue);
    	struct sk_buff *head;
    	u32 start, end;
    
    	if (skb == NULL)
    		return;
    
    	start = TCP_SKB_CB(skb)->seq;
    	end = TCP_SKB_CB(skb)->end_seq;
    	head = skb;
    
    	for (;;) {
    		skb = skb->next;
    
    		/* Segment is terminated when we see gap or when
    		 * we are at the end of all the queue. */
    		if (skb == (struct sk_buff *)&tp->out_of_order_queue ||
    		    after(TCP_SKB_CB(skb)->seq, end) ||
    		    before(TCP_SKB_CB(skb)->end_seq, start)) {
    			tcp_collapse(sk, head, skb, start, end);
    			head = skb;
    			if (skb == (struct sk_buff *)&tp->out_of_order_queue)
    				break;
    			/* Start new segment */
    			start = TCP_SKB_CB(skb)->seq;
    			end = TCP_SKB_CB(skb)->end_seq;
    		} else {
    			if (before(TCP_SKB_CB(skb)->seq, start))
    				start = TCP_SKB_CB(skb)->seq;
    			if (after(TCP_SKB_CB(skb)->end_seq, end))
    				end = TCP_SKB_CB(skb)->end_seq;
    		}
    	}
    }
    
    /* Reduce allocated memory if we can, trying to get
     * the socket within its memory limits again.
     *
     * Return less than zero if we should start dropping frames
     * until the socket owning process reads some of the data
     * to stabilize the situation.
     */
    static int tcp_prune_queue(struct sock *sk)
    {
    	struct tcp_sock *tp = tcp_sk(sk); 
    
    	SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
    
    	NET_INC_STATS_BH(LINUX_MIB_PRUNECALLED);
    
    	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
    		tcp_clamp_window(sk, tp);
    	else if (tcp_memory_pressure)
    		tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
    
    	tcp_collapse_ofo_queue(sk);
    	tcp_collapse(sk, sk->sk_receive_queue.next,
    		     (struct sk_buff*)&sk->sk_receive_queue,
    		     tp->copied_seq, tp->rcv_nxt);
    	sk_stream_mem_reclaim(sk);
    
    	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
    		return 0;
    
    	/* Collapsing did not help, destructive actions follow.
    	 * This must not ever occur. */
    
    	/* First, purge the out_of_order queue. */
    	if (skb_queue_len(&tp->out_of_order_queue)) {
    		NET_ADD_STATS_BH(LINUX_MIB_OFOPRUNED, 
    				 skb_queue_len(&tp->out_of_order_queue));
    		__skb_queue_purge(&tp->out_of_order_queue);
    
    		/* Reset SACK state.  A conforming SACK implementation will
    		 * do the same at a timeout based retransmit.  When a connection
    		 * is in a sad state like this, we care only about integrity
    		 * of the connection not performance.
    		 */
    		if (tp->rx_opt.sack_ok)
    			tcp_sack_reset(&tp->rx_opt);
    		sk_stream_mem_reclaim(sk);
    	}
    
    	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
    		return 0;
    
    	/* If we are really being abused, tell the caller to silently
    	 * drop receive data on the floor.  It will get retransmitted
    	 * and hopefully then we'll have sufficient space.
    	 */
    	NET_INC_STATS_BH(LINUX_MIB_RCVPRUNED);
    
    	/* Massive buffer overcommit. */
    	tp->pred_flags = 0;
    	return -1;
    }
    
    
    /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
     * As additional protections, we do not touch cwnd in retransmission phases,
     * and if application hit its sndbuf limit recently.
     */
    void tcp_cwnd_application_limited(struct sock *sk)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	if (tp->ca_state == TCP_CA_Open &&
    	    sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
    		/* Limited by application or receiver window. */
    		u32 win_used = max(tp->snd_cwnd_used, 2U);
    		if (win_used < tp->snd_cwnd) {
    			tp->snd_ssthresh = tcp_current_ssthresh(tp);
    			tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
    		}
    		tp->snd_cwnd_used = 0;
    	}
    	tp->snd_cwnd_stamp = tcp_time_stamp;
    }
    
    
    /* When incoming ACK allowed to free some skb from write_queue,
     * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
     * on the exit from tcp input handler.
     *
     * PROBLEM: sndbuf expansion does not work well with largesend.
     */
    static void tcp_new_space(struct sock *sk)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	if (tp->packets_out < tp->snd_cwnd &&
    	    !(sk->sk_userlocks & SOCK_SNDBUF_LOCK) &&
    	    !tcp_memory_pressure &&
    	    atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
     		int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache_std) +
    			MAX_TCP_HEADER + 16 + sizeof(struct sk_buff),
    		    demanded = max_t(unsigned int, tp->snd_cwnd,
    						   tp->reordering + 1);
    		sndmem *= 2*demanded;
    		if (sndmem > sk->sk_sndbuf)
    			sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
    		tp->snd_cwnd_stamp = tcp_time_stamp;
    	}
    
    	sk->sk_write_space(sk);
    }
    
    static inline void tcp_check_space(struct sock *sk)
    {
    	if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
    		sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
    		if (sk->sk_socket &&
    		    test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
    			tcp_new_space(sk);
    	}
    }
    
    static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) ||
    	    tcp_packets_in_flight(tp) >= tp->snd_cwnd ||
    	    tcp_write_xmit(sk, tp->nonagle))
    		tcp_check_probe_timer(sk, tp);
    }
    
    static __inline__ void tcp_data_snd_check(struct sock *sk)
    {
    	struct sk_buff *skb = sk->sk_send_head;
    
    	if (skb != NULL)
    		__tcp_data_snd_check(sk, skb);
    	tcp_check_space(sk);
    }
    
    /*
     * Check if sending an ack is needed.
     */
    static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)