Newer
Older
break;
case TCPOLEN_COOKIE_PAIR:
/* not yet implemented */
break;
case TCPOLEN_COOKIE_MIN+0:
case TCPOLEN_COOKIE_MIN+2:
case TCPOLEN_COOKIE_MIN+4:
case TCPOLEN_COOKIE_MIN+6:
case TCPOLEN_COOKIE_MAX:
/* 16-bit multiple */
opt_rx->cookie_plus = opsize;
*hvpp = ptr;
default:
/* ignore option */
break;
ptr += opsize-2;
length -= opsize;
static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th)
const __be32 *ptr = (const __be32 *)(th + 1);
if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
| (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
tp->rx_opt.saw_tstamp = 1;
++ptr;
tp->rx_opt.rcv_tsval = ntohl(*ptr);
++ptr;
tp->rx_opt.rcv_tsecr = ntohl(*ptr);
return 1;
}
return 0;
}
/* Fast parse options. This hopes to only see timestamps.
* If it is wrong it falls back on tcp_parse_options().
*/
static int tcp_fast_parse_options(const struct sk_buff *skb,
const struct tcphdr *th,
struct tcp_sock *tp, const u8 **hvpp)
/* In the spirit of fast parsing, compare doff directly to constant
* values. Because equality is used, short doff can be ignored here.
*/
if (th->doff == (sizeof(*th) / 4)) {
tp->rx_opt.saw_tstamp = 0;
return 0;
} else if (tp->rx_opt.tstamp_ok &&
th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
if (tcp_parse_aligned_timestamp(tp, th))
tcp_parse_options(skb, &tp->rx_opt, hvpp, 1);
#ifdef CONFIG_TCP_MD5SIG
/*
* Parse MD5 Signature option
*/
const u8 *tcp_parse_md5sig_option(const struct tcphdr *th)
{
int length = (th->doff << 2) - sizeof(*th);
const u8 *ptr = (const u8 *)(th + 1);
/* If the TCP option is too short, we can short cut */
if (length < TCPOLEN_MD5SIG)
return NULL;
while (length > 0) {
int opcode = *ptr++;
int opsize;
switch(opcode) {
case TCPOPT_EOL:
return NULL;
case TCPOPT_NOP:
length--;
continue;
default:
opsize = *ptr++;
if (opsize < 2 || opsize > length)
return NULL;
if (opcode == TCPOPT_MD5SIG)
return opsize == TCPOLEN_MD5SIG ? ptr : NULL;
}
ptr += opsize - 2;
length -= opsize;
}
return NULL;
}
EXPORT_SYMBOL(tcp_parse_md5sig_option);
#endif
static inline void tcp_store_ts_recent(struct tcp_sock *tp)
{
tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
tp->rx_opt.ts_recent_stamp = get_seconds();
}
static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
{
if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
/* PAWS bug workaround wrt. ACK frames, the PAWS discard
* extra check below makes sure this can only happen
* for pure ACK frames. -DaveM
*
* Not only, also it occurs for expired timestamps.
*/
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
tcp_store_ts_recent(tp);
}
}
/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
*
* It is not fatal. If this ACK does _not_ change critical state (seqs, window)
* it can pass through stack. So, the following predicate verifies that
* this segment is not used for anything but congestion avoidance or
* fast retransmit. Moreover, we even are able to eliminate most of such
* second order effects, if we apply some small "replay" window (~RTO)
* to timestamp space.
*
* All these measures still do not guarantee that we reject wrapped ACKs
* on networks with high bandwidth, when sequence space is recycled fastly,
* but it guarantees that such events will be very rare and do not affect
* connection seriously. This doesn't look nice, but alas, PAWS is really
* buggy extension.
*
* [ Later note. Even worse! It is buggy for segments _with_ data. RFC
* states that events when retransmit arrives after original data are rare.
* It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is
* the biggest problem on large power networks even with minor reordering.
* OK, let's give it small replay window. If peer clock is even 1hz, it is safe
* up to bandwidth of 18Gigabit/sec. 8) ]
*/
static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
const struct tcp_sock *tp = tcp_sk(sk);
const struct tcphdr *th = tcp_hdr(skb);
u32 seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
return (/* 1. Pure ACK with correct sequence number. */
(th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&
/* 2. ... and duplicate ACK. */
ack == tp->snd_una &&
/* 3. ... and does not update window. */
!tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
/* 4. ... and sits in replay window. */
(s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
static inline int tcp_paws_discard(const struct sock *sk,
const struct sk_buff *skb)
const struct tcp_sock *tp = tcp_sk(sk);
return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) &&
!tcp_disordered_ack(sk, skb);
}
/* Check segment sequence number for validity.
*
* Segment controls are considered valid, if the segment
* fits to the window after truncation to the window. Acceptability
* of data (and SYN, FIN, of course) is checked separately.
* See tcp_data_queue(), for example.
*
* Also, controls (RST is main one) are accepted using RCV.WUP instead
* of RCV.NXT. Peer still did not advance his SND.UNA when we
* delayed ACK, so that hisSND.UNA<=ourRCV.WUP.
* (borrowed from freebsd)
*/
static inline int tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
{
return !before(end_seq, tp->rcv_wup) &&
!after(seq, tp->rcv_nxt + tcp_receive_window(tp));
}
/* When we get a reset we do this. */
static void tcp_reset(struct sock *sk)
{
/* We want the right error as BSD sees it (and indeed as we do). */
switch (sk->sk_state) {
case TCP_SYN_SENT:
sk->sk_err = ECONNREFUSED;
break;
case TCP_CLOSE_WAIT:
sk->sk_err = EPIPE;
break;
case TCP_CLOSE:
return;
default:
sk->sk_err = ECONNRESET;
/* This barrier is coupled with smp_rmb() in tcp_poll() */
smp_wmb();
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
if (!sock_flag(sk, SOCK_DEAD))
sk->sk_error_report(sk);
tcp_done(sk);
}
/*
* Process the FIN bit. This now behaves as it is supposed to work
* and the FIN takes effect when it is validly part of sequence
* space. Not before when we get holes.
*
* If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
* (and thence onto LAST-ACK and finally, CLOSE, we never enter
* TIME-WAIT)
*
* If we are in FINWAIT-1, a received FIN indicates simultaneous
* close and we go into CLOSING (and later onto TIME-WAIT)
*
* If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
*/
static void tcp_fin(struct sock *sk)
inet_csk_schedule_ack(sk);
sk->sk_shutdown |= RCV_SHUTDOWN;
sock_set_flag(sk, SOCK_DONE);
switch (sk->sk_state) {
case TCP_SYN_RECV:
case TCP_ESTABLISHED:
/* Move to CLOSE_WAIT */
tcp_set_state(sk, TCP_CLOSE_WAIT);
inet_csk(sk)->icsk_ack.pingpong = 1;
break;
case TCP_CLOSE_WAIT:
case TCP_CLOSING:
/* Received a retransmission of the FIN, do
* nothing.
*/
break;
case TCP_LAST_ACK:
/* RFC793: Remain in the LAST-ACK state. */
break;
case TCP_FIN_WAIT1:
/* This case occurs when a simultaneous close
* happens, we must ack the received FIN and
* enter the CLOSING state.
*/
tcp_send_ack(sk);
tcp_set_state(sk, TCP_CLOSING);
break;
case TCP_FIN_WAIT2:
/* Received a FIN -- send ACK and enter TIME_WAIT. */
tcp_send_ack(sk);
tcp_time_wait(sk, TCP_TIME_WAIT, 0);
break;
default:
/* Only TCP_LISTEN and TCP_CLOSE are left, in these
* cases we should never reach this piece of code.
*/
pr_err("%s: Impossible, sk->sk_state=%d\n",
__func__, sk->sk_state);
/* It _is_ possible, that we have something out-of-order _after_ FIN.
* Probably, we should reset in this case. For now drop them.
*/
__skb_queue_purge(&tp->out_of_order_queue);
if (tcp_is_sack(tp))
sk_mem_reclaim(sk);
if (!sock_flag(sk, SOCK_DEAD)) {
sk->sk_state_change(sk);
/* Do not send POLL_HUP for half duplex close. */
if (sk->sk_shutdown == SHUTDOWN_MASK ||
sk->sk_state == TCP_CLOSE)
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
u32 end_seq)
{
if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
if (before(seq, sp->start_seq))
sp->start_seq = seq;
if (after(end_seq, sp->end_seq))
sp->end_seq = end_seq;
return 1;
}
return 0;
}
static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
struct tcp_sock *tp = tcp_sk(sk);
if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
int mib_idx;
mib_idx = LINUX_MIB_TCPDSACKOLDSENT;
mib_idx = LINUX_MIB_TCPDSACKOFOSENT;
NET_INC_STATS_BH(sock_net(sk), mib_idx);
tp->rx_opt.dsack = 1;
tp->duplicate_sack[0].start_seq = seq;
tp->duplicate_sack[0].end_seq = end_seq;
}
}
static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
struct tcp_sock *tp = tcp_sk(sk);
tcp_dsack_set(sk, seq, end_seq);
else
tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
}
static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
tcp_enter_quickack_mode(sk);
if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
u32 end_seq = TCP_SKB_CB(skb)->end_seq;
if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
end_seq = tp->rcv_nxt;
tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
}
}
tcp_send_ack(sk);
}
/* These routines update the SACK block as out-of-order packets arrive or
* in-order packets close up the sequence space.
*/
static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
{
int this_sack;
struct tcp_sack_block *sp = &tp->selective_acks[0];
struct tcp_sack_block *swalk = sp + 1;
/* See if the recent change to the first SACK eats into
* or hits the sequence space of other SACK blocks, if so coalesce.
*/
for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;) {
if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) {
int i;
/* Zap SWALK, by moving every further SACK up by one slot.
* Decrease num_sacks.
*/
tp->rx_opt.num_sacks--;
for (i = this_sack; i < tp->rx_opt.num_sacks; i++)
sp[i] = sp[i + 1];
continue;
}
this_sack++, swalk++;
}
}
static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_sack_block *sp = &tp->selective_acks[0];
int cur_sacks = tp->rx_opt.num_sacks;
int this_sack;
if (!cur_sacks)
goto new_sack;
for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) {
if (tcp_sack_extend(sp, seq, end_seq)) {
/* Rotate this_sack to the first one. */
for (; this_sack > 0; this_sack--, sp--)
if (cur_sacks > 1)
tcp_sack_maybe_coalesce(tp);
return;
}
}
/* Could not find an adjacent existing SACK, build a new one,
* put it at the front, and shift everyone else down. We
* always know there is at least one SACK present already here.
*
* If the sack array is full, forget about the last one.
*/
if (this_sack >= TCP_NUM_SACKS) {
this_sack--;
tp->rx_opt.num_sacks--;
sp--;
}
new_sack:
/* Build the new head SACK, and we're done. */
sp->start_seq = seq;
sp->end_seq = end_seq;
tp->rx_opt.num_sacks++;
}
/* RCV.NXT advances, some SACKs should be eaten. */
static void tcp_sack_remove(struct tcp_sock *tp)
{
struct tcp_sack_block *sp = &tp->selective_acks[0];
int num_sacks = tp->rx_opt.num_sacks;
int this_sack;
/* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
if (skb_queue_empty(&tp->out_of_order_queue)) {
for (this_sack = 0; this_sack < num_sacks;) {
/* Check if the start of the sack is covered by RCV.NXT. */
if (!before(tp->rcv_nxt, sp->start_seq)) {
int i;
/* RCV.NXT must cover all the block! */
WARN_ON(before(tp->rcv_nxt, sp->end_seq));
/* Zap this SACK, by moving forward any other SACKS. */
for (i=this_sack+1; i < num_sacks; i++)
tp->selective_acks[i-1] = tp->selective_acks[i];
num_sacks--;
continue;
}
this_sack++;
sp++;
}
tp->rx_opt.num_sacks = num_sacks;
}
/* This one checks to see if we can put data from the
* out_of_order queue into the receive_queue.
*/
static void tcp_ofo_queue(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
__u32 dsack_high = tp->rcv_nxt;
struct sk_buff *skb;
while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) {
if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
break;
if (before(TCP_SKB_CB(skb)->seq, dsack_high)) {
__u32 dsack = dsack_high;
if (before(TCP_SKB_CB(skb)->end_seq, dsack_high))
dsack_high = TCP_SKB_CB(skb)->end_seq;
tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
}
if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
SOCK_DEBUG(sk, "ofo packet was already received\n");
__kfree_skb(skb);
continue;
}
SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
TCP_SKB_CB(skb)->end_seq);
__skb_queue_tail(&sk->sk_receive_queue, skb);
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
if (tcp_hdr(skb)->fin)
static int tcp_prune_ofo_queue(struct sock *sk);
static int tcp_try_rmem_schedule(struct sock *sk, unsigned int size)
{
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
!sk_rmem_schedule(sk, size)) {
if (tcp_prune_queue(sk) < 0)
return -1;
if (!sk_rmem_schedule(sk, size)) {
if (!tcp_prune_ofo_queue(sk))
return -1;
if (!sk_rmem_schedule(sk, size))
return -1;
}
}
return 0;
}
/**
* tcp_try_coalesce - try to merge skb to prior one
* @sk: socket
* @to: prior buffer
* @from: buffer to add in queue
* @fragstolen: pointer to boolean
*
* Before queueing skb @from after @to, try to merge them
* to reduce overall memory use and queue lengths, if cost is small.
* Packets in ofo or receive queues can stay a long time.
* Better try to coalesce them right now to avoid future collapses.
* Returns true if caller should free @from instead of queueing it
static bool tcp_try_coalesce(struct sock *sk,
struct sk_buff *to,
struct sk_buff *from,
bool *fragstolen)
int i, delta, len = from->len;
*fragstolen = false;
if (tcp_hdr(from)->fin || skb_cloned(to))
if (len <= skb_tailroom(to)) {
BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
if (skb_has_frag_list(to) || skb_has_frag_list(from))
return false;
if (skb_headlen(from) != 0) {
struct page *page;
unsigned int offset;
if (skb_shinfo(to)->nr_frags +
skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS)
return false;
if (skb_head_is_locked(from))
return false;
delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
page = virt_to_head_page(from->head);
offset = from->data - (unsigned char *)page_address(page);
skb_fill_page_desc(to, skb_shinfo(to)->nr_frags,
page, offset, skb_headlen(from));
*fragstolen = true;
} else {
if (skb_shinfo(to)->nr_frags +
skb_shinfo(from)->nr_frags > MAX_SKB_FRAGS)
return false;
delta = from->truesize -
SKB_TRUESIZE(skb_end_pointer(from) - from->head);
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
WARN_ON_ONCE(delta < len);
memcpy(skb_shinfo(to)->frags + skb_shinfo(to)->nr_frags,
skb_shinfo(from)->frags,
skb_shinfo(from)->nr_frags * sizeof(skb_frag_t));
skb_shinfo(to)->nr_frags += skb_shinfo(from)->nr_frags;
if (!skb_cloned(from))
skb_shinfo(from)->nr_frags = 0;
/* if the skb is cloned this does nothing since we set nr_frags to 0 */
for (i = 0; i < skb_shinfo(from)->nr_frags; i++)
skb_frag_ref(from, i);
to->truesize += delta;
atomic_add(delta, &sk->sk_rmem_alloc);
sk_mem_charge(sk, delta);
to->len += len;
to->data_len += len;
merge:
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
return true;
static void kfree_skb_partial(struct sk_buff *skb, bool head_stolen)
{
if (head_stolen)
kmem_cache_free(skbuff_head_cache, skb);
else
__kfree_skb(skb);
}
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb1;
u32 seq, end_seq;
TCP_ECN_check_ce(tp, skb);
if (tcp_try_rmem_schedule(sk, skb->truesize)) {
/* TODO: should increment a counter */
__kfree_skb(skb);
return;
}
/* Disable header prediction. */
tp->pred_flags = 0;
inet_csk_schedule_ack(sk);
SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
skb1 = skb_peek_tail(&tp->out_of_order_queue);
if (!skb1) {
/* Initial out of order segment, build 1 SACK. */
if (tcp_is_sack(tp)) {
tp->rx_opt.num_sacks = 1;
tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
tp->selective_acks[0].end_seq =
TCP_SKB_CB(skb)->end_seq;
}
__skb_queue_head(&tp->out_of_order_queue, skb);
goto end;
}
seq = TCP_SKB_CB(skb)->seq;
end_seq = TCP_SKB_CB(skb)->end_seq;
if (seq == TCP_SKB_CB(skb1)->end_seq) {
bool fragstolen;
if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
__skb_queue_after(&tp->out_of_order_queue, skb1, skb);
} else {
kfree_skb_partial(skb, fragstolen);
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
if (!tp->rx_opt.num_sacks ||
tp->selective_acks[0].end_seq != seq)
goto add_sack;
/* Common case: data arrive in order after hole. */
tp->selective_acks[0].end_seq = end_seq;
goto end;
}
/* Find place to insert this segment. */
while (1) {
if (!after(TCP_SKB_CB(skb1)->seq, seq))
break;
if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) {
skb1 = NULL;
break;
}
skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1);
}
/* Do skb overlap to previous one? */
if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
/* All the bits are present. Drop. */
__kfree_skb(skb);
skb = NULL;
tcp_dsack_set(sk, seq, end_seq);
goto add_sack;
}
if (after(seq, TCP_SKB_CB(skb1)->seq)) {
/* Partial overlap. */
tcp_dsack_set(sk, seq,
TCP_SKB_CB(skb1)->end_seq);
} else {
if (skb_queue_is_first(&tp->out_of_order_queue,
skb1))
skb1 = NULL;
else
skb1 = skb_queue_prev(
&tp->out_of_order_queue,
skb1);
}
}
if (!skb1)
__skb_queue_head(&tp->out_of_order_queue, skb);
else
__skb_queue_after(&tp->out_of_order_queue, skb1, skb);
/* And clean segments covered by new one as whole. */
while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) {
skb1 = skb_queue_next(&tp->out_of_order_queue, skb);
if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
break;
if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
end_seq);
break;
}
__skb_unlink(skb1, &tp->out_of_order_queue);
tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
TCP_SKB_CB(skb1)->end_seq);
__kfree_skb(skb1);
}
add_sack:
if (tcp_is_sack(tp))
tcp_sack_new_ofo_skb(sk, seq, end_seq);
end:
if (skb)
skb_set_owner_r(skb, sk);
}
static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
bool *fragstolen)
{
int eaten;
struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
__skb_pull(skb, hdrlen);
eaten = (tail &&
tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0;
tcp_sk(sk)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
if (!eaten) {
__skb_queue_tail(&sk->sk_receive_queue, skb);
skb_set_owner_r(skb, sk);
}
return eaten;
}
int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
{
struct sk_buff *skb;
struct tcphdr *th;
bool fragstolen;
if (tcp_try_rmem_schedule(sk, size + sizeof(*th)))
goto err;
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
skb = alloc_skb(size + sizeof(*th), sk->sk_allocation);
if (!skb)
goto err;
th = (struct tcphdr *)skb_put(skb, sizeof(*th));
skb_reset_transport_header(skb);
memset(th, 0, sizeof(*th));
if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size))
goto err_free;
TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
if (tcp_queue_rcv(sk, skb, sizeof(*th), &fragstolen)) {
WARN_ON_ONCE(fragstolen); /* should not happen */
__kfree_skb(skb);
}
return size;
err_free:
kfree_skb(skb);
err:
return -ENOMEM;
}
static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
const struct tcphdr *th = tcp_hdr(skb);
struct tcp_sock *tp = tcp_sk(sk);
int eaten = -1;
bool fragstolen = false;
if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
goto drop;
/* Queue data for delivery to the user.
* Packets in sequence go to the receive queue.
* Out of sequence packets to the out_of_order_queue.
*/
if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
if (tcp_receive_window(tp) == 0)
goto out_of_window;
/* Ok. In sequence. In window. */
if (tp->ucopy.task == current &&
tp->copied_seq == tp->rcv_nxt && tp->ucopy.len &&
sock_owned_by_user(sk) && !tp->urg_data) {
int chunk = min_t(unsigned int, skb->len,
__set_current_state(TASK_RUNNING);
local_bh_enable();
if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) {
tp->ucopy.len -= chunk;
tp->copied_seq += chunk;
eaten = (chunk == skb->len);
tcp_rcv_space_adjust(sk);
}
local_bh_disable();
}
if (eaten <= 0) {
queue_and_out:
if (eaten < 0 &&
tcp_try_rmem_schedule(sk, skb->truesize))
goto drop;
eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
tcp_event_data_recv(sk, skb);
if (!skb_queue_empty(&tp->out_of_order_queue)) {
tcp_ofo_queue(sk);
/* RFC2581. 4.2. SHOULD send immediate ACK, when
* gap in queue is filled.
*/
if (skb_queue_empty(&tp->out_of_order_queue))
inet_csk(sk)->icsk_ack.pingpong = 0;
}
if (tp->rx_opt.num_sacks)
tcp_sack_remove(tp);
tcp_fast_path_check(sk);
if (eaten > 0)
kfree_skb_partial(skb, fragstolen);
else if (!sock_flag(sk, SOCK_DEAD))
sk->sk_data_ready(sk, 0);
return;
}
if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
/* A retransmit, 2nd most common case. Force an immediate ack. */
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
tcp_enter_quickack_mode(sk);
inet_csk_schedule_ack(sk);
drop:
__kfree_skb(skb);
return;
}
/* Out of window. F.e. zero window probe. */
if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
goto out_of_window;
tcp_enter_quickack_mode(sk);
if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
/* Partial packet, seq < rcv_next < end_seq */
SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
TCP_SKB_CB(skb)->end_seq);
tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
/* If window is closed, drop tail of packet. But after
* remembering D-SACK for its head made in previous line.
*/
if (!tcp_receive_window(tp))
goto out_of_window;
goto queue_and_out;
}
static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
struct sk_buff_head *list)
{
struct sk_buff *next = NULL;
if (!skb_queue_is_last(list, skb))
next = skb_queue_next(list, skb);
__skb_unlink(skb, list);
__kfree_skb(skb);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
return next;
}
/* Collapse contiguous sequence of skbs head..tail with
* sequence numbers start..end.
*
* If tail is NULL, this means until the end of the list.
*
* Segments with FIN/SYN are not collapsed (only because this
* simplifies code)
*/
static void
tcp_collapse(struct sock *sk, struct sk_buff_head *list,
struct sk_buff *head, struct sk_buff *tail,
u32 start, u32 end)
struct sk_buff *skb, *n;
bool end_of_skbs;
/* First, check that queue is collapsible and find
skb = head;
restart:
end_of_skbs = true;
skb_queue_walk_from_safe(list, skb, n) {
if (skb == tail)
break;
/* No new bits? It is possible on ofo queue. */
if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
skb = tcp_collapse_one(sk, skb, list);
if (!skb)
break;
goto restart;
}
/* The first skb to collapse is:
* - not SYN/FIN and
* - bloated or contains data before "start" or
* overlaps to the next one.
*/
if (!tcp_hdr(skb)->syn && !tcp_hdr(skb)->fin &&
before(TCP_SKB_CB(skb)->seq, start))) {
end_of_skbs = false;
}
if (!skb_queue_is_last(list, skb)) {
struct sk_buff *next = skb_queue_next(list, skb);
if (next != tail &&
TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) {
end_of_skbs = false;
break;
}
}
/* Decided to skip this, advance start seq. */
start = TCP_SKB_CB(skb)->end_seq;
}
if (end_of_skbs || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin)
return;
while (before(start, end)) {
struct sk_buff *nskb;
unsigned int header = skb_headroom(skb);
int copy = SKB_MAX_ORDER(header, 0);
/* Too big header? This can happen with IPv6. */
if (copy < 0)
return;