Newer
Older
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
* interface as the means of communication with the user level.
*
* Implementation of the Transmission Control Protocol(TCP).
*
* Version: $Id: tcp_output.c,v 1.146 2002/02/01 22:01:04 davem Exp $
*
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Mark Evans, <evansmp@uhura.aston.ac.uk>
* Corey Minyard <wf-rch!minyard@relay.EU.net>
* Florian La Roche, <flla@stud.uni-sb.de>
* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
* Linus Torvalds, <torvalds@cs.helsinki.fi>
* Alan Cox, <gw4pts@gw4pts.ampr.org>
* Matthew Dillon, <dillon@apollo.west.oic.com>
* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
* Jorge Cwik, <jorge@laser.satlink.net>
*/
/*
* Changes: Pedro Roque : Retransmit queue handled by TCP.
* : Fragmentation on mtu decrease
* : Segment collapse on retransmit
* : AF independence
*
* Linus Torvalds : send_delayed_ack
* David S. Miller : Charge memory using the right skb
* during syn/ack processing.
* David S. Miller : Output engine completely rewritten.
* Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
* Cacophonix Gaul : draft-minshall-nagle-01
* J Hadi Salim : ECN support
*
*/
#include <net/tcp.h>
#include <linux/compiler.h>
#include <linux/module.h>
#include <linux/smp_lock.h>
/* People can turn this off for buggy TCP's found in printers etc. */
int sysctl_tcp_retrans_collapse = 1;
/* This limits the percentage of the congestion window which we
* will allow a single TSO frame to consume. Building TSO frames
* which are too large can cause TCP streams to be bursty.
*/
int sysctl_tcp_tso_win_divisor = 3;
static void update_send_head(struct sock *sk, struct tcp_sock *tp,
struct sk_buff *skb)
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
{
sk->sk_send_head = skb->next;
if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue)
sk->sk_send_head = NULL;
tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
tcp_packets_out_inc(sk, tp, skb);
}
/* SND.NXT, if window was not shrunk.
* If window has been shrunk, what should we make? It is not clear at all.
* Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
* Anything in between SND.UNA...SND.UNA+SND.WND also can be already
* invalid. OK, let's make this for now:
*/
static inline __u32 tcp_acceptable_seq(struct sock *sk, struct tcp_sock *tp)
{
if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt))
return tp->snd_nxt;
else
return tp->snd_una+tp->snd_wnd;
}
/* Calculate mss to advertise in SYN segment.
* RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
*
* 1. It is independent of path mtu.
* 2. Ideally, it is maximal possible segment size i.e. 65535-40.
* 3. For IPv4 it is reasonable to calculate it from maximal MTU of
* attached devices, because some buggy hosts are confused by
* large MSS.
* 4. We do not make 3, we advertise MSS, calculated from first
* hop device mtu, but allow to raise it to ip_rt_min_advmss.
* This may be overridden via information stored in routing table.
* 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
* probably even Jumbo".
*/
static __u16 tcp_advertise_mss(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct dst_entry *dst = __sk_dst_get(sk);
int mss = tp->advmss;
if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) {
mss = dst_metric(dst, RTAX_ADVMSS);
tp->advmss = mss;
}
return (__u16)mss;
}
/* RFC2861. Reset CWND after idle period longer RTO to "restart window".
* This is the first part of cwnd validation mechanism. */
static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst)
struct tcp_sock *tp = tcp_sk(sk);
s32 delta = tcp_time_stamp - tp->lsndtime;
u32 restart_cwnd = tcp_init_cwnd(tp, dst);
u32 cwnd = tp->snd_cwnd;
tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
tp->snd_ssthresh = tcp_current_ssthresh(sk);
while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
cwnd >>= 1;
tp->snd_cwnd = max(cwnd, restart_cwnd);
tp->snd_cwnd_stamp = tcp_time_stamp;
tp->snd_cwnd_used = 0;
}
static void tcp_event_data_sent(struct tcp_sock *tp,
struct sk_buff *skb, struct sock *sk)
struct inet_connection_sock *icsk = inet_csk(sk);
const u32 now = tcp_time_stamp;
if (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)
tcp_cwnd_restart(sk, __sk_dst_get(sk));
tp->lsndtime = now;
/* If it is a reply for ato after last received
* packet, enter pingpong mode.
*/
if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
icsk->icsk_ack.pingpong = 1;
static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
tcp_dec_quickack_mode(sk, pkts);
inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
}
/* Determine a window scaling and initial window to offer.
* Based on the assumption that the given amount of space
* will be offered. Store the results in the tp structure.
* NOTE: for smooth operation initial space offering should
* be a multiple of mss if possible. We assume here that mss >= 1.
* This MUST be enforced by all callers.
*/
void tcp_select_initial_window(int __space, __u32 mss,
__u32 *rcv_wnd, __u32 *window_clamp,
int wscale_ok, __u8 *rcv_wscale)
{
unsigned int space = (__space < 0 ? 0 : __space);
/* If no clamp set the clamp to the max possible scaled window */
if (*window_clamp == 0)
(*window_clamp) = (65535 << 14);
space = min(*window_clamp, space);
/* Quantize space offering to a multiple of mss if possible. */
if (space > mss)
space = (space / mss) * mss;
/* NOTE: offering an initial window larger than 32767
* will break some buggy TCP stacks. We try to be nice.
* If we are not window scaling, then this truncates
* our initial window offering to 32k. There should also
* be a sysctl option to stop being nice.
*/
(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
(*rcv_wscale) = 0;
if (wscale_ok) {
/* Set window scaling on max possible window
* See RFC1323 for an explanation of the limit to 14
*/
space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
while (space > 65535 && (*rcv_wscale) < 14) {
space >>= 1;
(*rcv_wscale)++;
}
}
/* Set initial window to value enough for senders,
* following RFC2414. Senders, not following this RFC,
* will be satisfied with 2.
*/
if (mss > (1<<*rcv_wscale)) {
int init_cwnd = 4;
if (mss > 1460*3)
else if (mss > 1460)
init_cwnd = 3;
if (*rcv_wnd > init_cwnd*mss)
*rcv_wnd = init_cwnd*mss;
}
/* Set the clamp no higher than max representable value */
(*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);
}
/* Chose a new window to advertise, update state in tcp_sock for the
* socket, and return result with RFC1323 scaling applied. The return
* value can be stuffed directly into th->window for an outgoing
* frame.
*/
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
{
struct tcp_sock *tp = tcp_sk(sk);
u32 cur_win = tcp_receive_window(tp);
u32 new_win = __tcp_select_window(sk);
/* Never shrink the offered window */
if(new_win < cur_win) {
/* Danger Will Robinson!
* Don't update rcv_wup/rcv_wnd here or else
* we will not be able to advertise a zero
* window in time. --DaveM
*
* Relax Will Robinson.
*/
new_win = cur_win;
}
tp->rcv_wnd = new_win;
tp->rcv_wup = tp->rcv_nxt;
/* Make sure we do not exceed the maximum possible
* scaled window.
*/
if (!tp->rx_opt.rcv_wscale)
new_win = min(new_win, MAX_TCP_WINDOW);
else
new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
/* RFC1323 scaling applied */
new_win >>= tp->rx_opt.rcv_wscale;
/* If we advertise zero window, disable fast path. */
if (new_win == 0)
tp->pred_flags = 0;
return new_win;
}
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
static void tcp_build_and_update_options(__u32 *ptr, struct tcp_sock *tp,
__u32 tstamp)
{
if (tp->rx_opt.tstamp_ok) {
*ptr++ = __constant_htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
(TCPOPT_TIMESTAMP << 8) |
TCPOLEN_TIMESTAMP);
*ptr++ = htonl(tstamp);
*ptr++ = htonl(tp->rx_opt.ts_recent);
}
if (tp->rx_opt.eff_sacks) {
struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks;
int this_sack;
*ptr++ = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
(TCPOPT_SACK << 8) |
(TCPOLEN_SACK_BASE + (tp->rx_opt.eff_sacks *
TCPOLEN_SACK_PERBLOCK)));
for(this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) {
*ptr++ = htonl(sp[this_sack].start_seq);
*ptr++ = htonl(sp[this_sack].end_seq);
}
if (tp->rx_opt.dsack) {
tp->rx_opt.dsack = 0;
tp->rx_opt.eff_sacks--;
}
}
}
/* Construct a tcp options header for a SYN or SYN_ACK packet.
* If this is every changed make sure to change the definition of
* MAX_SYN_SIZE to match the new maximum number of options that you
* can generate.
*/
static void tcp_syn_build_options(__u32 *ptr, int mss, int ts, int sack,
int offer_wscale, int wscale, __u32 tstamp,
__u32 ts_recent)
{
/* We always get an MSS option.
* The option bytes which will be seen in normal data
* packets should timestamps be used, must be in the MSS
* advertised. But we subtract them from tp->mss_cache so
* that calculations in tcp_sendmsg are simpler etc.
* So account for this fact here if necessary. If we
* don't do this correctly, as a receiver we won't
* recognize data packets as being full sized when we
* should, and thus we won't abide by the delayed ACK
* rules correctly.
* SACKs don't matter, we never delay an ACK when we
* have any of those going out.
*/
*ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss);
if (ts) {
if(sack)
*ptr++ = __constant_htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) |
(TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP);
else
*ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
(TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP);
*ptr++ = htonl(tstamp); /* TSVAL */
*ptr++ = htonl(ts_recent); /* TSECR */
} else if(sack)
*ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
(TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM);
if (offer_wscale)
*ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | (wscale));
}
/* This routine actually transmits TCP packets queued in by
* tcp_do_sendmsg(). This is used by both the initial
* transmission and possible later retransmissions.
* All SKB's seen here are completely headerless. It is our
* job to build the TCP header, and pass the packet down to
* IP so it can do the same plus pass the packet off to the
* device.
*
* We are working here with either a clone of the original
* SKB, or a fresh unique copy made by the retransmit engine.
*/
static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask)
const struct inet_connection_sock *icsk = inet_csk(sk);
struct inet_sock *inet;
struct tcp_sock *tp;
struct tcp_skb_cb *tcb;
int tcp_header_size;
struct tcphdr *th;
int sysctl_flags;
int err;
BUG_ON(!skb || !tcp_skb_pcount(skb));
/* If congestion control is doing timestamping, we must
* take such a timestamp before we potentially clone/copy.
*/
if (icsk->icsk_ca_ops->rtt_sample)
__net_timestamp(skb);
if (likely(clone_it)) {
if (unlikely(skb_cloned(skb)))
skb = pskb_copy(skb, gfp_mask);
else
skb = skb_clone(skb, gfp_mask);
if (unlikely(!skb))
return -ENOBUFS;
}
inet = inet_sk(sk);
tp = tcp_sk(sk);
tcb = TCP_SKB_CB(skb);
tcp_header_size = tp->tcp_header_len;
#define SYSCTL_FLAG_TSTAMPS 0x1
#define SYSCTL_FLAG_WSCALE 0x2
#define SYSCTL_FLAG_SACK 0x4
sysctl_flags = 0;
if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
if(sysctl_tcp_timestamps) {
tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
if (sysctl_tcp_window_scaling) {
tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
sysctl_flags |= SYSCTL_FLAG_WSCALE;
if (sysctl_tcp_sack) {
sysctl_flags |= SYSCTL_FLAG_SACK;
if (!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
} else if (unlikely(tp->rx_opt.eff_sacks)) {
/* A SACK is 2 pad bytes, a 2 byte header, plus
* 2 32-bit sequence numbers for each SACK block.
*/
tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
(tp->rx_opt.eff_sacks *
TCPOLEN_SACK_PERBLOCK));
}
if (tcp_packets_in_flight(tp) == 0)
tcp_ca_event(sk, CA_EVENT_TX_START);
th = (struct tcphdr *) skb_push(skb, tcp_header_size);
skb->h.th = th;
skb_set_owner_w(skb, sk);
/* Build TCP header and checksum it. */
th->source = inet->sport;
th->dest = inet->dport;
th->seq = htonl(tcb->seq);
th->ack_seq = htonl(tp->rcv_nxt);
*(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
tcb->flags);
if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
/* RFC1323: The window in SYN & SYN/ACK segments
* is never scaled.
*/
th->window = htons(tp->rcv_wnd);
} else {
th->window = htons(tcp_select_window(sk));
}
th->check = 0;
th->urg_ptr = 0;
if (unlikely(tp->urg_mode &&
between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF))) {
th->urg_ptr = htons(tp->snd_up-tcb->seq);
th->urg = 1;
}
if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
tcp_syn_build_options((__u32 *)(th + 1),
tcp_advertise_mss(sk),
(sysctl_flags & SYSCTL_FLAG_TSTAMPS),
(sysctl_flags & SYSCTL_FLAG_SACK),
(sysctl_flags & SYSCTL_FLAG_WSCALE),
tp->rx_opt.rcv_wscale,
tcb->when,
tp->rx_opt.ts_recent);
} else {
tcp_build_and_update_options((__u32 *)(th + 1),
tp, tcb->when);
TCP_ECN_send(sk, tp, skb, tcp_header_size);
}

Arnaldo Carvalho de Melo
committed
icsk->icsk_af_ops->send_check(sk, skb->len, skb);
if (likely(tcb->flags & TCPCB_FLAG_ACK))
tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
if (skb->len != tcp_header_size)
tcp_event_data_sent(tp, skb, sk);

Arnaldo Carvalho de Melo
committed
err = icsk->icsk_af_ops->queue_xmit(skb, 0);
if (unlikely(err <= 0))
return err;
tcp_enter_cwr(sk);
/* NET_XMIT_CN is special. It does not guarantee,
* that this packet is lost. It tells that device
* is about to start to drop packets or already
* drops some packets of the same priority and
* invokes us to send less aggressively.
*/
return err == NET_XMIT_CN ? 0 : err;
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
#undef SYSCTL_FLAG_TSTAMPS
#undef SYSCTL_FLAG_WSCALE
#undef SYSCTL_FLAG_SACK
}
/* This routine just queue's the buffer
*
* NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
* otherwise socket can stall.
*/
static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
/* Advance write_seq and place onto the write_queue. */
tp->write_seq = TCP_SKB_CB(skb)->end_seq;
skb_header_release(skb);
__skb_queue_tail(&sk->sk_write_queue, skb);
sk_charge_skb(sk, skb);
/* Queue it, remembering where we must start sending. */
if (sk->sk_send_head == NULL)
sk->sk_send_head = skb;
}
static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
!(sk->sk_route_caps & NETIF_F_TSO)) {
/* Avoid the costly divide in the normal
* non-TSO case.
*/
skb_shinfo(skb)->tso_segs = 1;
skb_shinfo(skb)->tso_size = 0;
} else {
unsigned int factor;
factor = skb->len + (mss_now - 1);
factor /= mss_now;
skb_shinfo(skb)->tso_segs = factor;
}
}
/* Function to create two new TCP segments. Shrinks the given segment
* to the specified size and appends a new segment with the rest of the
* packet to the list. This won't be called frequently, I hope.
* Remember, these are still headerless SKBs at this point.
*/
int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss_now)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *buff;
int nsize, old_factor;
nsize = skb_headlen(skb) - len;
if (nsize < 0)
nsize = 0;
if (skb_cloned(skb) &&
skb_is_nonlinear(skb) &&
pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
return -ENOMEM;
/* Get a new skb... force flag on. */
buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC);
if (buff == NULL)
return -ENOMEM; /* We'll just try again later. */
sk_charge_skb(sk, buff);
/* Correct the sequence numbers. */
TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
/* PSH and FIN should only be set in the second packet. */
flags = TCP_SKB_CB(skb)->flags;
TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
TCP_SKB_CB(buff)->flags = flags;
TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL;
if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_HW) {
/* Copy and checksum data tail into the new buffer. */
buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize),
nsize, 0);
skb_trim(skb, len);
skb->csum = csum_block_sub(skb->csum, buff->csum, len);
} else {
skb->ip_summed = CHECKSUM_HW;
skb_split(skb, buff, len);
}
buff->ip_summed = skb->ip_summed;
/* Looks stupid, but our code really uses when of
* skbs, which it never sent before. --ANK
*/
TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
buff->tstamp = skb->tstamp;
old_factor = tcp_skb_pcount(skb);
/* Fix up tso_factor for both original and new SKB. */
tcp_set_skb_tso_segs(sk, skb, mss_now);
tcp_set_skb_tso_segs(sk, buff, mss_now);
/* If this packet has been sent out already, we must
* adjust the various packet counters.
*/
if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
int diff = old_factor - tcp_skb_pcount(skb) -
tcp_skb_pcount(buff);
tp->packets_out -= diff;
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
tp->sacked_out -= diff;
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
tp->retrans_out -= diff;
if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
tp->lost_out -= diff;
tp->left_out -= diff;
}
/* Adjust Reno SACK estimate. */
if (!tp->rx_opt.sack_ok) {
tp->sacked_out -= diff;
if ((int)tp->sacked_out < 0)
tp->sacked_out = 0;
tcp_sync_left_out(tp);
}
tp->fackets_out -= diff;
if ((int)tp->fackets_out < 0)
tp->fackets_out = 0;
}
skb_header_release(buff);
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
return 0;
}
/* This is similar to __pskb_pull_head() (it will go to core/skbuff.c
* eventually). The difference is that pulled data not copied, but
* immediately discarded.
*/
static unsigned char *__pskb_trim_head(struct sk_buff *skb, int len)
{
int i, k, eat;
eat = len;
k = 0;
for (i=0; i<skb_shinfo(skb)->nr_frags; i++) {
if (skb_shinfo(skb)->frags[i].size <= eat) {
put_page(skb_shinfo(skb)->frags[i].page);
eat -= skb_shinfo(skb)->frags[i].size;
} else {
skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
if (eat) {
skb_shinfo(skb)->frags[k].page_offset += eat;
skb_shinfo(skb)->frags[k].size -= eat;
eat = 0;
}
k++;
}
}
skb_shinfo(skb)->nr_frags = k;
skb->tail = skb->data;
skb->data_len -= len;
skb->len = skb->data_len;
return skb->tail;
}
int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
{
if (skb_cloned(skb) &&
pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
return -ENOMEM;
if (len <= skb_headlen(skb)) {
__skb_pull(skb, len);
} else {
if (__pskb_trim_head(skb, len-skb_headlen(skb)) == NULL)
return -ENOMEM;
}
TCP_SKB_CB(skb)->seq += len;
skb->ip_summed = CHECKSUM_HW;
skb->truesize -= len;
sk->sk_wmem_queued -= len;
sk->sk_forward_alloc += len;
sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
/* Any change of skb->len requires recalculation of tso
* factor and mss.
*/
if (tcp_skb_pcount(skb) > 1)
tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk, 1));
return 0;
}
/* This function synchronize snd mss to current pmtu/exthdr set.
tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
for TCP options, but includes only bare TCP header.
tp->rx_opt.mss_clamp is mss negotiated at connection setup.
It is minimum of user_mss and mss received with SYN.
inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.
tp->mss_cache is current effective sending mss, including
all tcp options except for SACKs. It is evaluated,
taking into account current pmtu, but never exceeds
tp->rx_opt.mss_clamp.
NOTE1. rfc1122 clearly states that advertised MSS
DOES NOT include either tcp or ip options.
NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
are READ ONLY outside this function. --ANK (980731)
*/
unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
/* Calculate base mss without TCP options:
It is MMS_S - sizeof(tcphdr) of rfc1122
*/
int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len -

Arnaldo Carvalho de Melo
committed
sizeof(struct tcphdr));
/* Clamp it (mss_clamp does not include tcp options) */
if (mss_now > tp->rx_opt.mss_clamp)
mss_now = tp->rx_opt.mss_clamp;
/* Now subtract optional transport overhead */
mss_now -= icsk->icsk_ext_hdr_len;
/* Then reserve room for full set of TCP options and 8 bytes of data */
if (mss_now < 48)
mss_now = 48;
/* Now subtract TCP options size, not including SACKs */
mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
/* Bound mss with half of window */
if (tp->max_window && mss_now > (tp->max_window>>1))
mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len);
/* And store cached results */
icsk->icsk_pmtu_cookie = pmtu;
return mss_now;
}
/* Compute the current effective MSS, taking SACKs and IP options,
* and even PMTU discovery events into account.
*
* LARGESEND note: !urg_mode is overkill, only frames up to snd_up
* cannot be large. However, taking into account rare use of URG, this
* is not a big flaw.
*/
unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
{
struct tcp_sock *tp = tcp_sk(sk);
struct dst_entry *dst = __sk_dst_get(sk);
u32 mss_now;
u16 xmit_size_goal;
int doing_tso = 0;
mss_now = tp->mss_cache;
if (large_allowed &&
(sk->sk_route_caps & NETIF_F_TSO) &&
!tp->urg_mode)
doing_tso = 1;
if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
if (tp->rx_opt.eff_sacks)
mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
(tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));

Arnaldo Carvalho de Melo
committed
xmit_size_goal = (65535 -
inet_csk(sk)->icsk_af_ops->net_header_len -
inet_csk(sk)->icsk_ext_hdr_len -
tp->tcp_header_len);
if (tp->max_window &&
(xmit_size_goal > (tp->max_window >> 1)))
xmit_size_goal = max((tp->max_window >> 1),
68U - tp->tcp_header_len);
xmit_size_goal -= (xmit_size_goal % mss_now);
tp->xmit_size_goal = xmit_size_goal;
/* Congestion window validation. (RFC2861) */
static void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
{
__u32 packets_out = tp->packets_out;
if (packets_out >= tp->snd_cwnd) {
/* Network is feed fully. */
tp->snd_cwnd_used = 0;
tp->snd_cwnd_stamp = tcp_time_stamp;
} else {
/* Network starves. */
if (tp->packets_out > tp->snd_cwnd_used)
tp->snd_cwnd_used = tp->packets_out;
if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto)
tcp_cwnd_application_limited(sk);
}
}
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
static unsigned int tcp_window_allows(struct tcp_sock *tp, struct sk_buff *skb, unsigned int mss_now, unsigned int cwnd)
{
u32 window, cwnd_len;
window = (tp->snd_una + tp->snd_wnd - TCP_SKB_CB(skb)->seq);
cwnd_len = mss_now * cwnd;
return min(window, cwnd_len);
}
/* Can at least one segment of SKB be sent right now, according to the
* congestion window rules? If so, return how many segments are allowed.
*/
static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb)
{
u32 in_flight, cwnd;
/* Don't be strict about the congestion window for the final FIN. */
if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
return 1;
in_flight = tcp_packets_in_flight(tp);
cwnd = tp->snd_cwnd;
if (in_flight < cwnd)
return (cwnd - in_flight);
return 0;
}
/* This must be invoked the first time we consider transmitting
* SKB onto the wire.
*/
static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
{
int tso_segs = tcp_skb_pcount(skb);
if (!tso_segs ||
(tso_segs > 1 &&
skb_shinfo(skb)->tso_size != mss_now)) {
tcp_set_skb_tso_segs(sk, skb, mss_now);
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
tso_segs = tcp_skb_pcount(skb);
}
return tso_segs;
}
static inline int tcp_minshall_check(const struct tcp_sock *tp)
{
return after(tp->snd_sml,tp->snd_una) &&
!after(tp->snd_sml, tp->snd_nxt);
}
/* Return 0, if packet can be sent now without violation Nagle's rules:
* 1. It is full sized.
* 2. Or it contains FIN. (already checked by caller)
* 3. Or TCP_NODELAY was set.
* 4. Or TCP_CORK is not set, and all sent packets are ACKed.
* With Minshall's modification: all sent small packets are ACKed.
*/
static inline int tcp_nagle_check(const struct tcp_sock *tp,
const struct sk_buff *skb,
unsigned mss_now, int nonagle)
{
return (skb->len < mss_now &&
((nonagle&TCP_NAGLE_CORK) ||
(!nonagle &&
tp->packets_out &&
tcp_minshall_check(tp))));
}
/* Return non-zero if the Nagle test allows this packet to be
* sent now.
*/
static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
unsigned int cur_mss, int nonagle)
{
/* Nagle rule does not apply to frames, which sit in the middle of the
* write_queue (they have no chances to get new data).
*
* This is implemented in the callers, where they modify the 'nonagle'
* argument based upon the location of SKB in the send queue.
*/
if (nonagle & TCP_NAGLE_PUSH)
return 1;
/* Don't use the nagle rule for urgent data (or for the final FIN). */
if (tp->urg_mode ||
(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
return 1;
if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
return 1;
return 0;
}
/* Does at least the first segment of SKB fit into the send window? */
static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss)
{
u32 end_seq = TCP_SKB_CB(skb)->end_seq;
if (skb->len > cur_mss)
end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
return !after(end_seq, tp->snd_una + tp->snd_wnd);
}
/* This checks if the data bearing packet SKB (usually sk->sk_send_head)
* should be put on the wire right now. If so, it returns the number of
* packets allowed by the congestion window.
*/
static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
unsigned int cur_mss, int nonagle)
{
struct tcp_sock *tp = tcp_sk(sk);
unsigned int cwnd_quota;
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
return 0;
cwnd_quota = tcp_cwnd_test(tp, skb);
if (cwnd_quota &&
!tcp_snd_wnd_test(tp, skb, cur_mss))
cwnd_quota = 0;
return cwnd_quota;
}
static inline int tcp_skb_is_last(const struct sock *sk,
const struct sk_buff *skb)
{
return skb->next == (struct sk_buff *)&sk->sk_write_queue;
}
int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
{
struct sk_buff *skb = sk->sk_send_head;
return (skb &&
tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
(tcp_skb_is_last(sk, skb) ?
TCP_NAGLE_PUSH :
tp->nonagle)));
}
/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
* which is put after SKB on the list. It is very much like
* tcp_fragment() except that it may make several kinds of assumptions
* in order to speed up the splitting operation. In particular, we
* know that all the data is in scatter-gather pages, and that the
* packet has never been sent out before (and thus is not cloned).
*/
static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, unsigned int mss_now)
{
struct sk_buff *buff;
int nlen = skb->len - len;
u16 flags;
/* All of a TSO frame must be composed of paged data. */
if (skb->len != skb->data_len)
return tcp_fragment(sk, skb, len, mss_now);
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC);
if (unlikely(buff == NULL))
return -ENOMEM;
buff->truesize = nlen;
skb->truesize -= nlen;
/* Correct the sequence numbers. */
TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
/* PSH and FIN should only be set in the second packet. */
flags = TCP_SKB_CB(skb)->flags;
TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
TCP_SKB_CB(buff)->flags = flags;
/* This packet was never sent out yet, so no SACK bits. */
TCP_SKB_CB(buff)->sacked = 0;
buff->ip_summed = skb->ip_summed = CHECKSUM_HW;
skb_split(skb, buff, len);
/* Fix up tso_factor for both original and new SKB. */