Skip to content
Snippets Groups Projects
tcp.c 62.6 KiB
Newer Older
  • Learn to ignore specific revisions
  • Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * INET		An implementation of the TCP/IP protocol suite for the LINUX
     *		operating system.  INET is implemented using the  BSD Socket
     *		interface as the means of communication with the user level.
     *
     *		Implementation of the Transmission Control Protocol(TCP).
     *
     * Version:	$Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
     *
    
     * Authors:	Ross Biro
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
     *		Mark Evans, <evansmp@uhura.aston.ac.uk>
     *		Corey Minyard <wf-rch!minyard@relay.EU.net>
     *		Florian La Roche, <flla@stud.uni-sb.de>
     *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
     *		Linus Torvalds, <torvalds@cs.helsinki.fi>
     *		Alan Cox, <gw4pts@gw4pts.ampr.org>
     *		Matthew Dillon, <dillon@apollo.west.oic.com>
     *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
     *		Jorge Cwik, <jorge@laser.satlink.net>
     *
     * Fixes:
     *		Alan Cox	:	Numerous verify_area() calls
     *		Alan Cox	:	Set the ACK bit on a reset
     *		Alan Cox	:	Stopped it crashing if it closed while
     *					sk->inuse=1 and was trying to connect
     *					(tcp_err()).
     *		Alan Cox	:	All icmp error handling was broken
     *					pointers passed where wrong and the
     *					socket was looked up backwards. Nobody
     *					tested any icmp error code obviously.
     *		Alan Cox	:	tcp_err() now handled properly. It
     *					wakes people on errors. poll
     *					behaves and the icmp error race
     *					has gone by moving it into sock.c
     *		Alan Cox	:	tcp_send_reset() fixed to work for
     *					everything not just packets for
     *					unknown sockets.
     *		Alan Cox	:	tcp option processing.
     *		Alan Cox	:	Reset tweaked (still not 100%) [Had
     *					syn rule wrong]
     *		Herp Rosmanith  :	More reset fixes
     *		Alan Cox	:	No longer acks invalid rst frames.
     *					Acking any kind of RST is right out.
     *		Alan Cox	:	Sets an ignore me flag on an rst
     *					receive otherwise odd bits of prattle
     *					escape still
     *		Alan Cox	:	Fixed another acking RST frame bug.
     *					Should stop LAN workplace lockups.
     *		Alan Cox	: 	Some tidyups using the new skb list
     *					facilities
     *		Alan Cox	:	sk->keepopen now seems to work
     *		Alan Cox	:	Pulls options out correctly on accepts
     *		Alan Cox	:	Fixed assorted sk->rqueue->next errors
     *		Alan Cox	:	PSH doesn't end a TCP read. Switched a
     *					bit to skb ops.
     *		Alan Cox	:	Tidied tcp_data to avoid a potential
     *					nasty.
     *		Alan Cox	:	Added some better commenting, as the
     *					tcp is hard to follow
     *		Alan Cox	:	Removed incorrect check for 20 * psh
     *	Michael O'Reilly	:	ack < copied bug fix.
     *	Johannes Stille		:	Misc tcp fixes (not all in yet).
     *		Alan Cox	:	FIN with no memory -> CRASH
     *		Alan Cox	:	Added socket option proto entries.
     *					Also added awareness of them to accept.
     *		Alan Cox	:	Added TCP options (SOL_TCP)
     *		Alan Cox	:	Switched wakeup calls to callbacks,
     *					so the kernel can layer network
     *					sockets.
     *		Alan Cox	:	Use ip_tos/ip_ttl settings.
     *		Alan Cox	:	Handle FIN (more) properly (we hope).
     *		Alan Cox	:	RST frames sent on unsynchronised
     *					state ack error.
     *		Alan Cox	:	Put in missing check for SYN bit.
     *		Alan Cox	:	Added tcp_select_window() aka NET2E
     *					window non shrink trick.
     *		Alan Cox	:	Added a couple of small NET2E timer
     *					fixes
     *		Charles Hedrick :	TCP fixes
     *		Toomas Tamm	:	TCP window fixes
     *		Alan Cox	:	Small URG fix to rlogin ^C ack fight
     *		Charles Hedrick	:	Rewrote most of it to actually work
     *		Linus		:	Rewrote tcp_read() and URG handling
     *					completely
     *		Gerhard Koerting:	Fixed some missing timer handling
     *		Matthew Dillon  :	Reworked TCP machine states as per RFC
     *		Gerhard Koerting:	PC/TCP workarounds
     *		Adam Caldwell	:	Assorted timer/timing errors
     *		Matthew Dillon	:	Fixed another RST bug
     *		Alan Cox	:	Move to kernel side addressing changes.
     *		Alan Cox	:	Beginning work on TCP fastpathing
     *					(not yet usable)
     *		Arnt Gulbrandsen:	Turbocharged tcp_check() routine.
     *		Alan Cox	:	TCP fast path debugging
     *		Alan Cox	:	Window clamping
     *		Michael Riepe	:	Bug in tcp_check()
     *		Matt Dillon	:	More TCP improvements and RST bug fixes
     *		Matt Dillon	:	Yet more small nasties remove from the
     *					TCP code (Be very nice to this man if
     *					tcp finally works 100%) 8)
     *		Alan Cox	:	BSD accept semantics.
     *		Alan Cox	:	Reset on closedown bug.
     *	Peter De Schrijver	:	ENOTCONN check missing in tcp_sendto().
     *		Michael Pall	:	Handle poll() after URG properly in
     *					all cases.
     *		Michael Pall	:	Undo the last fix in tcp_read_urg()
     *					(multi URG PUSH broke rlogin).
     *		Michael Pall	:	Fix the multi URG PUSH problem in
     *					tcp_readable(), poll() after URG
     *					works now.
     *		Michael Pall	:	recv(...,MSG_OOB) never blocks in the
     *					BSD api.
     *		Alan Cox	:	Changed the semantics of sk->socket to
     *					fix a race and a signal problem with
     *					accept() and async I/O.
     *		Alan Cox	:	Relaxed the rules on tcp_sendto().
     *		Yury Shevchuk	:	Really fixed accept() blocking problem.
     *		Craig I. Hagan  :	Allow for BSD compatible TIME_WAIT for
     *					clients/servers which listen in on
     *					fixed ports.
     *		Alan Cox	:	Cleaned the above up and shrank it to
     *					a sensible code size.
     *		Alan Cox	:	Self connect lockup fix.
     *		Alan Cox	:	No connect to multicast.
     *		Ross Biro	:	Close unaccepted children on master
     *					socket close.
     *		Alan Cox	:	Reset tracing code.
     *		Alan Cox	:	Spurious resets on shutdown.
     *		Alan Cox	:	Giant 15 minute/60 second timer error
     *		Alan Cox	:	Small whoops in polling before an
     *					accept.
     *		Alan Cox	:	Kept the state trace facility since
     *					it's handy for debugging.
     *		Alan Cox	:	More reset handler fixes.
     *		Alan Cox	:	Started rewriting the code based on
     *					the RFC's for other useful protocol
     *					references see: Comer, KA9Q NOS, and
     *					for a reference on the difference
     *					between specifications and how BSD
     *					works see the 4.4lite source.
     *		A.N.Kuznetsov	:	Don't time wait on completion of tidy
     *					close.
     *		Linus Torvalds	:	Fin/Shutdown & copied_seq changes.
     *		Linus Torvalds	:	Fixed BSD port reuse to work first syn
     *		Alan Cox	:	Reimplemented timers as per the RFC
     *					and using multiple timers for sanity.
     *		Alan Cox	:	Small bug fixes, and a lot of new
     *					comments.
     *		Alan Cox	:	Fixed dual reader crash by locking
     *					the buffers (much like datagram.c)
     *		Alan Cox	:	Fixed stuck sockets in probe. A probe
     *					now gets fed up of retrying without
     *					(even a no space) answer.
     *		Alan Cox	:	Extracted closing code better
     *		Alan Cox	:	Fixed the closing state machine to
     *					resemble the RFC.
     *		Alan Cox	:	More 'per spec' fixes.
     *		Jorge Cwik	:	Even faster checksumming.
     *		Alan Cox	:	tcp_data() doesn't ack illegal PSH
     *					only frames. At least one pc tcp stack
     *					generates them.
     *		Alan Cox	:	Cache last socket.
     *		Alan Cox	:	Per route irtt.
     *		Matt Day	:	poll()->select() match BSD precisely on error
     *		Alan Cox	:	New buffers
     *		Marc Tamsky	:	Various sk->prot->retransmits and
     *					sk->retransmits misupdating fixed.
     *					Fixed tcp_write_timeout: stuck close,
     *					and TCP syn retries gets used now.
     *		Mark Yarvis	:	In tcp_read_wakeup(), don't send an
     *					ack if state is TCP_CLOSED.
     *		Alan Cox	:	Look up device on a retransmit - routes may
     *					change. Doesn't yet cope with MSS shrink right
     *					but it's a start!
     *		Marc Tamsky	:	Closing in closing fixes.
     *		Mike Shaver	:	RFC1122 verifications.
     *		Alan Cox	:	rcv_saddr errors.
     *		Alan Cox	:	Block double connect().
     *		Alan Cox	:	Small hooks for enSKIP.
     *		Alexey Kuznetsov:	Path MTU discovery.
     *		Alan Cox	:	Support soft errors.
     *		Alan Cox	:	Fix MTU discovery pathological case
     *					when the remote claims no mtu!
     *		Marc Tamsky	:	TCP_CLOSE fix.
     *		Colin (G3TNE)	:	Send a reset on syn ack replies in
     *					window but wrong (fixes NT lpd problems)
     *		Pedro Roque	:	Better TCP window handling, delayed ack.
     *		Joerg Reuter	:	No modification of locked buffers in
     *					tcp_do_retransmit()
     *		Eric Schenk	:	Changed receiver side silly window
     *					avoidance algorithm to BSD style
     *					algorithm. This doubles throughput
     *					against machines running Solaris,
     *					and seems to result in general
     *					improvement.
     *	Stefan Magdalinski	:	adjusted tcp_readable() to fix FIONREAD
     *	Willy Konynenberg	:	Transparent proxying support.
     *	Mike McLagan		:	Routing by source
     *		Keith Owens	:	Do proper merging with partial SKB's in
     *					tcp_do_sendmsg to avoid burstiness.
     *		Eric Schenk	:	Fix fast close down bug with
     *					shutdown() followed by close().
     *		Andi Kleen 	:	Make poll agree with SIGIO
     *	Salvatore Sanfilippo	:	Support SO_LINGER with linger == 1 and
     *					lingertime == 0 (RFC 793 ABORT Call)
     *	Hirokazu Takahashi	:	Use copy_from_user() instead of
     *					csum_and_copy_from_user() if possible.
     *
     *		This program is free software; you can redistribute it and/or
     *		modify it under the terms of the GNU General Public License
     *		as published by the Free Software Foundation; either version
     *		2 of the License, or(at your option) any later version.
     *
     * Description of States:
     *
     *	TCP_SYN_SENT		sent a connection request, waiting for ack
     *
     *	TCP_SYN_RECV		received a connection request, sent ack,
     *				waiting for final ack in three-way handshake.
     *
     *	TCP_ESTABLISHED		connection established
     *
     *	TCP_FIN_WAIT1		our side has shutdown, waiting to complete
     *				transmission of remaining buffered data
     *
     *	TCP_FIN_WAIT2		all buffered data sent, waiting for remote
     *				to shutdown
     *
     *	TCP_CLOSING		both sides have shutdown but we still have
     *				data we have to finish sending
     *
     *	TCP_TIME_WAIT		timeout to catch resent junk before entering
     *				closed, can only be entered from FIN_WAIT2
     *				or CLOSING.  Required because the other end
     *				may not have gotten our last ACK causing it
     *				to retransmit the data packet (which we ignore)
     *
     *	TCP_CLOSE_WAIT		remote side has shutdown and is waiting for
     *				us to finish writing our data and to shutdown
     *				(we have to close() to move on to LAST_ACK)
     *
     *	TCP_LAST_ACK		out side has shutdown after remote has
     *				shutdown.  There may still be data in our
     *				buffer that we have to finish sending
     *
     *	TCP_CLOSE		socket is finished
     */
    
    #include <linux/config.h>
    #include <linux/module.h>
    #include <linux/types.h>
    #include <linux/fcntl.h>
    #include <linux/poll.h>
    #include <linux/init.h>
    #include <linux/smp_lock.h>
    #include <linux/fs.h>
    #include <linux/random.h>
    #include <linux/bootmem.h>
    
    #include <net/icmp.h>
    #include <net/tcp.h>
    #include <net/xfrm.h>
    #include <net/ip.h>
    
    
    #include <asm/uaccess.h>
    #include <asm/ioctls.h>
    
    int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
    
    DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
    
    kmem_cache_t *tcp_bucket_cachep;
    kmem_cache_t *tcp_timewait_cachep;
    
    atomic_t tcp_orphan_count = ATOMIC_INIT(0);
    
    int sysctl_tcp_mem[3];
    int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
    int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
    
    EXPORT_SYMBOL(sysctl_tcp_mem);
    EXPORT_SYMBOL(sysctl_tcp_rmem);
    EXPORT_SYMBOL(sysctl_tcp_wmem);
    
    atomic_t tcp_memory_allocated;	/* Current allocated memory. */
    atomic_t tcp_sockets_allocated;	/* Current number of TCP sockets. */
    
    EXPORT_SYMBOL(tcp_memory_allocated);
    EXPORT_SYMBOL(tcp_sockets_allocated);
    
    /*
     * Pressure flag: try to collapse.
     * Technical note: it is used by multiple contexts non atomically.
     * All the sk_stream_mem_schedule() is of this nature: accounting
     * is strict, actions are advisory and have some latency.
     */
    int tcp_memory_pressure;
    
    EXPORT_SYMBOL(tcp_memory_pressure);
    
    void tcp_enter_memory_pressure(void)
    {
    	if (!tcp_memory_pressure) {
    		NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
    		tcp_memory_pressure = 1;
    	}
    }
    
    EXPORT_SYMBOL(tcp_enter_memory_pressure);
    
    /*
     * LISTEN is a special case for poll..
     */
    static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
    					       poll_table *wait)
    {
    
    	return !reqsk_queue_empty(&tcp_sk(sk)->accept_queue) ? (POLLIN | POLLRDNORM) : 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /*
     *	Wait for a TCP event.
     *
     *	Note that we don't need to lock the socket, as the upper poll layers
     *	take care of normal races (between the test and the event) and we don't
     *	go look at any of the socket buffers directly.
     */
    unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
    {
    	unsigned int mask;
    	struct sock *sk = sock->sk;
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	poll_wait(file, sk->sk_sleep, wait);
    	if (sk->sk_state == TCP_LISTEN)
    		return tcp_listen_poll(sk, wait);
    
    	/* Socket is not locked. We are protected from async events
    	   by poll logic and correct handling of state changes
    	   made by another threads is impossible in any case.
    	 */
    
    	mask = 0;
    	if (sk->sk_err)
    		mask = POLLERR;
    
    	/*
    	 * POLLHUP is certainly not done right. But poll() doesn't
    	 * have a notion of HUP in just one direction, and for a
    	 * socket the read side is more interesting.
    	 *
    	 * Some poll() documentation says that POLLHUP is incompatible
    	 * with the POLLOUT/POLLWR flags, so somebody should check this
    	 * all. But careful, it tends to be safer to return too many
    	 * bits than too few, and you can easily break real applications
    	 * if you don't tell them that something has hung up!
    	 *
    	 * Check-me.
    	 *
    	 * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
    	 * our fs/select.c). It means that after we received EOF,
    	 * poll always returns immediately, making impossible poll() on write()
    	 * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
    	 * if and only if shutdown has been made in both directions.
    	 * Actually, it is interesting to look how Solaris and DUX
    	 * solve this dilemma. I would prefer, if PULLHUP were maskable,
    	 * then we could set it on SND_SHUTDOWN. BTW examples given
    	 * in Stevens' books assume exactly this behaviour, it explains
    	 * why PULLHUP is incompatible with POLLOUT.	--ANK
    	 *
    	 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
    	 * blocking on fresh not-connected or disconnected socket. --ANK
    	 */
    	if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
    		mask |= POLLHUP;
    	if (sk->sk_shutdown & RCV_SHUTDOWN)
    		mask |= POLLIN | POLLRDNORM;
    
    	/* Connected? */
    	if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
    		/* Potential race condition. If read of tp below will
    		 * escape above sk->sk_state, we can be illegally awaken
    		 * in SYN_* states. */
    		if ((tp->rcv_nxt != tp->copied_seq) &&
    		    (tp->urg_seq != tp->copied_seq ||
    		     tp->rcv_nxt != tp->copied_seq + 1 ||
    		     sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
    			mask |= POLLIN | POLLRDNORM;
    
    		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
    			if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
    				mask |= POLLOUT | POLLWRNORM;
    			} else {  /* send SIGIO later */
    				set_bit(SOCK_ASYNC_NOSPACE,
    					&sk->sk_socket->flags);
    				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
    
    				/* Race breaker. If space is freed after
    				 * wspace test but before the flags are set,
    				 * IO signal will be lost.
    				 */
    				if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
    					mask |= POLLOUT | POLLWRNORM;
    			}
    		}
    
    		if (tp->urg_data & TCP_URG_VALID)
    			mask |= POLLPRI;
    	}
    	return mask;
    }
    
    int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	int answ;
    
    	switch (cmd) {
    	case SIOCINQ:
    		if (sk->sk_state == TCP_LISTEN)
    			return -EINVAL;
    
    		lock_sock(sk);
    		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
    			answ = 0;
    		else if (sock_flag(sk, SOCK_URGINLINE) ||
    			 !tp->urg_data ||
    			 before(tp->urg_seq, tp->copied_seq) ||
    			 !before(tp->urg_seq, tp->rcv_nxt)) {
    			answ = tp->rcv_nxt - tp->copied_seq;
    
    			/* Subtract 1, if FIN is in queue. */
    			if (answ && !skb_queue_empty(&sk->sk_receive_queue))
    				answ -=
    		       ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
    		} else
    			answ = tp->urg_seq - tp->copied_seq;
    		release_sock(sk);
    		break;
    	case SIOCATMARK:
    		answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
    		break;
    	case SIOCOUTQ:
    		if (sk->sk_state == TCP_LISTEN)
    			return -EINVAL;
    
    		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
    			answ = 0;
    		else
    			answ = tp->write_seq - tp->snd_una;
    		break;
    	default:
    		return -ENOIOCTLCMD;
    	};
    
    	return put_user(answ, (int __user *)arg);
    }
    
    
    int tcp_listen_start(struct sock *sk)
    {
    	struct inet_sock *inet = inet_sk(sk);
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	int rc = reqsk_queue_alloc(&tp->accept_queue, TCP_SYNQ_HSIZE);
    
    	if (rc != 0)
    		return rc;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	sk->sk_max_ack_backlog = 0;
    	sk->sk_ack_backlog = 0;
    	tcp_delack_init(tp);
    
    	/* There is race window here: we announce ourselves listening,
    	 * but this transition is still not validated by get_port().
    	 * It is OK, because this socket enters to hash table only
    	 * after validation is complete.
    	 */
    	sk->sk_state = TCP_LISTEN;
    	if (!sk->sk_prot->get_port(sk, inet->num)) {
    		inet->sport = htons(inet->num);
    
    		sk_dst_reset(sk);
    		sk->sk_prot->hash(sk);
    
    		return 0;
    	}
    
    	sk->sk_state = TCP_CLOSE;
    
    	reqsk_queue_destroy(&tp->accept_queue);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	return -EADDRINUSE;
    }
    
    /*
     *	This routine closes sockets which have been at least partially
     *	opened, but not yet accepted.
     */
    
    static void tcp_listen_stop (struct sock *sk)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	struct request_sock *acc_req;
    
    	struct request_sock *req;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	int i;
    
    	tcp_delete_keepalive_timer(sk);
    
    	/* make all the listen_opt local to us */
    
    	lopt = reqsk_queue_yank_listen_sk(&tp->accept_queue);
    	acc_req = reqsk_queue_yank_acceptq(&tp->accept_queue);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	if (lopt->qlen) {
    		for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
    			while ((req = lopt->syn_table[i]) != NULL) {
    				lopt->syn_table[i] = req->dl_next;
    				lopt->qlen--;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		/* Following specs, it would be better either to send FIN
    		 * (and enter FIN-WAIT-1, it is normal close)
    		 * or to send active reset (abort).
    		 * Certainly, it is pretty dangerous while synflood, but it is
    		 * bad justification for our negligence 8)
    		 * To be honest, we are not able to make either
    		 * of the variants now.			--ANK
    		 */
    			}
    		}
    	}
    	BUG_TRAP(!lopt->qlen);
    
    	kfree(lopt);
    
    	while ((req = acc_req) != NULL) {
    		struct sock *child = req->sk;
    
    		acc_req = req->dl_next;
    
    		local_bh_disable();
    		bh_lock_sock(child);
    		BUG_TRAP(!sock_owned_by_user(child));
    		sock_hold(child);
    
    		tcp_disconnect(child, O_NONBLOCK);
    
    		sock_orphan(child);
    
    		atomic_inc(&tcp_orphan_count);
    
    		tcp_destroy_sock(child);
    
    		bh_unlock_sock(child);
    		local_bh_enable();
    		sock_put(child);
    
    		sk_acceptq_removed(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    	BUG_TRAP(!sk->sk_ack_backlog);
    }
    
    static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
    {
    	TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
    	tp->pushed_seq = tp->write_seq;
    }
    
    static inline int forced_push(struct tcp_sock *tp)
    {
    	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
    }
    
    static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
    			      struct sk_buff *skb)
    {
    	skb->csum = 0;
    	TCP_SKB_CB(skb)->seq = tp->write_seq;
    	TCP_SKB_CB(skb)->end_seq = tp->write_seq;
    	TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
    	TCP_SKB_CB(skb)->sacked = 0;
    	skb_header_release(skb);
    	__skb_queue_tail(&sk->sk_write_queue, skb);
    	sk_charge_skb(sk, skb);
    	if (!sk->sk_send_head)
    		sk->sk_send_head = skb;
    	else if (tp->nonagle&TCP_NAGLE_PUSH)
    		tp->nonagle &= ~TCP_NAGLE_PUSH; 
    }
    
    static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
    				struct sk_buff *skb)
    {
    	if (flags & MSG_OOB) {
    		tp->urg_mode = 1;
    		tp->snd_up = tp->write_seq;
    		TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
    	}
    }
    
    static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
    			    int mss_now, int nonagle)
    {
    	if (sk->sk_send_head) {
    		struct sk_buff *skb = sk->sk_write_queue.prev;
    		if (!(flags & MSG_MORE) || forced_push(tp))
    			tcp_mark_push(tp, skb);
    		tcp_mark_urg(tp, flags, skb);
    		__tcp_push_pending_frames(sk, tp, mss_now,
    					  (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
    	}
    }
    
    static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
    			 size_t psize, int flags)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	int mss_now;
    	int err;
    	ssize_t copied;
    	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
    
    	/* Wait for a connection to finish. */
    	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
    		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
    			goto out_err;
    
    	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
    
    	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
    	copied = 0;
    
    	err = -EPIPE;
    	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
    		goto do_error;
    
    	while (psize > 0) {
    		struct sk_buff *skb = sk->sk_write_queue.prev;
    		struct page *page = pages[poffset / PAGE_SIZE];
    		int copy, i, can_coalesce;
    		int offset = poffset % PAGE_SIZE;
    		int size = min_t(size_t, psize, PAGE_SIZE - offset);
    
    		if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) {
    new_segment:
    			if (!sk_stream_memory_free(sk))
    				goto wait_for_sndbuf;
    
    			skb = sk_stream_alloc_pskb(sk, 0, 0,
    						   sk->sk_allocation);
    			if (!skb)
    				goto wait_for_memory;
    
    			skb_entail(sk, tp, skb);
    			copy = mss_now;
    		}
    
    		if (copy > size)
    			copy = size;
    
    		i = skb_shinfo(skb)->nr_frags;
    		can_coalesce = skb_can_coalesce(skb, i, page, offset);
    		if (!can_coalesce && i >= MAX_SKB_FRAGS) {
    			tcp_mark_push(tp, skb);
    			goto new_segment;
    		}
    		if (sk->sk_forward_alloc < copy &&
    		    !sk_stream_mem_schedule(sk, copy, 0))
    			goto wait_for_memory;
    		
    		if (can_coalesce) {
    			skb_shinfo(skb)->frags[i - 1].size += copy;
    		} else {
    			get_page(page);
    			skb_fill_page_desc(skb, i, page, offset, copy);
    		}
    
    		skb->len += copy;
    		skb->data_len += copy;
    		skb->truesize += copy;
    		sk->sk_wmem_queued += copy;
    		sk->sk_forward_alloc -= copy;
    		skb->ip_summed = CHECKSUM_HW;
    		tp->write_seq += copy;
    		TCP_SKB_CB(skb)->end_seq += copy;
    		skb_shinfo(skb)->tso_segs = 0;
    
    		if (!copied)
    			TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
    
    		copied += copy;
    		poffset += copy;
    		if (!(psize -= copy))
    			goto out;
    
    		if (skb->len != mss_now || (flags & MSG_OOB))
    			continue;
    
    		if (forced_push(tp)) {
    			tcp_mark_push(tp, skb);
    			__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
    		} else if (skb == sk->sk_send_head)
    			tcp_push_one(sk, mss_now);
    		continue;
    
    wait_for_sndbuf:
    		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
    wait_for_memory:
    		if (copied)
    			tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
    
    		if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
    			goto do_error;
    
    		mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
    	}
    
    out:
    	if (copied)
    		tcp_push(sk, tp, flags, mss_now, tp->nonagle);
    	return copied;
    
    do_error:
    	if (copied)
    		goto out;
    out_err:
    	return sk_stream_error(sk, flags, err);
    }
    
    ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
    		     size_t size, int flags)
    {
    	ssize_t res;
    	struct sock *sk = sock->sk;
    
    #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
    
    	if (!(sk->sk_route_caps & NETIF_F_SG) ||
    	    !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
    		return sock_no_sendpage(sock, page, offset, size, flags);
    
    #undef TCP_ZC_CSUM_FLAGS
    
    	lock_sock(sk);
    	TCP_CHECK_TIMER(sk);
    	res = do_tcp_sendpages(sk, &page, offset, size, flags);
    	TCP_CHECK_TIMER(sk);
    	release_sock(sk);
    	return res;
    }
    
    #define TCP_PAGE(sk)	(sk->sk_sndmsg_page)
    #define TCP_OFF(sk)	(sk->sk_sndmsg_off)
    
    static inline int select_size(struct sock *sk, struct tcp_sock *tp)
    {
    	int tmp = tp->mss_cache_std;
    
    
    	if (sk->sk_route_caps & NETIF_F_SG)
    		tmp = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	return tmp;
    }
    
    int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
    		size_t size)
    {
    	struct iovec *iov;
    	struct tcp_sock *tp = tcp_sk(sk);
    	struct sk_buff *skb;
    	int iovlen, flags;
    	int mss_now;
    	int err, copied;
    	long timeo;
    
    	lock_sock(sk);
    	TCP_CHECK_TIMER(sk);
    
    	flags = msg->msg_flags;
    	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
    
    	/* Wait for a connection to finish. */
    	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
    		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
    			goto out_err;
    
    	/* This should be in poll */
    	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
    
    	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
    
    	/* Ok commence sending. */
    	iovlen = msg->msg_iovlen;
    	iov = msg->msg_iov;
    	copied = 0;
    
    	err = -EPIPE;
    	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
    		goto do_error;
    
    	while (--iovlen >= 0) {
    		int seglen = iov->iov_len;
    		unsigned char __user *from = iov->iov_base;
    
    		iov++;
    
    		while (seglen > 0) {
    			int copy;
    
    			skb = sk->sk_write_queue.prev;
    
    			if (!sk->sk_send_head ||
    			    (copy = mss_now - skb->len) <= 0) {
    
    new_segment:
    				/* Allocate new segment. If the interface is SG,
    				 * allocate skb fitting to single page.
    				 */
    				if (!sk_stream_memory_free(sk))
    					goto wait_for_sndbuf;
    
    				skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
    							   0, sk->sk_allocation);
    				if (!skb)
    					goto wait_for_memory;
    
    				/*
    				 * Check whether we can use HW checksum.
    				 */
    				if (sk->sk_route_caps &
    				    (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
    				     NETIF_F_HW_CSUM))
    					skb->ip_summed = CHECKSUM_HW;
    
    				skb_entail(sk, tp, skb);
    				copy = mss_now;
    			}
    
    			/* Try to append data to the end of skb. */
    			if (copy > seglen)
    				copy = seglen;
    
    			/* Where to copy to? */
    			if (skb_tailroom(skb) > 0) {
    				/* We have some space in skb head. Superb! */
    				if (copy > skb_tailroom(skb))
    					copy = skb_tailroom(skb);
    				if ((err = skb_add_data(skb, from, copy)) != 0)
    					goto do_fault;
    			} else {
    				int merge = 0;
    				int i = skb_shinfo(skb)->nr_frags;
    				struct page *page = TCP_PAGE(sk);
    				int off = TCP_OFF(sk);
    
    				if (skb_can_coalesce(skb, i, page, off) &&
    				    off != PAGE_SIZE) {
    					/* We can extend the last page
    					 * fragment. */
    					merge = 1;
    				} else if (i == MAX_SKB_FRAGS ||
    					   (!i &&
    					   !(sk->sk_route_caps & NETIF_F_SG))) {
    					/* Need to add new fragment and cannot
    					 * do this because interface is non-SG,
    					 * or because all the page slots are
    					 * busy. */
    					tcp_mark_push(tp, skb);
    					goto new_segment;
    				} else if (page) {
    					if (off == PAGE_SIZE) {
    						put_page(page);
    						TCP_PAGE(sk) = page = NULL;
    					}
    				}
    
    				if (!page) {
    					/* Allocate new cache page. */
    					if (!(page = sk_stream_alloc_page(sk)))
    						goto wait_for_memory;
    					off = 0;
    				}
    
    				if (copy > PAGE_SIZE - off)
    					copy = PAGE_SIZE - off;
    
    				/* Time to copy data. We are close to
    				 * the end! */
    				err = skb_copy_to_page(sk, from, skb, page,
    						       off, copy);
    				if (err) {
    					/* If this page was new, give it to the
    					 * socket so it does not get leaked.
    					 */
    					if (!TCP_PAGE(sk)) {
    						TCP_PAGE(sk) = page;
    						TCP_OFF(sk) = 0;
    					}
    					goto do_error;
    				}
    
    				/* Update the skb. */
    				if (merge) {
    					skb_shinfo(skb)->frags[i - 1].size +=
    									copy;
    				} else {
    					skb_fill_page_desc(skb, i, page, off, copy);
    					if (TCP_PAGE(sk)) {
    						get_page(page);
    					} else if (off + copy < PAGE_SIZE) {
    						get_page(page);
    						TCP_PAGE(sk) = page;
    					}
    				}
    
    				TCP_OFF(sk) = off + copy;
    			}
    
    			if (!copied)
    				TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
    
    			tp->write_seq += copy;
    			TCP_SKB_CB(skb)->end_seq += copy;
    			skb_shinfo(skb)->tso_segs = 0;
    
    			from += copy;
    			copied += copy;
    			if ((seglen -= copy) == 0 && iovlen == 0)
    				goto out;
    
    			if (skb->len != mss_now || (flags & MSG_OOB))
    				continue;
    
    			if (forced_push(tp)) {
    				tcp_mark_push(tp, skb);
    				__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
    			} else if (skb == sk->sk_send_head)
    				tcp_push_one(sk, mss_now);
    			continue;
    
    wait_for_sndbuf:
    			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
    wait_for_memory:
    			if (copied)
    				tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
    
    			if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
    				goto do_error;
    
    			mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
    		}
    	}
    
    out:
    	if (copied)
    		tcp_push(sk, tp, flags, mss_now, tp->nonagle);
    	TCP_CHECK_TIMER(sk);
    	release_sock(sk);
    	return copied;
    
    do_fault:
    	if (!skb->len) {
    		if (sk->sk_send_head == skb)
    			sk->sk_send_head = NULL;
    		__skb_unlink(skb, skb->list);
    		sk_stream_free_skb(sk, skb);
    	}
    
    do_error:
    	if (copied)
    		goto out;
    out_err:
    	err = sk_stream_error(sk, flags, err);
    	TCP_CHECK_TIMER(sk);
    	release_sock(sk);
    	return err;
    }
    
    /*
     *	Handle reading urgent data. BSD has very simple semantics for
     *	this, no blocking and very strange errors 8)
     */
    
    static int tcp_recv_urg(struct sock *sk, long timeo,
    			struct msghdr *msg, int len, int flags,
    			int *addr_len)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	/* No URG data to read. */
    	if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
    	    tp->urg_data == TCP_URG_READ)
    		return -EINVAL;	/* Yes this is right ! */
    
    	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
    		return -ENOTCONN;
    
    	if (tp->urg_data & TCP_URG_VALID) {
    		int err = 0;
    		char c = tp->urg_data;