Skip to content
Snippets Groups Projects
tcp.c 62.7 KiB
Newer Older
  • Learn to ignore specific revisions
  • Linus Torvalds's avatar
    Linus Torvalds committed
    		sk->sk_backlog_rcv(sk, skb);
    	local_bh_enable();
    
    	/* Clear memory counter. */
    	tp->ucopy.memory = 0;
    }
    
    static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
    {
    	struct sk_buff *skb;
    	u32 offset;
    
    	skb_queue_walk(&sk->sk_receive_queue, skb) {
    		offset = seq - TCP_SKB_CB(skb)->seq;
    		if (skb->h.th->syn)
    			offset--;
    		if (offset < skb->len || skb->h.th->fin) {
    			*off = offset;
    			return skb;
    		}
    	}
    	return NULL;
    }
    
    /*
     * This routine provides an alternative to tcp_recvmsg() for routines
     * that would like to handle copying from skbuffs directly in 'sendfile'
     * fashion.
     * Note:
     *	- It is assumed that the socket was locked by the caller.
     *	- The routine does not block.
     *	- At present, there is no support for reading OOB data
     *	  or for 'peeking' the socket using this routine
     *	  (although both would be easy to implement).
     */
    int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
    		  sk_read_actor_t recv_actor)
    {
    	struct sk_buff *skb;
    	struct tcp_sock *tp = tcp_sk(sk);
    	u32 seq = tp->copied_seq;
    	u32 offset;
    	int copied = 0;
    
    	if (sk->sk_state == TCP_LISTEN)
    		return -ENOTCONN;
    	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
    		if (offset < skb->len) {
    			size_t used, len;
    
    			len = skb->len - offset;
    			/* Stop reading if we hit a patch of urgent data */
    			if (tp->urg_data) {
    				u32 urg_offset = tp->urg_seq - seq;
    				if (urg_offset < len)
    					len = urg_offset;
    				if (!len)
    					break;
    			}
    			used = recv_actor(desc, skb, offset, len);
    			if (used <= len) {
    				seq += used;
    				copied += used;
    				offset += used;
    			}
    			if (offset != skb->len)
    				break;
    		}
    		if (skb->h.th->fin) {
    
    			sk_eat_skb(sk, skb, 0);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			++seq;
    			break;
    		}
    
    		sk_eat_skb(sk, skb, 0);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (!desc->count)
    			break;
    	}
    	tp->copied_seq = seq;
    
    	tcp_rcv_space_adjust(sk);
    
    	/* Clean up data we have read: This will do ACK frames. */
    	if (copied)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	return copied;
    }
    
    /*
     *	This routine copies from a sock struct into the user buffer.
     *
     *	Technical note: in 2.3 we work on _locked_ socket, so that
     *	tricks with *seq access order and skb->users are not required.
     *	Probably, code can be easily improved even more.
     */
    
    int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
    		size_t len, int nonblock, int flags, int *addr_len)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    	int copied = 0;
    	u32 peek_seq;
    	u32 *seq;
    	unsigned long used;
    	int err;
    	int target;		/* Read at least this many bytes */
    	long timeo;
    	struct task_struct *user_recv = NULL;
    
    	int copied_early = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	lock_sock(sk);
    
    	TCP_CHECK_TIMER(sk);
    
    	err = -ENOTCONN;
    	if (sk->sk_state == TCP_LISTEN)
    		goto out;
    
    	timeo = sock_rcvtimeo(sk, nonblock);
    
    	/* Urgent data needs to be handled specially. */
    	if (flags & MSG_OOB)
    		goto recv_urg;
    
    	seq = &tp->copied_seq;
    	if (flags & MSG_PEEK) {
    		peek_seq = tp->copied_seq;
    		seq = &peek_seq;
    	}
    
    	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
    
    
    #ifdef CONFIG_NET_DMA
    	tp->ucopy.dma_chan = NULL;
    	preempt_disable();
    	if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
    	    !sysctl_tcp_low_latency && __get_cpu_var(softnet_data.net_dma)) {
    		preempt_enable_no_resched();
    		tp->ucopy.pinned_list = dma_pin_iovec_pages(msg->msg_iov, len);
    	} else
    		preempt_enable_no_resched();
    #endif
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	do {
    		struct sk_buff *skb;
    		u32 offset;
    
    		/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
    		if (tp->urg_data && tp->urg_seq == *seq) {
    			if (copied)
    				break;
    			if (signal_pending(current)) {
    				copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
    				break;
    			}
    		}
    
    		/* Next get a buffer. */
    
    		skb = skb_peek(&sk->sk_receive_queue);
    		do {
    			if (!skb)
    				break;
    
    			/* Now that we have two receive queues this
    			 * shouldn't happen.
    			 */
    			if (before(*seq, TCP_SKB_CB(skb)->seq)) {
    				printk(KERN_INFO "recvmsg bug: copied %X "
    				       "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
    				break;
    			}
    			offset = *seq - TCP_SKB_CB(skb)->seq;
    			if (skb->h.th->syn)
    				offset--;
    			if (offset < skb->len)
    				goto found_ok_skb;
    			if (skb->h.th->fin)
    				goto found_fin_ok;
    			BUG_TRAP(flags & MSG_PEEK);
    			skb = skb->next;
    		} while (skb != (struct sk_buff *)&sk->sk_receive_queue);
    
    		/* Well, if we have backlog, try to process it now yet. */
    
    		if (copied >= target && !sk->sk_backlog.tail)
    			break;
    
    		if (copied) {
    			if (sk->sk_err ||
    			    sk->sk_state == TCP_CLOSE ||
    			    (sk->sk_shutdown & RCV_SHUTDOWN) ||
    			    !timeo ||
    			    signal_pending(current) ||
    			    (flags & MSG_PEEK))
    				break;
    		} else {
    			if (sock_flag(sk, SOCK_DONE))
    				break;
    
    			if (sk->sk_err) {
    				copied = sock_error(sk);
    				break;
    			}
    
    			if (sk->sk_shutdown & RCV_SHUTDOWN)
    				break;
    
    			if (sk->sk_state == TCP_CLOSE) {
    				if (!sock_flag(sk, SOCK_DONE)) {
    					/* This occurs when user tries to read
    					 * from never connected socket.
    					 */
    					copied = -ENOTCONN;
    					break;
    				}
    				break;
    			}
    
    			if (!timeo) {
    				copied = -EAGAIN;
    				break;
    			}
    
    			if (signal_pending(current)) {
    				copied = sock_intr_errno(timeo);
    				break;
    			}
    		}
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			/* Install new reader */
    			if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
    				user_recv = current;
    				tp->ucopy.task = user_recv;
    				tp->ucopy.iov = msg->msg_iov;
    			}
    
    			tp->ucopy.len = len;
    
    			BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
    				 (flags & (MSG_PEEK | MSG_TRUNC)));
    
    			/* Ugly... If prequeue is not empty, we have to
    			 * process it before releasing socket, otherwise
    			 * order will be broken at second iteration.
    			 * More elegant solution is required!!!
    			 *
    			 * Look: we have the following (pseudo)queues:
    			 *
    			 * 1. packets in flight
    			 * 2. backlog
    			 * 3. prequeue
    			 * 4. receive_queue
    			 *
    			 * Each queue can be processed only if the next ones
    			 * are empty. At this point we have empty receive_queue.
    			 * But prequeue _can_ be not empty after 2nd iteration,
    			 * when we jumped to start of loop because backlog
    			 * processing added something to receive_queue.
    			 * We cannot release_sock(), because backlog contains
    			 * packets arrived _after_ prequeued ones.
    			 *
    			 * Shortly, algorithm is clear --- to process all
    			 * the queues in order. We could make it more directly,
    			 * requeueing packets from backlog to prequeue, if
    			 * is not empty. It is more elegant, but eats cycles,
    			 * unfortunately.
    			 */
    
    			if (!skb_queue_empty(&tp->ucopy.prequeue))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				goto do_prequeue;
    
    			/* __ Set realtime policy in scheduler __ */
    		}
    
    		if (copied >= target) {
    			/* Do not sleep, just process backlog. */
    			release_sock(sk);
    			lock_sock(sk);
    		} else
    			sk_wait_data(sk, &timeo);
    
    
    #ifdef CONFIG_NET_DMA
    		tp->ucopy.wakeup = 0;
    #endif
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (user_recv) {
    			int chunk;
    
    			/* __ Restore normal policy in scheduler __ */
    
    			if ((chunk = len - tp->ucopy.len) != 0) {
    				NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
    				len -= chunk;
    				copied += chunk;
    			}
    
    			if (tp->rcv_nxt == tp->copied_seq &&
    
    			    !skb_queue_empty(&tp->ucopy.prequeue)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    do_prequeue:
    				tcp_prequeue_process(sk);
    
    				if ((chunk = len - tp->ucopy.len) != 0) {
    					NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
    					len -= chunk;
    					copied += chunk;
    				}
    			}
    		}
    		if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
    			if (net_ratelimit())
    				printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
    				       current->comm, current->pid);
    			peek_seq = tp->copied_seq;
    		}
    		continue;
    
    	found_ok_skb:
    		/* Ok so how much can we use? */
    		used = skb->len - offset;
    		if (len < used)
    			used = len;
    
    		/* Do we have urgent data here? */
    		if (tp->urg_data) {
    			u32 urg_offset = tp->urg_seq - *seq;
    			if (urg_offset < used) {
    				if (!urg_offset) {
    					if (!sock_flag(sk, SOCK_URGINLINE)) {
    						++*seq;
    						offset++;
    						used--;
    						if (!used)
    							goto skip_copy;
    					}
    				} else
    					used = urg_offset;
    			}
    		}
    
    		if (!(flags & MSG_TRUNC)) {
    
    #ifdef CONFIG_NET_DMA
    			if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
    				tp->ucopy.dma_chan = get_softnet_dma();
    
    			if (tp->ucopy.dma_chan) {
    				tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
    					tp->ucopy.dma_chan, skb, offset,
    					msg->msg_iov, used,
    					tp->ucopy.pinned_list);
    
    				if (tp->ucopy.dma_cookie < 0) {
    
    					printk(KERN_ALERT "dma_cookie < 0\n");
    
    					/* Exception. Bailout! */
    					if (!copied)
    						copied = -EFAULT;
    					break;
    				}
    				if ((offset + used) == skb->len)
    					copied_early = 1;
    
    			} else
    #endif
    			{
    				err = skb_copy_datagram_iovec(skb, offset,
    						msg->msg_iov, used);
    				if (err) {
    					/* Exception. Bailout! */
    					if (!copied)
    						copied = -EFAULT;
    					break;
    				}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			}
    		}
    
    		*seq += used;
    		copied += used;
    		len -= used;
    
    		tcp_rcv_space_adjust(sk);
    
    skip_copy:
    		if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
    			tp->urg_data = 0;
    			tcp_fast_path_check(sk, tp);
    		}
    		if (used + offset < skb->len)
    			continue;
    
    		if (skb->h.th->fin)
    			goto found_fin_ok;
    
    		if (!(flags & MSG_PEEK)) {
    			sk_eat_skb(sk, skb, copied_early);
    			copied_early = 0;
    		}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		continue;
    
    	found_fin_ok:
    		/* Process the FIN. */
    		++*seq;
    
    		if (!(flags & MSG_PEEK)) {
    			sk_eat_skb(sk, skb, copied_early);
    			copied_early = 0;
    		}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		break;
    	} while (len > 0);
    
    	if (user_recv) {
    
    		if (!skb_queue_empty(&tp->ucopy.prequeue)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			int chunk;
    
    			tp->ucopy.len = copied > 0 ? len : 0;
    
    			tcp_prequeue_process(sk);
    
    			if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
    				NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
    				len -= chunk;
    				copied += chunk;
    			}
    		}
    
    		tp->ucopy.task = NULL;
    		tp->ucopy.len = 0;
    	}
    
    
    #ifdef CONFIG_NET_DMA
    	if (tp->ucopy.dma_chan) {
    		struct sk_buff *skb;
    		dma_cookie_t done, used;
    
    		dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
    
    		while (dma_async_memcpy_complete(tp->ucopy.dma_chan,
    		                                 tp->ucopy.dma_cookie, &done,
    		                                 &used) == DMA_IN_PROGRESS) {
    			/* do partial cleanup of sk_async_wait_queue */
    			while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
    			       (dma_async_is_complete(skb->dma_cookie, done,
    			                              used) == DMA_SUCCESS)) {
    				__skb_dequeue(&sk->sk_async_wait_queue);
    				kfree_skb(skb);
    			}
    		}
    
    		/* Safe to free early-copied skbs now */
    		__skb_queue_purge(&sk->sk_async_wait_queue);
    		dma_chan_put(tp->ucopy.dma_chan);
    		tp->ucopy.dma_chan = NULL;
    	}
    	if (tp->ucopy.pinned_list) {
    		dma_unpin_iovec_pages(tp->ucopy.pinned_list);
    		tp->ucopy.pinned_list = NULL;
    	}
    #endif
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	/* According to UNIX98, msg_name/msg_namelen are ignored
    	 * on connected socket. I was just happy when found this 8) --ANK
    	 */
    
    	/* Clean up data we have read: This will do ACK frames. */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	TCP_CHECK_TIMER(sk);
    	release_sock(sk);
    	return copied;
    
    out:
    	TCP_CHECK_TIMER(sk);
    	release_sock(sk);
    	return err;
    
    recv_urg:
    	err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
    	goto out;
    }
    
    /*
     *	State processing on a close. This implements the state shift for
     *	sending our FIN frame. Note that we only send a FIN for some
     *	states. A shutdown() may have already sent the FIN, or we may be
     *	closed.
     */
    
    
    static const unsigned char new_state[16] = {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
      /* current state:        new state:      action:	*/
      /* (Invalid)		*/ TCP_CLOSE,
      /* TCP_ESTABLISHED	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
      /* TCP_SYN_SENT	*/ TCP_CLOSE,
      /* TCP_SYN_RECV	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
      /* TCP_FIN_WAIT1	*/ TCP_FIN_WAIT1,
      /* TCP_FIN_WAIT2	*/ TCP_FIN_WAIT2,
      /* TCP_TIME_WAIT	*/ TCP_CLOSE,
      /* TCP_CLOSE		*/ TCP_CLOSE,
      /* TCP_CLOSE_WAIT	*/ TCP_LAST_ACK  | TCP_ACTION_FIN,
      /* TCP_LAST_ACK	*/ TCP_LAST_ACK,
      /* TCP_LISTEN		*/ TCP_CLOSE,
      /* TCP_CLOSING	*/ TCP_CLOSING,
    };
    
    static int tcp_close_state(struct sock *sk)
    {
    	int next = (int)new_state[sk->sk_state];
    	int ns = next & TCP_STATE_MASK;
    
    	tcp_set_state(sk, ns);
    
    	return next & TCP_ACTION_FIN;
    }
    
    /*
     *	Shutdown the sending side of a connection. Much like close except
     *	that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
     */
    
    void tcp_shutdown(struct sock *sk, int how)
    {
    	/*	We need to grab some memory, and put together a FIN,
    	 *	and then put it into the queue to be sent.
    	 *		Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
    	 */
    	if (!(how & SEND_SHUTDOWN))
    		return;
    
    	/* If we've already sent a FIN, or it's a closed state, skip this. */
    	if ((1 << sk->sk_state) &
    	    (TCPF_ESTABLISHED | TCPF_SYN_SENT |
    	     TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
    		/* Clear out any half completed packets.  FIN if needed. */
    		if (tcp_close_state(sk))
    			tcp_send_fin(sk);
    	}
    }
    
    void tcp_close(struct sock *sk, long timeout)
    {
    	struct sk_buff *skb;
    	int data_was_unread = 0;
    
    	int state;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	lock_sock(sk);
    	sk->sk_shutdown = SHUTDOWN_MASK;
    
    	if (sk->sk_state == TCP_LISTEN) {
    		tcp_set_state(sk, TCP_CLOSE);
    
    		/* Special case. */
    
    		inet_csk_listen_stop(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		goto adjudge_to_death;
    	}
    
    	/*  We need to flush the recv. buffs.  We do this only on the
    	 *  descriptor close, not protocol-sourced closes, because the
    	 *  reader process may not have drained the data yet!
    	 */
    	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
    		u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
    			  skb->h.th->fin;
    		data_was_unread += len;
    		__kfree_skb(skb);
    	}
    
    	sk_stream_mem_reclaim(sk);
    
    	/* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
    	 * 3.10, we send a RST here because data was lost.  To
    	 * witness the awful effects of the old behavior of always
    	 * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
    	 * a bulk GET in an FTP client, suspend the process, wait
    	 * for the client to advertise a zero window, then kill -9
    	 * the FTP client, wheee...  Note: timeout is always zero
    	 * in such a case.
    	 */
    	if (data_was_unread) {
    		/* Unread data was tossed, zap the connection. */
    		NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
    		tcp_set_state(sk, TCP_CLOSE);
    		tcp_send_active_reset(sk, GFP_KERNEL);
    	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
    		/* Check zero linger _after_ checking for unread data. */
    		sk->sk_prot->disconnect(sk, 0);
    		NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
    	} else if (tcp_close_state(sk)) {
    		/* We FIN if the application ate all the data before
    		 * zapping the connection.
    		 */
    
    		/* RED-PEN. Formally speaking, we have broken TCP state
    		 * machine. State transitions:
    		 *
    		 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
    		 * TCP_SYN_RECV	-> TCP_FIN_WAIT1 (forget it, it's impossible)
    		 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
    		 *
    		 * are legal only when FIN has been sent (i.e. in window),
    		 * rather than queued out of window. Purists blame.
    		 *
    		 * F.e. "RFC state" is ESTABLISHED,
    		 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
    		 *
    		 * The visible declinations are that sometimes
    		 * we enter time-wait state, when it is not required really
    		 * (harmless), do not send active resets, when they are
    		 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
    		 * they look as CLOSING or LAST_ACK for Linux)
    		 * Probably, I missed some more holelets.
    		 * 						--ANK
    		 */
    		tcp_send_fin(sk);
    	}
    
    	sk_stream_wait_close(sk, timeout);
    
    adjudge_to_death:
    
    	state = sk->sk_state;
    	sock_hold(sk);
    	sock_orphan(sk);
    	atomic_inc(sk->sk_prot->orphan_count);
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	/* It is the last release_sock in its life. It will remove backlog. */
    	release_sock(sk);
    
    
    	/* Now socket is owned by kernel and we acquire BH lock
    	   to finish close. No need to check for user refs.
    	 */
    	local_bh_disable();
    	bh_lock_sock(sk);
    	BUG_TRAP(!sock_owned_by_user(sk));
    
    
    	/* Have we already been destroyed by a softirq or backlog? */
    	if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
    		goto out;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	/*	This is a (useful) BSD violating of the RFC. There is a
    	 *	problem with TCP as specified in that the other end could
    	 *	keep a socket open forever with no application left this end.
    	 *	We use a 3 minute timeout (about the same as BSD) then kill
    	 *	our end. If they send after that then tough - BUT: long enough
    	 *	that we won't make the old 4*rto = almost no time - whoops
    	 *	reset mistake.
    	 *
    	 *	Nope, it was not mistake. It is really desired behaviour
    	 *	f.e. on http servers, when such sockets are useless, but
    	 *	consume significant resources. Let's do it with special
    	 *	linger2	option.					--ANK
    	 */
    
    	if (sk->sk_state == TCP_FIN_WAIT2) {
    		struct tcp_sock *tp = tcp_sk(sk);
    		if (tp->linger2 < 0) {
    			tcp_set_state(sk, TCP_CLOSE);
    			tcp_send_active_reset(sk, GFP_ATOMIC);
    			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
    		} else {
    
    			const int tmo = tcp_fin_time(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    			if (tmo > TCP_TIMEWAIT_LEN) {
    
    				inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			} else {
    				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
    				goto out;
    			}
    		}
    	}
    	if (sk->sk_state != TCP_CLOSE) {
    		sk_stream_mem_reclaim(sk);
    
    		if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans ||
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		    (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
    		     atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
    			if (net_ratelimit())
    				printk(KERN_INFO "TCP: too many of orphaned "
    				       "sockets\n");
    			tcp_set_state(sk, TCP_CLOSE);
    			tcp_send_active_reset(sk, GFP_ATOMIC);
    			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
    		}
    	}
    
    	if (sk->sk_state == TCP_CLOSE)
    
    		inet_csk_destroy_sock(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	/* Otherwise, socket is reprieved until protocol close. */
    
    out:
    	bh_unlock_sock(sk);
    	local_bh_enable();
    	sock_put(sk);
    }
    
    /* These states need RST on ABORT according to RFC793 */
    
    static inline int tcp_need_reset(int state)
    {
    	return (1 << state) &
    	       (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
    		TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
    }
    
    int tcp_disconnect(struct sock *sk, int flags)
    {
    	struct inet_sock *inet = inet_sk(sk);
    
    	struct inet_connection_sock *icsk = inet_csk(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct tcp_sock *tp = tcp_sk(sk);
    	int err = 0;
    	int old_state = sk->sk_state;
    
    	if (old_state != TCP_CLOSE)
    		tcp_set_state(sk, TCP_CLOSE);
    
    	/* ABORT function of RFC793 */
    	if (old_state == TCP_LISTEN) {
    
    		inet_csk_listen_stop(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	} else if (tcp_need_reset(old_state) ||
    		   (tp->snd_nxt != tp->write_seq &&
    		    (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    		/* The last check adjusts for discrepancy of Linux wrt. RFC
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		 * states
    		 */
    		tcp_send_active_reset(sk, gfp_any());
    		sk->sk_err = ECONNRESET;
    	} else if (old_state == TCP_SYN_SENT)
    		sk->sk_err = ECONNRESET;
    
    	tcp_clear_xmit_timers(sk);
    	__skb_queue_purge(&sk->sk_receive_queue);
    	sk_stream_writequeue_purge(sk);
    	__skb_queue_purge(&tp->out_of_order_queue);
    
    #ifdef CONFIG_NET_DMA
    	__skb_queue_purge(&sk->sk_async_wait_queue);
    #endif
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	inet->dport = 0;
    
    	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
    		inet_reset_saddr(sk);
    
    	sk->sk_shutdown = 0;
    	sock_reset_flag(sk, SOCK_DONE);
    	tp->srtt = 0;
    	if ((tp->write_seq += tp->max_window + 2) == 0)
    		tp->write_seq = 1;
    
    	icsk->icsk_backoff = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	tp->snd_cwnd = 2;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	tp->packets_out = 0;
    	tp->snd_ssthresh = 0x7fffffff;
    	tp->snd_cwnd_cnt = 0;
    
    	tp->bytes_acked = 0;
    
    	tcp_set_ca_state(sk, TCP_CA_Open);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	tcp_clear_retrans(tp);
    
    	inet_csk_delack_init(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	sk->sk_send_head = NULL;
    	tp->rx_opt.saw_tstamp = 0;
    	tcp_sack_reset(&tp->rx_opt);
    	__sk_dst_reset(sk);
    
    
    	BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	sk->sk_error_report(sk);
    	return err;
    }
    
    /*
     *	Socket option code for TCP.
     */
    
    static int do_tcp_setsockopt(struct sock *sk, int level,
    		int optname, char __user *optval, int optlen)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	struct inet_connection_sock *icsk = inet_csk(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	int val;
    	int err = 0;
    
    
    	/* This is a string value all the others are int's */
    	if (optname == TCP_CONGESTION) {
    		char name[TCP_CA_NAME_MAX];
    
    		if (optlen < 1)
    			return -EINVAL;
    
    		val = strncpy_from_user(name, optval,
    					min(TCP_CA_NAME_MAX-1, optlen));
    		if (val < 0)
    			return -EFAULT;
    		name[val] = 0;
    
    		lock_sock(sk);
    
    		err = tcp_set_congestion_control(sk, name);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (optlen < sizeof(int))
    		return -EINVAL;
    
    	if (get_user(val, (int __user *)optval))
    		return -EFAULT;
    
    	lock_sock(sk);
    
    	switch (optname) {
    	case TCP_MAXSEG:
    		/* Values greater than interface MTU won't take effect. However
    		 * at the point when this call is done we typically don't yet
    		 * know which interface is going to be used */
    		if (val < 8 || val > MAX_TCP_WINDOW) {
    			err = -EINVAL;
    			break;
    		}
    		tp->rx_opt.user_mss = val;
    		break;
    
    	case TCP_NODELAY:
    		if (val) {
    			/* TCP_NODELAY is weaker than TCP_CORK, so that
    			 * this option on corked socket is remembered, but
    			 * it is not activated until cork is cleared.
    			 *
    			 * However, when TCP_NODELAY is set we make
    			 * an explicit push, which overrides even TCP_CORK
    			 * for currently queued segments.
    			 */
    			tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
    			tcp_push_pending_frames(sk, tp);
    		} else {
    			tp->nonagle &= ~TCP_NAGLE_OFF;
    		}
    		break;
    
    	case TCP_CORK:
    		/* When set indicates to always queue non-full frames.
    		 * Later the user clears this option and we transmit
    		 * any pending partial frames in the queue.  This is
    		 * meant to be used alongside sendfile() to get properly
    		 * filled frames when the user (for example) must write
    		 * out headers with a write() call first and then use
    		 * sendfile to send out the data parts.
    		 *
    		 * TCP_CORK can be set together with TCP_NODELAY and it is
    		 * stronger than TCP_NODELAY.
    		 */
    		if (val) {
    			tp->nonagle |= TCP_NAGLE_CORK;
    		} else {
    			tp->nonagle &= ~TCP_NAGLE_CORK;
    			if (tp->nonagle&TCP_NAGLE_OFF)
    				tp->nonagle |= TCP_NAGLE_PUSH;
    			tcp_push_pending_frames(sk, tp);
    		}
    		break;
    
    	case TCP_KEEPIDLE:
    		if (val < 1 || val > MAX_TCP_KEEPIDLE)
    			err = -EINVAL;
    		else {
    			tp->keepalive_time = val * HZ;
    			if (sock_flag(sk, SOCK_KEEPOPEN) &&
    			    !((1 << sk->sk_state) &
    			      (TCPF_CLOSE | TCPF_LISTEN))) {
    				__u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
    				if (tp->keepalive_time > elapsed)
    					elapsed = tp->keepalive_time - elapsed;
    				else
    					elapsed = 0;
    
    				inet_csk_reset_keepalive_timer(sk, elapsed);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			}
    		}
    		break;
    	case TCP_KEEPINTVL:
    		if (val < 1 || val > MAX_TCP_KEEPINTVL)
    			err = -EINVAL;
    		else
    			tp->keepalive_intvl = val * HZ;
    		break;
    	case TCP_KEEPCNT:
    		if (val < 1 || val > MAX_TCP_KEEPCNT)
    			err = -EINVAL;
    		else
    			tp->keepalive_probes = val;
    		break;
    	case TCP_SYNCNT:
    		if (val < 1 || val > MAX_TCP_SYNCNT)
    			err = -EINVAL;
    		else
    
    			icsk->icsk_syn_retries = val;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		break;
    
    	case TCP_LINGER2:
    		if (val < 0)
    			tp->linger2 = -1;
    		else if (val > sysctl_tcp_fin_timeout / HZ)
    			tp->linger2 = 0;
    		else
    			tp->linger2 = val * HZ;
    		break;
    
    	case TCP_DEFER_ACCEPT:
    
    		icsk->icsk_accept_queue.rskq_defer_accept = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (val > 0) {
    			/* Translate value in seconds to number of
    			 * retransmits */
    
    			while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			       val > ((TCP_TIMEOUT_INIT / HZ) <<
    
    				       icsk->icsk_accept_queue.rskq_defer_accept))
    				icsk->icsk_accept_queue.rskq_defer_accept++;
    			icsk->icsk_accept_queue.rskq_defer_accept++;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    		break;
    
    	case TCP_WINDOW_CLAMP:
    		if (!val) {
    			if (sk->sk_state != TCP_CLOSE) {
    				err = -EINVAL;
    				break;
    			}
    			tp->window_clamp = 0;
    		} else
    			tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
    						SOCK_MIN_RCVBUF / 2 : val;
    		break;
    
    	case TCP_QUICKACK:
    		if (!val) {
    
    			icsk->icsk_ack.pingpong = 1;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		} else {
    
    			icsk->icsk_ack.pingpong = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			if ((1 << sk->sk_state) &
    			    (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
    
    			    inet_csk_ack_scheduled(sk)) {
    				icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				if (!(val & 1))
    
    					icsk->icsk_ack.pingpong = 1;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			}
    		}
    		break;
    
    	default:
    		err = -ENOPROTOOPT;
    		break;
    	};
    	release_sock(sk);
    	return err;
    }
    
    
    int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
    		   int optlen)
    {
    	struct inet_connection_sock *icsk = inet_csk(sk);
    
    	if (level != SOL_TCP)
    		return icsk->icsk_af_ops->setsockopt(sk, level, optname,
    						     optval, optlen);
    	return do_tcp_setsockopt(sk, level, optname, optval, optlen);
    }
    
    #ifdef CONFIG_COMPAT
    
    int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
    			  char __user *optval, int optlen)
    
    	if (level != SOL_TCP)
    		return inet_csk_compat_setsockopt(sk, level, optname,
    						  optval, optlen);
    
    	return do_tcp_setsockopt(sk, level, optname, optval, optlen);
    }
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /* Return information about state of tcp endpoint in API format. */
    void tcp_get_info(struct sock *sk, struct tcp_info *info)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	const struct inet_connection_sock *icsk = inet_csk(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	u32 now = tcp_time_stamp;
    
    	memset(info, 0, sizeof(*info));
    
    	info->tcpi_state = sk->sk_state;
    
    	info->tcpi_ca_state = icsk->icsk_ca_state;
    
    	info->tcpi_retransmits = icsk->icsk_retransmits;
    
    	info->tcpi_probes = icsk->icsk_probes_out;
    
    	info->tcpi_backoff = icsk->icsk_backoff;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	if (tp->rx_opt.tstamp_ok)
    		info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
    	if (tp->rx_opt.sack_ok)
    		info->tcpi_options |= TCPI_OPT_SACK;
    	if (tp->rx_opt.wscale_ok) {
    		info->tcpi_options |= TCPI_OPT_WSCALE;
    		info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
    		info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
    	} 
    
    	if (tp->ecn_flags&TCP_ECN_OK)
    		info->tcpi_options |= TCPI_OPT_ECN;