Skip to content
Snippets Groups Projects
tcp_ipv4.c 64.3 KiB
Newer Older
  • Learn to ignore specific revisions
  • Linus Torvalds's avatar
    Linus Torvalds committed
    
    struct tcp_func ipv4_specific = {
    	.queue_xmit	=	ip_queue_xmit,
    	.send_check	=	tcp_v4_send_check,
    	.rebuild_header	=	tcp_v4_rebuild_header,
    	.conn_request	=	tcp_v4_conn_request,
    	.syn_recv_sock	=	tcp_v4_syn_recv_sock,
    	.remember_stamp	=	tcp_v4_remember_stamp,
    	.net_header_len	=	sizeof(struct iphdr),
    	.setsockopt	=	ip_setsockopt,
    	.getsockopt	=	ip_getsockopt,
    	.addr2sockaddr	=	v4_addr2sockaddr,
    	.sockaddr_len	=	sizeof(struct sockaddr_in),
    };
    
    /* NOTE: A lot of things set to zero explicitly by call to
     *       sk_alloc() so need not be done here.
     */
    static int tcp_v4_init_sock(struct sock *sk)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	skb_queue_head_init(&tp->out_of_order_queue);
    	tcp_init_xmit_timers(sk);
    	tcp_prequeue_init(tp);
    
    	tp->rto  = TCP_TIMEOUT_INIT;
    	tp->mdev = TCP_TIMEOUT_INIT;
    
    	/* So many TCP implementations out there (incorrectly) count the
    	 * initial SYN frame in their delayed-ACK and congestion control
    	 * algorithms that we must have the following bandaid to talk
    	 * efficiently to them.  -DaveM
    	 */
    	tp->snd_cwnd = 2;
    
    	/* See draft-stevens-tcpca-spec-01 for discussion of the
    	 * initialization of these values.
    	 */
    	tp->snd_ssthresh = 0x7fffffff;	/* Infinity */
    	tp->snd_cwnd_clamp = ~0;
    
    	tp->mss_cache = 536;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	tp->reordering = sysctl_tcp_reordering;
    
    	tp->ca_ops = &tcp_init_congestion_ops;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	sk->sk_state = TCP_CLOSE;
    
    	sk->sk_write_space = sk_stream_write_space;
    	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
    
    	tp->af_specific = &ipv4_specific;
    
    	sk->sk_sndbuf = sysctl_tcp_wmem[1];
    	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
    
    	atomic_inc(&tcp_sockets_allocated);
    
    	return 0;
    }
    
    int tcp_v4_destroy_sock(struct sock *sk)
    {
    	struct tcp_sock *tp = tcp_sk(sk);
    
    	tcp_clear_xmit_timers(sk);
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	/* Cleanup up the write buffer. */
      	sk_stream_writequeue_purge(sk);
    
    	/* Cleans up our, hopefully empty, out_of_order_queue. */
      	__skb_queue_purge(&tp->out_of_order_queue);
    
    	/* Clean prequeue, it must be empty really */
    	__skb_queue_purge(&tp->ucopy.prequeue);
    
    	/* Clean up a referenced TCP bind bucket. */
    	if (tp->bind_hash)
    		tcp_put_port(sk);
    
    	/*
    	 * If sendmsg cached page exists, toss it.
    	 */
    	if (sk->sk_sndmsg_page) {
    		__free_page(sk->sk_sndmsg_page);
    		sk->sk_sndmsg_page = NULL;
    	}
    
    	atomic_dec(&tcp_sockets_allocated);
    
    	return 0;
    }
    
    EXPORT_SYMBOL(tcp_v4_destroy_sock);
    
    #ifdef CONFIG_PROC_FS
    /* Proc filesystem TCP sock list dumping. */
    
    static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
    {
    	return hlist_empty(head) ? NULL :
    		list_entry(head->first, struct tcp_tw_bucket, tw_node);
    }
    
    static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
    {
    	return tw->tw_node.next ?
    		hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
    }
    
    static void *listening_get_next(struct seq_file *seq, void *cur)
    {
    	struct tcp_sock *tp;
    	struct hlist_node *node;
    	struct sock *sk = cur;
    	struct tcp_iter_state* st = seq->private;
    
    	if (!sk) {
    		st->bucket = 0;
    		sk = sk_head(&tcp_listening_hash[0]);
    		goto get_sk;
    	}
    
    	++st->num;
    
    	if (st->state == TCP_SEQ_STATE_OPENREQ) {
    
    		struct request_sock *req = cur;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	       	tp = tcp_sk(st->syn_wait_sk);
    		req = req->dl_next;
    		while (1) {
    			while (req) {
    
    				if (req->rsk_ops->family == st->family) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    					cur = req;
    					goto out;
    				}
    				req = req->dl_next;
    			}
    			if (++st->sbucket >= TCP_SYNQ_HSIZE)
    				break;
    get_req:
    
    			req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    		sk	  = sk_next(st->syn_wait_sk);
    		st->state = TCP_SEQ_STATE_LISTENING;
    
    		read_unlock_bh(&tp->accept_queue.syn_wait_lock);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	} else {
    	       	tp = tcp_sk(sk);
    
    		read_lock_bh(&tp->accept_queue.syn_wait_lock);
    		if (reqsk_queue_len(&tp->accept_queue))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			goto start_req;
    
    		read_unlock_bh(&tp->accept_queue.syn_wait_lock);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		sk = sk_next(sk);
    	}
    get_sk:
    	sk_for_each_from(sk, node) {
    		if (sk->sk_family == st->family) {
    			cur = sk;
    			goto out;
    		}
    	       	tp = tcp_sk(sk);
    
    		read_lock_bh(&tp->accept_queue.syn_wait_lock);
    		if (reqsk_queue_len(&tp->accept_queue)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    start_req:
    			st->uid		= sock_i_uid(sk);
    			st->syn_wait_sk = sk;
    			st->state	= TCP_SEQ_STATE_OPENREQ;
    			st->sbucket	= 0;
    			goto get_req;
    		}
    
    		read_unlock_bh(&tp->accept_queue.syn_wait_lock);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    	if (++st->bucket < TCP_LHTABLE_SIZE) {
    		sk = sk_head(&tcp_listening_hash[st->bucket]);
    		goto get_sk;
    	}
    	cur = NULL;
    out:
    	return cur;
    }
    
    static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
    {
    	void *rc = listening_get_next(seq, NULL);
    
    	while (rc && *pos) {
    		rc = listening_get_next(seq, rc);
    		--*pos;
    	}
    	return rc;
    }
    
    static void *established_get_first(struct seq_file *seq)
    {
    	struct tcp_iter_state* st = seq->private;
    	void *rc = NULL;
    
    	for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
    		struct sock *sk;
    		struct hlist_node *node;
    		struct tcp_tw_bucket *tw;
    
    		/* We can reschedule _before_ having picked the target: */
    		cond_resched_softirq();
    
    		read_lock(&tcp_ehash[st->bucket].lock);
    		sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
    			if (sk->sk_family != st->family) {
    				continue;
    			}
    			rc = sk;
    			goto out;
    		}
    		st->state = TCP_SEQ_STATE_TIME_WAIT;
    		tw_for_each(tw, node,
    			    &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
    			if (tw->tw_family != st->family) {
    				continue;
    			}
    			rc = tw;
    			goto out;
    		}
    		read_unlock(&tcp_ehash[st->bucket].lock);
    		st->state = TCP_SEQ_STATE_ESTABLISHED;
    	}
    out:
    	return rc;
    }
    
    static void *established_get_next(struct seq_file *seq, void *cur)
    {
    	struct sock *sk = cur;
    	struct tcp_tw_bucket *tw;
    	struct hlist_node *node;
    	struct tcp_iter_state* st = seq->private;
    
    	++st->num;
    
    	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
    		tw = cur;
    		tw = tw_next(tw);
    get_tw:
    		while (tw && tw->tw_family != st->family) {
    			tw = tw_next(tw);
    		}
    		if (tw) {
    			cur = tw;
    			goto out;
    		}
    		read_unlock(&tcp_ehash[st->bucket].lock);
    		st->state = TCP_SEQ_STATE_ESTABLISHED;
    
    		/* We can reschedule between buckets: */
    		cond_resched_softirq();
    
    		if (++st->bucket < tcp_ehash_size) {
    			read_lock(&tcp_ehash[st->bucket].lock);
    			sk = sk_head(&tcp_ehash[st->bucket].chain);
    		} else {
    			cur = NULL;
    			goto out;
    		}
    	} else
    		sk = sk_next(sk);
    
    	sk_for_each_from(sk, node) {
    		if (sk->sk_family == st->family)
    			goto found;
    	}
    
    	st->state = TCP_SEQ_STATE_TIME_WAIT;
    	tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
    	goto get_tw;
    found:
    	cur = sk;
    out:
    	return cur;
    }
    
    static void *established_get_idx(struct seq_file *seq, loff_t pos)
    {
    	void *rc = established_get_first(seq);
    
    	while (rc && pos) {
    		rc = established_get_next(seq, rc);
    		--pos;
    	}		
    	return rc;
    }
    
    static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
    {
    	void *rc;
    	struct tcp_iter_state* st = seq->private;
    
    	tcp_listen_lock();
    	st->state = TCP_SEQ_STATE_LISTENING;
    	rc	  = listening_get_idx(seq, &pos);
    
    	if (!rc) {
    		tcp_listen_unlock();
    		local_bh_disable();
    		st->state = TCP_SEQ_STATE_ESTABLISHED;
    		rc	  = established_get_idx(seq, pos);
    	}
    
    	return rc;
    }
    
    static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
    {
    	struct tcp_iter_state* st = seq->private;
    	st->state = TCP_SEQ_STATE_LISTENING;
    	st->num = 0;
    	return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
    }
    
    static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
    {
    	void *rc = NULL;
    	struct tcp_iter_state* st;
    
    	if (v == SEQ_START_TOKEN) {
    		rc = tcp_get_idx(seq, 0);
    		goto out;
    	}
    	st = seq->private;
    
    	switch (st->state) {
    	case TCP_SEQ_STATE_OPENREQ:
    	case TCP_SEQ_STATE_LISTENING:
    		rc = listening_get_next(seq, v);
    		if (!rc) {
    			tcp_listen_unlock();
    			local_bh_disable();
    			st->state = TCP_SEQ_STATE_ESTABLISHED;
    			rc	  = established_get_first(seq);
    		}
    		break;
    	case TCP_SEQ_STATE_ESTABLISHED:
    	case TCP_SEQ_STATE_TIME_WAIT:
    		rc = established_get_next(seq, v);
    		break;
    	}
    out:
    	++*pos;
    	return rc;
    }
    
    static void tcp_seq_stop(struct seq_file *seq, void *v)
    {
    	struct tcp_iter_state* st = seq->private;
    
    	switch (st->state) {
    	case TCP_SEQ_STATE_OPENREQ:
    		if (v) {
    			struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
    
    			read_unlock_bh(&tp->accept_queue.syn_wait_lock);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    	case TCP_SEQ_STATE_LISTENING:
    		if (v != SEQ_START_TOKEN)
    			tcp_listen_unlock();
    		break;
    	case TCP_SEQ_STATE_TIME_WAIT:
    	case TCP_SEQ_STATE_ESTABLISHED:
    		if (v)
    			read_unlock(&tcp_ehash[st->bucket].lock);
    		local_bh_enable();
    		break;
    	}
    }
    
    static int tcp_seq_open(struct inode *inode, struct file *file)
    {
    	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
    	struct seq_file *seq;
    	struct tcp_iter_state *s;
    	int rc;
    
    	if (unlikely(afinfo == NULL))
    		return -EINVAL;
    
    	s = kmalloc(sizeof(*s), GFP_KERNEL);
    	if (!s)
    		return -ENOMEM;
    	memset(s, 0, sizeof(*s));
    	s->family		= afinfo->family;
    	s->seq_ops.start	= tcp_seq_start;
    	s->seq_ops.next		= tcp_seq_next;
    	s->seq_ops.show		= afinfo->seq_show;
    	s->seq_ops.stop		= tcp_seq_stop;
    
    	rc = seq_open(file, &s->seq_ops);
    	if (rc)
    		goto out_kfree;
    	seq	     = file->private_data;
    	seq->private = s;
    out:
    	return rc;
    out_kfree:
    	kfree(s);
    	goto out;
    }
    
    int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
    {
    	int rc = 0;
    	struct proc_dir_entry *p;
    
    	if (!afinfo)
    		return -EINVAL;
    	afinfo->seq_fops->owner		= afinfo->owner;
    	afinfo->seq_fops->open		= tcp_seq_open;
    	afinfo->seq_fops->read		= seq_read;
    	afinfo->seq_fops->llseek	= seq_lseek;
    	afinfo->seq_fops->release	= seq_release_private;
    	
    	p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
    	if (p)
    		p->data = afinfo;
    	else
    		rc = -ENOMEM;
    	return rc;
    }
    
    void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
    {
    	if (!afinfo)
    		return;
    	proc_net_remove(afinfo->name);
    	memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); 
    }
    
    
    static void get_openreq4(struct sock *sk, struct request_sock *req,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			 char *tmpbuf, int i, int uid)
    {
    
    	const struct inet_request_sock *ireq = inet_rsk(req);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	int ttd = req->expires - jiffies;
    
    	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
    		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
    		i,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		ntohs(inet_sk(sk)->sport),
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		TCP_SYN_RECV,
    		0, 0, /* could print option size, but that is af dependent. */
    		1,    /* timers active (only the expire timer) */
    		jiffies_to_clock_t(ttd),
    		req->retrans,
    		uid,
    		0,  /* non standard timer */
    		0, /* open_requests have no inode */
    		atomic_read(&sk->sk_refcnt),
    		req);
    }
    
    static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
    {
    	int timer_active;
    	unsigned long timer_expires;
    	struct tcp_sock *tp = tcp_sk(sp);
    	struct inet_sock *inet = inet_sk(sp);
    	unsigned int dest = inet->daddr;
    	unsigned int src = inet->rcv_saddr;
    	__u16 destp = ntohs(inet->dport);
    	__u16 srcp = ntohs(inet->sport);
    
    	if (tp->pending == TCP_TIME_RETRANS) {
    		timer_active	= 1;
    		timer_expires	= tp->timeout;
    	} else if (tp->pending == TCP_TIME_PROBE0) {
    		timer_active	= 4;
    		timer_expires	= tp->timeout;
    	} else if (timer_pending(&sp->sk_timer)) {
    		timer_active	= 2;
    		timer_expires	= sp->sk_timer.expires;
    	} else {
    		timer_active	= 0;
    		timer_expires = jiffies;
    	}
    
    	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
    			"%08X %5d %8d %lu %d %p %u %u %u %u %d",
    		i, src, srcp, dest, destp, sp->sk_state,
    		tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
    		timer_active,
    		jiffies_to_clock_t(timer_expires - jiffies),
    		tp->retransmits,
    		sock_i_uid(sp),
    		tp->probes_out,
    		sock_i_ino(sp),
    		atomic_read(&sp->sk_refcnt), sp,
    		tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
    		tp->snd_cwnd,
    		tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
    }
    
    static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
    {
    	unsigned int dest, src;
    	__u16 destp, srcp;
    	int ttd = tw->tw_ttd - jiffies;
    
    	if (ttd < 0)
    		ttd = 0;
    
    	dest  = tw->tw_daddr;
    	src   = tw->tw_rcv_saddr;
    	destp = ntohs(tw->tw_dport);
    	srcp  = ntohs(tw->tw_sport);
    
    	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
    		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
    		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
    		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
    		atomic_read(&tw->tw_refcnt), tw);
    }
    
    #define TMPSZ 150
    
    static int tcp4_seq_show(struct seq_file *seq, void *v)
    {
    	struct tcp_iter_state* st;
    	char tmpbuf[TMPSZ + 1];
    
    	if (v == SEQ_START_TOKEN) {
    		seq_printf(seq, "%-*s\n", TMPSZ - 1,
    			   "  sl  local_address rem_address   st tx_queue "
    			   "rx_queue tr tm->when retrnsmt   uid  timeout "
    			   "inode");
    		goto out;
    	}
    	st = seq->private;
    
    	switch (st->state) {
    	case TCP_SEQ_STATE_LISTENING:
    	case TCP_SEQ_STATE_ESTABLISHED:
    		get_tcp4_sock(v, tmpbuf, st->num);
    		break;
    	case TCP_SEQ_STATE_OPENREQ:
    		get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
    		break;
    	case TCP_SEQ_STATE_TIME_WAIT:
    		get_timewait4_sock(v, tmpbuf, st->num);
    		break;
    	}
    	seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
    out:
    	return 0;
    }
    
    static struct file_operations tcp4_seq_fops;
    static struct tcp_seq_afinfo tcp4_seq_afinfo = {
    	.owner		= THIS_MODULE,
    	.name		= "tcp",
    	.family		= AF_INET,
    	.seq_show	= tcp4_seq_show,
    	.seq_fops	= &tcp4_seq_fops,
    };
    
    int __init tcp4_proc_init(void)
    {
    	return tcp_proc_register(&tcp4_seq_afinfo);
    }
    
    void tcp4_proc_exit(void)
    {
    	tcp_proc_unregister(&tcp4_seq_afinfo);
    }
    #endif /* CONFIG_PROC_FS */
    
    struct proto tcp_prot = {
    	.name			= "TCP",
    	.owner			= THIS_MODULE,
    	.close			= tcp_close,
    	.connect		= tcp_v4_connect,
    	.disconnect		= tcp_disconnect,
    	.accept			= tcp_accept,
    	.ioctl			= tcp_ioctl,
    	.init			= tcp_v4_init_sock,
    	.destroy		= tcp_v4_destroy_sock,
    	.shutdown		= tcp_shutdown,
    	.setsockopt		= tcp_setsockopt,
    	.getsockopt		= tcp_getsockopt,
    	.sendmsg		= tcp_sendmsg,
    	.recvmsg		= tcp_recvmsg,
    	.backlog_rcv		= tcp_v4_do_rcv,
    	.hash			= tcp_v4_hash,
    	.unhash			= tcp_unhash,
    	.get_port		= tcp_v4_get_port,
    	.enter_memory_pressure	= tcp_enter_memory_pressure,
    	.sockets_allocated	= &tcp_sockets_allocated,
    	.memory_allocated	= &tcp_memory_allocated,
    	.memory_pressure	= &tcp_memory_pressure,
    	.sysctl_mem		= sysctl_tcp_mem,
    	.sysctl_wmem		= sysctl_tcp_wmem,
    	.sysctl_rmem		= sysctl_tcp_rmem,
    	.max_header		= MAX_TCP_HEADER,
    	.obj_size		= sizeof(struct tcp_sock),
    
    	.rsk_prot		= &tcp_request_sock_ops,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    };
    
    
    
    void __init tcp_v4_init(struct net_proto_family *ops)
    {
    	int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
    	if (err < 0)
    		panic("Failed to create the TCP control socket.\n");
    	tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
    	inet_sk(tcp_socket->sk)->uc_ttl = -1;
    
    	/* Unhash it so that IP input processing does not even
    	 * see it, we do not wish this socket to see incoming
    	 * packets.
    	 */
    	tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
    }
    
    EXPORT_SYMBOL(ipv4_specific);
    EXPORT_SYMBOL(tcp_bind_hash);
    EXPORT_SYMBOL(tcp_bucket_create);
    EXPORT_SYMBOL(tcp_hashinfo);
    EXPORT_SYMBOL(tcp_inherit_port);
    EXPORT_SYMBOL(tcp_listen_wlock);
    EXPORT_SYMBOL(tcp_port_rover);
    EXPORT_SYMBOL(tcp_prot);
    EXPORT_SYMBOL(tcp_put_port);
    EXPORT_SYMBOL(tcp_unhash);
    EXPORT_SYMBOL(tcp_v4_conn_request);
    EXPORT_SYMBOL(tcp_v4_connect);
    EXPORT_SYMBOL(tcp_v4_do_rcv);
    EXPORT_SYMBOL(tcp_v4_rebuild_header);
    EXPORT_SYMBOL(tcp_v4_remember_stamp);
    EXPORT_SYMBOL(tcp_v4_send_check);
    EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
    
    #ifdef CONFIG_PROC_FS
    EXPORT_SYMBOL(tcp_proc_register);
    EXPORT_SYMBOL(tcp_proc_unregister);
    #endif
    EXPORT_SYMBOL(sysctl_local_port_range);
    EXPORT_SYMBOL(sysctl_tcp_low_latency);
    EXPORT_SYMBOL(sysctl_tcp_tw_reuse);