Skip to content
Snippets Groups Projects
socket.c 49 KiB
Newer Older
  • Learn to ignore specific revisions
  • Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * NET		An implementation of the SOCKET network access protocol.
     *
     * Version:	@(#)socket.c	1.1.93	18/02/95
     *
     * Authors:	Orest Zborowski, <obz@Kodak.COM>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
     *
     * Fixes:
     *		Anonymous	:	NOTSOCK/BADF cleanup. Error fix in
     *					shutdown()
     *		Alan Cox	:	verify_area() fixes
     *		Alan Cox	:	Removed DDI
     *		Jonathan Kamens	:	SOCK_DGRAM reconnect bug
     *		Alan Cox	:	Moved a load of checks to the very
     *					top level.
     *		Alan Cox	:	Move address structures to/from user
     *					mode above the protocol layers.
     *		Rob Janssen	:	Allow 0 length sends.
     *		Alan Cox	:	Asynchronous I/O support (cribbed from the
     *					tty drivers).
     *		Niibe Yutaka	:	Asynchronous I/O for writes (4.4BSD style)
     *		Jeff Uphoff	:	Made max number of sockets command-line
     *					configurable.
     *		Matti Aarnio	:	Made the number of sockets dynamic,
     *					to be allocated when needed, and mr.
     *					Uphoff's max is used as max to be
     *					allowed to allocate.
     *		Linus		:	Argh. removed all the socket allocation
     *					altogether: it's in the inode now.
     *		Alan Cox	:	Made sock_alloc()/sock_release() public
     *					for NetROM and future kernel nfsd type
     *					stuff.
     *		Alan Cox	:	sendmsg/recvmsg basics.
     *		Tom Dyas	:	Export net symbols.
     *		Marcin Dalecki	:	Fixed problems with CONFIG_NET="n".
     *		Alan Cox	:	Added thread locking to sys_* calls
     *					for sockets. May have errors at the
     *					moment.
     *		Kevin Buhr	:	Fixed the dumb errors in the above.
     *		Andi Kleen	:	Some small cleanups, optimizations,
     *					and fixed a copy_from_user() bug.
     *		Tigran Aivazian	:	sys_send(args) calls sys_sendto(args, NULL, 0)
     *		Tigran Aivazian	:	Made listen(2) backlog sanity checks 
     *					protocol-independent
     *
     *
     *		This program is free software; you can redistribute it and/or
     *		modify it under the terms of the GNU General Public License
     *		as published by the Free Software Foundation; either version
     *		2 of the License, or (at your option) any later version.
     *
     *
     *	This module is effectively the top level interface to the BSD socket
     *	paradigm. 
     *
     *	Based upon Swansea University Computer Society NET3.039
     */
    
    #include <linux/config.h>
    #include <linux/mm.h>
    #include <linux/smp_lock.h>
    #include <linux/socket.h>
    #include <linux/file.h>
    #include <linux/net.h>
    #include <linux/interrupt.h>
    #include <linux/netdevice.h>
    #include <linux/proc_fs.h>
    #include <linux/seq_file.h>
    #include <linux/wanrouter.h>
    #include <linux/if_bridge.h>
    
    #include <linux/if_frad.h>
    #include <linux/if_vlan.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #include <linux/init.h>
    #include <linux/poll.h>
    #include <linux/cache.h>
    #include <linux/module.h>
    #include <linux/highmem.h>
    #include <linux/divert.h>
    #include <linux/mount.h>
    #include <linux/security.h>
    #include <linux/syscalls.h>
    #include <linux/compat.h>
    #include <linux/kmod.h>
    
    #include <linux/audit.h>
    
    #include <linux/wireless.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    #include <asm/uaccess.h>
    #include <asm/unistd.h>
    
    #include <net/compat.h>
    
    #include <net/sock.h>
    #include <linux/netfilter.h>
    
    static int sock_no_open(struct inode *irrelevant, struct file *dontcare);
    static ssize_t sock_aio_read(struct kiocb *iocb, char __user *buf,
    			 size_t size, loff_t pos);
    static ssize_t sock_aio_write(struct kiocb *iocb, const char __user *buf,
    			  size_t size, loff_t pos);
    static int sock_mmap(struct file *file, struct vm_area_struct * vma);
    
    static int sock_close(struct inode *inode, struct file *file);
    static unsigned int sock_poll(struct file *file,
    			      struct poll_table_struct *wait);
    static long sock_ioctl(struct file *file,
    		      unsigned int cmd, unsigned long arg);
    static int sock_fasync(int fd, struct file *filp, int on);
    static ssize_t sock_readv(struct file *file, const struct iovec *vector,
    			  unsigned long count, loff_t *ppos);
    static ssize_t sock_writev(struct file *file, const struct iovec *vector,
    			  unsigned long count, loff_t *ppos);
    static ssize_t sock_sendpage(struct file *file, struct page *page,
    			     int offset, size_t size, loff_t *ppos, int more);
    
    
    /*
     *	Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
     *	in the operation structures but are done directly via the socketcall() multiplexor.
     */
    
    static struct file_operations socket_file_ops = {
    	.owner =	THIS_MODULE,
    	.llseek =	no_llseek,
    	.aio_read =	sock_aio_read,
    	.aio_write =	sock_aio_write,
    	.poll =		sock_poll,
    	.unlocked_ioctl = sock_ioctl,
    	.mmap =		sock_mmap,
    	.open =		sock_no_open,	/* special open code to disallow open via /proc */
    	.release =	sock_close,
    	.fasync =	sock_fasync,
    	.readv =	sock_readv,
    	.writev =	sock_writev,
    	.sendpage =	sock_sendpage
    };
    
    /*
     *	The protocol list. Each protocol is registered in here.
     */
    
    static struct net_proto_family *net_families[NPROTO];
    
    #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
    static atomic_t net_family_lockct = ATOMIC_INIT(0);
    static DEFINE_SPINLOCK(net_family_lock);
    
    /* The strategy is: modifications net_family vector are short, do not
       sleep and veeery rare, but read access should be free of any exclusive
       locks.
     */
    
    static void net_family_write_lock(void)
    {
    	spin_lock(&net_family_lock);
    	while (atomic_read(&net_family_lockct) != 0) {
    		spin_unlock(&net_family_lock);
    
    		yield();
    
    		spin_lock(&net_family_lock);
    	}
    }
    
    static __inline__ void net_family_write_unlock(void)
    {
    	spin_unlock(&net_family_lock);
    }
    
    static __inline__ void net_family_read_lock(void)
    {
    	atomic_inc(&net_family_lockct);
    	spin_unlock_wait(&net_family_lock);
    }
    
    static __inline__ void net_family_read_unlock(void)
    {
    	atomic_dec(&net_family_lockct);
    }
    
    #else
    #define net_family_write_lock() do { } while(0)
    #define net_family_write_unlock() do { } while(0)
    #define net_family_read_lock() do { } while(0)
    #define net_family_read_unlock() do { } while(0)
    #endif
    
    
    /*
     *	Statistics counters of the socket lists
     */
    
    static DEFINE_PER_CPU(int, sockets_in_use) = 0;
    
    /*
     *	Support routines. Move socket addresses back and forth across the kernel/user
     *	divide and look after the messy bits.
     */
    
    #define MAX_SOCK_ADDR	128		/* 108 for Unix domain - 
    					   16 for IP, 16 for IPX,
    					   24 for IPv6,
    					   about 80 for AX.25 
    					   must be at least one bigger than
    					   the AF_UNIX size (see net/unix/af_unix.c
    					   :unix_mkname()).  
    					 */
    					 
    /**
     *	move_addr_to_kernel	-	copy a socket address into kernel space
     *	@uaddr: Address in user space
     *	@kaddr: Address in kernel space
     *	@ulen: Length in user space
     *
     *	The address is copied into kernel space. If the provided address is
     *	too long an error code of -EINVAL is returned. If the copy gives
     *	invalid addresses -EFAULT is returned. On a success 0 is returned.
     */
    
    int move_addr_to_kernel(void __user *uaddr, int ulen, void *kaddr)
    {
    	if(ulen<0||ulen>MAX_SOCK_ADDR)
    		return -EINVAL;
    	if(ulen==0)
    		return 0;
    	if(copy_from_user(kaddr,uaddr,ulen))
    		return -EFAULT;
    
    	return audit_sockaddr(ulen, kaddr);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /**
     *	move_addr_to_user	-	copy an address to user space
     *	@kaddr: kernel space address
     *	@klen: length of address in kernel
     *	@uaddr: user space address
     *	@ulen: pointer to user length field
     *
     *	The value pointed to by ulen on entry is the buffer length available.
     *	This is overwritten with the buffer space used. -EINVAL is returned
     *	if an overlong buffer is specified or a negative buffer size. -EFAULT
     *	is returned if either the buffer or the length field are not
     *	accessible.
     *	After copying the data up to the limit the user specifies, the true
     *	length of the data is written over the length limit the user
     *	specified. Zero is returned for a success.
     */
     
    int move_addr_to_user(void *kaddr, int klen, void __user *uaddr, int __user *ulen)
    {
    	int err;
    	int len;
    
    	if((err=get_user(len, ulen)))
    		return err;
    	if(len>klen)
    		len=klen;
    	if(len<0 || len> MAX_SOCK_ADDR)
    		return -EINVAL;
    	if(len)
    	{
    		if(copy_to_user(uaddr,kaddr,len))
    			return -EFAULT;
    	}
    	/*
    	 *	"fromlen shall refer to the value before truncation.."
    	 *			1003.1g
    	 */
    	return __put_user(klen, ulen);
    }
    
    #define SOCKFS_MAGIC 0x534F434B
    
    
    static kmem_cache_t * sock_inode_cachep __read_mostly;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    static struct inode *sock_alloc_inode(struct super_block *sb)
    {
    	struct socket_alloc *ei;
    	ei = (struct socket_alloc *)kmem_cache_alloc(sock_inode_cachep, SLAB_KERNEL);
    	if (!ei)
    		return NULL;
    	init_waitqueue_head(&ei->socket.wait);
    	
    	ei->socket.fasync_list = NULL;
    	ei->socket.state = SS_UNCONNECTED;
    	ei->socket.flags = 0;
    	ei->socket.ops = NULL;
    	ei->socket.sk = NULL;
    	ei->socket.file = NULL;
    	ei->socket.flags = 0;
    
    	return &ei->vfs_inode;
    }
    
    static void sock_destroy_inode(struct inode *inode)
    {
    	kmem_cache_free(sock_inode_cachep,
    			container_of(inode, struct socket_alloc, vfs_inode));
    }
    
    static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
    {
    	struct socket_alloc *ei = (struct socket_alloc *) foo;
    
    	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
    	    SLAB_CTOR_CONSTRUCTOR)
    		inode_init_once(&ei->vfs_inode);
    }
     
    static int init_inodecache(void)
    {
    	sock_inode_cachep = kmem_cache_create("sock_inode_cache",
    				sizeof(struct socket_alloc),
    				0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT,
    				init_once, NULL);
    	if (sock_inode_cachep == NULL)
    		return -ENOMEM;
    	return 0;
    }
    
    static struct super_operations sockfs_ops = {
    	.alloc_inode =	sock_alloc_inode,
    	.destroy_inode =sock_destroy_inode,
    	.statfs =	simple_statfs,
    };
    
    static struct super_block *sockfs_get_sb(struct file_system_type *fs_type,
    	int flags, const char *dev_name, void *data)
    {
    	return get_sb_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC);
    }
    
    
    static struct vfsmount *sock_mnt __read_mostly;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    static struct file_system_type sock_fs_type = {
    	.name =		"sockfs",
    	.get_sb =	sockfs_get_sb,
    	.kill_sb =	kill_anon_super,
    };
    static int sockfs_delete_dentry(struct dentry *dentry)
    {
    	return 1;
    }
    static struct dentry_operations sockfs_dentry_operations = {
    	.d_delete =	sockfs_delete_dentry,
    };
    
    /*
     *	Obtains the first available file descriptor and sets it up for use.
     *
     *	This function creates file structure and maps it to fd space
     *	of current process. On success it returns file descriptor
     *	and file struct implicitly stored in sock->file.
     *	Note that another thread may close file descriptor before we return
     *	from this function. We use the fact that now we do not refer
     *	to socket after mapping. If one day we will need it, this
     *	function will increment ref. count on file by 1.
     *
     *	In any case returned fd MAY BE not valid!
     *	This race condition is unavoidable
     *	with shared fd spaces, we cannot solve it inside kernel,
     *	but we take care of internal coherence yet.
     */
    
    int sock_map_fd(struct socket *sock)
    {
    	int fd;
    	struct qstr this;
    	char name[32];
    
    	/*
    	 *	Find a file descriptor suitable for return to the user. 
    	 */
    
    	fd = get_unused_fd();
    	if (fd >= 0) {
    		struct file *file = get_empty_filp();
    
    		if (!file) {
    			put_unused_fd(fd);
    			fd = -ENFILE;
    			goto out;
    		}
    
    
    		this.len = sprintf(name, "[%lu]", SOCK_INODE(sock)->i_ino);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		this.name = name;
    		this.hash = SOCK_INODE(sock)->i_ino;
    
    		file->f_dentry = d_alloc(sock_mnt->mnt_sb->s_root, &this);
    		if (!file->f_dentry) {
    			put_filp(file);
    			put_unused_fd(fd);
    			fd = -ENOMEM;
    			goto out;
    		}
    		file->f_dentry->d_op = &sockfs_dentry_operations;
    		d_add(file->f_dentry, SOCK_INODE(sock));
    		file->f_vfsmnt = mntget(sock_mnt);
    		file->f_mapping = file->f_dentry->d_inode->i_mapping;
    
    		sock->file = file;
    		file->f_op = SOCK_INODE(sock)->i_fop = &socket_file_ops;
    		file->f_mode = FMODE_READ | FMODE_WRITE;
    		file->f_flags = O_RDWR;
    		file->f_pos = 0;
    
    		file->private_data = sock;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		fd_install(fd, file);
    	}
    
    out:
    	return fd;
    }
    
    /**
     *	sockfd_lookup	- 	Go from a file number to its socket slot
     *	@fd: file handle
     *	@err: pointer to an error code return
     *
     *	The file handle passed in is locked and the socket it is bound
     *	too is returned. If an error occurs the err pointer is overwritten
     *	with a negative errno code and NULL is returned. The function checks
     *	for both invalid handles and passing a handle which is not a socket.
     *
     *	On a success the socket object pointer is returned.
     */
    
    struct socket *sockfd_lookup(int fd, int *err)
    {
    	struct file *file;
    	struct inode *inode;
    	struct socket *sock;
    
    	if (!(file = fget(fd)))
    	{
    		*err = -EBADF;
    		return NULL;
    	}
    
    
    	if (file->f_op == &socket_file_ops)
    		return file->private_data;	/* set in sock_map_fd */
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	inode = file->f_dentry->d_inode;
    	if (!S_ISSOCK(inode->i_mode)) {
    		*err = -ENOTSOCK;
    		fput(file);
    		return NULL;
    	}
    
    	sock = SOCKET_I(inode);
    	if (sock->file != file) {
    		printk(KERN_ERR "socki_lookup: socket file changed!\n");
    		sock->file = file;
    	}
    	return sock;
    }
    
    /**
     *	sock_alloc	-	allocate a socket
     *	
     *	Allocate a new inode and socket object. The two are bound together
     *	and initialised. The socket is then returned. If we are out of inodes
     *	NULL is returned.
     */
    
    static struct socket *sock_alloc(void)
    {
    	struct inode * inode;
    	struct socket * sock;
    
    	inode = new_inode(sock_mnt->mnt_sb);
    	if (!inode)
    		return NULL;
    
    	sock = SOCKET_I(inode);
    
    	inode->i_mode = S_IFSOCK|S_IRWXUGO;
    	inode->i_uid = current->fsuid;
    	inode->i_gid = current->fsgid;
    
    	get_cpu_var(sockets_in_use)++;
    	put_cpu_var(sockets_in_use);
    	return sock;
    }
    
    /*
     *	In theory you can't get an open on this inode, but /proc provides
     *	a back door. Remember to keep it shut otherwise you'll let the
     *	creepy crawlies in.
     */
      
    static int sock_no_open(struct inode *irrelevant, struct file *dontcare)
    {
    	return -ENXIO;
    }
    
    struct file_operations bad_sock_fops = {
    	.owner = THIS_MODULE,
    	.open = sock_no_open,
    };
    
    /**
     *	sock_release	-	close a socket
     *	@sock: socket to close
     *
     *	The socket is released from the protocol stack if it has a release
     *	callback, and the inode is then released if the socket is bound to
     *	an inode not a file. 
     */
     
    void sock_release(struct socket *sock)
    {
    	if (sock->ops) {
    		struct module *owner = sock->ops->owner;
    
    		sock->ops->release(sock);
    		sock->ops = NULL;
    		module_put(owner);
    	}
    
    	if (sock->fasync_list)
    		printk(KERN_ERR "sock_release: fasync list not empty!\n");
    
    	get_cpu_var(sockets_in_use)--;
    	put_cpu_var(sockets_in_use);
    	if (!sock->file) {
    		iput(SOCK_INODE(sock));
    		return;
    	}
    	sock->file=NULL;
    }
    
    static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock, 
    				 struct msghdr *msg, size_t size)
    {
    	struct sock_iocb *si = kiocb_to_siocb(iocb);
    	int err;
    
    	si->sock = sock;
    	si->scm = NULL;
    	si->msg = msg;
    	si->size = size;
    
    	err = security_socket_sendmsg(sock, msg, size);
    	if (err)
    		return err;
    
    	return sock->ops->sendmsg(iocb, sock, msg, size);
    }
    
    int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
    {
    	struct kiocb iocb;
    	struct sock_iocb siocb;
    	int ret;
    
    	init_sync_kiocb(&iocb, NULL);
    	iocb.private = &siocb;
    	ret = __sock_sendmsg(&iocb, sock, msg, size);
    	if (-EIOCBQUEUED == ret)
    		ret = wait_on_sync_kiocb(&iocb);
    	return ret;
    }
    
    int kernel_sendmsg(struct socket *sock, struct msghdr *msg,
    		   struct kvec *vec, size_t num, size_t size)
    {
    	mm_segment_t oldfs = get_fs();
    	int result;
    
    	set_fs(KERNEL_DS);
    	/*
    	 * the following is safe, since for compiler definitions of kvec and
    	 * iovec are identical, yielding the same in-core layout and alignment
    	 */
    	msg->msg_iov = (struct iovec *)vec,
    	msg->msg_iovlen = num;
    	result = sock_sendmsg(sock, msg, size);
    	set_fs(oldfs);
    	return result;
    }
    
    static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock, 
    				 struct msghdr *msg, size_t size, int flags)
    {
    	int err;
    	struct sock_iocb *si = kiocb_to_siocb(iocb);
    
    	si->sock = sock;
    	si->scm = NULL;
    	si->msg = msg;
    	si->size = size;
    	si->flags = flags;
    
    	err = security_socket_recvmsg(sock, msg, size, flags);
    	if (err)
    		return err;
    
    	return sock->ops->recvmsg(iocb, sock, msg, size, flags);
    }
    
    int sock_recvmsg(struct socket *sock, struct msghdr *msg, 
    		 size_t size, int flags)
    {
    	struct kiocb iocb;
    	struct sock_iocb siocb;
    	int ret;
    
            init_sync_kiocb(&iocb, NULL);
    	iocb.private = &siocb;
    	ret = __sock_recvmsg(&iocb, sock, msg, size, flags);
    	if (-EIOCBQUEUED == ret)
    		ret = wait_on_sync_kiocb(&iocb);
    	return ret;
    }
    
    int kernel_recvmsg(struct socket *sock, struct msghdr *msg, 
    		   struct kvec *vec, size_t num,
    		   size_t size, int flags)
    {
    	mm_segment_t oldfs = get_fs();
    	int result;
    
    	set_fs(KERNEL_DS);
    	/*
    	 * the following is safe, since for compiler definitions of kvec and
    	 * iovec are identical, yielding the same in-core layout and alignment
    	 */
    	msg->msg_iov = (struct iovec *)vec,
    	msg->msg_iovlen = num;
    	result = sock_recvmsg(sock, msg, size, flags);
    	set_fs(oldfs);
    	return result;
    }
    
    static void sock_aio_dtor(struct kiocb *iocb)
    {
    	kfree(iocb->private);
    }
    
    
    static ssize_t sock_sendpage(struct file *file, struct page *page,
    			     int offset, size_t size, loff_t *ppos, int more)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct socket *sock;
    	int flags;
    
    
    	sock = file->private_data;
    
    	flags = !(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT;
    	if (more)
    		flags |= MSG_MORE;
    
    	return sock->ops->sendpage(sock, page, offset, size, flags);
    }
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    static struct sock_iocb *alloc_sock_iocb(struct kiocb *iocb,
    		char __user *ubuf, size_t size, struct sock_iocb *siocb)
    {
    	if (!is_sync_kiocb(iocb)) {
    		siocb = kmalloc(sizeof(*siocb), GFP_KERNEL);
    		if (!siocb)
    			return NULL;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		iocb->ki_dtor = sock_aio_dtor;
    	}
    
    
    	siocb->kiocb = iocb;
    	siocb->async_iov.iov_base = ubuf;
    	siocb->async_iov.iov_len = size;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	iocb->private = siocb;
    	return siocb;
    
    static ssize_t do_sock_read(struct msghdr *msg, struct kiocb *iocb,
    		struct file *file, struct iovec *iov, unsigned long nr_segs)
    {
    	struct socket *sock = file->private_data;
    	size_t size = 0;
    	int i;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
            for (i = 0 ; i < nr_segs ; i++)
                    size += iov[i].iov_len;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	msg->msg_name = NULL;
    	msg->msg_namelen = 0;
    	msg->msg_control = NULL;
    	msg->msg_controllen = 0;
    	msg->msg_iov = (struct iovec *) iov;
    	msg->msg_iovlen = nr_segs;
    	msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
    
    	return __sock_recvmsg(iocb, sock, msg, size, msg->msg_flags);
    }
    
    static ssize_t sock_readv(struct file *file, const struct iovec *iov,
    			  unsigned long nr_segs, loff_t *ppos)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct kiocb iocb;
    	struct sock_iocb siocb;
    	struct msghdr msg;
    	int ret;
    
            init_sync_kiocb(&iocb, NULL);
    	iocb.private = &siocb;
    
    	ret = do_sock_read(&msg, &iocb, file, (struct iovec *)iov, nr_segs);
    	if (-EIOCBQUEUED == ret)
    		ret = wait_on_sync_kiocb(&iocb);
    	return ret;
    }
    
    static ssize_t sock_aio_read(struct kiocb *iocb, char __user *ubuf,
    			 size_t count, loff_t pos)
    {
    	struct sock_iocb siocb, *x;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (pos != 0)
    		return -ESPIPE;
    
    	if (count == 0)		/* Match SYS5 behaviour */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return 0;
    
    
    	x = alloc_sock_iocb(iocb, ubuf, count, &siocb);
    	if (!x)
    		return -ENOMEM;
    	return do_sock_read(&x->async_msg, iocb, iocb->ki_filp,
    			&x->async_iov, 1);
    
    static ssize_t do_sock_write(struct msghdr *msg, struct kiocb *iocb,
    		struct file *file, struct iovec *iov, unsigned long nr_segs)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct socket *sock = file->private_data;
    	size_t size = 0;
    	int i;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
            for (i = 0 ; i < nr_segs ; i++)
                    size += iov[i].iov_len;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	msg->msg_name = NULL;
    	msg->msg_namelen = 0;
    	msg->msg_control = NULL;
    	msg->msg_controllen = 0;
    	msg->msg_iov = (struct iovec *) iov;
    	msg->msg_iovlen = nr_segs;
    	msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
    	if (sock->type == SOCK_SEQPACKET)
    		msg->msg_flags |= MSG_EOR;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	return __sock_sendmsg(iocb, sock, msg, size);
    
    static ssize_t sock_writev(struct file *file, const struct iovec *iov,
    			   unsigned long nr_segs, loff_t *ppos)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct msghdr msg;
    
    	struct kiocb iocb;
    	struct sock_iocb siocb;
    	int ret;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	init_sync_kiocb(&iocb, NULL);
    	iocb.private = &siocb;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	ret = do_sock_write(&msg, &iocb, file, (struct iovec *)iov, nr_segs);
    	if (-EIOCBQUEUED == ret)
    		ret = wait_on_sync_kiocb(&iocb);
    	return ret;
    }
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    static ssize_t sock_aio_write(struct kiocb *iocb, const char __user *ubuf,
    			  size_t count, loff_t pos)
    {
    	struct sock_iocb siocb, *x;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if (pos != 0)
    		return -ESPIPE;
    	if (count == 0)		/* Match SYS5 behaviour */
    		return 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	x = alloc_sock_iocb(iocb, (void __user *)ubuf, count, &siocb);
    	if (!x)
    		return -ENOMEM;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	return do_sock_write(&x->async_msg, iocb, iocb->ki_filp,
    			&x->async_iov, 1);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    
    /*
     * Atomic setting of ioctl hooks to avoid race
     * with module unload.
     */
    
    static DECLARE_MUTEX(br_ioctl_mutex);
    static int (*br_ioctl_hook)(unsigned int cmd, void __user *arg) = NULL;
    
    void brioctl_set(int (*hook)(unsigned int, void __user *))
    {
    	down(&br_ioctl_mutex);
    	br_ioctl_hook = hook;
    	up(&br_ioctl_mutex);
    }
    EXPORT_SYMBOL(brioctl_set);
    
    static DECLARE_MUTEX(vlan_ioctl_mutex);
    static int (*vlan_ioctl_hook)(void __user *arg);
    
    void vlan_ioctl_set(int (*hook)(void __user *))
    {
    	down(&vlan_ioctl_mutex);
    	vlan_ioctl_hook = hook;
    	up(&vlan_ioctl_mutex);
    }
    EXPORT_SYMBOL(vlan_ioctl_set);
    
    static DECLARE_MUTEX(dlci_ioctl_mutex);
    static int (*dlci_ioctl_hook)(unsigned int, void __user *);
    
    void dlci_ioctl_set(int (*hook)(unsigned int, void __user *))
    {
    	down(&dlci_ioctl_mutex);
    	dlci_ioctl_hook = hook;
    	up(&dlci_ioctl_mutex);
    }
    EXPORT_SYMBOL(dlci_ioctl_set);
    
    /*
     *	With an ioctl, arg may well be a user mode pointer, but we don't know
     *	what to do with it - that's up to the protocol still.
     */
    
    static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
    {
    	struct socket *sock;
    	void __user *argp = (void __user *)arg;
    	int pid, err;
    
    
    	sock = file->private_data;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) {
    		err = dev_ioctl(cmd, argp);
    	} else
    
    #ifdef CONFIG_WIRELESS_EXT
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
    		err = dev_ioctl(cmd, argp);
    	} else
    
    #endif	/* CONFIG_WIRELESS_EXT */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	switch (cmd) {
    		case FIOSETOWN:
    		case SIOCSPGRP:
    			err = -EFAULT;
    			if (get_user(pid, (int __user *)argp))
    				break;
    			err = f_setown(sock->file, pid, 1);
    			break;
    		case FIOGETOWN:
    		case SIOCGPGRP:
    			err = put_user(sock->file->f_owner.pid, (int __user *)argp);
    			break;
    		case SIOCGIFBR:
    		case SIOCSIFBR:
    		case SIOCBRADDBR:
    		case SIOCBRDELBR:
    			err = -ENOPKG;
    			if (!br_ioctl_hook)
    				request_module("bridge");
    
    			down(&br_ioctl_mutex);
    			if (br_ioctl_hook) 
    				err = br_ioctl_hook(cmd, argp);
    			up(&br_ioctl_mutex);
    			break;
    		case SIOCGIFVLAN:
    		case SIOCSIFVLAN:
    			err = -ENOPKG;
    			if (!vlan_ioctl_hook)
    				request_module("8021q");
    
    			down(&vlan_ioctl_mutex);
    			if (vlan_ioctl_hook)
    				err = vlan_ioctl_hook(argp);
    			up(&vlan_ioctl_mutex);
    			break;
    		case SIOCGIFDIVERT:
    		case SIOCSIFDIVERT:
    		/* Convert this to call through a hook */
    			err = divert_ioctl(cmd, argp);
    			break;
    		case SIOCADDDLCI:
    		case SIOCDELDLCI:
    			err = -ENOPKG;
    			if (!dlci_ioctl_hook)
    				request_module("dlci");
    
    			if (dlci_ioctl_hook) {
    				down(&dlci_ioctl_mutex);
    				err = dlci_ioctl_hook(cmd, argp);
    				up(&dlci_ioctl_mutex);
    			}
    			break;
    		default:
    			err = sock->ops->ioctl(sock, cmd, arg);
    
    
    			/*
    			 * If this ioctl is unknown try to hand it down
    			 * to the NIC driver.
    			 */
    			if (err == -ENOIOCTLCMD)
    				err = dev_ioctl(cmd, argp);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			break;
    	}
    	return err;
    }
    
    int sock_create_lite(int family, int type, int protocol, struct socket **res)
    {
    	int err;
    	struct socket *sock = NULL;
    	
    	err = security_socket_create(family, type, protocol, 1);
    	if (err)
    		goto out;
    
    	sock = sock_alloc();
    	if (!sock) {
    		err = -ENOMEM;
    		goto out;
    	}
    
    	security_socket_post_create(sock, family, type, protocol, 1);
    	sock->type = type;
    out:
    	*res = sock;
    	return err;
    }
    
    /* No kernel lock held - perfect */
    static unsigned int sock_poll(struct file *file, poll_table * wait)
    {
    	struct socket *sock;
    
    	/*
    	 *	We can't return errors to poll, so it's either yes or no. 
    	 */
    
    	sock = file->private_data;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	return sock->ops->poll(file, sock, wait);
    }
    
    static int sock_mmap(struct file * file, struct vm_area_struct * vma)
    {
    
    	struct socket *sock = file->private_data;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	return sock->ops->mmap(file, sock, vma);
    }
    
    
    static int sock_close(struct inode *inode, struct file *filp)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	/*
    	 *	It was possible the inode is NULL we were 
    	 *	closing an unfinished socket. 
    	 */
    
    	if (!inode)
    	{
    		printk(KERN_DEBUG "sock_close: NULL inode\n");
    		return 0;
    	}
    	sock_fasync(-1, filp, 0);
    	sock_release(SOCKET_I(inode));
    	return 0;
    }
    
    /*
     *	Update the socket async list
     *
     *	Fasync_list locking strategy.
     *
     *	1. fasync_list is modified only under process context socket lock
     *	   i.e. under semaphore.
     *	2. fasync_list is used under read_lock(&sk->sk_callback_lock)
     *	   or under socket lock.
     *	3. fasync_list can be used from softirq context, so that
     *	   modification under socket lock have to be enhanced with
     *	   write_lock_bh(&sk->sk_callback_lock).
     *							--ANK (990710)
     */
    
    static int sock_fasync(int fd, struct file *filp, int on)
    {
    	struct fasync_struct *fa, *fna=NULL, **prev;
    	struct socket *sock;
    	struct sock *sk;
    
    	if (on)
    	{
    
    		fna = kmalloc(sizeof(struct fasync_struct), GFP_KERNEL);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if(fna==NULL)
    			return -ENOMEM;
    	}
    
    
    	sock = filp->private_data;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	if ((sk=sock->sk) == NULL) {