Skip to content
Snippets Groups Projects
socket.c 52.8 KiB
Newer Older
  • Learn to ignore specific revisions
  • Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * NET		An implementation of the SOCKET network access protocol.
     *
     * Version:	@(#)socket.c	1.1.93	18/02/95
     *
     * Authors:	Orest Zborowski, <obz@Kodak.COM>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
     *
     * Fixes:
     *		Anonymous	:	NOTSOCK/BADF cleanup. Error fix in
     *					shutdown()
     *		Alan Cox	:	verify_area() fixes
     *		Alan Cox	:	Removed DDI
     *		Jonathan Kamens	:	SOCK_DGRAM reconnect bug
     *		Alan Cox	:	Moved a load of checks to the very
     *					top level.
     *		Alan Cox	:	Move address structures to/from user
     *					mode above the protocol layers.
     *		Rob Janssen	:	Allow 0 length sends.
     *		Alan Cox	:	Asynchronous I/O support (cribbed from the
     *					tty drivers).
     *		Niibe Yutaka	:	Asynchronous I/O for writes (4.4BSD style)
     *		Jeff Uphoff	:	Made max number of sockets command-line
     *					configurable.
     *		Matti Aarnio	:	Made the number of sockets dynamic,
     *					to be allocated when needed, and mr.
     *					Uphoff's max is used as max to be
     *					allowed to allocate.
     *		Linus		:	Argh. removed all the socket allocation
     *					altogether: it's in the inode now.
     *		Alan Cox	:	Made sock_alloc()/sock_release() public
     *					for NetROM and future kernel nfsd type
     *					stuff.
     *		Alan Cox	:	sendmsg/recvmsg basics.
     *		Tom Dyas	:	Export net symbols.
     *		Marcin Dalecki	:	Fixed problems with CONFIG_NET="n".
     *		Alan Cox	:	Added thread locking to sys_* calls
     *					for sockets. May have errors at the
     *					moment.
     *		Kevin Buhr	:	Fixed the dumb errors in the above.
     *		Andi Kleen	:	Some small cleanups, optimizations,
     *					and fixed a copy_from_user() bug.
     *		Tigran Aivazian	:	sys_send(args) calls sys_sendto(args, NULL, 0)
    
     *		Tigran Aivazian	:	Made listen(2) backlog sanity checks
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *					protocol-independent
     *
     *
     *		This program is free software; you can redistribute it and/or
     *		modify it under the terms of the GNU General Public License
     *		as published by the Free Software Foundation; either version
     *		2 of the License, or (at your option) any later version.
     *
     *
     *	This module is effectively the top level interface to the BSD socket
    
     *	paradigm.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *
     *	Based upon Swansea University Computer Society NET3.039
     */
    
    #include <linux/mm.h>
    #include <linux/socket.h>
    #include <linux/file.h>
    #include <linux/net.h>
    #include <linux/interrupt.h>
    
    #include <linux/rcupdate.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #include <linux/netdevice.h>
    #include <linux/proc_fs.h>
    #include <linux/seq_file.h>
    
    Arjan van de Ven's avatar
    Arjan van de Ven committed
    #include <linux/mutex.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #include <linux/wanrouter.h>
    #include <linux/if_bridge.h>
    
    #include <linux/if_frad.h>
    #include <linux/if_vlan.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #include <linux/init.h>
    #include <linux/poll.h>
    #include <linux/cache.h>
    #include <linux/module.h>
    #include <linux/highmem.h>
    #include <linux/mount.h>
    #include <linux/security.h>
    #include <linux/syscalls.h>
    #include <linux/compat.h>
    #include <linux/kmod.h>
    
    #include <linux/audit.h>
    
    #include <linux/wireless.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    #include <asm/uaccess.h>
    #include <asm/unistd.h>
    
    #include <net/compat.h>
    
    #include <net/sock.h>
    #include <linux/netfilter.h>
    
    static int sock_no_open(struct inode *irrelevant, struct file *dontcare);
    
    static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
    			 unsigned long nr_segs, loff_t pos);
    static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov,
    			  unsigned long nr_segs, loff_t pos);
    
    static int sock_mmap(struct file *file, struct vm_area_struct *vma);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    static int sock_close(struct inode *inode, struct file *file);
    static unsigned int sock_poll(struct file *file,
    			      struct poll_table_struct *wait);
    
    static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
    
    #ifdef CONFIG_COMPAT
    static long compat_sock_ioctl(struct file *file,
    
    			      unsigned int cmd, unsigned long arg);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    static int sock_fasync(int fd, struct file *filp, int on);
    static ssize_t sock_sendpage(struct file *file, struct page *page,
    			     int offset, size_t size, loff_t *ppos, int more);
    
    /*
     *	Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
     *	in the operation structures but are done directly via the socketcall() multiplexor.
     */
    
    static struct file_operations socket_file_ops = {
    	.owner =	THIS_MODULE,
    	.llseek =	no_llseek,
    	.aio_read =	sock_aio_read,
    	.aio_write =	sock_aio_write,
    	.poll =		sock_poll,
    	.unlocked_ioctl = sock_ioctl,
    
    #ifdef CONFIG_COMPAT
    	.compat_ioctl = compat_sock_ioctl,
    #endif
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	.mmap =		sock_mmap,
    	.open =		sock_no_open,	/* special open code to disallow open via /proc */
    	.release =	sock_close,
    	.fasync =	sock_fasync,
    
    	.sendpage =	sock_sendpage,
    	.splice_write = generic_splice_sendpage,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    };
    
    /*
     *	The protocol list. Each protocol is registered in here.
     */
    
    static DEFINE_SPINLOCK(net_family_lock);
    
    static const struct net_proto_family *net_families[NPROTO] __read_mostly;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    /*
     *	Statistics counters of the socket lists
     */
    
    static DEFINE_PER_CPU(int, sockets_in_use) = 0;
    
    /*
    
     * Support routines.
     * Move socket addresses back and forth across the kernel/user
     * divide and look after the messy bits.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    
    #define MAX_SOCK_ADDR	128		/* 108 for Unix domain -
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    					   16 for IP, 16 for IPX,
    					   24 for IPv6,
    
    					   about 80 for AX.25
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    					   must be at least one bigger than
    					   the AF_UNIX size (see net/unix/af_unix.c
    
    					   :unix_mkname()).
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    					 */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /**
     *	move_addr_to_kernel	-	copy a socket address into kernel space
     *	@uaddr: Address in user space
     *	@kaddr: Address in kernel space
     *	@ulen: Length in user space
     *
     *	The address is copied into kernel space. If the provided address is
     *	too long an error code of -EINVAL is returned. If the copy gives
     *	invalid addresses -EFAULT is returned. On a success 0 is returned.
     */
    
    int move_addr_to_kernel(void __user *uaddr, int ulen, void *kaddr)
    {
    
    	if (ulen < 0 || ulen > MAX_SOCK_ADDR)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return -EINVAL;
    
    	if (ulen == 0)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return 0;
    
    	if (copy_from_user(kaddr, uaddr, ulen))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return -EFAULT;
    
    	return audit_sockaddr(ulen, kaddr);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /**
     *	move_addr_to_user	-	copy an address to user space
     *	@kaddr: kernel space address
     *	@klen: length of address in kernel
     *	@uaddr: user space address
     *	@ulen: pointer to user length field
     *
     *	The value pointed to by ulen on entry is the buffer length available.
     *	This is overwritten with the buffer space used. -EINVAL is returned
     *	if an overlong buffer is specified or a negative buffer size. -EFAULT
     *	is returned if either the buffer or the length field are not
     *	accessible.
     *	After copying the data up to the limit the user specifies, the true
     *	length of the data is written over the length limit the user
     *	specified. Zero is returned for a success.
     */
    
    
    int move_addr_to_user(void *kaddr, int klen, void __user *uaddr,
    		      int __user *ulen)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	int err;
    	int len;
    
    
    	err = get_user(len, ulen);
    	if (err)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return err;
    
    	if (len > klen)
    		len = klen;
    	if (len < 0 || len > MAX_SOCK_ADDR)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return -EINVAL;
    
    Steve Grubb's avatar
    Steve Grubb committed
    		if (audit_sockaddr(klen, kaddr))
    			return -ENOMEM;
    
    		if (copy_to_user(uaddr, kaddr, len))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			return -EFAULT;
    	}
    	/*
    
    	 *      "fromlen shall refer to the value before truncation.."
    	 *                      1003.1g
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	 */
    	return __put_user(klen, ulen);
    }
    
    #define SOCKFS_MAGIC 0x534F434B
    
    
    static kmem_cache_t *sock_inode_cachep __read_mostly;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    static struct inode *sock_alloc_inode(struct super_block *sb)
    {
    	struct socket_alloc *ei;
    
    
    	ei = kmem_cache_alloc(sock_inode_cachep, SLAB_KERNEL);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (!ei)
    		return NULL;
    	init_waitqueue_head(&ei->socket.wait);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	ei->socket.fasync_list = NULL;
    	ei->socket.state = SS_UNCONNECTED;
    	ei->socket.flags = 0;
    	ei->socket.ops = NULL;
    	ei->socket.sk = NULL;
    	ei->socket.file = NULL;
    
    	return &ei->vfs_inode;
    }
    
    static void sock_destroy_inode(struct inode *inode)
    {
    	kmem_cache_free(sock_inode_cachep,
    			container_of(inode, struct socket_alloc, vfs_inode));
    }
    
    
    static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct socket_alloc *ei = (struct socket_alloc *)foo;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR))
    	    == SLAB_CTOR_CONSTRUCTOR)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		inode_init_once(&ei->vfs_inode);
    }
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    static int init_inodecache(void)
    {
    	sock_inode_cachep = kmem_cache_create("sock_inode_cache",
    
    					      sizeof(struct socket_alloc),
    					      0,
    					      (SLAB_HWCACHE_ALIGN |
    					       SLAB_RECLAIM_ACCOUNT |
    					       SLAB_MEM_SPREAD),
    					      init_once,
    					      NULL);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (sock_inode_cachep == NULL)
    		return -ENOMEM;
    	return 0;
    }
    
    static struct super_operations sockfs_ops = {
    	.alloc_inode =	sock_alloc_inode,
    	.destroy_inode =sock_destroy_inode,
    	.statfs =	simple_statfs,
    };
    
    
    static int sockfs_get_sb(struct file_system_type *fs_type,
    
    			 int flags, const char *dev_name, void *data,
    			 struct vfsmount *mnt)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	return get_sb_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC,
    			     mnt);
    
    static struct vfsmount *sock_mnt __read_mostly;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    static struct file_system_type sock_fs_type = {
    	.name =		"sockfs",
    	.get_sb =	sockfs_get_sb,
    	.kill_sb =	kill_anon_super,
    };
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    static int sockfs_delete_dentry(struct dentry *dentry)
    {
    	return 1;
    }
    static struct dentry_operations sockfs_dentry_operations = {
    
    	.d_delete = sockfs_delete_dentry,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    };
    
    /*
     *	Obtains the first available file descriptor and sets it up for use.
     *
    
     *	These functions create file structures and maps them to fd space
     *	of the current process. On success it returns file descriptor
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *	and file struct implicitly stored in sock->file.
     *	Note that another thread may close file descriptor before we return
     *	from this function. We use the fact that now we do not refer
     *	to socket after mapping. If one day we will need it, this
     *	function will increment ref. count on file by 1.
     *
     *	In any case returned fd MAY BE not valid!
     *	This race condition is unavoidable
     *	with shared fd spaces, we cannot solve it inside kernel,
     *	but we take care of internal coherence yet.
     */
    
    
    static int sock_alloc_fd(struct file **filep)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	int fd;
    
    	fd = get_unused_fd();
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		struct file *file = get_empty_filp();
    
    
    		*filep = file;
    		if (unlikely(!file)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			put_unused_fd(fd);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    static int sock_attach_fd(struct socket *sock, struct file *file)
    {
    	struct qstr this;
    	char name[32];
    
    	this.len = sprintf(name, "[%lu]", SOCK_INODE(sock)->i_ino);
    	this.name = name;
    	this.hash = SOCK_INODE(sock)->i_ino;
    
    	file->f_dentry = d_alloc(sock_mnt->mnt_sb->s_root, &this);
    	if (unlikely(!file->f_dentry))
    		return -ENOMEM;
    
    	file->f_dentry->d_op = &sockfs_dentry_operations;
    	d_add(file->f_dentry, SOCK_INODE(sock));
    	file->f_vfsmnt = mntget(sock_mnt);
    	file->f_mapping = file->f_dentry->d_inode->i_mapping;
    
    	sock->file = file;
    	file->f_op = SOCK_INODE(sock)->i_fop = &socket_file_ops;
    	file->f_mode = FMODE_READ | FMODE_WRITE;
    	file->f_flags = O_RDWR;
    	file->f_pos = 0;
    	file->private_data = sock;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	return 0;
    }
    
    int sock_map_fd(struct socket *sock)
    {
    	struct file *newfile;
    	int fd = sock_alloc_fd(&newfile);
    
    	if (likely(fd >= 0)) {
    		int err = sock_attach_fd(sock, newfile);
    
    		if (unlikely(err < 0)) {
    			put_filp(newfile);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			put_unused_fd(fd);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    	return fd;
    }
    
    
    static struct socket *sock_from_file(struct file *file, int *err)
    {
    	struct inode *inode;
    	struct socket *sock;
    
    	if (file->f_op == &socket_file_ops)
    		return file->private_data;	/* set in sock_map_fd */
    
    	inode = file->f_dentry->d_inode;
    	if (!S_ISSOCK(inode->i_mode)) {
    		*err = -ENOTSOCK;
    		return NULL;
    	}
    
    	sock = SOCKET_I(inode);
    	if (sock->file != file) {
    		printk(KERN_ERR "socki_lookup: socket file changed!\n");
    		sock->file = file;
    	}
    	return sock;
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /**
     *	sockfd_lookup	- 	Go from a file number to its socket slot
     *	@fd: file handle
     *	@err: pointer to an error code return
     *
     *	The file handle passed in is locked and the socket it is bound
     *	too is returned. If an error occurs the err pointer is overwritten
     *	with a negative errno code and NULL is returned. The function checks
     *	for both invalid handles and passing a handle which is not a socket.
     *
     *	On a success the socket object pointer is returned.
     */
    
    struct socket *sockfd_lookup(int fd, int *err)
    {
    	struct file *file;
    	struct socket *sock;
    
    
    	file = fget(fd);
    	if (!file) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		*err = -EBADF;
    		return NULL;
    	}
    
    	sock = sock_from_file(file, err);
    	if (!sock)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		fput(file);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed)
    {
    	struct file *file;
    	struct socket *sock;
    
    
    	file = fget_light(fd, fput_needed);
    	if (file) {
    		sock = sock_from_file(file, err);
    		if (sock)
    			return sock;
    		fput_light(file, *fput_needed);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /**
     *	sock_alloc	-	allocate a socket
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *	Allocate a new inode and socket object. The two are bound together
     *	and initialised. The socket is then returned. If we are out of inodes
     *	NULL is returned.
     */
    
    static struct socket *sock_alloc(void)
    {
    
    	struct inode *inode;
    	struct socket *sock;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	inode = new_inode(sock_mnt->mnt_sb);
    	if (!inode)
    		return NULL;
    
    	sock = SOCKET_I(inode);
    
    
    	inode->i_mode = S_IFSOCK | S_IRWXUGO;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	inode->i_uid = current->fsuid;
    	inode->i_gid = current->fsgid;
    
    	get_cpu_var(sockets_in_use)++;
    	put_cpu_var(sockets_in_use);
    	return sock;
    }
    
    /*
     *	In theory you can't get an open on this inode, but /proc provides
     *	a back door. Remember to keep it shut otherwise you'll let the
     *	creepy crawlies in.
     */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    static int sock_no_open(struct inode *irrelevant, struct file *dontcare)
    {
    	return -ENXIO;
    }
    
    
    const struct file_operations bad_sock_fops = {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	.owner = THIS_MODULE,
    	.open = sock_no_open,
    };
    
    /**
     *	sock_release	-	close a socket
     *	@sock: socket to close
     *
     *	The socket is released from the protocol stack if it has a release
     *	callback, and the inode is then released if the socket is bound to
    
     *	an inode not a file.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    void sock_release(struct socket *sock)
    {
    	if (sock->ops) {
    		struct module *owner = sock->ops->owner;
    
    		sock->ops->release(sock);
    		sock->ops = NULL;
    		module_put(owner);
    	}
    
    	if (sock->fasync_list)
    		printk(KERN_ERR "sock_release: fasync list not empty!\n");
    
    	get_cpu_var(sockets_in_use)--;
    	put_cpu_var(sockets_in_use);
    	if (!sock->file) {
    		iput(SOCK_INODE(sock));
    		return;
    	}
    
    	sock->file = NULL;
    
    static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				 struct msghdr *msg, size_t size)
    {
    	struct sock_iocb *si = kiocb_to_siocb(iocb);
    	int err;
    
    	si->sock = sock;
    	si->scm = NULL;
    	si->msg = msg;
    	si->size = size;
    
    	err = security_socket_sendmsg(sock, msg, size);
    	if (err)
    		return err;
    
    	return sock->ops->sendmsg(iocb, sock, msg, size);
    }
    
    int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
    {
    	struct kiocb iocb;
    	struct sock_iocb siocb;
    	int ret;
    
    	init_sync_kiocb(&iocb, NULL);
    	iocb.private = &siocb;
    	ret = __sock_sendmsg(&iocb, sock, msg, size);
    	if (-EIOCBQUEUED == ret)
    		ret = wait_on_sync_kiocb(&iocb);
    	return ret;
    }
    
    int kernel_sendmsg(struct socket *sock, struct msghdr *msg,
    		   struct kvec *vec, size_t num, size_t size)
    {
    	mm_segment_t oldfs = get_fs();
    	int result;
    
    	set_fs(KERNEL_DS);
    	/*
    	 * the following is safe, since for compiler definitions of kvec and
    	 * iovec are identical, yielding the same in-core layout and alignment
    	 */
    
    	msg->msg_iov = (struct iovec *)vec;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	msg->msg_iovlen = num;
    	result = sock_sendmsg(sock, msg, size);
    	set_fs(oldfs);
    	return result;
    }
    
    
    static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				 struct msghdr *msg, size_t size, int flags)
    {
    	int err;
    	struct sock_iocb *si = kiocb_to_siocb(iocb);
    
    	si->sock = sock;
    	si->scm = NULL;
    	si->msg = msg;
    	si->size = size;
    	si->flags = flags;
    
    	err = security_socket_recvmsg(sock, msg, size, flags);
    	if (err)
    		return err;
    
    	return sock->ops->recvmsg(iocb, sock, msg, size, flags);
    }
    
    
    int sock_recvmsg(struct socket *sock, struct msghdr *msg,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		 size_t size, int flags)
    {
    	struct kiocb iocb;
    	struct sock_iocb siocb;
    	int ret;
    
    
    	init_sync_kiocb(&iocb, NULL);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	iocb.private = &siocb;
    	ret = __sock_recvmsg(&iocb, sock, msg, size, flags);
    	if (-EIOCBQUEUED == ret)
    		ret = wait_on_sync_kiocb(&iocb);
    	return ret;
    }
    
    
    int kernel_recvmsg(struct socket *sock, struct msghdr *msg,
    		   struct kvec *vec, size_t num, size_t size, int flags)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	mm_segment_t oldfs = get_fs();
    	int result;
    
    	set_fs(KERNEL_DS);
    	/*
    	 * the following is safe, since for compiler definitions of kvec and
    	 * iovec are identical, yielding the same in-core layout and alignment
    	 */
    
    	msg->msg_iov = (struct iovec *)vec, msg->msg_iovlen = num;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	result = sock_recvmsg(sock, msg, size, flags);
    	set_fs(oldfs);
    	return result;
    }
    
    static void sock_aio_dtor(struct kiocb *iocb)
    {
    	kfree(iocb->private);
    }
    
    
    static ssize_t sock_sendpage(struct file *file, struct page *page,
    			     int offset, size_t size, loff_t *ppos, int more)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct socket *sock;
    	int flags;
    
    
    	sock = file->private_data;
    
    	flags = !(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT;
    	if (more)
    		flags |= MSG_MORE;
    
    	return sock->ops->sendpage(sock, page, offset, size, flags);
    }
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    static struct sock_iocb *alloc_sock_iocb(struct kiocb *iocb,
    
    					 struct sock_iocb *siocb)
    
    {
    	if (!is_sync_kiocb(iocb)) {
    		siocb = kmalloc(sizeof(*siocb), GFP_KERNEL);
    		if (!siocb)
    			return NULL;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		iocb->ki_dtor = sock_aio_dtor;
    	}
    
    
    	siocb->kiocb = iocb;
    	iocb->private = siocb;
    	return siocb;
    
    static ssize_t do_sock_read(struct msghdr *msg, struct kiocb *iocb,
    
    		struct file *file, const struct iovec *iov,
    		unsigned long nr_segs)
    
    {
    	struct socket *sock = file->private_data;
    	size_t size = 0;
    	int i;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	for (i = 0; i < nr_segs; i++)
    		size += iov[i].iov_len;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	msg->msg_name = NULL;
    	msg->msg_namelen = 0;
    	msg->msg_control = NULL;
    	msg->msg_controllen = 0;
    
    	msg->msg_iov = (struct iovec *)iov;
    
    	msg->msg_iovlen = nr_segs;
    	msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
    
    	return __sock_recvmsg(iocb, sock, msg, size, msg->msg_flags);
    }
    
    
    static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
    				unsigned long nr_segs, loff_t pos)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (pos != 0)
    		return -ESPIPE;
    
    
    	if (iocb->ki_left == 0)	/* Match SYS5 behaviour */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return 0;
    
    
    
    	x = alloc_sock_iocb(iocb, &siocb);
    
    	return do_sock_read(&x->async_msg, iocb, iocb->ki_filp, iov, nr_segs);
    
    static ssize_t do_sock_write(struct msghdr *msg, struct kiocb *iocb,
    
    			struct file *file, const struct iovec *iov,
    			unsigned long nr_segs)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct socket *sock = file->private_data;
    	size_t size = 0;
    	int i;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	for (i = 0; i < nr_segs; i++)
    		size += iov[i].iov_len;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	msg->msg_name = NULL;
    	msg->msg_namelen = 0;
    	msg->msg_control = NULL;
    	msg->msg_controllen = 0;
    
    	msg->msg_iov = (struct iovec *)iov;
    
    	msg->msg_iovlen = nr_segs;
    	msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
    	if (sock->type == SOCK_SEQPACKET)
    		msg->msg_flags |= MSG_EOR;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	return __sock_sendmsg(iocb, sock, msg, size);
    
    static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov,
    			  unsigned long nr_segs, loff_t pos)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    
    	if (iocb->ki_left == 0)	/* Match SYS5 behaviour */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	x = alloc_sock_iocb(iocb, &siocb);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	return do_sock_write(&x->async_msg, iocb, iocb->ki_filp, iov, nr_segs);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /*
     * Atomic setting of ioctl hooks to avoid race
     * with module unload.
     */
    
    
    Arjan van de Ven's avatar
    Arjan van de Ven committed
    static DEFINE_MUTEX(br_ioctl_mutex);
    
    static int (*br_ioctl_hook) (unsigned int cmd, void __user *arg) = NULL;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    void brioctl_set(int (*hook) (unsigned int, void __user *))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    Arjan van de Ven's avatar
    Arjan van de Ven committed
    	mutex_lock(&br_ioctl_mutex);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	br_ioctl_hook = hook;
    
    Arjan van de Ven's avatar
    Arjan van de Ven committed
    	mutex_unlock(&br_ioctl_mutex);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    EXPORT_SYMBOL(brioctl_set);
    
    
    Arjan van de Ven's avatar
    Arjan van de Ven committed
    static DEFINE_MUTEX(vlan_ioctl_mutex);
    
    static int (*vlan_ioctl_hook) (void __user *arg);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    void vlan_ioctl_set(int (*hook) (void __user *))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    Arjan van de Ven's avatar
    Arjan van de Ven committed
    	mutex_lock(&vlan_ioctl_mutex);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	vlan_ioctl_hook = hook;
    
    Arjan van de Ven's avatar
    Arjan van de Ven committed
    	mutex_unlock(&vlan_ioctl_mutex);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    EXPORT_SYMBOL(vlan_ioctl_set);
    
    
    Arjan van de Ven's avatar
    Arjan van de Ven committed
    static DEFINE_MUTEX(dlci_ioctl_mutex);
    
    static int (*dlci_ioctl_hook) (unsigned int, void __user *);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    void dlci_ioctl_set(int (*hook) (unsigned int, void __user *))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    Arjan van de Ven's avatar
    Arjan van de Ven committed
    	mutex_lock(&dlci_ioctl_mutex);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	dlci_ioctl_hook = hook;
    
    Arjan van de Ven's avatar
    Arjan van de Ven committed
    	mutex_unlock(&dlci_ioctl_mutex);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    EXPORT_SYMBOL(dlci_ioctl_set);
    
    /*
     *	With an ioctl, arg may well be a user mode pointer, but we don't know
     *	what to do with it - that's up to the protocol still.
     */
    
    static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
    {
    	struct socket *sock;
    	void __user *argp = (void __user *)arg;
    	int pid, err;
    
    
    	sock = file->private_data;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) {
    		err = dev_ioctl(cmd, argp);
    	} else
    
    #ifdef CONFIG_WIRELESS_EXT
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
    		err = dev_ioctl(cmd, argp);
    	} else
    
    #endif				/* CONFIG_WIRELESS_EXT */
    		switch (cmd) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		case FIOSETOWN:
    		case SIOCSPGRP:
    			err = -EFAULT;
    			if (get_user(pid, (int __user *)argp))
    				break;
    			err = f_setown(sock->file, pid, 1);
    			break;
    		case FIOGETOWN:
    		case SIOCGPGRP:
    
    			err = put_user(f_getown(sock->file),
    
    				       (int __user *)argp);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			break;
    		case SIOCGIFBR:
    		case SIOCSIFBR:
    		case SIOCBRADDBR:
    		case SIOCBRDELBR:
    			err = -ENOPKG;
    			if (!br_ioctl_hook)
    				request_module("bridge");
    
    
    Arjan van de Ven's avatar
    Arjan van de Ven committed
    			mutex_lock(&br_ioctl_mutex);
    
    			if (br_ioctl_hook)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				err = br_ioctl_hook(cmd, argp);
    
    Arjan van de Ven's avatar
    Arjan van de Ven committed
    			mutex_unlock(&br_ioctl_mutex);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			break;
    		case SIOCGIFVLAN:
    		case SIOCSIFVLAN:
    			err = -ENOPKG;
    			if (!vlan_ioctl_hook)
    				request_module("8021q");
    
    
    Arjan van de Ven's avatar
    Arjan van de Ven committed
    			mutex_lock(&vlan_ioctl_mutex);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			if (vlan_ioctl_hook)
    				err = vlan_ioctl_hook(argp);
    
    Arjan van de Ven's avatar
    Arjan van de Ven committed
    			mutex_unlock(&vlan_ioctl_mutex);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			break;
    		case SIOCADDDLCI:
    		case SIOCDELDLCI:
    			err = -ENOPKG;
    			if (!dlci_ioctl_hook)
    				request_module("dlci");
    
    			if (dlci_ioctl_hook) {
    
    Arjan van de Ven's avatar
    Arjan van de Ven committed
    				mutex_lock(&dlci_ioctl_mutex);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				err = dlci_ioctl_hook(cmd, argp);
    
    Arjan van de Ven's avatar
    Arjan van de Ven committed
    				mutex_unlock(&dlci_ioctl_mutex);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			}
    			break;
    		default:
    			err = sock->ops->ioctl(sock, cmd, arg);
    
    
    			/*
    			 * If this ioctl is unknown try to hand it down
    			 * to the NIC driver.
    			 */
    			if (err == -ENOIOCTLCMD)
    				err = dev_ioctl(cmd, argp);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			break;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	return err;
    }
    
    int sock_create_lite(int family, int type, int protocol, struct socket **res)
    {
    	int err;
    	struct socket *sock = NULL;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	err = security_socket_create(family, type, protocol, 1);
    	if (err)
    		goto out;
    
    	sock = sock_alloc();
    	if (!sock) {
    		err = -ENOMEM;
    		goto out;
    	}
    
    	sock->type = type;
    
    	err = security_socket_post_create(sock, family, type, protocol, 1);
    	if (err)
    		goto out_release;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    out:
    	*res = sock;
    	return err;
    
    out_release:
    	sock_release(sock);
    	sock = NULL;
    	goto out;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /* No kernel lock held - perfect */
    
    static unsigned int sock_poll(struct file *file, poll_table *wait)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct socket *sock;
    
    	/*
    
    	 *      We can't return errors to poll, so it's either yes or no.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	 */
    
    	sock = file->private_data;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	return sock->ops->poll(file, sock, wait);
    }
    
    
    static int sock_mmap(struct file *file, struct vm_area_struct *vma)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct socket *sock = file->private_data;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	return sock->ops->mmap(file, sock, vma);
    }
    
    
    static int sock_close(struct inode *inode, struct file *filp)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	/*
    
    	 *      It was possible the inode is NULL we were
    	 *      closing an unfinished socket.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	 */
    
    
    	if (!inode) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		printk(KERN_DEBUG "sock_close: NULL inode\n");
    		return 0;
    	}
    	sock_fasync(-1, filp, 0);
    	sock_release(SOCKET_I(inode));
    	return 0;
    }
    
    /*
     *	Update the socket async list
     *
     *	Fasync_list locking strategy.
     *
     *	1. fasync_list is modified only under process context socket lock
     *	   i.e. under semaphore.
     *	2. fasync_list is used under read_lock(&sk->sk_callback_lock)
     *	   or under socket lock.
     *	3. fasync_list can be used from softirq context, so that
     *	   modification under socket lock have to be enhanced with
     *	   write_lock_bh(&sk->sk_callback_lock).
     *							--ANK (990710)
     */
    
    static int sock_fasync(int fd, struct file *filp, int on)
    {
    
    	struct fasync_struct *fa, *fna = NULL, **prev;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct socket *sock;
    	struct sock *sk;
    
    
    		fna = kmalloc(sizeof(struct fasync_struct), GFP_KERNEL);
    
    		if (fna == NULL)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			return -ENOMEM;
    	}
    
    
    	sock = filp->private_data;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	sk = sock->sk;
    	if (sk == NULL) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		kfree(fna);
    		return -EINVAL;
    	}
    
    	lock_sock(sk);
    
    
    	prev = &(sock->fasync_list);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	for (fa = *prev; fa != NULL; prev = &fa->fa_next, fa = *prev)
    		if (fa->fa_file == filp)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			break;
    
    
    	if (on) {
    		if (fa != NULL) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			write_lock_bh(&sk->sk_callback_lock);
    
    			fa->fa_fd = fd;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			write_unlock_bh(&sk->sk_callback_lock);
    
    			kfree(fna);
    			goto out;
    		}
    
    		fna->fa_file = filp;
    		fna->fa_fd = fd;
    		fna->magic = FASYNC_MAGIC;
    		fna->fa_next = sock->fasync_list;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		write_lock_bh(&sk->sk_callback_lock);