Skip to content
Snippets Groups Projects
sock.c 66.7 KiB
Newer Older
  • Learn to ignore specific revisions
  • Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * INET		An implementation of the TCP/IP protocol suite for the LINUX
     *		operating system.  INET is implemented using the  BSD Socket
     *		interface as the means of communication with the user level.
     *
     *		Generic socket support routines. Memory allocators, socket lock/release
     *		handler for protocols to use and generic option handler.
     *
     *
    
     * Authors:	Ross Biro
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
     *		Florian La Roche, <flla@stud.uni-sb.de>
     *		Alan Cox, <A.Cox@swansea.ac.uk>
     *
     * Fixes:
     *		Alan Cox	: 	Numerous verify_area() problems
     *		Alan Cox	:	Connecting on a connecting socket
     *					now returns an error for tcp.
     *		Alan Cox	:	sock->protocol is set correctly.
     *					and is not sometimes left as 0.
     *		Alan Cox	:	connect handles icmp errors on a
     *					connect properly. Unfortunately there
     *					is a restart syscall nasty there. I
     *					can't match BSD without hacking the C
     *					library. Ideas urgently sought!
     *		Alan Cox	:	Disallow bind() to addresses that are
     *					not ours - especially broadcast ones!!
     *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
     *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
     *					instead they leave that for the DESTROY timer.
     *		Alan Cox	:	Clean up error flag in accept
     *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
     *					was buggy. Put a remove_sock() in the handler
     *					for memory when we hit 0. Also altered the timer
    
     *					code. The ACK stuff can wait and needs major
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *					TCP layer surgery.
     *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
     *					and fixed timer/inet_bh race.
     *		Alan Cox	:	Added zapped flag for TCP
     *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
     *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
     *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
     *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
     *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
     *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
     *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
     *	Pauline Middelink	:	identd support
     *		Alan Cox	:	Fixed connect() taking signals I think.
     *		Alan Cox	:	SO_LINGER supported
     *		Alan Cox	:	Error reporting fixes
     *		Anonymous	:	inet_create tidied up (sk->reuse setting)
     *		Alan Cox	:	inet sockets don't set sk->type!
     *		Alan Cox	:	Split socket option code
     *		Alan Cox	:	Callbacks
     *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
     *		Alex		:	Removed restriction on inet fioctl
     *		Alan Cox	:	Splitting INET from NET core
     *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
     *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
     *		Alan Cox	:	Split IP from generic code
     *		Alan Cox	:	New kfree_skbmem()
     *		Alan Cox	:	Make SO_DEBUG superuser only.
     *		Alan Cox	:	Allow anyone to clear SO_DEBUG
     *					(compatibility fix)
     *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
     *		Alan Cox	:	Allocator for a socket is settable.
     *		Alan Cox	:	SO_ERROR includes soft errors.
     *		Alan Cox	:	Allow NULL arguments on some SO_ opts
     *		Alan Cox	: 	Generic socket allocation to make hooks
     *					easier (suggested by Craig Metz).
     *		Michael Pall	:	SO_ERROR returns positive errno again
     *              Steve Whitehouse:       Added default destructor to free
     *                                      protocol private data.
     *              Steve Whitehouse:       Added various other default routines
     *                                      common to several socket families.
     *              Chris Evans     :       Call suser() check last on F_SETOWN
     *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
     *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
     *		Andi Kleen	:	Fix write_space callback
     *		Chris Evans	:	Security fixes - signedness again
     *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
     *
     * To Fix:
     *
     *
     *		This program is free software; you can redistribute it and/or
     *		modify it under the terms of the GNU General Public License
     *		as published by the Free Software Foundation; either version
     *		2 of the License, or (at your option) any later version.
     */
    
    
    #include <linux/capability.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #include <linux/errno.h>
    #include <linux/types.h>
    #include <linux/socket.h>
    #include <linux/in.h>
    #include <linux/kernel.h>
    #include <linux/module.h>
    #include <linux/proc_fs.h>
    #include <linux/seq_file.h>
    #include <linux/sched.h>
    #include <linux/timer.h>
    #include <linux/string.h>
    #include <linux/sockios.h>
    #include <linux/net.h>
    #include <linux/mm.h>
    #include <linux/slab.h>
    #include <linux/interrupt.h>
    #include <linux/poll.h>
    #include <linux/tcp.h>
    #include <linux/init.h>
    
    #include <linux/highmem.h>
    
    #include <linux/user_namespace.h>
    
    #include <linux/jump_label.h>
    
    #include <linux/memcontrol.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    #include <asm/uaccess.h>
    #include <asm/system.h>
    
    #include <linux/netdevice.h>
    #include <net/protocol.h>
    #include <linux/skbuff.h>
    
    #include <net/net_namespace.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #include <net/sock.h>
    
    #include <linux/net_tstamp.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #include <net/xfrm.h>
    #include <linux/ipsec.h>
    
    #include <net/cls_cgroup.h>
    
    #include <net/netprio_cgroup.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    #include <linux/filter.h>
    
    
    #include <trace/events/sock.h>
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #ifdef CONFIG_INET
    #include <net/tcp.h>
    #endif
    
    
    static DEFINE_MUTEX(proto_list_mutex);
    
    static LIST_HEAD(proto_list);
    
    #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
    int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss)
    {
    	struct proto *proto;
    	int ret = 0;
    
    
    	mutex_lock(&proto_list_mutex);
    
    	list_for_each_entry(proto, &proto_list, node) {
    		if (proto->init_cgroup) {
    			ret = proto->init_cgroup(cgrp, ss);
    			if (ret)
    				goto out;
    		}
    	}
    
    
    	mutex_unlock(&proto_list_mutex);
    
    	return ret;
    out:
    	list_for_each_entry_continue_reverse(proto, &proto_list, node)
    		if (proto->destroy_cgroup)
    			proto->destroy_cgroup(cgrp, ss);
    
    	mutex_unlock(&proto_list_mutex);
    
    	return ret;
    }
    
    void mem_cgroup_sockets_destroy(struct cgroup *cgrp, struct cgroup_subsys *ss)
    {
    	struct proto *proto;
    
    
    	mutex_lock(&proto_list_mutex);
    
    	list_for_each_entry_reverse(proto, &proto_list, node)
    		if (proto->destroy_cgroup)
    			proto->destroy_cgroup(cgrp, ss);
    
    	mutex_unlock(&proto_list_mutex);
    
    /*
     * Each address family might have different locking rules, so we have
     * one slock key per address family:
     */
    
    static struct lock_class_key af_family_keys[AF_MAX];
    static struct lock_class_key af_family_slock_keys[AF_MAX];
    
    
    struct jump_label_key memcg_socket_limit_enabled;
    EXPORT_SYMBOL(memcg_socket_limit_enabled);
    
    
    /*
     * Make lock validator output more readable. (we pre-construct these
     * strings build-time, so that runtime initialization of socket
     * locks is fast):
     */
    
    static const char *const af_family_key_strings[AF_MAX+1] = {
    
      "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
      "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
      "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
      "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
      "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
      "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
      "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
    
      "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
    
      "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
    
      "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
    
      "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
    
      "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
    
      "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
    
      "sk_lock-AF_NFC"   , "sk_lock-AF_MAX"
    
    static const char *const af_family_slock_key_strings[AF_MAX+1] = {
    
      "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
      "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
      "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
      "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
      "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
      "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
      "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
    
      "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
    
      "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
    
      "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
    
      "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
    
      "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
    
      "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
    
      "slock-AF_NFC"   , "slock-AF_MAX"
    
    static const char *const af_family_clock_key_strings[AF_MAX+1] = {
    
      "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
      "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
      "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
      "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
      "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
      "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
      "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
    
      "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
    
      "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
    
      "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
    
      "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
    
      "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
    
      "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
    
      "clock-AF_NFC"   , "clock-AF_MAX"
    
    
    /*
     * sk_callback_lock locking rules are per-address-family,
     * so split the lock classes by using a per-AF key:
     */
    static struct lock_class_key af_callback_keys[AF_MAX];
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /* Take into consideration the size of the struct sk_buff overhead in the
     * determination of these values, since that is non-constant across
     * platforms.  This makes socket queueing behavior and performance
     * not depend upon such differences.
     */
    #define _SK_MEM_PACKETS		256
    
    #define _SK_MEM_OVERHEAD	SKB_TRUESIZE(256)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
    #define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
    
    /* Run time adjustable parameters. */
    
    __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
    __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
    __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
    __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    Lucas De Marchi's avatar
    Lucas De Marchi committed
    /* Maximal space eaten by iovec or ancillary data plus some space */
    
    int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
    
    EXPORT_SYMBOL(sysctl_optmem_max);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    #if defined(CONFIG_CGROUPS)
    #if !defined(CONFIG_NET_CLS_CGROUP)
    
    int net_cls_subsys_id = -1;
    EXPORT_SYMBOL_GPL(net_cls_subsys_id);
    #endif
    
    #if !defined(CONFIG_NETPRIO_CGROUP)
    int net_prio_subsys_id = -1;
    EXPORT_SYMBOL_GPL(net_prio_subsys_id);
    #endif
    #endif
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
    {
    	struct timeval tv;
    
    	if (optlen < sizeof(tv))
    		return -EINVAL;
    	if (copy_from_user(&tv, optval, sizeof(tv)))
    		return -EFAULT;
    
    	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
    		return -EDOM;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if (tv.tv_sec < 0) {
    
    		static int warned __read_mostly;
    
    
    		if (warned < 10 && net_ratelimit()) {
    
    			warned++;
    			printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
    			       "tries to set negative timeout\n",
    
    				current->comm, task_pid_nr(current));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	*timeo_p = MAX_SCHEDULE_TIMEOUT;
    	if (tv.tv_sec == 0 && tv.tv_usec == 0)
    		return 0;
    	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
    		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
    	return 0;
    }
    
    static void sock_warn_obsolete_bsdism(const char *name)
    {
    	static int warned;
    	static char warncomm[TASK_COMM_LEN];
    
    	if (strcmp(warncomm, current->comm) && warned < 5) {
    		strcpy(warncomm,  current->comm);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		printk(KERN_WARNING "process `%s' is using obsolete "
    		       "%s SO_BSDCOMPAT\n", warncomm, name);
    		warned++;
    	}
    }
    
    
    #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
    
    static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
    
    	if (sk->sk_flags & flags) {
    		sk->sk_flags &= ~flags;
    		if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
    
    			net_disable_timestamp();
    
    int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
    {
    
    	int err;
    
    	unsigned long flags;
    	struct sk_buff_head *list = &sk->sk_receive_queue;
    
    Eric Dumazet's avatar
    Eric Dumazet committed
    	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
    
    		atomic_inc(&sk->sk_drops);
    
    		trace_sock_rcvqueue_full(sk, skb);
    
    		return -ENOMEM;
    
    	err = sk_filter(sk, skb);
    
    		return err;
    
    	if (!sk_rmem_schedule(sk, skb->truesize)) {
    
    		atomic_inc(&sk->sk_drops);
    		return -ENOBUFS;
    
    	skb->dev = NULL;
    	skb_set_owner_r(skb, sk);
    
    	/* Cache the SKB length before we tack it onto the receive
    	 * queue.  Once it is added it no longer belongs to us and
    	 * may be freed by other threads of control pulling packets
    	 * from the queue.
    	 */
    	skb_len = skb->len;
    
    
    	/* we escape from rcu protected region, make sure we dont leak
    	 * a norefcounted dst
    	 */
    	skb_dst_force(skb);
    
    
    	spin_lock_irqsave(&list->lock, flags);
    	skb->dropcount = atomic_read(&sk->sk_drops);
    	__skb_queue_tail(list, skb);
    	spin_unlock_irqrestore(&list->lock, flags);
    
    
    	if (!sock_flag(sk, SOCK_DEAD))
    		sk->sk_data_ready(sk, skb_len);
    
    	return 0;
    
    }
    EXPORT_SYMBOL(sock_queue_rcv_skb);
    
    
    int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
    
    {
    	int rc = NET_RX_SUCCESS;
    
    
    	if (sk_filter(sk, skb))
    
    		goto discard_and_relse;
    
    	skb->dev = NULL;
    
    
    	if (sk_rcvqueues_full(sk, skb)) {
    		atomic_inc(&sk->sk_drops);
    		goto discard_and_relse;
    	}
    
    	if (nested)
    		bh_lock_sock_nested(sk);
    	else
    		bh_lock_sock(sk);
    
    	if (!sock_owned_by_user(sk)) {
    		/*
    		 * trylock + unlock semantics:
    		 */
    		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
    
    
    		rc = sk_backlog_rcv(sk, skb);
    
    
    		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
    
    Zhu Yi's avatar
    Zhu Yi committed
    	} else if (sk_add_backlog(sk, skb)) {
    
    		bh_unlock_sock(sk);
    		atomic_inc(&sk->sk_drops);
    		goto discard_and_relse;
    	}
    
    
    	bh_unlock_sock(sk);
    out:
    	sock_put(sk);
    	return rc;
    discard_and_relse:
    	kfree_skb(skb);
    	goto out;
    }
    EXPORT_SYMBOL(sk_receive_skb);
    
    
    void sk_reset_txq(struct sock *sk)
    {
    	sk_tx_queue_clear(sk);
    }
    EXPORT_SYMBOL(sk_reset_txq);
    
    
    struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
    {
    
    	struct dst_entry *dst = __sk_dst_get(sk);
    
    
    	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
    
    		sk_tx_queue_clear(sk);
    
    		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
    
    		dst_release(dst);
    		return NULL;
    	}
    
    	return dst;
    }
    EXPORT_SYMBOL(__sk_dst_check);
    
    struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
    {
    	struct dst_entry *dst = sk_dst_get(sk);
    
    	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
    		sk_dst_reset(sk);
    		dst_release(dst);
    		return NULL;
    	}
    
    	return dst;
    }
    EXPORT_SYMBOL(sk_dst_check);
    
    
    static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
    {
    	int ret = -ENOPROTOOPT;
    #ifdef CONFIG_NETDEVICES
    
    	struct net *net = sock_net(sk);
    
    	char devname[IFNAMSIZ];
    	int index;
    
    	/* Sorry... */
    	ret = -EPERM;
    	if (!capable(CAP_NET_RAW))
    		goto out;
    
    	ret = -EINVAL;
    	if (optlen < 0)
    		goto out;
    
    	/* Bind this socket to a particular device like "eth0",
    	 * as specified in the passed interface name. If the
    	 * name is "" or the option length is zero the socket
    	 * is not bound.
    	 */
    	if (optlen > IFNAMSIZ - 1)
    		optlen = IFNAMSIZ - 1;
    	memset(devname, 0, sizeof(devname));
    
    	ret = -EFAULT;
    	if (copy_from_user(devname, optval, optlen))
    		goto out;
    
    
    	index = 0;
    	if (devname[0] != '\0') {
    
    		struct net_device *dev;
    
    		rcu_read_lock();
    		dev = dev_get_by_name_rcu(net, devname);
    		if (dev)
    			index = dev->ifindex;
    		rcu_read_unlock();
    
    		ret = -ENODEV;
    		if (!dev)
    			goto out;
    	}
    
    	lock_sock(sk);
    	sk->sk_bound_dev_if = index;
    	sk_dst_reset(sk);
    	release_sock(sk);
    
    	ret = 0;
    
    out:
    #endif
    
    	return ret;
    }
    
    
    static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
    {
    	if (valbool)
    		sock_set_flag(sk, bit);
    	else
    		sock_reset_flag(sk, bit);
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     *	This is meant for all protocols to use and covers goings on
     *	at the socket level. Everything here is generic.
     */
    
    int sock_setsockopt(struct socket *sock, int level, int optname,
    
    		    char __user *optval, unsigned int optlen)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct sock *sk = sock->sk;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	int val;
    	int valbool;
    	struct linger ling;
    	int ret = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	/*
    	 *	Options without arguments
    	 */
    
    
    	if (optname == SO_BINDTODEVICE)
    		return sock_bindtodevice(sk, optval, optlen);
    
    
    	if (optlen < sizeof(int))
    		return -EINVAL;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (get_user(val, (int __user *)optval))
    		return -EFAULT;
    
    	valbool = val ? 1 : 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	lock_sock(sk);
    
    
    	switch (optname) {
    
    	case SO_DEBUG:
    
    		if (val && !capable(CAP_NET_ADMIN))
    
    			ret = -EACCES;
    
    			sock_valbool_flag(sk, SOCK_DBG, valbool);
    
    		break;
    	case SO_REUSEADDR:
    		sk->sk_reuse = valbool;
    		break;
    	case SO_TYPE:
    
    	case SO_PROTOCOL:
    
    	case SO_DOMAIN:
    
    	case SO_ERROR:
    		ret = -ENOPROTOOPT;
    		break;
    	case SO_DONTROUTE:
    
    		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
    
    		break;
    	case SO_BROADCAST:
    		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
    		break;
    	case SO_SNDBUF:
    		/* Don't error on this BSD doesn't and if you think
    		   about it this is right. Otherwise apps have to
    		   play 'guess the biggest size' games. RCVBUF/SNDBUF
    		   are treated in BSD as hints */
    
    		if (val > sysctl_wmem_max)
    			val = sysctl_wmem_max;
    
    		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
    		if ((val * 2) < SOCK_MIN_SNDBUF)
    			sk->sk_sndbuf = SOCK_MIN_SNDBUF;
    		else
    			sk->sk_sndbuf = val * 2;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		/*
    		 *	Wake up sending tasks if we
    		 *	upped the value.
    		 */
    		sk->sk_write_space(sk);
    		break;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	case SO_SNDBUFFORCE:
    		if (!capable(CAP_NET_ADMIN)) {
    			ret = -EPERM;
    			break;
    		}
    		goto set_sndbuf;
    
    	case SO_RCVBUF:
    		/* Don't error on this BSD doesn't and if you think
    		   about it this is right. Otherwise apps have to
    		   play 'guess the biggest size' games. RCVBUF/SNDBUF
    		   are treated in BSD as hints */
    
    		if (val > sysctl_rmem_max)
    			val = sysctl_rmem_max;
    
    		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
    		/*
    		 * We double it on the way in to account for
    		 * "struct sk_buff" etc. overhead.   Applications
    		 * assume that the SO_RCVBUF setting they make will
    		 * allow that much actual data to be received on that
    		 * socket.
    		 *
    		 * Applications are unaware that "struct sk_buff" and
    		 * other overheads allocate from the receive buffer
    		 * during socket buffer allocation.
    		 *
    		 * And after considering the possible alternatives,
    		 * returning the value we actually used in getsockopt
    		 * is the most desirable behavior.
    		 */
    		if ((val * 2) < SOCK_MIN_RCVBUF)
    			sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
    		else
    			sk->sk_rcvbuf = val * 2;
    		break;
    
    	case SO_RCVBUFFORCE:
    		if (!capable(CAP_NET_ADMIN)) {
    			ret = -EPERM;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			break;
    
    		}
    		goto set_rcvbuf;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	case SO_KEEPALIVE:
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #ifdef CONFIG_INET
    
    		if (sk->sk_protocol == IPPROTO_TCP)
    			tcp_set_keepalive(sk, valbool);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #endif
    
    		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
    		break;
    
    	case SO_OOBINLINE:
    		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
    		break;
    
    	case SO_NO_CHECK:
    		sk->sk_no_check = valbool;
    		break;
    
    	case SO_PRIORITY:
    		if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
    			sk->sk_priority = val;
    		else
    			ret = -EPERM;
    		break;
    
    	case SO_LINGER:
    		if (optlen < sizeof(ling)) {
    			ret = -EINVAL;	/* 1003.1g */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			break;
    
    		if (copy_from_user(&ling, optval, sizeof(ling))) {
    
    			ret = -EFAULT;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			break;
    
    		}
    		if (!ling.l_onoff)
    			sock_reset_flag(sk, SOCK_LINGER);
    		else {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #if (BITS_PER_LONG == 32)
    
    			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
    				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			else
    
    #endif
    				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
    			sock_set_flag(sk, SOCK_LINGER);
    		}
    		break;
    
    	case SO_BSDCOMPAT:
    		sock_warn_obsolete_bsdism("setsockopt");
    		break;
    
    	case SO_PASSCRED:
    		if (valbool)
    			set_bit(SOCK_PASSCRED, &sock->flags);
    		else
    			clear_bit(SOCK_PASSCRED, &sock->flags);
    		break;
    
    	case SO_TIMESTAMP:
    
    		if (valbool)  {
    
    			if (optname == SO_TIMESTAMP)
    				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
    			else
    				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
    
    			sock_set_flag(sk, SOCK_RCVTSTAMP);
    
    			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
    
    			sock_reset_flag(sk, SOCK_RCVTSTAMP);
    
    			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
    		}
    
    	case SO_TIMESTAMPING:
    		if (val & ~SOF_TIMESTAMPING_MASK) {
    
    			break;
    		}
    		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
    				  val & SOF_TIMESTAMPING_TX_HARDWARE);
    		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
    				  val & SOF_TIMESTAMPING_TX_SOFTWARE);
    		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
    				  val & SOF_TIMESTAMPING_RX_HARDWARE);
    		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
    			sock_enable_timestamp(sk,
    					      SOCK_TIMESTAMPING_RX_SOFTWARE);
    		else
    			sock_disable_timestamp(sk,
    
    					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
    
    		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
    				  val & SOF_TIMESTAMPING_SOFTWARE);
    		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
    				  val & SOF_TIMESTAMPING_SYS_HARDWARE);
    		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
    				  val & SOF_TIMESTAMPING_RAW_HARDWARE);
    		break;
    
    
    	case SO_RCVLOWAT:
    		if (val < 0)
    			val = INT_MAX;
    		sk->sk_rcvlowat = val ? : 1;
    		break;
    
    	case SO_RCVTIMEO:
    		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
    		break;
    
    	case SO_SNDTIMEO:
    		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
    		break;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	case SO_ATTACH_FILTER:
    		ret = -EINVAL;
    		if (optlen == sizeof(struct sock_fprog)) {
    			struct sock_fprog fprog;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    			ret = -EFAULT;
    			if (copy_from_user(&fprog, optval, sizeof(fprog)))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				break;
    
    
    			ret = sk_attach_filter(&fprog, sk);
    		}
    		break;
    
    	case SO_DETACH_FILTER:
    
    		ret = sk_detach_filter(sk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	case SO_PASSSEC:
    		if (valbool)
    			set_bit(SOCK_PASSSEC, &sock->flags);
    		else
    			clear_bit(SOCK_PASSSEC, &sock->flags);
    		break;
    
    	case SO_MARK:
    		if (!capable(CAP_NET_ADMIN))
    			ret = -EPERM;
    
    			sk->sk_mark = val;
    		break;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/* We implement the SO_SNDLOWAT etc to
    		   not be settable (1003.1g 5.3) */
    
    		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
    
    
    	case SO_WIFI_STATUS:
    		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
    		break;
    
    
    	case SO_PEEK_OFF:
    		if (sock->ops->set_peek_off)
    			sock->ops->set_peek_off(sk, val);
    		else
    			ret = -EOPNOTSUPP;
    		break;
    
    	default:
    		ret = -ENOPROTOOPT;
    		break;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	release_sock(sk);
    	return ret;
    }
    
    EXPORT_SYMBOL(sock_setsockopt);
    
    void cred_to_ucred(struct pid *pid, const struct cred *cred,
    		   struct ucred *ucred)
    {
    	ucred->pid = pid_vnr(pid);
    	ucred->uid = ucred->gid = -1;
    	if (cred) {
    		struct user_namespace *current_ns = current_user_ns();
    
    		ucred->uid = user_ns_map_uid(current_ns, cred, cred->euid);
    		ucred->gid = user_ns_map_gid(current_ns, cred, cred->egid);
    	}
    }
    
    EXPORT_SYMBOL_GPL(cred_to_ucred);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    int sock_getsockopt(struct socket *sock, int level, int optname,
    		    char __user *optval, int __user *optlen)
    {
    	struct sock *sk = sock->sk;
    
    		int val;
    		struct linger ling;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		struct timeval tm;
    	} v;
    
    	int lv = sizeof(int);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	int len;
    
    	if (get_user(len, optlen))
    
    		return -EFAULT;
    
    	if (len < 0)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return -EINVAL;
    
    	memset(&v, 0, sizeof(v));
    
    	switch (optname) {
    
    	case SO_DEBUG:
    		v.val = sock_flag(sk, SOCK_DBG);
    		break;
    
    	case SO_DONTROUTE:
    		v.val = sock_flag(sk, SOCK_LOCALROUTE);
    		break;
    
    	case SO_BROADCAST:
    		v.val = !!sock_flag(sk, SOCK_BROADCAST);
    		break;
    
    	case SO_SNDBUF:
    		v.val = sk->sk_sndbuf;
    		break;
    
    	case SO_RCVBUF:
    		v.val = sk->sk_rcvbuf;
    		break;
    
    	case SO_REUSEADDR:
    		v.val = sk->sk_reuse;
    		break;
    
    	case SO_KEEPALIVE:
    		v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
    		break;
    
    	case SO_TYPE:
    		v.val = sk->sk_type;
    		break;
    
    
    	case SO_PROTOCOL:
    		v.val = sk->sk_protocol;
    		break;
    
    
    	case SO_DOMAIN:
    		v.val = sk->sk_family;
    		break;
    
    
    	case SO_ERROR:
    		v.val = -sock_error(sk);
    
    		if (v.val == 0)
    
    			v.val = xchg(&sk->sk_err_soft, 0);
    		break;
    
    	case SO_OOBINLINE:
    		v.val = !!sock_flag(sk, SOCK_URGINLINE);
    		break;
    
    	case SO_NO_CHECK:
    		v.val = sk->sk_no_check;
    		break;
    
    	case SO_PRIORITY:
    		v.val = sk->sk_priority;
    		break;
    
    	case SO_LINGER:
    		lv		= sizeof(v.ling);
    		v.ling.l_onoff	= !!sock_flag(sk, SOCK_LINGER);
    		v.ling.l_linger	= sk->sk_lingertime / HZ;
    		break;
    
    	case SO_BSDCOMPAT:
    		sock_warn_obsolete_bsdism("getsockopt");
    		break;
    
    	case SO_TIMESTAMP:
    
    		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
    				!sock_flag(sk, SOCK_RCVTSTAMPNS);
    		break;
    
    	case SO_TIMESTAMPNS:
    		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
    
    	case SO_TIMESTAMPING:
    		v.val = 0;
    		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
    			v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
    		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
    			v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
    		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
    			v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
    		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
    			v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
    		if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
    			v.val |= SOF_TIMESTAMPING_SOFTWARE;
    		if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
    			v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
    		if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
    			v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
    		break;
    
    
    	case SO_RCVTIMEO:
    
    		lv = sizeof(struct timeval);
    
    		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
    			v.tm.tv_sec = 0;
    			v.tm.tv_usec = 0;
    		} else {
    			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
    			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
    		}
    		break;
    
    	case SO_SNDTIMEO:
    
    		lv = sizeof(struct timeval);
    
    		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
    			v.tm.tv_sec = 0;
    			v.tm.tv_usec = 0;
    		} else {
    			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
    			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
    		}
    		break;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	case SO_RCVLOWAT:
    		v.val = sk->sk_rcvlowat;
    		break;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	case SO_SNDLOWAT:
    
    		v.val = 1;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	case SO_PASSCRED:
    		v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
    		break;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	case SO_PEERCRED:
    
    	{
    		struct ucred peercred;
    		if (len > sizeof(peercred))
    			len = sizeof(peercred);
    		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
    		if (copy_to_user(optval, &peercred, len))
    
    			return -EFAULT;
    		goto lenout;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	case SO_PEERNAME:
    	{
    		char address[128];
    
    		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
    			return -ENOTCONN;
    		if (lv < len)
    			return -EINVAL;
    		if (copy_to_user(optval, address, len))
    			return -EFAULT;
    		goto lenout;
    	}
    
    Linus Torvalds's avatar
    Linus Torvalds committed