Skip to content
Snippets Groups Projects
ipmr.c 45 KiB
Newer Older
  • Learn to ignore specific revisions
  • Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     *	IP multicast routing support for mrouted 3.6/3.8
     *
    
     *		(c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *	  Linux Consultancy and Custom Driver Development
     *
     *	This program is free software; you can redistribute it and/or
     *	modify it under the terms of the GNU General Public License
     *	as published by the Free Software Foundation; either version
     *	2 of the License, or (at your option) any later version.
     *
     *	Fixes:
     *	Michael Chastain	:	Incorrect size of copying.
     *	Alan Cox		:	Added the cache manager code
     *	Alan Cox		:	Fixed the clone/copy bug and device race.
     *	Mike McLagan		:	Routing by source
     *	Malcolm Beattie		:	Buffer handling fixes.
     *	Alexey Kuznetsov	:	Double buffer free and other fixes.
     *	SVR Anand		:	Fixed several multicast bugs and problems.
     *	Alexey Kuznetsov	:	Status, optimisations and more.
     *	Brad Parker		:	Better behaviour on mrouted upcall
     *					overflow.
     *      Carlos Picoto           :       PIMv1 Support
     *	Pavlin Ivanov Radoslavov:	PIMv2 Registers must checksum only PIM header
     *					Relax this requrement to work with older peers.
     *
     */
    
    #include <asm/system.h>
    #include <asm/uaccess.h>
    #include <linux/types.h>
    
    #include <linux/capability.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #include <linux/errno.h>
    #include <linux/timer.h>
    #include <linux/mm.h>
    #include <linux/kernel.h>
    #include <linux/fcntl.h>
    #include <linux/stat.h>
    #include <linux/socket.h>
    #include <linux/in.h>
    #include <linux/inet.h>
    #include <linux/netdevice.h>
    #include <linux/inetdevice.h>
    #include <linux/igmp.h>
    #include <linux/proc_fs.h>
    #include <linux/seq_file.h>
    #include <linux/mroute.h>
    #include <linux/init.h>
    
    #include <linux/if_ether.h>
    
    #include <net/net_namespace.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #include <net/ip.h>
    #include <net/protocol.h>
    #include <linux/skbuff.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #include <net/sock.h>
    #include <net/icmp.h>
    #include <net/udp.h>
    #include <net/raw.h>
    #include <linux/notifier.h>
    #include <linux/if_arp.h>
    #include <linux/netfilter_ipv4.h>
    #include <net/ipip.h>
    #include <net/checksum.h>
    
    #include <net/netlink.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
    #define CONFIG_IP_PIMSM	1
    #endif
    
    /* Big lock, protecting vif table, mrt cache and mroute socket state.
       Note that the changes are semaphored via rtnl_lock.
     */
    
    static DEFINE_RWLOCK(mrt_lock);
    
    /*
     *	Multicast router control variables
     */
    
    
    #define VIF_EXISTS(_net, _idx) ((_net)->ipv4.vif_table[_idx].dev != NULL)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    static struct mfc_cache *mfc_unres_queue;		/* Queue of unresolved entries */
    
    /* Special spinlock for queue of unresolved entries */
    static DEFINE_SPINLOCK(mfc_unres_lock);
    
    /* We return to original Alan's scheme. Hash table of resolved
       entries is changed only in process context and protected
       with weak lock mrt_lock. Queue of unresolved entries is protected
       with strong spinlock mfc_unres_lock.
    
       In this case data path is free of exclusive locks at all.
     */
    
    
    static struct kmem_cache *mrt_cachep __read_mostly;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
    static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
    static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
    
    #ifdef CONFIG_IP_PIMSM_V2
    static struct net_protocol pim_protocol;
    #endif
    
    static struct timer_list ipmr_expire_timer;
    
    /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
    
    
    static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
    {
    	dev_close(dev);
    
    	dev = __dev_get_by_name(&init_net, "tunl0");
    	if (dev) {
    
    		const struct net_device_ops *ops = dev->netdev_ops;
    
    		struct ifreq ifr;
    		struct ip_tunnel_parm p;
    
    		memset(&p, 0, sizeof(p));
    		p.iph.daddr = v->vifc_rmt_addr.s_addr;
    		p.iph.saddr = v->vifc_lcl_addr.s_addr;
    		p.iph.version = 4;
    		p.iph.ihl = 5;
    		p.iph.protocol = IPPROTO_IPIP;
    		sprintf(p.name, "dvmrp%d", v->vifc_vifi);
    		ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
    
    
    		if (ops->ndo_do_ioctl) {
    			mm_segment_t oldfs = get_fs();
    
    			set_fs(KERNEL_DS);
    			ops->ndo_do_ioctl(dev, &ifr, SIOCDELTUNNEL);
    			set_fs(oldfs);
    		}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    static
    struct net_device *ipmr_new_tunnel(struct vifctl *v)
    {
    	struct net_device  *dev;
    
    
    	dev = __dev_get_by_name(&init_net, "tunl0");
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	if (dev) {
    
    		const struct net_device_ops *ops = dev->netdev_ops;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		int err;
    		struct ifreq ifr;
    		struct ip_tunnel_parm p;
    		struct in_device  *in_dev;
    
    		memset(&p, 0, sizeof(p));
    		p.iph.daddr = v->vifc_rmt_addr.s_addr;
    		p.iph.saddr = v->vifc_lcl_addr.s_addr;
    		p.iph.version = 4;
    		p.iph.ihl = 5;
    		p.iph.protocol = IPPROTO_IPIP;
    		sprintf(p.name, "dvmrp%d", v->vifc_vifi);
    
    		ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		if (ops->ndo_do_ioctl) {
    			mm_segment_t oldfs = get_fs();
    
    			set_fs(KERNEL_DS);
    			err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
    			set_fs(oldfs);
    		} else
    			err = -EOPNOTSUPP;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		dev = NULL;
    
    
    		if (err == 0 && (dev = __dev_get_by_name(&init_net, p.name)) != NULL) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			dev->flags |= IFF_MULTICAST;
    
    
    			in_dev = __in_dev_get_rtnl(dev);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				goto failure;
    
    
    			ipv4_devconf_setall(in_dev);
    			IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    			if (dev_open(dev))
    				goto failure;
    
    			dev_hold(dev);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    	}
    	return dev;
    
    failure:
    	/* allow the register to be completed before unregistering. */
    	rtnl_unlock();
    	rtnl_lock();
    
    	unregister_netdevice(dev);
    	return NULL;
    }
    
    #ifdef CONFIG_IP_PIMSM
    
    static int reg_vif_num = -1;
    
    static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
    {
    	read_lock(&mrt_lock);
    
    	dev->stats.tx_bytes += skb->len;
    	dev->stats.tx_packets++;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
    	read_unlock(&mrt_lock);
    	kfree_skb(skb);
    	return 0;
    }
    
    
    static const struct net_device_ops reg_vif_netdev_ops = {
    	.ndo_start_xmit	= reg_vif_xmit,
    };
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    static void reg_vif_setup(struct net_device *dev)
    {
    	dev->type		= ARPHRD_PIMREG;
    
    	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 8;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	dev->flags		= IFF_NOARP;
    
    	dev->netdev_ops		= &reg_vif_netdev_ops,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	dev->destructor		= free_netdev;
    }
    
    static struct net_device *ipmr_reg_vif(void)
    {
    	struct net_device *dev;
    	struct in_device *in_dev;
    
    
    	dev = alloc_netdev(0, "pimreg", reg_vif_setup);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	if (dev == NULL)
    		return NULL;
    
    	if (register_netdevice(dev)) {
    		free_netdev(dev);
    		return NULL;
    	}
    	dev->iflink = 0;
    
    
    	rcu_read_lock();
    	if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
    		rcu_read_unlock();
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		goto failure;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	ipv4_devconf_setall(in_dev);
    	IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
    	rcu_read_unlock();
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	if (dev_open(dev))
    		goto failure;
    
    
    	dev_hold(dev);
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	return dev;
    
    failure:
    	/* allow the register to be completed before unregistering. */
    	rtnl_unlock();
    	rtnl_lock();
    
    	unregister_netdevice(dev);
    	return NULL;
    }
    #endif
    
    /*
     *	Delete a VIF entry
    
     *	@notify: Set to 1, if the caller is a notifier_call
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    static int vif_delete(int vifi, int notify)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct vif_device *v;
    	struct net_device *dev;
    	struct in_device *in_dev;
    
    
    	if (vifi < 0 || vifi >= init_net.ipv4.maxvif)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return -EADDRNOTAVAIL;
    
    
    	v = &init_net.ipv4.vif_table[vifi];
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	write_lock_bh(&mrt_lock);
    	dev = v->dev;
    	v->dev = NULL;
    
    	if (!dev) {
    		write_unlock_bh(&mrt_lock);
    		return -EADDRNOTAVAIL;
    	}
    
    #ifdef CONFIG_IP_PIMSM
    	if (vifi == reg_vif_num)
    		reg_vif_num = -1;
    #endif
    
    
    	if (vifi+1 == init_net.ipv4.maxvif) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		int tmp;
    		for (tmp=vifi-1; tmp>=0; tmp--) {
    
    			if (VIF_EXISTS(&init_net, tmp))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				break;
    		}
    
    		init_net.ipv4.maxvif = tmp+1;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    	write_unlock_bh(&mrt_lock);
    
    	dev_set_allmulti(dev, -1);
    
    
    	if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
    
    		IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		ip_rt_multicast_event(in_dev);
    	}
    
    
    	if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		unregister_netdevice(dev);
    
    	dev_put(dev);
    	return 0;
    }
    
    
    static inline void ipmr_cache_free(struct mfc_cache *c)
    {
    	release_net(mfc_net(c));
    	kmem_cache_free(mrt_cachep, c);
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /* Destroy an unresolved cache entry, killing queued skbs
       and reporting error to netlink readers.
     */
    
    static void ipmr_destroy_unres(struct mfc_cache *c)
    {
    	struct sk_buff *skb;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	atomic_dec(&init_net.ipv4.cache_resolve_queue_len);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) {
    
    		if (ip_hdr(skb)->version == 0) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
    			nlh->nlmsg_type = NLMSG_ERROR;
    			nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
    			skb_trim(skb, nlh->nlmsg_len);
    
    			e = NLMSG_DATA(nlh);
    			e->error = -ETIMEDOUT;
    			memset(&e->msg, 0, sizeof(e->msg));
    
    			rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		} else
    			kfree_skb(skb);
    	}
    
    
    	ipmr_cache_free(c);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    
    /* Single timer process for all the unresolved queue. */
    
    static void ipmr_expire_process(unsigned long dummy)
    {
    	unsigned long now;
    	unsigned long expires;
    	struct mfc_cache *c, **cp;
    
    	if (!spin_trylock(&mfc_unres_lock)) {
    		mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
    		return;
    	}
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		goto out;
    
    	now = jiffies;
    	expires = 10*HZ;
    	cp = &mfc_unres_queue;
    
    	while ((c=*cp) != NULL) {
    		if (time_after(c->mfc_un.unres.expires, now)) {
    			unsigned long interval = c->mfc_un.unres.expires - now;
    			if (interval < expires)
    				expires = interval;
    			cp = &c->next;
    			continue;
    		}
    
    		*cp = c->next;
    
    		ipmr_destroy_unres(c);
    	}
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		mod_timer(&ipmr_expire_timer, jiffies + expires);
    
    out:
    	spin_unlock(&mfc_unres_lock);
    }
    
    /* Fill oifs list. It is called under write locked mrt_lock. */
    
    
    static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	int vifi;
    
    	cache->mfc_un.res.minvif = MAXVIFS;
    	cache->mfc_un.res.maxvif = 0;
    	memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
    
    
    	for (vifi = 0; vifi < init_net.ipv4.maxvif; vifi++) {
    		if (VIF_EXISTS(&init_net, vifi) &&
    		    ttls[vifi] && ttls[vifi] < 255) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			cache->mfc_un.res.ttls[vifi] = ttls[vifi];
    			if (cache->mfc_un.res.minvif > vifi)
    				cache->mfc_un.res.minvif = vifi;
    			if (cache->mfc_un.res.maxvif <= vifi)
    				cache->mfc_un.res.maxvif = vifi + 1;
    		}
    	}
    }
    
    static int vif_add(struct vifctl *vifc, int mrtsock)
    {
    	int vifi = vifc->vifc_vifi;
    
    	struct vif_device *v = &init_net.ipv4.vif_table[vifi];
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct net_device *dev;
    	struct in_device *in_dev;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	/* Is vif busy ? */
    
    	if (VIF_EXISTS(&init_net, vifi))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return -EADDRINUSE;
    
    	switch (vifc->vifc_flags) {
    #ifdef CONFIG_IP_PIMSM
    	case VIFF_REGISTER:
    		/*
    		 * Special Purpose VIF in PIM
    		 * All the packets will be sent to the daemon
    		 */
    		if (reg_vif_num >= 0)
    			return -EADDRINUSE;
    		dev = ipmr_reg_vif();
    		if (!dev)
    			return -ENOBUFS;
    
    		err = dev_set_allmulti(dev, 1);
    		if (err) {
    			unregister_netdevice(dev);
    
    			dev_put(dev);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		break;
    #endif
    
    	case VIFF_TUNNEL:
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		dev = ipmr_new_tunnel(vifc);
    		if (!dev)
    			return -ENOBUFS;
    
    		err = dev_set_allmulti(dev, 1);
    		if (err) {
    			ipmr_del_tunnel(dev, vifc);
    
    			dev_put(dev);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		break;
    	case 0:
    
    		dev = ip_dev_find(&init_net, vifc->vifc_lcl_addr.s_addr);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (!dev)
    			return -EADDRNOTAVAIL;
    
    		err = dev_set_allmulti(dev, 1);
    
    		if (err) {
    			dev_put(dev);
    
    			return err;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		break;
    	default:
    		return -EINVAL;
    	}
    
    
    	if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return -EADDRNOTAVAIL;
    
    	IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	ip_rt_multicast_event(in_dev);
    
    	/*
    	 *	Fill in the VIF structures
    	 */
    
    	v->rate_limit = vifc->vifc_rate_limit;
    	v->local = vifc->vifc_lcl_addr.s_addr;
    	v->remote = vifc->vifc_rmt_addr.s_addr;
    	v->flags = vifc->vifc_flags;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (!mrtsock)
    		v->flags |= VIFF_STATIC;
    
    	v->threshold = vifc->vifc_threshold;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	v->bytes_in = 0;
    	v->bytes_out = 0;
    	v->pkt_in = 0;
    	v->pkt_out = 0;
    	v->link = dev->ifindex;
    	if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
    		v->link = dev->iflink;
    
    	/* And finish update writing critical data */
    	write_lock_bh(&mrt_lock);
    
    	v->dev = dev;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #ifdef CONFIG_IP_PIMSM
    	if (v->flags&VIFF_REGISTER)
    		reg_vif_num = vifi;
    #endif
    
    	if (vifi+1 > init_net.ipv4.maxvif)
    		init_net.ipv4.maxvif = vifi+1;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	write_unlock_bh(&mrt_lock);
    	return 0;
    }
    
    
    Al Viro's avatar
    Al Viro committed
    static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	int line = MFC_HASH(mcastgrp, origin);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct mfc_cache *c;
    
    
    	for (c = init_net.ipv4.mfc_cache_array[line]; c; c = c->next) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
    			break;
    	}
    	return c;
    }
    
    /*
     *	Allocate a multicast cache entry
     */
    
    static struct mfc_cache *ipmr_cache_alloc(struct net *net)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
    	if (c == NULL)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return NULL;
    	c->mfc_un.res.minvif = MAXVIFS;
    
    	mfc_net_set(c, net);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	return c;
    }
    
    
    static struct mfc_cache *ipmr_cache_alloc_unres(struct net *net)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
    	if (c == NULL)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return NULL;
    	skb_queue_head_init(&c->mfc_un.unres.unresolved);
    	c->mfc_un.unres.expires = jiffies + 10*HZ;
    
    	mfc_net_set(c, net);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	return c;
    }
    
    /*
     *	A cache entry has gone into a resolved state from queued
     */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
    {
    	struct sk_buff *skb;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	/*
    	 *	Play the pending entries through our router
    	 */
    
    
    	while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
    
    		if (ip_hdr(skb)->version == 0) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
    
    			if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
    
    				nlh->nlmsg_len = (skb_tail_pointer(skb) -
    						  (u8 *)nlh);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			} else {
    				nlh->nlmsg_type = NLMSG_ERROR;
    				nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
    				skb_trim(skb, nlh->nlmsg_len);
    
    				e = NLMSG_DATA(nlh);
    				e->error = -EMSGSIZE;
    				memset(&e->msg, 0, sizeof(e->msg));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			}
    
    			rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		} else
    			ip_mr_forward(skb, c, 0);
    	}
    }
    
    /*
     *	Bounce a cache query up to mrouted. We could use netlink for this but mrouted
     *	expects the following bizarre scheme.
     *
     *	Called under mrt_lock.
     */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
    {
    	struct sk_buff *skb;
    
    	const int ihl = ip_hdrlen(pkt);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct igmphdr *igmp;
    	struct igmpmsg *msg;
    	int ret;
    
    #ifdef CONFIG_IP_PIMSM
    	if (assert == IGMPMSG_WHOLEPKT)
    		skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
    	else
    #endif
    		skb = alloc_skb(128, GFP_ATOMIC);
    
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    	if (!skb)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return -ENOBUFS;
    
    #ifdef CONFIG_IP_PIMSM
    	if (assert == IGMPMSG_WHOLEPKT) {
    		/* Ugly, but we have no choice with this interface.
    		   Duplicate old header, fix ihl, length etc.
    		   And all this only to mangle msg->im_msgtype and
    		   to set msg->im_mbz to "mbz" :-)
    		 */
    
    		skb_push(skb, sizeof(struct iphdr));
    		skb_reset_network_header(skb);
    
    		skb_reset_transport_header(skb);
    
    		msg = (struct igmpmsg *)skb_network_header(skb);
    
    		memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		msg->im_msgtype = IGMPMSG_WHOLEPKT;
    		msg->im_mbz = 0;
    
    		msg->im_vif = reg_vif_num;
    
    		ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
    		ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
    					     sizeof(struct iphdr));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #endif
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	/*
    	 *	Copy the IP header
    	 */
    
    
    	skb->network_header = skb->tail;
    
    	skb_copy_to_linear_data(skb, pkt->data, ihl);
    
    	ip_hdr(skb)->protocol = 0;			/* Flag to the kernel this is a route add */
    	msg = (struct igmpmsg *)skb_network_header(skb);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	msg->im_vif = vifi;
    	skb->dst = dst_clone(pkt->dst);
    
    	/*
    	 *	Add our header
    	 */
    
    
    	igmp=(struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	igmp->type	=
    	msg->im_msgtype = assert;
    	igmp->code 	=	0;
    
    	ip_hdr(skb)->tot_len = htons(skb->len);			/* Fix the length */
    
    	skb->transport_header = skb->network_header;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if (init_net.ipv4.mroute_sk == NULL) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		kfree_skb(skb);
    		return -EINVAL;
    	}
    
    	/*
    	 *	Deliver to mrouted
    	 */
    
    	ret = sock_queue_rcv_skb(init_net.ipv4.mroute_sk, skb);
    	if (ret < 0) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (net_ratelimit())
    			printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
    		kfree_skb(skb);
    	}
    
    	return ret;
    }
    
    /*
     *	Queue a packet for resolution. It gets locked cache entry!
     */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    static int
    ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
    {
    	int err;
    	struct mfc_cache *c;
    
    	const struct iphdr *iph = ip_hdr(skb);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	spin_lock_bh(&mfc_unres_lock);
    	for (c=mfc_unres_queue; c; c=c->next) {
    
    		if (net_eq(mfc_net(c), &init_net) &&
    		    c->mfc_mcastgrp == iph->daddr &&
    
    		    c->mfc_origin == iph->saddr)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			break;
    	}
    
    	if (c == NULL) {
    		/*
    		 *	Create a new entry if allowable
    		 */
    
    
    		if (atomic_read(&init_net.ipv4.cache_resolve_queue_len) >= 10 ||
    
    		    (c = ipmr_cache_alloc_unres(&init_net)) == NULL) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			spin_unlock_bh(&mfc_unres_lock);
    
    			kfree_skb(skb);
    			return -ENOBUFS;
    		}
    
    		/*
    		 *	Fill in the new cache entry
    		 */
    
    		c->mfc_parent	= -1;
    		c->mfc_origin	= iph->saddr;
    		c->mfc_mcastgrp	= iph->daddr;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		/*
    		 *	Reflect first query at mrouted.
    		 */
    		if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
    
    			/* If the report failed throw the cache entry
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			   out - Brad Parker
    			 */
    			spin_unlock_bh(&mfc_unres_lock);
    
    
    			ipmr_cache_free(c);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			kfree_skb(skb);
    			return err;
    		}
    
    
    		atomic_inc(&init_net.ipv4.cache_resolve_queue_len);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		c->next = mfc_unres_queue;
    		mfc_unres_queue = c;
    
    		mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
    	}
    
    	/*
    	 *	See if we can append the packet
    	 */
    	if (c->mfc_un.unres.unresolved.qlen>3) {
    		kfree_skb(skb);
    		err = -ENOBUFS;
    	} else {
    
    		skb_queue_tail(&c->mfc_un.unres.unresolved, skb);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		err = 0;
    	}
    
    	spin_unlock_bh(&mfc_unres_lock);
    	return err;
    }
    
    /*
     *	MFC cache manipulation by user space mroute daemon
     */
    
    static int ipmr_mfc_delete(struct mfcctl *mfc)
    {
    	int line;
    	struct mfc_cache *c, **cp;
    
    
    	line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	for (cp = &init_net.ipv4.mfc_cache_array[line];
    	     (c = *cp) != NULL; cp = &c->next) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
    		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
    			write_lock_bh(&mrt_lock);
    			*cp = c->next;
    			write_unlock_bh(&mrt_lock);
    
    
    			ipmr_cache_free(c);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			return 0;
    		}
    	}
    	return -ENOENT;
    }
    
    static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
    {
    	int line;
    	struct mfc_cache *uc, *c, **cp;
    
    
    	line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	for (cp = &init_net.ipv4.mfc_cache_array[line];
    	     (c = *cp) != NULL; cp = &c->next) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
    		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
    			break;
    	}
    
    	if (c != NULL) {
    		write_lock_bh(&mrt_lock);
    		c->mfc_parent = mfc->mfcc_parent;
    
    		ipmr_update_thresholds(c, mfc->mfcc_ttls);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (!mrtsock)
    			c->mfc_flags |= MFC_STATIC;
    		write_unlock_bh(&mrt_lock);
    		return 0;
    	}
    
    
    	if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return -EINVAL;
    
    
    	c = ipmr_cache_alloc(&init_net);
    
    	if (c == NULL)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return -ENOMEM;
    
    
    	c->mfc_origin = mfc->mfcc_origin.s_addr;
    	c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr;
    	c->mfc_parent = mfc->mfcc_parent;
    
    	ipmr_update_thresholds(c, mfc->mfcc_ttls);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (!mrtsock)
    		c->mfc_flags |= MFC_STATIC;
    
    	write_lock_bh(&mrt_lock);
    
    	c->next = init_net.ipv4.mfc_cache_array[line];
    	init_net.ipv4.mfc_cache_array[line] = c;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	write_unlock_bh(&mrt_lock);
    
    	/*
    	 *	Check to see if we resolved a queued list. If so we
    	 *	need to send on the frames and tidy up.
    	 */
    	spin_lock_bh(&mfc_unres_lock);
    	for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
    	     cp = &uc->next) {
    
    		if (net_eq(mfc_net(uc), &init_net) &&
    		    uc->mfc_origin == c->mfc_origin &&
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		    uc->mfc_mcastgrp == c->mfc_mcastgrp) {
    			*cp = uc->next;
    
    			atomic_dec(&init_net.ipv4.cache_resolve_queue_len);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			break;
    		}
    	}
    
    	if (mfc_unres_queue == NULL)
    		del_timer(&ipmr_expire_timer);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	spin_unlock_bh(&mfc_unres_lock);
    
    	if (uc) {
    		ipmr_cache_resolve(uc, c);
    
    		ipmr_cache_free(uc);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    	return 0;
    }
    
    /*
     *	Close the multicast socket, and clear the vif tables etc
     */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    static void mroute_clean_tables(struct sock *sk)
    {
    	int i;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	/*
    	 *	Shut down all active vif entries
    	 */
    
    	for (i = 0; i < init_net.ipv4.maxvif; i++) {
    		if (!(init_net.ipv4.vif_table[i].flags&VIFF_STATIC))
    
    			vif_delete(i, 0);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    	/*
    	 *	Wipe the cache
    	 */
    
    	for (i=0; i<MFC_LINES; i++) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		struct mfc_cache *c, **cp;
    
    
    		cp = &init_net.ipv4.mfc_cache_array[i];
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		while ((c = *cp) != NULL) {
    			if (c->mfc_flags&MFC_STATIC) {
    				cp = &c->next;
    				continue;
    			}
    			write_lock_bh(&mrt_lock);
    			*cp = c->next;
    			write_unlock_bh(&mrt_lock);
    
    
    			ipmr_cache_free(c);
    
    	if (atomic_read(&init_net.ipv4.cache_resolve_queue_len) != 0) {
    		struct mfc_cache *c, **cp;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		spin_lock_bh(&mfc_unres_lock);
    
    		cp = &mfc_unres_queue;
    		while ((c = *cp) != NULL) {
    			if (!net_eq(mfc_net(c), &init_net)) {
    				cp = &c->next;
    				continue;
    			}
    			*cp = c->next;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    			ipmr_destroy_unres(c);
    		}
    		spin_unlock_bh(&mfc_unres_lock);
    	}
    }
    
    static void mrtsock_destruct(struct sock *sk)
    {
    	rtnl_lock();
    
    	if (sk == init_net.ipv4.mroute_sk) {
    
    		IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)--;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		write_lock_bh(&mrt_lock);
    
    		init_net.ipv4.mroute_sk = NULL;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		write_unlock_bh(&mrt_lock);
    
    		mroute_clean_tables(sk);
    	}
    	rtnl_unlock();
    }
    
    /*
     *	Socket options and virtual interface manipulation. The whole
     *	virtual interface system is a complete heap, but unfortunately
     *	that's how BSD mrouted happens to think. Maybe one day with a proper
     *	MOSPF/PIM router set up we can clean this up.
     */
    
    int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int optlen)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	int ret;
    	struct vifctl vif;
    	struct mfcctl mfc;
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    	if (optname != MRT_INIT) {
    
    		if (sk != init_net.ipv4.mroute_sk && !capable(CAP_NET_ADMIN))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			return -EACCES;
    	}
    
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    	switch (optname) {
    	case MRT_INIT:
    		if (sk->sk_type != SOCK_RAW ||
    		    inet_sk(sk)->num != IPPROTO_IGMP)
    			return -EOPNOTSUPP;
    
    		if (optlen != sizeof(int))
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    			return -ENOPROTOOPT;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    		rtnl_lock();
    
    		if (init_net.ipv4.mroute_sk) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			rtnl_unlock();
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    			return -EADDRINUSE;
    		}
    
    		ret = ip_ra_control(sk, 1, mrtsock_destruct);
    		if (ret == 0) {
    			write_lock_bh(&mrt_lock);
    
    			init_net.ipv4.mroute_sk = sk;
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    			write_unlock_bh(&mrt_lock);
    
    
    			IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)++;
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    		}
    		rtnl_unlock();
    		return ret;
    	case MRT_DONE:
    
    		if (sk != init_net.ipv4.mroute_sk)
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    			return -EACCES;
    		return ip_ra_control(sk, 0, NULL);
    	case MRT_ADD_VIF:
    	case MRT_DEL_VIF:
    
    		if (optlen != sizeof(vif))
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    			return -EINVAL;
    
    		if (copy_from_user(&vif, optval, sizeof(vif)))
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    			return -EFAULT;
    		if (vif.vifc_vifi >= MAXVIFS)
    			return -ENFILE;
    		rtnl_lock();
    
    		if (optname == MRT_ADD_VIF) {
    
    			ret = vif_add(&vif, sk == init_net.ipv4.mroute_sk);
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    		} else {
    
    			ret = vif_delete(vif.vifc_vifi, 0);
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    		}
    		rtnl_unlock();
    		return ret;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		/*
    		 *	Manipulate the forwarding caches. These live
    		 *	in a sort of kernel/user symbiosis.
    		 */
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    	case MRT_ADD_MFC:
    	case MRT_DEL_MFC:
    
    		if (optlen != sizeof(mfc))
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    			return -EINVAL;
    
    		if (copy_from_user(&mfc, optval, sizeof(mfc)))
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    			return -EFAULT;
    		rtnl_lock();
    
    		if (optname == MRT_DEL_MFC)
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    			ret = ipmr_mfc_delete(&mfc);
    		else
    
    			ret = ipmr_mfc_add(&mfc, sk == init_net.ipv4.mroute_sk);
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    		rtnl_unlock();
    		return ret;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/*
    		 *	Control PIM assert.
    		 */
    
    Stephen Hemminger's avatar
    Stephen Hemminger committed
    	case MRT_ASSERT:
    	{
    		int v;