Skip to content
Snippets Groups Projects
ip_gre.c 45.7 KiB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
/*
 *	Linux NET3:	GRE over IP protocol decoder.
Linus Torvalds's avatar
Linus Torvalds committed
 *
 *	Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
 *
 *	This program is free software; you can redistribute it and/or
 *	modify it under the terms of the GNU General Public License
 *	as published by the Free Software Foundation; either version
 *	2 of the License, or (at your option) any later version.
 *
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/capability.h>
Linus Torvalds's avatar
Linus Torvalds committed
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
Linus Torvalds's avatar
Linus Torvalds committed
#include <asm/uaccess.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/in.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/if_arp.h>
#include <linux/mroute.h>
#include <linux/init.h>
#include <linux/in6.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
#include <linux/netfilter_ipv4.h>
#include <linux/etherdevice.h>
#include <linux/if_ether.h>
Linus Torvalds's avatar
Linus Torvalds committed

#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/protocol.h>
#include <net/ipip.h>
#include <net/arp.h>
#include <net/checksum.h>
#include <net/dsfield.h>
#include <net/inet_ecn.h>
#include <net/xfrm.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
Herbert Xu's avatar
Herbert Xu committed
#include <net/rtnetlink.h>
Linus Torvalds's avatar
Linus Torvalds committed

#if IS_ENABLED(CONFIG_IPV6)
Linus Torvalds's avatar
Linus Torvalds committed
#include <net/ipv6.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
#endif

/*
   Problems & solutions
   --------------------

   1. The most important issue is detecting local dead loops.
   They would cause complete host lockup in transmit, which
   would be "resolved" by stack overflow or, if queueing is enabled,
   with infinite looping in net_bh.

   We cannot track such dead loops during route installation,
   it is infeasible task. The most general solutions would be
   to keep skb->encapsulation counter (sort of local ttl),
Eric Dumazet's avatar
Eric Dumazet committed
   and silently drop packet when it expires. It is a good
   solution, but it supposes maintaining new variable in ALL
Linus Torvalds's avatar
Linus Torvalds committed
   skb, even if no tunneling is used.

Eric Dumazet's avatar
Eric Dumazet committed
   Current solution: xmit_recursion breaks dead loops. This is a percpu
   counter, since when we enter the first ndo_xmit(), cpu migration is
   forbidden. We force an exit if this counter reaches RECURSION_LIMIT
Linus Torvalds's avatar
Linus Torvalds committed

   2. Networking dead loops would not kill routers, but would really
   kill network. IP hop limit plays role of "t->recursion" in this case,
   if we copy it from packet being encapsulated to upper header.
   It is very good solution, but it introduces two problems:

   - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
     do not work over tunnels.
   - traceroute does not work. I planned to relay ICMP from tunnel,
     so that this problem would be solved and traceroute output
     would even more informative. This idea appeared to be wrong:
     only Linux complies to rfc1812 now (yes, guys, Linux is the only
     true router now :-)), all routers (at least, in neighbourhood of mine)
     return only 8 bytes of payload. It is the end.

   Hence, if we want that OSPF worked or traceroute said something reasonable,
   we should search for another solution.

   One of them is to parse packet trying to detect inner encapsulation
   made by our node. It is difficult or even impossible, especially,
   taking into account fragmentation. TO be short, ttl is not solution at all.
Linus Torvalds's avatar
Linus Torvalds committed

   Current solution: The solution was UNEXPECTEDLY SIMPLE.
   We force DF flag on tunnels with preconfigured hop limit,
   that is ALL. :-) Well, it does not remove the problem completely,
   but exponential growth of network traffic is changed to linear
   (branches, that exceed pmtu are pruned) and tunnel mtu
   rapidly degrades to value <68, where looping stops.
Linus Torvalds's avatar
Linus Torvalds committed
   Yes, it is not good if there exists a router in the loop,
   which does not force DF, even when encapsulating packets have DF set.
   But it is not our problem! Nobody could accuse us, we made
   all that we could make. Even if it is your gated who injected
   fatal route to network, even if it were you who configured
   fatal static route: you are innocent. :-)



   3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
   practically identical code. It would be good to glue them
   together, but it is not very evident, how to make them modular.
   sit is integral part of IPv6, ipip and gre are naturally modular.
   We could extract common parts (hash table, ioctl etc)
   to a separate module (ip_tunnel.c).

   Alexey Kuznetsov.
 */

static bool log_ecn_error = true;
module_param(log_ecn_error, bool, 0644);
MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");

Herbert Xu's avatar
Herbert Xu committed
static struct rtnl_link_ops ipgre_link_ops __read_mostly;
Linus Torvalds's avatar
Linus Torvalds committed
static int ipgre_tunnel_init(struct net_device *dev);
static void ipgre_tunnel_setup(struct net_device *dev);
static int ipgre_tunnel_bind_dev(struct net_device *dev);
Linus Torvalds's avatar
Linus Torvalds committed

/* Fallback tunnel: no source, no destination, no key, no options */

#define HASH_SIZE  16

static int ipgre_net_id __read_mostly;
Eric Dumazet's avatar
Eric Dumazet committed
	struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
	struct net_device *fb_tunnel_dev;
Linus Torvalds's avatar
Linus Torvalds committed
/* Tunnel hash table */

/*
   4 hash tables:

   3: (remote,local)
   2: (remote,*)
   1: (*,local)
   0: (*,*)

   We require exact key match i.e. if a key is present in packet
   it will match only tunnel with the same key; if it is not present,
   it will match only keyless tunnel.

   All keysless packets, if not matched configured keyless tunnels
   will match fallback tunnel.
 */

#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
Linus Torvalds's avatar
Linus Torvalds committed

#define tunnels_r_l	tunnels[3]
#define tunnels_r	tunnels[2]
#define tunnels_l	tunnels[1]
#define tunnels_wc	tunnels[0]
Linus Torvalds's avatar
Linus Torvalds committed

static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev,
						   struct rtnl_link_stats64 *tot)
{
	int i;

	for_each_possible_cpu(i) {
		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
		unsigned int start;

		do {
			start = u64_stats_fetch_begin_bh(&tstats->syncp);
			rx_packets = tstats->rx_packets;
			tx_packets = tstats->tx_packets;
			rx_bytes = tstats->rx_bytes;
			tx_bytes = tstats->tx_bytes;
		} while (u64_stats_fetch_retry_bh(&tstats->syncp, start));

		tot->rx_packets += rx_packets;
		tot->tx_packets += tx_packets;
		tot->rx_bytes   += rx_bytes;
		tot->tx_bytes   += tx_bytes;

	tot->multicast = dev->stats.multicast;
	tot->rx_crc_errors = dev->stats.rx_crc_errors;
	tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
	tot->rx_length_errors = dev->stats.rx_length_errors;
	tot->rx_frame_errors = dev->stats.rx_frame_errors;
	tot->rx_errors = dev->stats.rx_errors;
	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
	tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
	tot->tx_dropped = dev->stats.tx_dropped;
	tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
	tot->tx_errors = dev->stats.tx_errors;

	return tot;
/* Does key in tunnel parameters match packet */
static bool ipgre_key_match(const struct ip_tunnel_parm *p,
			    __be16 flags, __be32 key)
{
	if (p->i_flags & GRE_KEY) {
		if (flags & GRE_KEY)
			return key == p->i_key;
		else
			return false;	/* key expected, none present */
	} else
		return !(flags & GRE_KEY);
}

Linus Torvalds's avatar
Linus Torvalds committed
/* Given src, dst and key, find appropriate for input tunnel. */

static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
					     __be32 remote, __be32 local,
					     __be16 flags, __be32 key,
					     __be16 gre_proto)
Linus Torvalds's avatar
Linus Torvalds committed
{
	struct net *net = dev_net(dev);
	int link = dev->ifindex;
Eric Dumazet's avatar
Eric Dumazet committed
	unsigned int h0 = HASH(remote);
	unsigned int h1 = HASH(key);
Timo Teras's avatar
Timo Teras committed
	struct ip_tunnel *t, *cand = NULL;
	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
	int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
		       ARPHRD_ETHER : ARPHRD_IPGRE;
Timo Teras's avatar
Timo Teras committed
	int score, cand_score = 4;
Linus Torvalds's avatar
Linus Torvalds committed

	for_each_ip_tunnel_rcu(t, ign->tunnels_r_l[h0 ^ h1]) {
		if (local != t->parms.iph.saddr ||
		    remote != t->parms.iph.daddr ||
		    !(t->dev->flags & IFF_UP))
			continue;

		if (!ipgre_key_match(&t->parms, flags, key))
			continue;

		if (t->dev->type != ARPHRD_IPGRE &&
		    t->dev->type != dev_type)
			continue;

Timo Teras's avatar
Timo Teras committed
		score = 0;
		if (t->parms.link != link)
Timo Teras's avatar
Timo Teras committed
			score |= 1;
		if (t->dev->type != dev_type)
Timo Teras's avatar
Timo Teras committed
			score |= 2;
		if (score == 0)
			return t;
Timo Teras's avatar
Timo Teras committed

		if (score < cand_score) {
			cand = t;
			cand_score = score;
		}
Linus Torvalds's avatar
Linus Torvalds committed
	}
	for_each_ip_tunnel_rcu(t, ign->tunnels_r[h0 ^ h1]) {
		if (remote != t->parms.iph.daddr ||
		    !(t->dev->flags & IFF_UP))
			continue;

		if (!ipgre_key_match(&t->parms, flags, key))
			continue;

		if (t->dev->type != ARPHRD_IPGRE &&
		    t->dev->type != dev_type)
			continue;

Timo Teras's avatar
Timo Teras committed
		score = 0;
		if (t->parms.link != link)
Timo Teras's avatar
Timo Teras committed
			score |= 1;
		if (t->dev->type != dev_type)
Timo Teras's avatar
Timo Teras committed
			score |= 2;
		if (score == 0)
			return t;
Timo Teras's avatar
Timo Teras committed

		if (score < cand_score) {
			cand = t;
			cand_score = score;
		}
Linus Torvalds's avatar
Linus Torvalds committed
	}
	for_each_ip_tunnel_rcu(t, ign->tunnels_l[h1]) {
		if ((local != t->parms.iph.saddr &&
		     (local != t->parms.iph.daddr ||
		      !ipv4_is_multicast(local))) ||
		    !(t->dev->flags & IFF_UP))
			continue;

		if (!ipgre_key_match(&t->parms, flags, key))
			continue;

		if (t->dev->type != ARPHRD_IPGRE &&
		    t->dev->type != dev_type)
			continue;

Timo Teras's avatar
Timo Teras committed
		score = 0;
		if (t->parms.link != link)
Timo Teras's avatar
Timo Teras committed
			score |= 1;
		if (t->dev->type != dev_type)
Timo Teras's avatar
Timo Teras committed
			score |= 2;
		if (score == 0)
			return t;
Timo Teras's avatar
Timo Teras committed

		if (score < cand_score) {
			cand = t;
			cand_score = score;
		}
Linus Torvalds's avatar
Linus Torvalds committed
	}
	for_each_ip_tunnel_rcu(t, ign->tunnels_wc[h1]) {
		if (t->parms.i_key != key ||
		    !(t->dev->flags & IFF_UP))
			continue;

		if (t->dev->type != ARPHRD_IPGRE &&
		    t->dev->type != dev_type)
			continue;

Timo Teras's avatar
Timo Teras committed
		score = 0;
		if (t->parms.link != link)
Timo Teras's avatar
Timo Teras committed
			score |= 1;
		if (t->dev->type != dev_type)
Timo Teras's avatar
Timo Teras committed
			score |= 2;
		if (score == 0)
			return t;
Timo Teras's avatar
Timo Teras committed

		if (score < cand_score) {
			cand = t;
			cand_score = score;
		}
Timo Teras's avatar
Timo Teras committed
	if (cand != NULL)
		return cand;
	dev = ign->fb_tunnel_dev;
	if (dev->flags & IFF_UP)
		return netdev_priv(dev);
Linus Torvalds's avatar
Linus Torvalds committed
	return NULL;
}

Eric Dumazet's avatar
Eric Dumazet committed
static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
		struct ip_tunnel_parm *parms)
Linus Torvalds's avatar
Linus Torvalds committed
{
	__be32 remote = parms->iph.daddr;
	__be32 local = parms->iph.saddr;
	__be32 key = parms->i_key;
Eric Dumazet's avatar
Eric Dumazet committed
	unsigned int h = HASH(key);
Linus Torvalds's avatar
Linus Torvalds committed
	int prio = 0;

	if (local)
		prio |= 1;
	if (remote && !ipv4_is_multicast(remote)) {
Linus Torvalds's avatar
Linus Torvalds committed
		prio |= 2;
		h ^= HASH(remote);
	}

	return &ign->tunnels[prio][h];
Eric Dumazet's avatar
Eric Dumazet committed
static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
	return __ipgre_bucket(ign, &t->parms);
static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
Linus Torvalds's avatar
Linus Torvalds committed
{
Eric Dumazet's avatar
Eric Dumazet committed
	struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
Linus Torvalds's avatar
Linus Torvalds committed

Eric Dumazet's avatar
Eric Dumazet committed
	rcu_assign_pointer(t->next, rtnl_dereference(*tp));
	rcu_assign_pointer(*tp, t);
static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
Linus Torvalds's avatar
Linus Torvalds committed
{
Eric Dumazet's avatar
Eric Dumazet committed
	struct ip_tunnel __rcu **tp;
	struct ip_tunnel *iter;

	for (tp = ipgre_bucket(ign, t);
	     (iter = rtnl_dereference(*tp)) != NULL;
	     tp = &iter->next) {
		if (t == iter) {
			rcu_assign_pointer(*tp, t->next);
static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
					   struct ip_tunnel_parm *parms,
					   int type)
Linus Torvalds's avatar
Linus Torvalds committed
{
	__be32 remote = parms->iph.daddr;
	__be32 local = parms->iph.saddr;
	__be32 key = parms->i_key;
	int link = parms->link;
Eric Dumazet's avatar
Eric Dumazet committed
	struct ip_tunnel *t;
	struct ip_tunnel __rcu **tp;
	struct ipgre_net *ign = net_generic(net, ipgre_net_id);

Eric Dumazet's avatar
Eric Dumazet committed
	for (tp = __ipgre_bucket(ign, parms);
	     (t = rtnl_dereference(*tp)) != NULL;
	     tp = &t->next)
		if (local == t->parms.iph.saddr &&
		    remote == t->parms.iph.daddr &&
		    key == t->parms.i_key &&
		    link == t->parms.link &&
		    type == t->dev->type)
			break;

	return t;
}

Eric Dumazet's avatar
Eric Dumazet committed
static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
		struct ip_tunnel_parm *parms, int create)
{
	struct ip_tunnel *t, *nt;
Linus Torvalds's avatar
Linus Torvalds committed
	struct net_device *dev;
	char name[IFNAMSIZ];
	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
Linus Torvalds's avatar
Linus Torvalds committed

	t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
	if (t || !create)
		return t;
Linus Torvalds's avatar
Linus Torvalds committed

	if (parms->name[0])
		strlcpy(name, parms->name, IFNAMSIZ);
stephen hemminger's avatar
stephen hemminger committed
		strcpy(name, "gre%d");
Linus Torvalds's avatar
Linus Torvalds committed

	dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
	if (!dev)
stephen hemminger's avatar
stephen hemminger committed
		return NULL;
Linus Torvalds's avatar
Linus Torvalds committed

	nt = netdev_priv(dev);
Linus Torvalds's avatar
Linus Torvalds committed
	nt->parms = *parms;
Herbert Xu's avatar
Herbert Xu committed
	dev->rtnl_link_ops = &ipgre_link_ops;
Linus Torvalds's avatar
Linus Torvalds committed

	dev->mtu = ipgre_tunnel_bind_dev(dev);

	if (register_netdevice(dev) < 0)
		goto failed_free;
Linus Torvalds's avatar
Linus Torvalds committed

	/* Can use a lockless transmit, unless we generate output sequences */
	if (!(nt->parms.o_flags & GRE_SEQ))
		dev->features |= NETIF_F_LLTX;

Linus Torvalds's avatar
Linus Torvalds committed
	dev_hold(dev);
	ipgre_tunnel_link(ign, nt);
Linus Torvalds's avatar
Linus Torvalds committed
	return nt;

failed_free:
	free_netdev(dev);
Linus Torvalds's avatar
Linus Torvalds committed
	return NULL;
}

static void ipgre_tunnel_uninit(struct net_device *dev)
{
	struct net *net = dev_net(dev);
	struct ipgre_net *ign = net_generic(net, ipgre_net_id);

	ipgre_tunnel_unlink(ign, netdev_priv(dev));
Linus Torvalds's avatar
Linus Torvalds committed
	dev_put(dev);
}


static void ipgre_err(struct sk_buff *skb, u32 info)
{

/* All the routers (except for Linux) return only
Linus Torvalds's avatar
Linus Torvalds committed
   8 bytes of packet payload. It means, that precise relaying of
   ICMP in the real Internet is absolutely infeasible.

   Moreover, Cisco "wise men" put GRE key to the third word
   in GRE header. It makes impossible maintaining even soft state for keyed
   GRE tunnels with enabled checksum. Tell them "thank you".

   Well, I wonder, rfc1812 was written by Cisco employee,
   what the hell these idiots break standards established
   by themselves???
Linus Torvalds's avatar
Linus Torvalds committed
 */

	const struct iphdr *iph = (const struct iphdr *)skb->data;
	__be16	     *p = (__be16 *)(skb->data+(iph->ihl<<2));
Linus Torvalds's avatar
Linus Torvalds committed
	int grehlen = (iph->ihl<<2) + 4;
	const int type = icmp_hdr(skb)->type;
	const int code = icmp_hdr(skb)->code;
Linus Torvalds's avatar
Linus Torvalds committed
	struct ip_tunnel *t;
	__be16 flags;
	__be32 key = 0;
Linus Torvalds's avatar
Linus Torvalds committed

	flags = p[0];
	if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
		if (flags&(GRE_VERSION|GRE_ROUTING))
			return;
		if (flags&GRE_KEY) {
			grehlen += 4;
			if (flags&GRE_CSUM)
				grehlen += 4;
		}
	}

	/* If only 8 bytes returned, keyed message will be dropped here */
	if (skb_headlen(skb) < grehlen)
		return;

	if (flags & GRE_KEY)
		key = *(((__be32 *)p) + (grehlen / 4) - 1);

Linus Torvalds's avatar
Linus Torvalds committed
	switch (type) {
	default:
	case ICMP_PARAMETERPROB:
		return;

	case ICMP_DEST_UNREACH:
		switch (code) {
		case ICMP_SR_FAILED:
		case ICMP_PORT_UNREACH:
			/* Impossible event. */
			return;
		default:
			/* All others are translated to HOST_UNREACH.
			   rfc2003 contains "deep thoughts" about NET_UNREACH,
			   I believe they are just ether pollution. --ANK
			 */
			break;
		}
		break;
	case ICMP_TIME_EXCEEDED:
		if (code != ICMP_EXC_TTL)
			return;
		break;
	t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
				flags, key, p[1]);


	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
		ipv4_update_pmtu(skb, dev_net(skb->dev), info,
				 t->parms.link, 0, IPPROTO_GRE, 0);
	if (type == ICMP_REDIRECT) {
		ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
			      IPPROTO_GRE, 0);
	if (t->parms.iph.daddr == 0 ||
	    ipv4_is_multicast(t->parms.iph.daddr))
Linus Torvalds's avatar
Linus Torvalds committed

	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
Linus Torvalds's avatar
Linus Torvalds committed

	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
Linus Torvalds's avatar
Linus Torvalds committed
		t->err_count++;
	else
		t->err_count = 1;
	t->err_time = jiffies;
}

static inline u8
ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
{
	u8 inner = 0;
	if (skb->protocol == htons(ETH_P_IP))
		inner = old_iph->tos;
	else if (skb->protocol == htons(ETH_P_IPV6))
		inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
Linus Torvalds's avatar
Linus Torvalds committed
	return INET_ECN_encapsulate(tos, inner);
}

static int ipgre_rcv(struct sk_buff *skb)
{
	const struct iphdr *iph;
Linus Torvalds's avatar
Linus Torvalds committed
	u8     *h;
	__be16    flags;
	__sum16   csum = 0;
	__be32 key = 0;
Linus Torvalds's avatar
Linus Torvalds committed
	u32    seqno = 0;
	struct ip_tunnel *tunnel;
	int    offset = 4;
	__be16 gre_proto;
Linus Torvalds's avatar
Linus Torvalds committed

	if (!pskb_may_pull(skb, 16))
Linus Torvalds's avatar
Linus Torvalds committed

Linus Torvalds's avatar
Linus Torvalds committed
	h = skb->data;
	flags = *(__be16 *)h;
Linus Torvalds's avatar
Linus Torvalds committed

	if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
		/* - Version must be 0.
		   - We do not support routing headers.
		 */
		if (flags&(GRE_VERSION|GRE_ROUTING))
Linus Torvalds's avatar
Linus Torvalds committed

		if (flags&GRE_CSUM) {
			switch (skb->ip_summed) {
				csum = csum_fold(skb->csum);
				if (!csum)
					break;
				/* fall through */
			case CHECKSUM_NONE:
				skb->csum = 0;
				csum = __skb_checksum_complete(skb);
				skb->ip_summed = CHECKSUM_COMPLETE;
Linus Torvalds's avatar
Linus Torvalds committed
			}
			offset += 4;
		}
		if (flags&GRE_KEY) {
			key = *(__be32 *)(h + offset);
Linus Torvalds's avatar
Linus Torvalds committed
			offset += 4;
		}
		if (flags&GRE_SEQ) {
			seqno = ntohl(*(__be32 *)(h + offset));
Linus Torvalds's avatar
Linus Torvalds committed
			offset += 4;
		}
	}

	gre_proto = *(__be16 *)(h + 2);

	tunnel = ipgre_tunnel_lookup(skb->dev,
				     iph->saddr, iph->daddr, flags, key,
				     gre_proto);
	if (tunnel) {
		struct pcpu_tstats *tstats;
Linus Torvalds's avatar
Linus Torvalds committed
		secpath_reset(skb);

		skb->protocol = gre_proto;
Linus Torvalds's avatar
Linus Torvalds committed
		/* WCCP version 1 and 2 protocol decoding.
		 * - Change protocol to IP
		 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
		 */
		if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
			skb->protocol = htons(ETH_P_IP);
			if ((*(h + offset) & 0xF0) != 0x40)
Linus Torvalds's avatar
Linus Torvalds committed
				offset += 4;
		}

		skb->mac_header = skb->network_header;
		skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
Linus Torvalds's avatar
Linus Torvalds committed
		skb->pkt_type = PACKET_HOST;
#ifdef CONFIG_NET_IPGRE_BROADCAST
		if (ipv4_is_multicast(iph->daddr)) {
Linus Torvalds's avatar
Linus Torvalds committed
			/* Looped back packet, drop it! */
			if (rt_is_output_route(skb_rtable(skb)))
Linus Torvalds's avatar
Linus Torvalds committed
				goto drop;
			tunnel->dev->stats.multicast++;
Linus Torvalds's avatar
Linus Torvalds committed
			skb->pkt_type = PACKET_BROADCAST;
		}
#endif

		if (((flags&GRE_CSUM) && csum) ||
		    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
			tunnel->dev->stats.rx_crc_errors++;
			tunnel->dev->stats.rx_errors++;
Linus Torvalds's avatar
Linus Torvalds committed
			goto drop;
		}
		if (tunnel->parms.i_flags&GRE_SEQ) {
			if (!(flags&GRE_SEQ) ||
			    (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
				tunnel->dev->stats.rx_fifo_errors++;
				tunnel->dev->stats.rx_errors++;
Linus Torvalds's avatar
Linus Torvalds committed
				goto drop;
			}
			tunnel->i_seqno = seqno + 1;
		}

		/* Warning: All skb pointers will be invalidated! */
		if (tunnel->dev->type == ARPHRD_ETHER) {
			if (!pskb_may_pull(skb, ETH_HLEN)) {
				tunnel->dev->stats.rx_length_errors++;
				tunnel->dev->stats.rx_errors++;
				goto drop;
			}

			iph = ip_hdr(skb);
			skb->protocol = eth_type_trans(skb, tunnel->dev);
			skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
		}

		__skb_tunnel_rx(skb, tunnel->dev);

		skb_reset_network_header(skb);
		err = IP_ECN_decapsulate(iph, skb);
		if (unlikely(err)) {
			if (log_ecn_error)
				net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
						     &iph->saddr, iph->tos);
			if (err > 1) {
				++tunnel->dev->stats.rx_frame_errors;
				++tunnel->dev->stats.rx_errors;
				goto drop;
			}
		}

		tstats = this_cpu_ptr(tunnel->dev->tstats);
		u64_stats_update_begin(&tstats->syncp);
		tstats->rx_packets++;
		tstats->rx_bytes += skb->len;
		u64_stats_update_end(&tstats->syncp);
		gro_cells_receive(&tunnel->gro_cells, skb);
Eric Dumazet's avatar
Eric Dumazet committed
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
	}
	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
Linus Torvalds's avatar
Linus Torvalds committed

drop:
	kfree_skb(skb);
	return 0;
static struct sk_buff *handle_offloads(struct sk_buff *skb)
{
	int err;

	if (skb_is_gso(skb)) {
		err = skb_unclone(skb, GFP_ATOMIC);
		if (unlikely(err))
			goto error;
		skb_shinfo(skb)->gso_type |= SKB_GSO_GRE;
		return skb;
	}
	if (skb->ip_summed != CHECKSUM_PARTIAL)
		skb->ip_summed = CHECKSUM_NONE;

	return skb;

error:
	kfree_skb(skb);
	return ERR_PTR(err);
}

static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
Linus Torvalds's avatar
Linus Torvalds committed
{
	struct pcpu_tstats *tstats = this_cpu_ptr(dev->tstats);
	struct ip_tunnel *tunnel = netdev_priv(dev);
	const struct iphdr  *old_iph;
	const struct iphdr  *tiph;
Linus Torvalds's avatar
Linus Torvalds committed
	u8     tos;
	__be16 df;
Linus Torvalds's avatar
Linus Torvalds committed
	struct rtable *rt;     			/* Route to the other host */
Eric Dumazet's avatar
Eric Dumazet committed
	struct net_device *tdev;		/* Device to other host */
Linus Torvalds's avatar
Linus Torvalds committed
	struct iphdr  *iph;			/* Our new IP header */
	unsigned int max_headroom;		/* The extra header space needed */
Linus Torvalds's avatar
Linus Torvalds committed
	int    gre_hlen;
	__be32 dst;
Linus Torvalds's avatar
Linus Torvalds committed
	int    mtu;
Linus Torvalds's avatar
Linus Torvalds committed

	skb = handle_offloads(skb);
	if (IS_ERR(skb)) {
		dev->stats.tx_dropped++;
		return NETDEV_TX_OK;
	}

	if (!skb->encapsulation) {
		skb_reset_inner_headers(skb);
		skb->encapsulation = 1;
	}
Eric Dumazet's avatar
Eric Dumazet committed

	old_iph = ip_hdr(skb);

	if (dev->type == ARPHRD_ETHER)
		IPCB(skb)->flags = 0;

	if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
Linus Torvalds's avatar
Linus Torvalds committed
		gre_hlen = 0;
		if (skb->protocol == htons(ETH_P_IP))
			tiph = (const struct iphdr *)skb->data;
		else
			tiph = &tunnel->parms.iph;
Linus Torvalds's avatar
Linus Torvalds committed
	} else {
		gre_hlen = tunnel->hlen;
		tiph = &tunnel->parms.iph;
	}

	if ((dst = tiph->daddr) == 0) {
		/* NBMA tunnel */

Eric Dumazet's avatar
Eric Dumazet committed
		if (skb_dst(skb) == NULL) {
			dev->stats.tx_fifo_errors++;
Linus Torvalds's avatar
Linus Torvalds committed
			goto tx_error;
		}

		if (skb->protocol == htons(ETH_P_IP)) {
Eric Dumazet's avatar
Eric Dumazet committed
			rt = skb_rtable(skb);
			dst = rt_nexthop(rt, old_iph->daddr);
#if IS_ENABLED(CONFIG_IPV6)
Linus Torvalds's avatar
Linus Torvalds committed
		else if (skb->protocol == htons(ETH_P_IPV6)) {
			const struct in6_addr *addr6;
			struct neighbour *neigh;
			bool do_tx_error_icmp;
Linus Torvalds's avatar
Linus Torvalds committed
			int addr_type;

			neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr);
Linus Torvalds's avatar
Linus Torvalds committed
			if (neigh == NULL)
				goto tx_error;

			addr6 = (const struct in6_addr *)&neigh->primary_key;
Linus Torvalds's avatar
Linus Torvalds committed
			addr_type = ipv6_addr_type(addr6);

			if (addr_type == IPV6_ADDR_ANY) {
				addr6 = &ipv6_hdr(skb)->daddr;
Linus Torvalds's avatar
Linus Torvalds committed
				addr_type = ipv6_addr_type(addr6);
			}

			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
				do_tx_error_icmp = true;
			else {
				do_tx_error_icmp = false;
				dst = addr6->s6_addr32[3];
			}
			neigh_release(neigh);
			if (do_tx_error_icmp)
Linus Torvalds's avatar
Linus Torvalds committed
				goto tx_error_icmp;
		}
#endif
		else
			goto tx_error;
	}

	ttl = tiph->ttl;
Linus Torvalds's avatar
Linus Torvalds committed
	tos = tiph->tos;
Linus Torvalds's avatar
Linus Torvalds committed
		if (skb->protocol == htons(ETH_P_IP))
			tos = old_iph->tos;
		else if (skb->protocol == htons(ETH_P_IPV6))
			tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
	rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr,
				 tunnel->parms.o_key, RT_TOS(tos),
				 tunnel->parms.link);
	if (IS_ERR(rt)) {
		dev->stats.tx_carrier_errors++;
		goto tx_error;
Linus Torvalds's avatar
Linus Torvalds committed
	}
	tdev = rt->dst.dev;
Linus Torvalds's avatar
Linus Torvalds committed

	if (tdev == dev) {
		ip_rt_put(rt);
		dev->stats.collisions++;
Linus Torvalds's avatar
Linus Torvalds committed
		goto tx_error;
	}

	df = tiph->frag_off;
	if (df)
		mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
Linus Torvalds's avatar
Linus Torvalds committed
	else
Eric Dumazet's avatar
Eric Dumazet committed
		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
Linus Torvalds's avatar
Linus Torvalds committed

Eric Dumazet's avatar
Eric Dumazet committed
	if (skb_dst(skb))
		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
Linus Torvalds's avatar
Linus Torvalds committed

	if (skb->protocol == htons(ETH_P_IP)) {
		df |= (old_iph->frag_off&htons(IP_DF));

		if (!skb_is_gso(skb) &&
		    (old_iph->frag_off&htons(IP_DF)) &&
Linus Torvalds's avatar
Linus Torvalds committed
		    mtu < ntohs(old_iph->tot_len)) {
			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
			ip_rt_put(rt);
			goto tx_error;
		}
	}
#if IS_ENABLED(CONFIG_IPV6)
Linus Torvalds's avatar
Linus Torvalds committed
	else if (skb->protocol == htons(ETH_P_IPV6)) {
Eric Dumazet's avatar
Eric Dumazet committed
		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
Linus Torvalds's avatar
Linus Torvalds committed

Eric Dumazet's avatar
Eric Dumazet committed
		if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
			if ((tunnel->parms.iph.daddr &&
			     !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
Linus Torvalds's avatar
Linus Torvalds committed
			    rt6->rt6i_dst.plen == 128) {
				rt6->rt6i_flags |= RTF_MODIFIED;
				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
		if (!skb_is_gso(skb) &&
		    mtu >= IPV6_MIN_MTU &&
		    mtu < skb->len - tunnel->hlen + gre_hlen) {
			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
Linus Torvalds's avatar
Linus Torvalds committed
			ip_rt_put(rt);
			goto tx_error;
		}
	}
#endif

	if (tunnel->err_count > 0) {
		if (time_before(jiffies,
				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
Linus Torvalds's avatar
Linus Torvalds committed
			tunnel->err_count--;

			dst_link_failure(skb);
		} else
			tunnel->err_count = 0;
	}

	max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
Linus Torvalds's avatar
Linus Torvalds committed

	if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
Linus Torvalds's avatar
Linus Torvalds committed
		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
		if (max_headroom > dev->needed_headroom)
			dev->needed_headroom = max_headroom;
Linus Torvalds's avatar
Linus Torvalds committed
		if (!new_skb) {
			ip_rt_put(rt);
			dev->stats.tx_dropped++;
Linus Torvalds's avatar
Linus Torvalds committed
			dev_kfree_skb(skb);
Linus Torvalds's avatar
Linus Torvalds committed
		}
		if (skb->sk)
			skb_set_owner_w(new_skb, skb->sk);
		dev_kfree_skb(skb);
		skb = new_skb;
		/* Warning : tiph value might point to freed memory */
	skb_push(skb, gre_hlen);
	skb_reset_network_header(skb);
	skb_set_transport_header(skb, sizeof(*iph));
Linus Torvalds's avatar
Linus Torvalds committed
	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
			      IPSKB_REROUTED);
Eric Dumazet's avatar
Eric Dumazet committed
	skb_dst_drop(skb);
	skb_dst_set(skb, &rt->dst);
Linus Torvalds's avatar
Linus Torvalds committed

	/*
	 *	Push down and install the IPIP header.
	 */

Linus Torvalds's avatar
Linus Torvalds committed
	iph->version		=	4;
	iph->ihl		=	sizeof(struct iphdr) >> 2;
	iph->frag_off		=	df;
	iph->protocol		=	IPPROTO_GRE;
	iph->tos		=	ipgre_ecn_encapsulate(tos, old_iph, skb);
	iph->daddr		=	fl4.daddr;
	iph->saddr		=	fl4.saddr;
	iph->ttl		=	ttl;
Linus Torvalds's avatar
Linus Torvalds committed

	if (ttl == 0) {
Linus Torvalds's avatar
Linus Torvalds committed
		if (skb->protocol == htons(ETH_P_IP))
			iph->ttl = old_iph->ttl;
#if IS_ENABLED(CONFIG_IPV6)
Linus Torvalds's avatar
Linus Torvalds committed
		else if (skb->protocol == htons(ETH_P_IPV6))
			iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
Linus Torvalds's avatar
Linus Torvalds committed
#endif
		else
			iph->ttl = ip4_dst_hoplimit(&rt->dst);
	((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
	((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
				   htons(ETH_P_TEB) : skb->protocol;
Linus Torvalds's avatar
Linus Torvalds committed

	if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
		__be32 *ptr = (__be32 *)(((u8 *)iph) + tunnel->hlen - 4);
Linus Torvalds's avatar
Linus Torvalds committed

		if (tunnel->parms.o_flags&GRE_SEQ) {
			++tunnel->o_seqno;
			*ptr = htonl(tunnel->o_seqno);
			ptr--;
		}
		if (tunnel->parms.o_flags&GRE_KEY) {
			*ptr = tunnel->parms.o_key;
			ptr--;