Newer
Older
* Copyright (c) 2007-2012 Nicira, Inc.
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/init.h>
#include <linux/module.h>
#include <linux/if_arp.h>
#include <linux/if_vlan.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/jhash.h>
#include <linux/delay.h>
#include <linux/time.h>
#include <linux/etherdevice.h>
#include <linux/genetlink.h>
#include <linux/kernel.h>
#include <linux/kthread.h>
#include <linux/mutex.h>
#include <linux/percpu.h>
#include <linux/rcupdate.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/ethtool.h>
#include <linux/wait.h>
#include <asm/div64.h>
#include <linux/highmem.h>
#include <linux/netfilter_bridge.h>
#include <linux/netfilter_ipv4.h>
#include <linux/inetdevice.h>
#include <linux/list.h>
#include <linux/openvswitch.h>
#include <linux/rculist.h>
#include <linux/dmi.h>
#include <linux/workqueue.h>
#include <net/genetlink.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include "datapath.h"
#include "flow.h"
#include "vport-internal_dev.h"
#include "vport-netdev.h"
#define REHASH_FLOW_INTERVAL (10 * 60 * HZ)
static void rehash_flow_table(struct work_struct *work);
static DECLARE_DELAYED_WORK(rehash_flow_wq, rehash_flow_table);
int ovs_net_id __read_mostly;
static void ovs_notify(struct sk_buff *skb, struct genl_info *info,
struct genl_multicast_group *grp)
{
genl_notify(skb, genl_info_net(info), info->snd_portid,
grp->id, info->nlhdr, GFP_KERNEL);
}
* All writes e.g. Writes to device state (add/remove datapath, port, set
* operations on vports, etc.), Writes to other state (flow table
* modifications, set miscellaneous datapath parameters, etc.) are protected
* by ovs_lock.
*
* Reads are protected by RCU.
*
* There are a few special cases (mostly stats) that have their own
* synchronization but they nest under all of above and don't interact with
* each other.
*
* The RTNL lock nests inside ovs_mutex.
static DEFINE_MUTEX(ovs_mutex);
void ovs_lock(void)
{
mutex_lock(&ovs_mutex);
}
void ovs_unlock(void)
{
mutex_unlock(&ovs_mutex);
}
#ifdef CONFIG_LOCKDEP
int lockdep_ovsl_is_held(void)
{
if (debug_locks)
return lockdep_is_held(&ovs_mutex);
else
return 1;
}
#endif
static struct vport *new_vport(const struct vport_parms *);
static int queue_gso_packets(struct net *, int dp_ifindex, struct sk_buff *,
static int queue_userspace_packet(struct net *, int dp_ifindex,
struct sk_buff *,
const struct dp_upcall_info *);
/* Must be called with rcu_read_lock or ovs_mutex. */
static struct datapath *get_dp(struct net *net, int dp_ifindex)
{
struct datapath *dp = NULL;
struct net_device *dev;
rcu_read_lock();
dev = dev_get_by_index_rcu(net, dp_ifindex);
if (dev) {
struct vport *vport = ovs_internal_dev_get_vport(dev);
if (vport)
dp = vport->dp;
}
rcu_read_unlock();
return dp;
}
/* Must be called with rcu_read_lock or ovs_mutex. */
const char *ovs_dp_name(const struct datapath *dp)
{
struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL);
return vport->ops->get_name(vport);
}
static int get_dpifindex(struct datapath *dp)
{
struct vport *local;
int ifindex;
rcu_read_lock();
local = ovs_vport_rcu(dp, OVSP_LOCAL);
ifindex = netdev_vport_priv(local)->dev->ifindex;
else
ifindex = 0;
rcu_read_unlock();
return ifindex;
}
static void destroy_dp_rcu(struct rcu_head *rcu)
{
struct datapath *dp = container_of(rcu, struct datapath, rcu);
ovs_flow_tbl_destroy((__force struct flow_table *)dp->table);
free_percpu(dp->stats_percpu);
release_net(ovs_dp_get_net(dp));
kfree(dp->ports);
static struct hlist_head *vport_hash_bucket(const struct datapath *dp,
u16 port_no)
{
return &dp->ports[port_no & (DP_VPORT_HASH_BUCKETS - 1)];
}
struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no)
{
struct vport *vport;
struct hlist_head *head;
head = vport_hash_bucket(dp, port_no);
hlist_for_each_entry_rcu(vport, head, dp_hash_node) {
if (vport->port_no == port_no)
return vport;
}
return NULL;
}
static struct vport *new_vport(const struct vport_parms *parms)
{
struct vport *vport;
vport = ovs_vport_add(parms);
if (!IS_ERR(vport)) {
struct datapath *dp = parms->dp;
struct hlist_head *head = vport_hash_bucket(dp, vport->port_no);
hlist_add_head_rcu(&vport->dp_hash_node, head);
}
return vport;
}
void ovs_dp_detach_port(struct vport *p)
{
/* First drop references to device. */
hlist_del_rcu(&p->dp_hash_node);
/* Then destroy it. */
ovs_vport_del(p);
}
/* Must be called with rcu_read_lock. */
void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb)
{
struct datapath *dp = p->dp;
struct sw_flow *flow;
struct dp_stats_percpu *stats;
struct sw_flow_key key;
u64 *stats_counter;
int error;
int key_len;
stats = this_cpu_ptr(dp->stats_percpu);
/* Extract flow from 'skb' into 'key'. */
error = ovs_flow_extract(skb, p->port_no, &key, &key_len);
if (unlikely(error)) {
kfree_skb(skb);
return;
}
/* Look up flow. */
flow = ovs_flow_tbl_lookup(rcu_dereference(dp->table), &key, key_len);
if (unlikely(!flow)) {
struct dp_upcall_info upcall;
upcall.cmd = OVS_PACKET_CMD_MISS;
upcall.key = &key;
upcall.userdata = NULL;
upcall.portid = p->upcall_portid;
ovs_dp_upcall(dp, skb, &upcall);
consume_skb(skb);
stats_counter = &stats->n_missed;
goto out;
}
OVS_CB(skb)->flow = flow;
stats_counter = &stats->n_hit;
ovs_flow_used(OVS_CB(skb)->flow, skb);
ovs_execute_actions(dp, skb);
out:
/* Update datapath statistics. */
u64_stats_update_begin(&stats->sync);
(*stats_counter)++;
u64_stats_update_end(&stats->sync);
}
static struct genl_family dp_packet_genl_family = {
.id = GENL_ID_GENERATE,
.hdrsize = sizeof(struct ovs_header),
.name = OVS_PACKET_FAMILY,
.version = OVS_PACKET_VERSION,
.maxattr = OVS_PACKET_ATTR_MAX,
.netnsok = true,
.parallel_ops = true,
};
int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb,
const struct dp_upcall_info *upcall_info)
{
struct dp_stats_percpu *stats;
int dp_ifindex;
int err;
if (upcall_info->portid == 0) {
err = -ENOTCONN;
goto err;
}
dp_ifindex = get_dpifindex(dp);
if (!dp_ifindex) {
err = -ENODEV;
goto err;
}
if (!skb_is_gso(skb))
err = queue_userspace_packet(ovs_dp_get_net(dp), dp_ifindex, skb, upcall_info);
err = queue_gso_packets(ovs_dp_get_net(dp), dp_ifindex, skb, upcall_info);
if (err)
goto err;
return 0;
err:
stats = this_cpu_ptr(dp->stats_percpu);
u64_stats_update_begin(&stats->sync);
stats->n_lost++;
u64_stats_update_end(&stats->sync);
return err;
}
static int queue_gso_packets(struct net *net, int dp_ifindex,
struct sk_buff *skb,
const struct dp_upcall_info *upcall_info)
{
unsigned short gso_type = skb_shinfo(skb)->gso_type;
struct dp_upcall_info later_info;
struct sw_flow_key later_key;
struct sk_buff *segs, *nskb;
int err;
segs = __skb_gso_segment(skb, NETIF_F_SG | NETIF_F_HW_CSUM, false);
if (IS_ERR(segs))
return PTR_ERR(segs);
/* Queue all of the segments. */
skb = segs;
do {
err = queue_userspace_packet(net, dp_ifindex, skb, upcall_info);
if (skb == segs && gso_type & SKB_GSO_UDP) {
/* The initial flow key extracted by ovs_flow_extract()
* in this case is for a first fragment, so we need to
* properly mark later fragments.
*/
later_key = *upcall_info->key;
later_key.ip.frag = OVS_FRAG_TYPE_LATER;
later_info = *upcall_info;
later_info.key = &later_key;
upcall_info = &later_info;
}
} while ((skb = skb->next));
/* Free all of the segments. */
skb = segs;
do {
nskb = skb->next;
if (err)
kfree_skb(skb);
else
consume_skb(skb);
} while ((skb = nskb));
return err;
}
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
static size_t key_attr_size(void)
{
return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */
+ nla_total_size(4) /* OVS_KEY_ATTR_IN_PORT */
+ nla_total_size(4) /* OVS_KEY_ATTR_SKB_MARK */
+ nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */
+ nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */
+ nla_total_size(4) /* OVS_KEY_ATTR_8021Q */
+ nla_total_size(0) /* OVS_KEY_ATTR_ENCAP */
+ nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */
+ nla_total_size(40) /* OVS_KEY_ATTR_IPV6 */
+ nla_total_size(2) /* OVS_KEY_ATTR_ICMPV6 */
+ nla_total_size(28); /* OVS_KEY_ATTR_ND */
}
static size_t upcall_msg_size(const struct sk_buff *skb,
const struct nlattr *userdata)
{
size_t size = NLMSG_ALIGN(sizeof(struct ovs_header))
+ nla_total_size(skb->len) /* OVS_PACKET_ATTR_PACKET */
+ nla_total_size(key_attr_size()); /* OVS_PACKET_ATTR_KEY */
/* OVS_PACKET_ATTR_USERDATA */
if (userdata)
size += NLA_ALIGN(userdata->nla_len);
return size;
}
static int queue_userspace_packet(struct net *net, int dp_ifindex,
struct sk_buff *skb,
const struct dp_upcall_info *upcall_info)
{
struct ovs_header *upcall;
struct sk_buff *nskb = NULL;
struct sk_buff *user_skb; /* to be queued to userspace */
struct nlattr *nla;
int err;
if (vlan_tx_tag_present(skb)) {
nskb = skb_clone(skb, GFP_ATOMIC);
if (!nskb)
return -ENOMEM;
nskb = __vlan_put_tag(nskb, nskb->vlan_proto, vlan_tx_tag_get(nskb));
if (!nskb)
return -ENOMEM;
nskb->vlan_tci = 0;
skb = nskb;
}
if (nla_attr_size(skb->len) > USHRT_MAX) {
err = -EFBIG;
goto out;
}
user_skb = genlmsg_new(upcall_msg_size(skb, upcall_info->userdata), GFP_ATOMIC);
if (!user_skb) {
err = -ENOMEM;
goto out;
}
upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family,
0, upcall_info->cmd);
upcall->dp_ifindex = dp_ifindex;
nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_KEY);
ovs_flow_to_nlattrs(upcall_info->key, user_skb);
nla_nest_end(user_skb, nla);
if (upcall_info->userdata)
__nla_put(user_skb, OVS_PACKET_ATTR_USERDATA,
nla_len(upcall_info->userdata),
nla_data(upcall_info->userdata));
nla = __nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, skb->len);
skb_copy_and_csum_dev(skb, nla_data(nla));
genlmsg_end(user_skb, upcall);
err = genlmsg_unicast(net, user_skb, upcall_info->portid);
out:
kfree_skb(nskb);
return err;
}
static int flush_flows(struct datapath *dp)
{
struct flow_table *old_table;
struct flow_table *new_table;
old_table = ovsl_dereference(dp->table);
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
new_table = ovs_flow_tbl_alloc(TBL_MIN_BUCKETS);
if (!new_table)
return -ENOMEM;
rcu_assign_pointer(dp->table, new_table);
ovs_flow_tbl_deferred_destroy(old_table);
return 0;
}
static int validate_actions(const struct nlattr *attr,
const struct sw_flow_key *key, int depth);
static int validate_sample(const struct nlattr *attr,
const struct sw_flow_key *key, int depth)
{
const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1];
const struct nlattr *probability, *actions;
const struct nlattr *a;
int rem;
memset(attrs, 0, sizeof(attrs));
nla_for_each_nested(a, attr, rem) {
int type = nla_type(a);
if (!type || type > OVS_SAMPLE_ATTR_MAX || attrs[type])
return -EINVAL;
attrs[type] = a;
}
if (rem)
return -EINVAL;
probability = attrs[OVS_SAMPLE_ATTR_PROBABILITY];
if (!probability || nla_len(probability) != sizeof(u32))
return -EINVAL;
actions = attrs[OVS_SAMPLE_ATTR_ACTIONS];
if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN))
return -EINVAL;
return validate_actions(actions, key, depth + 1);
}
static int validate_tp_port(const struct sw_flow_key *flow_key)
{
if (flow_key->eth.type == htons(ETH_P_IP)) {
if (flow_key->ipv4.tp.src || flow_key->ipv4.tp.dst)
return 0;
} else if (flow_key->eth.type == htons(ETH_P_IPV6)) {
if (flow_key->ipv6.tp.src || flow_key->ipv6.tp.dst)
return 0;
}
return -EINVAL;
}
static int validate_set(const struct nlattr *a,
const struct sw_flow_key *flow_key)
{
const struct nlattr *ovs_key = nla_data(a);
int key_type = nla_type(ovs_key);
/* There can be only one key in a action */
if (nla_total_size(nla_len(ovs_key)) != nla_len(a))
return -EINVAL;
if (key_type > OVS_KEY_ATTR_MAX ||
nla_len(ovs_key) != ovs_key_lens[key_type])
return -EINVAL;
switch (key_type) {
const struct ovs_key_ipv4 *ipv4_key;
case OVS_KEY_ATTR_SKB_MARK:
case OVS_KEY_ATTR_ETHERNET:
break;
case OVS_KEY_ATTR_IPV4:
if (flow_key->eth.type != htons(ETH_P_IP))
return -EINVAL;
return -EINVAL;
ipv4_key = nla_data(ovs_key);
if (ipv4_key->ipv4_proto != flow_key->ip.proto)
return -EINVAL;
if (ipv4_key->ipv4_frag != flow_key->ip.frag)
return -EINVAL;
break;
case OVS_KEY_ATTR_IPV6:
if (flow_key->eth.type != htons(ETH_P_IPV6))
return -EINVAL;
if (!flow_key->ip.proto)
return -EINVAL;
ipv6_key = nla_data(ovs_key);
if (ipv6_key->ipv6_proto != flow_key->ip.proto)
return -EINVAL;
if (ipv6_key->ipv6_frag != flow_key->ip.frag)
return -EINVAL;
if (ntohl(ipv6_key->ipv6_label) & 0xFFF00000)
return -EINVAL;
break;
case OVS_KEY_ATTR_TCP:
if (flow_key->ip.proto != IPPROTO_TCP)
return -EINVAL;
return validate_tp_port(flow_key);
case OVS_KEY_ATTR_UDP:
if (flow_key->ip.proto != IPPROTO_UDP)
return -EINVAL;
return validate_tp_port(flow_key);
default:
return -EINVAL;
}
return 0;
}
static int validate_userspace(const struct nlattr *attr)
{
static const struct nla_policy userspace_policy[OVS_USERSPACE_ATTR_MAX + 1] = {
[OVS_USERSPACE_ATTR_PID] = {.type = NLA_U32 },
[OVS_USERSPACE_ATTR_USERDATA] = {.type = NLA_UNSPEC },
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
};
struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1];
int error;
error = nla_parse_nested(a, OVS_USERSPACE_ATTR_MAX,
attr, userspace_policy);
if (error)
return error;
if (!a[OVS_USERSPACE_ATTR_PID] ||
!nla_get_u32(a[OVS_USERSPACE_ATTR_PID]))
return -EINVAL;
return 0;
}
static int validate_actions(const struct nlattr *attr,
const struct sw_flow_key *key, int depth)
{
const struct nlattr *a;
int rem, err;
if (depth >= SAMPLE_ACTION_DEPTH)
return -EOVERFLOW;
nla_for_each_nested(a, attr, rem) {
/* Expected argument lengths, (u32)-1 for variable length. */
static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = {
[OVS_ACTION_ATTR_OUTPUT] = sizeof(u32),
[OVS_ACTION_ATTR_USERSPACE] = (u32)-1,
[OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan),
[OVS_ACTION_ATTR_POP_VLAN] = 0,
[OVS_ACTION_ATTR_SET] = (u32)-1,
[OVS_ACTION_ATTR_SAMPLE] = (u32)-1
};
const struct ovs_action_push_vlan *vlan;
int type = nla_type(a);
if (type > OVS_ACTION_ATTR_MAX ||
(action_lens[type] != nla_len(a) &&
action_lens[type] != (u32)-1))
return -EINVAL;
switch (type) {
case OVS_ACTION_ATTR_UNSPEC:
return -EINVAL;
case OVS_ACTION_ATTR_USERSPACE:
err = validate_userspace(a);
if (err)
return err;
break;
case OVS_ACTION_ATTR_OUTPUT:
if (nla_get_u32(a) >= DP_MAX_PORTS)
return -EINVAL;
break;
case OVS_ACTION_ATTR_POP_VLAN:
break;
case OVS_ACTION_ATTR_PUSH_VLAN:
vlan = nla_data(a);
if (vlan->vlan_tpid != htons(ETH_P_8021Q))
return -EINVAL;
if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT)))
return -EINVAL;
break;
case OVS_ACTION_ATTR_SET:
err = validate_set(a, key);
if (err)
return err;
break;
case OVS_ACTION_ATTR_SAMPLE:
err = validate_sample(a, key, depth);
if (err)
return err;
break;
default:
return -EINVAL;
}
}
if (rem > 0)
return -EINVAL;
return 0;
}
static void clear_stats(struct sw_flow *flow)
{
flow->used = 0;
flow->tcp_flags = 0;
flow->packet_count = 0;
flow->byte_count = 0;
}
static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
{
struct ovs_header *ovs_header = info->userhdr;
struct nlattr **a = info->attrs;
struct sw_flow_actions *acts;
struct sk_buff *packet;
struct sw_flow *flow;
struct datapath *dp;
struct ethhdr *eth;
int len;
int err;
int key_len;
err = -EINVAL;
if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] ||

Thomas Graf
committed
!a[OVS_PACKET_ATTR_ACTIONS])
goto err;
len = nla_len(a[OVS_PACKET_ATTR_PACKET]);
packet = __dev_alloc_skb(NET_IP_ALIGN + len, GFP_KERNEL);
err = -ENOMEM;
if (!packet)
goto err;
skb_reserve(packet, NET_IP_ALIGN);
nla_memcpy(__skb_put(packet, len), a[OVS_PACKET_ATTR_PACKET], len);
skb_reset_mac_header(packet);
eth = eth_hdr(packet);
/* Normally, setting the skb 'protocol' field would be handled by a
* call to eth_type_trans(), but it assumes there's a sending
* device, which we may not have. */
packet->protocol = eth->h_proto;
else
packet->protocol = htons(ETH_P_802_2);
/* Build an sw_flow for sending this packet. */
flow = ovs_flow_alloc();
err = PTR_ERR(flow);
if (IS_ERR(flow))
goto err_kfree_skb;
err = ovs_flow_extract(packet, -1, &flow->key, &key_len);
if (err)
goto err_flow_free;
err = ovs_flow_metadata_from_nlattrs(flow, a[OVS_PACKET_ATTR_KEY]);
if (err)
goto err_flow_free;
err = validate_actions(a[OVS_PACKET_ATTR_ACTIONS], &flow->key, 0);
if (err)
goto err_flow_free;
flow->hash = ovs_flow_hash(&flow->key, key_len);
acts = ovs_flow_actions_alloc(a[OVS_PACKET_ATTR_ACTIONS]);
err = PTR_ERR(acts);
if (IS_ERR(acts))
goto err_flow_free;
rcu_assign_pointer(flow->sf_acts, acts);
OVS_CB(packet)->flow = flow;
packet->priority = flow->key.phy.priority;
packet->mark = flow->key.phy.skb_mark;
dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
err = -ENODEV;
if (!dp)
goto err_unlock;
local_bh_disable();
err = ovs_execute_actions(dp, packet);
local_bh_enable();
rcu_read_unlock();
ovs_flow_free(flow);
return err;
err_unlock:
rcu_read_unlock();
err_flow_free:
ovs_flow_free(flow);
err_kfree_skb:
kfree_skb(packet);
err:
return err;
}
static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = {

Thomas Graf
committed
[OVS_PACKET_ATTR_PACKET] = { .len = ETH_HLEN },
[OVS_PACKET_ATTR_KEY] = { .type = NLA_NESTED },
[OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED },
};
static struct genl_ops dp_packet_genl_ops[] = {
{ .cmd = OVS_PACKET_CMD_EXECUTE,
.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = packet_policy,
.doit = ovs_packet_cmd_execute
}
};
static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats)
{
int i;
struct flow_table *table = ovsl_dereference(dp->table);
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
stats->n_flows = ovs_flow_tbl_count(table);
stats->n_hit = stats->n_missed = stats->n_lost = 0;
for_each_possible_cpu(i) {
const struct dp_stats_percpu *percpu_stats;
struct dp_stats_percpu local_stats;
unsigned int start;
percpu_stats = per_cpu_ptr(dp->stats_percpu, i);
do {
start = u64_stats_fetch_begin_bh(&percpu_stats->sync);
local_stats = *percpu_stats;
} while (u64_stats_fetch_retry_bh(&percpu_stats->sync, start));
stats->n_hit += local_stats.n_hit;
stats->n_missed += local_stats.n_missed;
stats->n_lost += local_stats.n_lost;
}
}
static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = {
[OVS_FLOW_ATTR_KEY] = { .type = NLA_NESTED },
[OVS_FLOW_ATTR_ACTIONS] = { .type = NLA_NESTED },
[OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG },
};
static struct genl_family dp_flow_genl_family = {
.id = GENL_ID_GENERATE,
.hdrsize = sizeof(struct ovs_header),
.name = OVS_FLOW_FAMILY,
.version = OVS_FLOW_VERSION,
.maxattr = OVS_FLOW_ATTR_MAX,
.netnsok = true,
.parallel_ops = true,
};
static struct genl_multicast_group ovs_dp_flow_multicast_group = {
.name = OVS_FLOW_MCGROUP
};
static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts)
{
return NLMSG_ALIGN(sizeof(struct ovs_header))
+ nla_total_size(key_attr_size()) /* OVS_FLOW_ATTR_KEY */
+ nla_total_size(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */
+ nla_total_size(1) /* OVS_FLOW_ATTR_TCP_FLAGS */
+ nla_total_size(8) /* OVS_FLOW_ATTR_USED */
+ nla_total_size(acts->actions_len); /* OVS_FLOW_ATTR_ACTIONS */
}
static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp,
struct sk_buff *skb, u32 portid,
u32 seq, u32 flags, u8 cmd)
{
const int skb_orig_len = skb->len;
const struct sw_flow_actions *sf_acts;
struct ovs_flow_stats stats;
struct ovs_header *ovs_header;
struct nlattr *nla;
unsigned long used;
u8 tcp_flags;
int err;
sf_acts = ovsl_dereference(flow->sf_acts);
ovs_header = genlmsg_put(skb, portid, seq, &dp_flow_genl_family, flags, cmd);
if (!ovs_header)
return -EMSGSIZE;
ovs_header->dp_ifindex = get_dpifindex(dp);
nla = nla_nest_start(skb, OVS_FLOW_ATTR_KEY);
if (!nla)
goto nla_put_failure;
err = ovs_flow_to_nlattrs(&flow->key, skb);
if (err)
goto error;
nla_nest_end(skb, nla);
spin_lock_bh(&flow->lock);
used = flow->used;
stats.n_packets = flow->packet_count;
stats.n_bytes = flow->byte_count;
tcp_flags = flow->tcp_flags;
spin_unlock_bh(&flow->lock);
if (used &&
nla_put_u64(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used)))
goto nla_put_failure;
if (stats.n_packets &&
nla_put(skb, OVS_FLOW_ATTR_STATS,
sizeof(struct ovs_flow_stats), &stats))
goto nla_put_failure;
if (tcp_flags &&
nla_put_u8(skb, OVS_FLOW_ATTR_TCP_FLAGS, tcp_flags))
goto nla_put_failure;
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
/* If OVS_FLOW_ATTR_ACTIONS doesn't fit, skip dumping the actions if
* this is the first flow to be dumped into 'skb'. This is unusual for
* Netlink but individual action lists can be longer than
* NLMSG_GOODSIZE and thus entirely undumpable if we didn't do this.
* The userspace caller can always fetch the actions separately if it
* really wants them. (Most userspace callers in fact don't care.)
*
* This can only fail for dump operations because the skb is always
* properly sized for single flows.
*/
err = nla_put(skb, OVS_FLOW_ATTR_ACTIONS, sf_acts->actions_len,
sf_acts->actions);
if (err < 0 && skb_orig_len)
goto error;
return genlmsg_end(skb, ovs_header);
nla_put_failure:
err = -EMSGSIZE;
error:
genlmsg_cancel(skb, ovs_header);
return err;
}
static struct sk_buff *ovs_flow_cmd_alloc_info(struct sw_flow *flow)
{
const struct sw_flow_actions *sf_acts;
sf_acts = ovsl_dereference(flow->sf_acts);
return genlmsg_new(ovs_flow_cmd_msg_size(sf_acts), GFP_KERNEL);
}
static struct sk_buff *ovs_flow_cmd_build_info(struct sw_flow *flow,
struct datapath *dp,
u32 portid, u32 seq, u8 cmd)
{
struct sk_buff *skb;
int retval;
skb = ovs_flow_cmd_alloc_info(flow);
if (!skb)
return ERR_PTR(-ENOMEM);
retval = ovs_flow_cmd_fill_info(flow, dp, skb, portid, seq, 0, cmd);
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
BUG_ON(retval < 0);
return skb;
}
static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr **a = info->attrs;
struct ovs_header *ovs_header = info->userhdr;
struct sw_flow_key key;
struct sw_flow *flow;
struct sk_buff *reply;
struct datapath *dp;
struct flow_table *table;
int error;
int key_len;
/* Extract key. */
error = -EINVAL;
if (!a[OVS_FLOW_ATTR_KEY])
goto error;
error = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]);
if (error)
goto error;
/* Validate actions. */
if (a[OVS_FLOW_ATTR_ACTIONS]) {
error = validate_actions(a[OVS_FLOW_ATTR_ACTIONS], &key, 0);
if (error)
goto error;
} else if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW) {
error = -EINVAL;
goto error;
}
dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
table = ovsl_dereference(dp->table);
flow = ovs_flow_tbl_lookup(table, &key, key_len);
if (!flow) {
struct sw_flow_actions *acts;
/* Bail out if we're not allowed to create a new flow. */
error = -ENOENT;
if (info->genlhdr->cmd == OVS_FLOW_CMD_SET)