Newer
Older
result = qdisc_enqueue_root(skb, q);
spin_unlock(qdisc_lock(q));
}
return result;
}
static inline struct sk_buff *handle_ing(struct sk_buff *skb,
struct packet_type **pt_prev,
int *ret, struct net_device *orig_dev)
{
struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
if (!rxq || rxq->qdisc == &noop_qdisc)
if (*pt_prev) {
*ret = deliver_skb(skb, *pt_prev, orig_dev);
*pt_prev = NULL;
case TC_ACT_SHOT:
case TC_ACT_STOLEN:
kfree_skb(skb);
return NULL;
}
out:
skb->tc_verd = 0;
return skb;
/**
* netdev_rx_handler_register - register receive handler
* @dev: device to register a handler for
* @rx_handler: receive handler to register
* @rx_handler_data: data pointer that is used by rx handler
*
* Register a receive hander for a device. This handler will then be
* called from __netif_receive_skb. A negative errno code is returned
* on a failure.
*
* The caller must hold the rtnl_mutex.
*
* For a general description of rx_handler, see enum rx_handler_result.
*/
int netdev_rx_handler_register(struct net_device *dev,
rx_handler_func_t *rx_handler,
void *rx_handler_data)
{
ASSERT_RTNL();
if (dev->rx_handler)
return -EBUSY;
rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
rcu_assign_pointer(dev->rx_handler, rx_handler);
return 0;
}
EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
/**
* netdev_rx_handler_unregister - unregister receive handler
* @dev: device to unregister a handler from
*
* Unregister a receive hander from a device.
*
* The caller must hold the rtnl_mutex.
*/
void netdev_rx_handler_unregister(struct net_device *dev)
{
ASSERT_RTNL();
rcu_assign_pointer(dev->rx_handler, NULL);
rcu_assign_pointer(dev->rx_handler_data, NULL);
}
EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
static int __netif_receive_skb(struct sk_buff *skb)
rx_handler_func_t *rx_handler;
struct net_device *null_or_dev;
bool deliver_exact = false;
if (!netdev_tstamp_prequeue)
net_timestamp_check(skb);
/* if we've gotten here through NAPI, check netpoll */
if (netpoll_receive_skb(skb))
if (!skb->skb_iif)
skb->skb_iif = skb->dev->ifindex;
skb_reset_network_header(skb);
skb_reset_transport_header(skb);

Arnaldo Carvalho de Melo
committed
skb->mac_len = skb->network_header - skb->mac_header;
another_round:
__this_cpu_inc(softnet_data.processed);
if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
skb = vlan_untag(skb);
if (unlikely(!skb))
goto out;
}
#ifdef CONFIG_NET_CLS_ACT
if (skb->tc_verd & TC_NCLS) {
skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
goto ncls;
}
#endif
list_for_each_entry_rcu(ptype, &ptype_all, list) {
if (!ptype->dev || ptype->dev == skb->dev) {
pt_prev = ptype;
}
}
#ifdef CONFIG_NET_CLS_ACT
skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
if (!skb)
rx_handler = rcu_dereference(skb->dev->rx_handler);
if (rx_handler) {
if (pt_prev) {
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL;
}
switch (rx_handler(&skb)) {
case RX_HANDLER_CONSUMED:
case RX_HANDLER_ANOTHER:
goto another_round;
case RX_HANDLER_EXACT:
deliver_exact = true;
case RX_HANDLER_PASS:
break;
default:
BUG();
}
if (vlan_tx_tag_present(skb)) {
if (pt_prev) {
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL;
}
if (vlan_do_receive(&skb)) {
ret = __netif_receive_skb(skb);
goto out;
} else if (unlikely(!skb))
goto out;
}
/* deliver only exact match when indicated */
null_or_dev = deliver_exact ? skb->dev : NULL;
list_for_each_entry_rcu(ptype,
&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
if (ptype->type == type &&
(ptype->dev == null_or_dev || ptype->dev == skb->dev ||
ptype->dev == orig_dev)) {
pt_prev = ptype;
}
}
if (pt_prev) {
ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
atomic_long_inc(&skb->dev->rx_dropped);
kfree_skb(skb);
/* Jamal, now you will not able to escape explaining
* me how you were going to use this. :-)
*/
ret = NET_RX_DROP;
}
out:
rcu_read_unlock();
return ret;
}
/**
* netif_receive_skb - process receive buffer from network
* @skb: buffer to process
*
* netif_receive_skb() is the main receive data processing function.
* It always succeeds. The buffer may be dropped during processing
* for congestion control or by the protocol layers.
*
* This function may only be called from softirq context and interrupts
* should be enabled.
*
* Return values (usually ignored):
* NET_RX_SUCCESS: no congestion
* NET_RX_DROP: packet was dropped
*/
int netif_receive_skb(struct sk_buff *skb)
{
if (netdev_tstamp_prequeue)
net_timestamp_check(skb);
if (skb_defer_rx_timestamp(skb))
return NET_RX_SUCCESS;
{
struct rps_dev_flow voidflow, *rflow = &voidflow;
int cpu, ret;
rcu_read_lock();
cpu = get_rps_cpu(skb->dev, skb, &rflow);
if (cpu >= 0) {
ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
rcu_read_unlock();
} else {
rcu_read_unlock();
ret = __netif_receive_skb(skb);
}
#else
return __netif_receive_skb(skb);
#endif
/* Network device is going away, flush any packets still pending
* Called with irqs disabled.
*/
static void flush_backlog(void *arg)
struct net_device *dev = arg;
struct sk_buff *skb, *tmp;
skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
input_queue_head_incr(sd);
skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
if (skb->dev == dev) {
__skb_unlink(skb, &sd->process_queue);
kfree_skb(skb);
input_queue_head_incr(sd);
static int napi_gro_complete(struct sk_buff *skb)
{
struct packet_type *ptype;
__be16 type = skb->protocol;
struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
int err = -ENOENT;
if (NAPI_GRO_CB(skb)->count == 1) {
skb_shinfo(skb)->gso_size = 0;
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
rcu_read_lock();
list_for_each_entry_rcu(ptype, head, list) {
if (ptype->type != type || ptype->dev || !ptype->gro_complete)
continue;
err = ptype->gro_complete(skb);
break;
}
rcu_read_unlock();
if (err) {
WARN_ON(&ptype->list == head);
kfree_skb(skb);
return NET_RX_SUCCESS;
}
out:
return netif_receive_skb(skb);
}
inline void napi_gro_flush(struct napi_struct *napi)
{
struct sk_buff *skb, *next;
for (skb = napi->gro_list; skb; skb = next) {
next = skb->next;
skb->next = NULL;
napi_gro_complete(skb);
}
napi->gro_count = 0;
napi->gro_list = NULL;
}
enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
struct sk_buff **pp = NULL;
struct packet_type *ptype;
__be16 type = skb->protocol;
struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
if (skb_is_gso(skb) || skb_has_frag_list(skb))
rcu_read_lock();
list_for_each_entry_rcu(ptype, head, list) {
if (ptype->type != type || ptype->dev || !ptype->gro_receive)
continue;
skb_set_network_header(skb, skb_gro_offset(skb));
mac_len = skb->network_header - skb->mac_header;
skb->mac_len = mac_len;
NAPI_GRO_CB(skb)->same_flow = 0;
NAPI_GRO_CB(skb)->flush = 0;
pp = ptype->gro_receive(&napi->gro_list, skb);
break;
}
rcu_read_unlock();
if (&ptype->list == head)
goto normal;
same_flow = NAPI_GRO_CB(skb)->same_flow;
ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
if (pp) {
struct sk_buff *nskb = *pp;
*pp = nskb->next;
nskb->next = NULL;
napi_gro_complete(nskb);
napi->gro_count--;
if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
napi->gro_count++;
NAPI_GRO_CB(skb)->count = 1;
skb_shinfo(skb)->gso_size = skb_gro_len(skb);
skb->next = napi->gro_list;
napi->gro_list = skb;
if (skb_headlen(skb) < skb_gro_offset(skb)) {
int grow = skb_gro_offset(skb) - skb_headlen(skb);
BUG_ON(skb->end - skb->tail < grow);
memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
skb->tail += grow;
skb->data_len -= grow;
skb_shinfo(skb)->frags[0].page_offset += grow;
skb_shinfo(skb)->frags[0].size -= grow;
if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
put_page(skb_shinfo(skb)->frags[0].page);
memmove(skb_shinfo(skb)->frags,
skb_shinfo(skb)->frags + 1,
--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
ret = GRO_NORMAL;
goto pull;
EXPORT_SYMBOL(dev_gro_receive);
__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
struct sk_buff *p;
for (p = napi->gro_list; p; p = p->next) {
unsigned long diffs;
diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
diffs |= p->vlan_tci ^ skb->vlan_tci;
diffs |= compare_ether_header(skb_mac_header(p),
NAPI_GRO_CB(p)->same_flow = !diffs;
NAPI_GRO_CB(p)->flush = 0;
}
return dev_gro_receive(napi, skb);
}
gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
switch (ret) {
case GRO_NORMAL:
if (netif_receive_skb(skb))
ret = GRO_DROP;
break;
case GRO_DROP:
case GRO_MERGED_FREE:
case GRO_HELD:
case GRO_MERGED:
break;
return ret;
}
EXPORT_SYMBOL(napi_skb_finish);
void skb_gro_reset_offset(struct sk_buff *skb)
{
NAPI_GRO_CB(skb)->data_offset = 0;
NAPI_GRO_CB(skb)->frag0 = NULL;
NAPI_GRO_CB(skb)->frag0_len = 0;
if (skb->mac_header == skb->tail &&
!PageHighMem(skb_shinfo(skb)->frags[0].page)) {
NAPI_GRO_CB(skb)->frag0 =
page_address(skb_shinfo(skb)->frags[0].page) +
skb_shinfo(skb)->frags[0].page_offset;
NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
}
}
EXPORT_SYMBOL(skb_gro_reset_offset);
gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
skb_gro_reset_offset(skb);
return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
}
EXPORT_SYMBOL(napi_gro_receive);
static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
{
__skb_pull(skb, skb_headlen(skb));
skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
struct sk_buff *napi_get_frags(struct napi_struct *napi)
{
struct sk_buff *skb = napi->skb;
if (!skb) {
skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
if (skb)
napi->skb = skb;
EXPORT_SYMBOL(napi_get_frags);
gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
gro_result_t ret)
switch (ret) {
case GRO_NORMAL:
skb->protocol = eth_type_trans(skb, skb->dev);
if (ret == GRO_HELD)
skb_gro_pull(skb, -ETH_HLEN);
else if (netif_receive_skb(skb))
ret = GRO_DROP;
case GRO_DROP:
case GRO_MERGED_FREE:
napi_reuse_skb(napi, skb);
break;
case GRO_MERGED:
break;
return ret;
EXPORT_SYMBOL(napi_frags_finish);
struct sk_buff *napi_frags_skb(struct napi_struct *napi)
{
struct sk_buff *skb = napi->skb;
struct ethhdr *eth;
unsigned int hlen;
unsigned int off;
napi->skb = NULL;
skb_reset_mac_header(skb);
skb_gro_reset_offset(skb);
off = skb_gro_offset(skb);
hlen = off + sizeof(*eth);
eth = skb_gro_header_fast(skb, off);
if (skb_gro_header_hard(skb, hlen)) {
eth = skb_gro_header_slow(skb, hlen, off);
if (unlikely(!eth)) {
napi_reuse_skb(napi, skb);
skb = NULL;
goto out;
}
}
skb_gro_pull(skb, sizeof(*eth));
/*
* This works because the only protocols we care about don't require
* special handling. We'll fix it up properly at the end.
*/
skb->protocol = eth->h_proto;
out:
return skb;
}
EXPORT_SYMBOL(napi_frags_skb);
gro_result_t napi_gro_frags(struct napi_struct *napi)
struct sk_buff *skb = napi_frags_skb(napi);
return GRO_DROP;
return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
}
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
/*
* net_rps_action sends any pending IPI's for rps.
* Note: called with local irq disabled, but exits with local irq enabled.
*/
static void net_rps_action_and_irq_enable(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
struct softnet_data *remsd = sd->rps_ipi_list;
if (remsd) {
sd->rps_ipi_list = NULL;
local_irq_enable();
/* Send pending IPI's to kick RPS processing on remote cpus. */
while (remsd) {
struct softnet_data *next = remsd->rps_ipi_next;
if (cpu_online(remsd->cpu))
__smp_call_function_single(remsd->cpu,
&remsd->csd, 0);
remsd = next;
}
} else
#endif
local_irq_enable();
}
static int process_backlog(struct napi_struct *napi, int quota)
struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
#ifdef CONFIG_RPS
/* Check if we have pending ipi, its better to send them now,
* not waiting net_rx_action() end.
*/
if (sd->rps_ipi_list) {
local_irq_disable();
net_rps_action_and_irq_enable(sd);
}
#endif
napi->weight = weight_p;
local_irq_disable();
while (work < quota) {
unsigned int qlen;
while ((skb = __skb_dequeue(&sd->process_queue))) {
local_irq_enable();
__netif_receive_skb(skb);
local_irq_disable();
input_queue_head_incr(sd);
if (++work >= quota) {
local_irq_enable();
return work;
}
qlen = skb_queue_len(&sd->input_pkt_queue);
skb_queue_splice_tail_init(&sd->input_pkt_queue,
&sd->process_queue);
if (qlen < quota - work) {
/*
* Inline a custom version of __napi_complete().
* only current cpu owns and manipulates this napi,
* and NAPI_STATE_SCHED is the only possible flag set on backlog.
* we can use a plain write instead of clear_bit(),
* and we dont need an smp_mb() memory barrier.
*/
list_del(&napi->poll_list);
napi->state = 0;
quota = work + qlen;
}
}
local_irq_enable();
return work;
}
/**
* __napi_schedule - schedule for receive
* @n: entry to schedule
*
* The entry's receive function will be scheduled to run
*/
void __napi_schedule(struct napi_struct *n)
{
unsigned long flags;
local_irq_save(flags);
____napi_schedule(&__get_cpu_var(softnet_data), n);
local_irq_restore(flags);
EXPORT_SYMBOL(__napi_schedule);
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
void __napi_complete(struct napi_struct *n)
{
BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
BUG_ON(n->gro_list);
list_del(&n->poll_list);
smp_mb__before_clear_bit();
clear_bit(NAPI_STATE_SCHED, &n->state);
}
EXPORT_SYMBOL(__napi_complete);
void napi_complete(struct napi_struct *n)
{
unsigned long flags;
/*
* don't let napi dequeue from the cpu poll list
* just in case its running on a different cpu
*/
if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
return;
napi_gro_flush(n);
local_irq_save(flags);
__napi_complete(n);
local_irq_restore(flags);
}
EXPORT_SYMBOL(napi_complete);
void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight)
{
INIT_LIST_HEAD(&napi->poll_list);
napi->gro_count = 0;
napi->poll = poll;
napi->weight = weight;
list_add(&napi->dev_list, &dev->napi_list);
napi->dev = dev;
spin_lock_init(&napi->poll_lock);
napi->poll_owner = -1;
#endif
set_bit(NAPI_STATE_SCHED, &napi->state);
}
EXPORT_SYMBOL(netif_napi_add);
void netif_napi_del(struct napi_struct *napi)
{
struct sk_buff *skb, *next;
list_del_init(&napi->dev_list);
for (skb = napi->gro_list; skb; skb = next) {
next = skb->next;
skb->next = NULL;
kfree_skb(skb);
}
napi->gro_list = NULL;
napi->gro_count = 0;
}
EXPORT_SYMBOL(netif_napi_del);
static void net_rx_action(struct softirq_action *h)
{
struct softnet_data *sd = &__get_cpu_var(softnet_data);
unsigned long time_limit = jiffies + 2;
int budget = netdev_budget;
while (!list_empty(&sd->poll_list)) {
struct napi_struct *n;
int work, weight;
/* If softirq window is exhuasted then punt.
* Allow this to run for 2 jiffies since which will allow
* an average latency of 1.5/HZ.
*/
if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
goto softnet_break;
local_irq_enable();
/* Even though interrupts have been re-enabled, this
* access is safe because interrupts can only add new
* entries to the tail of this list, and only ->poll()
* calls can remove this head entry from the list.
*/
n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
have = netpoll_poll_lock(n);
weight = n->weight;
/* This NAPI_STATE_SCHED test is for avoiding a race
* with netpoll's poll_napi(). Only the entity which
* obtains the lock and sees NAPI_STATE_SCHED set will
* actually make the ->poll() call. Therefore we avoid
* accidentally calling ->poll() when NAPI is not scheduled.
*/
work = 0;
if (test_bit(NAPI_STATE_SCHED, &n->state)) {
work = n->poll(n, weight);
trace_napi_poll(n);
}
WARN_ON_ONCE(work > weight);
budget -= work;
local_irq_disable();
/* Drivers must not modify the NAPI state if they
* consume the entire weight. In such cases this code
* still "owns" the NAPI instance and therefore can
* move the instance around on the list at-will.
*/
if (unlikely(work == weight)) {
if (unlikely(napi_disable_pending(n))) {
local_irq_enable();
napi_complete(n);
local_irq_disable();
} else
list_move_tail(&n->poll_list, &sd->poll_list);
netpoll_poll_unlock(have);
net_rps_action_and_irq_enable(sd);
#ifdef CONFIG_NET_DMA
/*
* There may not be any more sk_buffs coming right now, so push
* any pending DMA copies to hardware
*/
dma_issue_pending_all();
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
goto out;
}
static gifconf_func_t *gifconf_list[NPROTO];
/**
* register_gifconf - register a SIOCGIF handler
* @family: Address family
* @gifconf: Function handler
*
* Register protocol dependent address dumping routines. The handler
* that is passed must not be freed or reused until it has been replaced
* by another handler.
*/
int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
{
if (family >= NPROTO)
return -EINVAL;
gifconf_list[family] = gifconf;
return 0;
}
/*
* Map an interface index to its name (SIOCGIFNAME)
*/
/*
* We need this ioctl for efficient implementation of the
* if_indextoname() function required by the IPv6 API. Without
* it, we would have to search all the interfaces to find a
* match. --pb
*/
static int dev_ifname(struct net *net, struct ifreq __user *arg)
{
struct net_device *dev;
struct ifreq ifr;
/*
* Fetch the caller's info block.
*/
if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
return -EFAULT;
rcu_read_lock();
dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
return -ENODEV;
}
strcpy(ifr.ifr_name, dev->name);
if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
return -EFAULT;
return 0;
}
/*
* Perform a SIOCGIFCONF call. This structure will change
* size eventually, and there is nothing I can do about it.
* Thus we will need a 'compatibility mode'.
*/
static int dev_ifconf(struct net *net, char __user *arg)
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
{
struct ifconf ifc;
struct net_device *dev;
char __user *pos;
int len;
int total;
int i;
/*
* Fetch the caller's info block.
*/
if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
return -EFAULT;
pos = ifc.ifc_buf;
len = ifc.ifc_len;
/*
* Loop over the interfaces, and write an info block for each.
*/
total = 0;
for_each_netdev(net, dev) {
for (i = 0; i < NPROTO; i++) {
if (gifconf_list[i]) {
int done;
if (!pos)
done = gifconf_list[i](dev, NULL, 0);
else
done = gifconf_list[i](dev, pos + total,
len - total);
if (done < 0)
return -EFAULT;
total += done;
}
}
/*
* All done. Write the updated control block back to the caller.
*/
ifc.ifc_len = total;
/*
* Both BSD and Solaris return 0 here, so we do too.
*/
return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
}
#ifdef CONFIG_PROC_FS
/*
* This is invoked by the /proc filesystem handler to display a device
* in detail.
*/
void *dev_seq_start(struct seq_file *seq, loff_t *pos)
struct net *net = seq_file_net(seq);
if (!*pos)
return SEQ_START_TOKEN;
for_each_netdev_rcu(net, dev)
if (off++ == *pos)
return dev;
}
void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct net_device *dev = v;
if (v == SEQ_START_TOKEN)
dev = first_net_device_rcu(seq_file_net(seq));
else
dev = next_net_device_rcu(dev);
}
void dev_seq_stop(struct seq_file *seq, void *v)
}
static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
{