Newer
Older
{
#ifdef CONFIG_RPS
struct softnet_data *mysd = &__get_cpu_var(softnet_data);
if (sd != mysd) {
sd->rps_ipi_next = mysd->rps_ipi_list;
mysd->rps_ipi_list = sd;
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
return 1;
}
#endif /* CONFIG_RPS */
return 0;
}
/*
* enqueue_to_backlog is called to queue an skb to a per CPU backlog
* queue (may be a remote CPU queue).
*/
static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
unsigned int *qtail)
if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
if (skb_queue_len(&sd->input_pkt_queue)) {
input_queue_tail_incr_save(sd, qtail);
/* Schedule NAPI for backlog device
* We can use non atomic operation since we own the queue lock
*/
if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
atomic_long_inc(&skb->dev->rx_dropped);
kfree_skb(skb);
return NET_RX_DROP;
}
/**
* netif_rx - post buffer to the network code
* @skb: buffer to post
*
* This function receives a packet from a device driver and queues it for
* the upper (protocol) levels to process. It always succeeds. The buffer
* may be dropped during processing for congestion control or by the
* protocol layers.
*
* return values:
* NET_RX_SUCCESS (no congestion)
* NET_RX_DROP (packet was dropped)
*
*/
int netif_rx(struct sk_buff *skb)
{
/* if netpoll wants it, pretend we never saw it */
if (netpoll_rx(skb))
return NET_RX_DROP;
net_timestamp_check(netdev_tstamp_prequeue, skb);
if (static_key_false(&rps_needed)) {
struct rps_dev_flow voidflow, *rflow = &voidflow;
preempt_disable();
cpu = get_rps_cpu(skb->dev, skb, &rflow);
if (cpu < 0)
cpu = smp_processor_id();
ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
{
unsigned int qtail;
ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
put_cpu();
}
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
int netif_rx_ni(struct sk_buff *skb)
{
int err;
preempt_disable();
err = netif_rx(skb);
if (local_softirq_pending())
do_softirq();
preempt_enable();
return err;
}
EXPORT_SYMBOL(netif_rx_ni);
static void net_tx_action(struct softirq_action *h)
{
struct softnet_data *sd = &__get_cpu_var(softnet_data);
if (sd->completion_queue) {
struct sk_buff *clist;
local_irq_disable();
clist = sd->completion_queue;
sd->completion_queue = NULL;
local_irq_enable();
while (clist) {
struct sk_buff *skb = clist;
clist = clist->next;
WARN_ON(atomic_read(&skb->users));
__kfree_skb(skb);
}
}
if (sd->output_queue) {
struct Qdisc *head;
local_irq_disable();
head = sd->output_queue;
sd->output_queue = NULL;
sd->output_queue_tailp = &sd->output_queue;
struct Qdisc *q = head;
spinlock_t *root_lock;
root_lock = qdisc_lock(q);
if (spin_trylock(root_lock)) {
smp_mb__before_clear_bit();
clear_bit(__QDISC_STATE_SCHED,
&q->state);
qdisc_run(q);
spin_unlock(root_lock);
if (!test_bit(__QDISC_STATE_DEACTIVATED,
__netif_reschedule(q);
} else {
smp_mb__before_clear_bit();
clear_bit(__QDISC_STATE_SCHED,
&q->state);
}
#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
(defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
/* This hook is defined here for ATM LANE */
int (*br_fdb_test_addr_hook)(struct net_device *dev,
unsigned char *addr) __read_mostly;
EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
#ifdef CONFIG_NET_CLS_ACT
/* TODO: Maybe we should just force sch_ingress to be compiled in
* when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
* a compare and 2 stores extra right now if we dont have it on
* but have CONFIG_NET_CLS_ACT
* NOTE: This doesn't stop any functionality; if you dont have
* the ingress scheduler, you just can't add policies on ingress.
static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
u32 ttl = G_TC_RTTL(skb->tc_verd);
int result = TC_ACT_OK;
struct Qdisc *q;
if (unlikely(MAX_RED_LOOP < ttl++)) {
net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
skb->skb_iif, dev->ifindex);
return TC_ACT_SHOT;
}
skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
result = qdisc_enqueue_root(skb, q);
spin_unlock(qdisc_lock(q));
}
return result;
}
static inline struct sk_buff *handle_ing(struct sk_buff *skb,
struct packet_type **pt_prev,
int *ret, struct net_device *orig_dev)
{
struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
if (!rxq || rxq->qdisc == &noop_qdisc)
if (*pt_prev) {
*ret = deliver_skb(skb, *pt_prev, orig_dev);
*pt_prev = NULL;
case TC_ACT_SHOT:
case TC_ACT_STOLEN:
kfree_skb(skb);
return NULL;
}
out:
skb->tc_verd = 0;
return skb;
/**
* netdev_rx_handler_register - register receive handler
* @dev: device to register a handler for
* @rx_handler: receive handler to register
* @rx_handler_data: data pointer that is used by rx handler
*
* Register a receive hander for a device. This handler will then be
* called from __netif_receive_skb. A negative errno code is returned
* on a failure.
*
* The caller must hold the rtnl_mutex.
*
* For a general description of rx_handler, see enum rx_handler_result.
*/
int netdev_rx_handler_register(struct net_device *dev,
rx_handler_func_t *rx_handler,
void *rx_handler_data)
{
ASSERT_RTNL();
if (dev->rx_handler)
return -EBUSY;
rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
rcu_assign_pointer(dev->rx_handler, rx_handler);
return 0;
}
EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
/**
* netdev_rx_handler_unregister - unregister receive handler
* @dev: device to unregister a handler from
*
* Unregister a receive hander from a device.
*
* The caller must hold the rtnl_mutex.
*/
void netdev_rx_handler_unregister(struct net_device *dev)
{
ASSERT_RTNL();
RCU_INIT_POINTER(dev->rx_handler, NULL);
RCU_INIT_POINTER(dev->rx_handler_data, NULL);
}
EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
/*
* Limit the use of PFMEMALLOC reserves to those protocols that implement
* the special handling of PFMEMALLOC skbs.
*/
static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
{
switch (skb->protocol) {
case __constant_htons(ETH_P_ARP):
case __constant_htons(ETH_P_IP):
case __constant_htons(ETH_P_IPV6):
case __constant_htons(ETH_P_8021Q):
return true;
default:
return false;
}
}
static int __netif_receive_skb(struct sk_buff *skb)
rx_handler_func_t *rx_handler;
struct net_device *null_or_dev;
bool deliver_exact = false;
unsigned long pflags = current->flags;
net_timestamp_check(!netdev_tstamp_prequeue, skb);
/*
* PFMEMALLOC skbs are special, they should
* - be delivered to SOCK_MEMALLOC sockets only
* - stay away from userspace
* - have bounded memory usage
*
* Use PF_MEMALLOC as this saves us from propagating the allocation
* context down to all allocation sites.
*/
if (sk_memalloc_socks() && skb_pfmemalloc(skb))
current->flags |= PF_MEMALLOC;
/* if we've gotten here through NAPI, check netpoll */
if (netpoll_receive_skb(skb))
skb_reset_network_header(skb);
skb_reset_transport_header(skb);
another_round:
skb->skb_iif = skb->dev->ifindex;
__this_cpu_inc(softnet_data.processed);
if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
skb = vlan_untag(skb);
if (unlikely(!skb))
#ifdef CONFIG_NET_CLS_ACT
if (skb->tc_verd & TC_NCLS) {
skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
goto ncls;
}
#endif
if (sk_memalloc_socks() && skb_pfmemalloc(skb))
goto skip_taps;
if (!ptype->dev || ptype->dev == skb->dev) {
skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
if (!skb)
if (sk_memalloc_socks() && skb_pfmemalloc(skb)
&& !skb_pfmemalloc_protocol(skb))
goto drop;
if (vlan_tx_tag_present(skb)) {
if (pt_prev) {
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL;
}
if (vlan_do_receive(&skb))
goto another_round;
else if (unlikely(!skb))
rx_handler = rcu_dereference(skb->dev->rx_handler);
if (rx_handler) {
if (pt_prev) {
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL;
}
switch (rx_handler(&skb)) {
case RX_HANDLER_CONSUMED:
case RX_HANDLER_ANOTHER:
goto another_round;
case RX_HANDLER_EXACT:
deliver_exact = true;
case RX_HANDLER_PASS:
break;
default:
BUG();
}
if (vlan_tx_nonzero_tag_present(skb))
skb->pkt_type = PACKET_OTHERHOST;
/* deliver only exact match when indicated */
null_or_dev = deliver_exact ? skb->dev : NULL;
list_for_each_entry_rcu(ptype,
&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
if (ptype->type == type &&
(ptype->dev == null_or_dev || ptype->dev == skb->dev ||
ptype->dev == orig_dev)) {
pt_prev = ptype;
}
}
if (pt_prev) {
if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
else
ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
atomic_long_inc(&skb->dev->rx_dropped);
kfree_skb(skb);
/* Jamal, now you will not able to escape explaining
* me how you were going to use this. :-)
*/
ret = NET_RX_DROP;
}
out:
tsk_restore_flags(current, pflags, PF_MEMALLOC);
/**
* netif_receive_skb - process receive buffer from network
* @skb: buffer to process
*
* netif_receive_skb() is the main receive data processing function.
* It always succeeds. The buffer may be dropped during processing
* for congestion control or by the protocol layers.
*
* This function may only be called from softirq context and interrupts
* should be enabled.
*
* Return values (usually ignored):
* NET_RX_SUCCESS: no congestion
* NET_RX_DROP: packet was dropped
*/
int netif_receive_skb(struct sk_buff *skb)
{
net_timestamp_check(netdev_tstamp_prequeue, skb);
if (skb_defer_rx_timestamp(skb))
return NET_RX_SUCCESS;
if (static_key_false(&rps_needed)) {
struct rps_dev_flow voidflow, *rflow = &voidflow;
int cpu, ret;
rcu_read_lock();
cpu = get_rps_cpu(skb->dev, skb, &rflow);
if (cpu >= 0) {
ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
rcu_read_unlock();
return __netif_receive_skb(skb);
/* Network device is going away, flush any packets still pending
* Called with irqs disabled.
*/
static void flush_backlog(void *arg)
struct net_device *dev = arg;
struct sk_buff *skb, *tmp;
skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
input_queue_head_incr(sd);
skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
if (skb->dev == dev) {
__skb_unlink(skb, &sd->process_queue);
kfree_skb(skb);
input_queue_head_incr(sd);
static int napi_gro_complete(struct sk_buff *skb)
{
struct packet_offload *ptype;
__be16 type = skb->protocol;
struct list_head *head = &offload_base;
if (NAPI_GRO_CB(skb)->count == 1) {
skb_shinfo(skb)->gso_size = 0;
rcu_read_lock();
list_for_each_entry_rcu(ptype, head, list) {
if (ptype->type != type || !ptype->callbacks.gro_complete)
err = ptype->callbacks.gro_complete(skb);
break;
}
rcu_read_unlock();
if (err) {
WARN_ON(&ptype->list == head);
kfree_skb(skb);
return NET_RX_SUCCESS;
}
out:
return netif_receive_skb(skb);
}
/* napi->gro_list contains packets ordered by age.
* youngest packets at the head of it.
* Complete skbs in reverse order to reduce latencies.
*/
void napi_gro_flush(struct napi_struct *napi, bool flush_old)
/* scan list and build reverse chain */
for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
skb->prev = prev;
prev = skb;
}
for (skb = prev; skb; skb = prev) {
if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
return;
prev = skb->prev;
}
napi->gro_list = NULL;
}
static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
struct sk_buff **pp = NULL;
struct packet_offload *ptype;
__be16 type = skb->protocol;
struct list_head *head = &offload_base;
if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
if (skb_is_gso(skb) || skb_has_frag_list(skb))
rcu_read_lock();
list_for_each_entry_rcu(ptype, head, list) {
if (ptype->type != type || !ptype->callbacks.gro_receive)
skb_set_network_header(skb, skb_gro_offset(skb));
mac_len = skb->network_header - skb->mac_header;
skb->mac_len = mac_len;
NAPI_GRO_CB(skb)->same_flow = 0;
NAPI_GRO_CB(skb)->flush = 0;
pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
break;
}
rcu_read_unlock();
if (&ptype->list == head)
goto normal;
same_flow = NAPI_GRO_CB(skb)->same_flow;
ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
if (pp) {
struct sk_buff *nskb = *pp;
*pp = nskb->next;
nskb->next = NULL;
napi_gro_complete(nskb);
napi->gro_count--;
if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
napi->gro_count++;
NAPI_GRO_CB(skb)->count = 1;
skb_shinfo(skb)->gso_size = skb_gro_len(skb);
skb->next = napi->gro_list;
napi->gro_list = skb;
if (skb_headlen(skb) < skb_gro_offset(skb)) {
int grow = skb_gro_offset(skb) - skb_headlen(skb);
BUG_ON(skb->end - skb->tail < grow);
memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
skb->tail += grow;
skb->data_len -= grow;
skb_shinfo(skb)->frags[0].page_offset += grow;
skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
memmove(skb_shinfo(skb)->frags,
skb_shinfo(skb)->frags + 1,
--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
ret = GRO_NORMAL;
goto pull;
__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
unsigned int maclen = skb->dev->hard_header_len;
for (p = napi->gro_list; p; p = p->next) {
unsigned long diffs;
diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
diffs |= p->vlan_tci ^ skb->vlan_tci;
if (maclen == ETH_HLEN)
diffs |= compare_ether_header(skb_mac_header(p),
skb_gro_mac_header(skb));
else if (!diffs)
diffs = memcmp(skb_mac_header(p),
skb_gro_mac_header(skb),
maclen);
NAPI_GRO_CB(p)->same_flow = !diffs;
NAPI_GRO_CB(p)->flush = 0;
}
return dev_gro_receive(napi, skb);
}
static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
switch (ret) {
case GRO_NORMAL:
if (netif_receive_skb(skb))
ret = GRO_DROP;
break;
if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
kmem_cache_free(skbuff_head_cache, skb);
else
__kfree_skb(skb);
case GRO_HELD:
case GRO_MERGED:
break;
return ret;
static void skb_gro_reset_offset(struct sk_buff *skb)
const struct skb_shared_info *pinfo = skb_shinfo(skb);
const skb_frag_t *frag0 = &pinfo->frags[0];
NAPI_GRO_CB(skb)->data_offset = 0;
NAPI_GRO_CB(skb)->frag0 = NULL;
NAPI_GRO_CB(skb)->frag0_len = 0;
if (skb->mac_header == skb->tail &&
pinfo->nr_frags &&
!PageHighMem(skb_frag_page(frag0))) {
NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
skb_gro_reset_offset(skb);
return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
}
EXPORT_SYMBOL(napi_gro_receive);
static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
{
__skb_pull(skb, skb_headlen(skb));
/* restore the reserve we had after netdev_alloc_skb_ip_align() */
skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
struct sk_buff *napi_get_frags(struct napi_struct *napi)
{
struct sk_buff *skb = napi->skb;
if (!skb) {
skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
if (skb)
napi->skb = skb;
EXPORT_SYMBOL(napi_get_frags);
static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
gro_result_t ret)
switch (ret) {
case GRO_NORMAL:
skb->protocol = eth_type_trans(skb, skb->dev);
if (ret == GRO_HELD)
skb_gro_pull(skb, -ETH_HLEN);
else if (netif_receive_skb(skb))
ret = GRO_DROP;
case GRO_DROP:
case GRO_MERGED_FREE:
napi_reuse_skb(napi, skb);
break;
case GRO_MERGED:
break;
return ret;
static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
{
struct sk_buff *skb = napi->skb;
struct ethhdr *eth;
unsigned int hlen;
unsigned int off;
napi->skb = NULL;
skb_reset_mac_header(skb);
skb_gro_reset_offset(skb);
off = skb_gro_offset(skb);
hlen = off + sizeof(*eth);
eth = skb_gro_header_fast(skb, off);
if (skb_gro_header_hard(skb, hlen)) {
eth = skb_gro_header_slow(skb, hlen, off);
if (unlikely(!eth)) {
napi_reuse_skb(napi, skb);
skb = NULL;
goto out;
}
}
skb_gro_pull(skb, sizeof(*eth));
/*
* This works because the only protocols we care about don't require
* special handling. We'll fix it up properly at the end.
*/
skb->protocol = eth->h_proto;
out:
return skb;
}
gro_result_t napi_gro_frags(struct napi_struct *napi)
struct sk_buff *skb = napi_frags_skb(napi);
return GRO_DROP;
return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
}
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
/*
* net_rps_action sends any pending IPI's for rps.
* Note: called with local irq disabled, but exits with local irq enabled.
*/
static void net_rps_action_and_irq_enable(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
struct softnet_data *remsd = sd->rps_ipi_list;
if (remsd) {
sd->rps_ipi_list = NULL;
local_irq_enable();
/* Send pending IPI's to kick RPS processing on remote cpus. */
while (remsd) {
struct softnet_data *next = remsd->rps_ipi_next;
if (cpu_online(remsd->cpu))
__smp_call_function_single(remsd->cpu,
&remsd->csd, 0);
remsd = next;
}
} else
#endif
local_irq_enable();
}
static int process_backlog(struct napi_struct *napi, int quota)
struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
#ifdef CONFIG_RPS
/* Check if we have pending ipi, its better to send them now,
* not waiting net_rx_action() end.
*/
if (sd->rps_ipi_list) {
local_irq_disable();
net_rps_action_and_irq_enable(sd);
}
#endif
napi->weight = weight_p;
local_irq_disable();
while (work < quota) {
unsigned int qlen;
while ((skb = __skb_dequeue(&sd->process_queue))) {
local_irq_enable();
__netif_receive_skb(skb);
local_irq_disable();
input_queue_head_incr(sd);
if (++work >= quota) {
local_irq_enable();
return work;
}
qlen = skb_queue_len(&sd->input_pkt_queue);
skb_queue_splice_tail_init(&sd->input_pkt_queue,
&sd->process_queue);
if (qlen < quota - work) {
/*
* Inline a custom version of __napi_complete().
* only current cpu owns and manipulates this napi,
* and NAPI_STATE_SCHED is the only possible flag set on backlog.
* we can use a plain write instead of clear_bit(),
* and we dont need an smp_mb() memory barrier.
*/
list_del(&napi->poll_list);
napi->state = 0;
quota = work + qlen;
}
}
local_irq_enable();
return work;
}
/**
* __napi_schedule - schedule for receive
* @n: entry to schedule
*
* The entry's receive function will be scheduled to run
*/
void __napi_schedule(struct napi_struct *n)
{
unsigned long flags;
local_irq_save(flags);
____napi_schedule(&__get_cpu_var(softnet_data), n);
local_irq_restore(flags);
EXPORT_SYMBOL(__napi_schedule);
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
void __napi_complete(struct napi_struct *n)
{
BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
BUG_ON(n->gro_list);
list_del(&n->poll_list);
smp_mb__before_clear_bit();
clear_bit(NAPI_STATE_SCHED, &n->state);
}
EXPORT_SYMBOL(__napi_complete);
void napi_complete(struct napi_struct *n)
{
unsigned long flags;
/*
* don't let napi dequeue from the cpu poll list
* just in case its running on a different cpu
*/
if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
return;
local_irq_save(flags);
__napi_complete(n);
local_irq_restore(flags);
}