Newer
Older
skb_reset_mac_header(skb);

Arnaldo Carvalho de Melo
committed
skb->mac_len = skb->network_header - skb->mac_header;
__skb_pull(skb, skb->mac_len);
if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
skb_warn_bad_offload(skb);
if (skb_header_cloned(skb) &&
(err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
return ERR_PTR(err);
}
list_for_each_entry_rcu(ptype,
&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
err = ptype->gso_send_check(skb);
segs = ERR_PTR(err);
if (err || skb_gso_ok(skb, features))
break;
__skb_push(skb, (skb->data -
skb_network_header(skb)));
segs = ptype->gso_segment(skb, features);
break;
}
}
rcu_read_unlock();
__skb_push(skb, skb->data - skb_mac_header(skb));
return segs;
}
EXPORT_SYMBOL(skb_gso_segment);
/* Take action when hardware reception checksum errors are detected. */
#ifdef CONFIG_BUG
void netdev_rx_csum_fault(struct net_device *dev)
{
if (net_ratelimit()) {
pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
dump_stack();
}
}
EXPORT_SYMBOL(netdev_rx_csum_fault);
#endif
/* Actually, we should eliminate this check as soon as we know, that:
* 1. IOMMU is present and allows to map all the memory.
* 2. No high memory really exists on this machine.
*/
static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
if (!(dev->features & NETIF_F_HIGHDMA)) {
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
if (PageHighMem(skb_frag_page(frag)))
if (PCI_DMA_BUS_IS_PHYS) {
struct device *pdev = dev->dev.parent;
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
dma_addr_t addr = page_to_phys(skb_frag_page(frag));
if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
return 1;
}
}
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
struct dev_gso_cb {
void (*destructor)(struct sk_buff *skb);
};
#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
static void dev_gso_skb_destructor(struct sk_buff *skb)
{
struct dev_gso_cb *cb;
do {
struct sk_buff *nskb = skb->next;
skb->next = nskb->next;
nskb->next = NULL;
kfree_skb(nskb);
} while (skb->next);
cb = DEV_GSO_CB(skb);
if (cb->destructor)
cb->destructor(skb);
}
/**
* dev_gso_segment - Perform emulated hardware segmentation on skb.
* @skb: buffer to segment
* @features: device features as applicable to this skb
*
* This function segments the given skb and stores the list of segments
* in skb->next.
*/
static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
segs = skb_gso_segment(skb, features);
/* Verifying header integrity only. */
if (!segs)
return 0;
return PTR_ERR(segs);
skb->next = segs;
DEV_GSO_CB(skb)->destructor = skb->destructor;
skb->destructor = dev_gso_skb_destructor;
return 0;
}
static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
{
return ((features & NETIF_F_GEN_CSUM) ||
((features & NETIF_F_V4_CSUM) &&
protocol == htons(ETH_P_IP)) ||
((features & NETIF_F_V6_CSUM) &&
protocol == htons(ETH_P_IPV6)) ||
((features & NETIF_F_FCOE_CRC) &&
protocol == htons(ETH_P_FCOE)));
}
static netdev_features_t harmonize_features(struct sk_buff *skb,
__be16 protocol, netdev_features_t features)
if (!can_checksum_protocol(features, protocol)) {
features &= ~NETIF_F_ALL_CSUM;
features &= ~NETIF_F_SG;
} else if (illegal_highdma(skb->dev, skb)) {
features &= ~NETIF_F_SG;
}
return features;
}
netdev_features_t netif_skb_features(struct sk_buff *skb)
{
__be16 protocol = skb->protocol;
netdev_features_t features = skb->dev->features;
if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
features &= ~NETIF_F_GSO_MASK;
if (protocol == htons(ETH_P_8021Q)) {
struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
protocol = veh->h_vlan_encapsulated_proto;
} else if (!vlan_tx_tag_present(skb)) {
return harmonize_features(skb, protocol, features);
}
features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
if (protocol != htons(ETH_P_8021Q)) {
return harmonize_features(skb, protocol, features);
} else {
features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
return harmonize_features(skb, protocol, features);
}
EXPORT_SYMBOL(netif_skb_features);
/*
* Returns true if either:
* 1. skb has frag_list and the device doesn't support FRAGLIST, or
* 2. skb is fragmented and the device does not support SG, or if
* at least one of fragments is in highmem and device does not
* support DMA from it.
*/
static inline int skb_needs_linearize(struct sk_buff *skb,
int features)
return skb_is_nonlinear(skb) &&
((skb_has_frag_list(skb) &&
!(features & NETIF_F_FRAGLIST)) ||
(skb_shinfo(skb)->nr_frags &&
!(features & NETIF_F_SG)));
int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
struct netdev_queue *txq)
const struct net_device_ops *ops = dev->netdev_ops;
int rc = NETDEV_TX_OK;
unsigned int skb_len;
netdev_features_t features;
* If device doesn't need skb->dst, release it right now while
* its hot in this cpu cache
*/
if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
skb_dst_drop(skb);
if (!list_empty(&ptype_all))
dev_queue_xmit_nit(skb, dev);
features = netif_skb_features(skb);
if (vlan_tx_tag_present(skb) &&
!(features & NETIF_F_HW_VLAN_TX)) {
skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
if (unlikely(!skb))
goto out;
skb->vlan_tci = 0;
}
if (netif_needs_gso(skb, features)) {
if (unlikely(dev_gso_segment(skb, features)))
goto out_kfree_skb;
if (skb->next)
goto gso;
if (skb_needs_linearize(skb, features) &&
__skb_linearize(skb))
goto out_kfree_skb;
/* If packet is not checksummed and device does not
* support checksumming for this protocol, complete
* checksumming here.
*/
if (skb->ip_summed == CHECKSUM_PARTIAL) {
skb_set_transport_header(skb,
skb_checksum_start_offset(skb));
if (!(features & NETIF_F_ALL_CSUM) &&
skb_checksum_help(skb))
goto out_kfree_skb;
}
skb_len = skb->len;
rc = ops->ndo_start_xmit(skb, dev);
trace_net_dev_xmit(skb, rc, dev, skb_len);

Patrick McHardy
committed
if (rc == NETDEV_TX_OK)
do {
struct sk_buff *nskb = skb->next;
skb->next = nskb->next;
nskb->next = NULL;
* If device doesn't need nskb->dst, release it right now while
* its hot in this cpu cache
*/
if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
skb_dst_drop(nskb);
skb_len = nskb->len;
rc = ops->ndo_start_xmit(nskb, dev);
trace_net_dev_xmit(nskb, rc, dev, skb_len);

Patrick McHardy
committed
if (unlikely(rc != NETDEV_TX_OK)) {
if (rc & ~NETDEV_TX_MASK)
goto out_kfree_gso_skb;
nskb->next = skb->next;
skb->next = nskb;
return rc;
}
if (unlikely(netif_xmit_stopped(txq) && skb->next))
out_kfree_gso_skb:
if (likely(skb->next == NULL))
skb->destructor = DEV_GSO_CB(skb)->destructor;
out_kfree_skb:
kfree_skb(skb);
return rc;
/*
* Returns a Tx hash based on the given packet descriptor a Tx queues' number
* to be used as a distribution range.
*/
u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
unsigned int num_tx_queues)
u16 qoffset = 0;
u16 qcount = num_tx_queues;
if (skb_rx_queue_recorded(skb)) {
hash = skb_get_rx_queue(skb);
while (unlikely(hash >= num_tx_queues))
hash -= num_tx_queues;
return hash;
}
if (dev->num_tc) {
u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
qoffset = dev->tc_to_txq[tc].offset;
qcount = dev->tc_to_txq[tc].count;
}
if (skb->sk && skb->sk->sk_hash)
return (u16) (((u64) hash * qcount) >> 32) + qoffset;
EXPORT_SYMBOL(__skb_tx_hash);
static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
{
if (unlikely(queue_index >= dev->real_num_tx_queues)) {
net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n",
dev->name, queue_index,
dev->real_num_tx_queues);
return 0;
}
return queue_index;
}
static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
{
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
struct xps_dev_maps *dev_maps;
struct xps_map *map;
int queue_index = -1;
rcu_read_lock();
dev_maps = rcu_dereference(dev->xps_maps);
if (dev_maps) {
map = rcu_dereference(
dev_maps->cpu_map[raw_smp_processor_id()]);
if (map) {
if (map->len == 1)
queue_index = map->queues[0];
else {
u32 hash;
if (skb->sk && skb->sk->sk_hash)
hash = skb->sk->sk_hash;
else
hash = (__force u16) skb->protocol ^
skb->rxhash;
hash = jhash_1word(hash, hashrnd);
queue_index = map->queues[
((u64)hash * map->len) >> 32];
}
if (unlikely(queue_index >= dev->real_num_tx_queues))
queue_index = -1;
}
}
rcu_read_unlock();
return queue_index;
#else
return -1;
#endif
}
static struct netdev_queue *dev_pick_tx(struct net_device *dev,
struct sk_buff *skb)
{
const struct net_device_ops *ops = dev->netdev_ops;
if (dev->real_num_tx_queues == 1)
queue_index = 0;
else if (ops->ndo_select_queue) {
queue_index = ops->ndo_select_queue(dev, skb);
queue_index = dev_cap_txqueue(dev, queue_index);
} else {
struct sock *sk = skb->sk;
queue_index = sk_tx_queue_get(sk);
if (queue_index < 0 || skb->ooo_okay ||
queue_index >= dev->real_num_tx_queues) {
int old_index = queue_index;
queue_index = get_xps_queue(dev, skb);
if (queue_index < 0)
queue_index = skb_tx_hash(dev, skb);
if (queue_index != old_index && sk) {
struct dst_entry *dst =
rcu_dereference_check(sk->sk_dst_cache, 1);
if (dst && skb_dst(skb) == dst)
sk_tx_queue_set(sk, queue_index);
}
skb_set_queue_mapping(skb, queue_index);
return netdev_get_tx_queue(dev, queue_index);
static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
struct net_device *dev,
struct netdev_queue *txq)
{
spinlock_t *root_lock = qdisc_lock(q);
qdisc_skb_cb(skb)->pkt_len = skb->len;
qdisc_calculate_pkt_len(skb, q);
/*
* Heuristic to force contended enqueues to serialize on a
* separate lock before trying to get qdisc main lock.
* This permits __QDISC_STATE_RUNNING owner to get the lock more often
* and dequeue packets faster.
*/
if (unlikely(contended))
spin_lock(&q->busylock);
spin_lock(root_lock);
if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
kfree_skb(skb);
rc = NET_XMIT_DROP;
} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
qdisc_run_begin(q)) {
/*
* This is a work-conserving queue; there are no old skbs
* waiting to be sent out; and the qdisc is not running -
* xmit the skb directly.
*/
if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
skb_dst_force(skb);
qdisc_bstats_update(q, skb);
if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
if (unlikely(contended)) {
spin_unlock(&q->busylock);
contended = false;
}
qdisc_run_end(q);
rc = NET_XMIT_SUCCESS;
} else {
rc = q->enqueue(skb, q) & NET_XMIT_MASK;
if (qdisc_run_begin(q)) {
if (unlikely(contended)) {
spin_unlock(&q->busylock);
contended = false;
}
__qdisc_run(q);
}
}
spin_unlock(root_lock);
if (unlikely(contended))
spin_unlock(&q->busylock);
#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
static void skb_update_prio(struct sk_buff *skb)
{
struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
if (!skb->priority && skb->sk && map) {
unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
if (prioidx < map->priomap_len)
skb->priority = map->priomap[prioidx];
}
}
#else
#define skb_update_prio(skb)
#endif
static DEFINE_PER_CPU(int, xmit_recursion);
/**
* dev_loopback_xmit - loop back @skb
* @skb: buffer to transmit
*/
int dev_loopback_xmit(struct sk_buff *skb)
{
skb_reset_mac_header(skb);
__skb_pull(skb, skb_network_offset(skb));
skb->pkt_type = PACKET_LOOPBACK;
skb->ip_summed = CHECKSUM_UNNECESSARY;
WARN_ON(!skb_dst(skb));
skb_dst_force(skb);
netif_rx_ni(skb);
return 0;
}
EXPORT_SYMBOL(dev_loopback_xmit);
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
/**
* dev_queue_xmit - transmit a buffer
* @skb: buffer to transmit
*
* Queue a buffer for transmission to a network device. The caller must
* have set the device and priority and built the buffer before calling
* this function. The function can be called from an interrupt.
*
* A negative errno code is returned on a failure. A success does not
* guarantee the frame will be transmitted as it may be dropped due
* to congestion or traffic shaping.
*
* -----------------------------------------------------------------------------------
* I notice this method can also return errors from the queue disciplines,
* including NET_XMIT_DROP, which is a positive value. So, errors can also
* be positive.
*
* Regardless of the return value, the skb is consumed, so it is currently
* difficult to retry a send to this method. (You can bump the ref count
* before sending to hold a reference for retry if you are careful.)
*
* When calling this method, interrupts MUST be enabled. This is because
* the BH enable code must have IRQs enabled so that it will not deadlock.
* --BLG
*/
int dev_queue_xmit(struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
struct netdev_queue *txq;
/* Disable soft irqs for various locks below. Also
* stops preemption for RCU.
skb_update_prio(skb);
txq = dev_pick_tx(dev, skb);
q = rcu_dereference_bh(txq->qdisc);
skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
rc = __dev_xmit_skb(skb, q, dev, txq);
}
/* The device has no queue. Common case for software devices:
loopback, all the sorts of tunnels...
Really, it is unlikely that netif_tx_lock protection is necessary
here. (f.e. loopback and IP tunnels are clean ignoring statistics
counters.)
However, it is possible, that they rely on protection
made by us here.
Check this and shot the lock. It is not prone from deadlocks.
Either shot noqueue qdisc, it is even simpler 8)
*/
if (dev->flags & IFF_UP) {
int cpu = smp_processor_id(); /* ok because BHs are off */
if (txq->xmit_lock_owner != cpu) {
if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
goto recursion_alert;
HARD_TX_LOCK(dev, txq, cpu);
rc = dev_hard_start_xmit(skb, dev, txq);
if (dev_xmit_complete(rc)) {
HARD_TX_UNLOCK(dev, txq);
HARD_TX_UNLOCK(dev, txq);
net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
dev->name);
} else {
/* Recursion is detected! It is possible,
* unfortunately
*/
recursion_alert:
net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
dev->name);
/*=======================================================================
Receiver routines
=======================================================================*/
int netdev_max_backlog __read_mostly = 1000;
int netdev_tstamp_prequeue __read_mostly = 1;
int netdev_budget __read_mostly = 300;
int weight_p __read_mostly = 64; /* old backlog weight */
/* Called with irq disabled */
static inline void ____napi_schedule(struct softnet_data *sd,
struct napi_struct *napi)
{
list_add_tail(&napi->poll_list, &sd->poll_list);
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}
* __skb_get_rxhash: calculate a flow hash based on src/dst addresses
* and src/dst port numbers. Sets rxhash in skb to non-zero hash value
* on success, zero indicates no valid hash. Also, sets l4_rxhash in skb
* if hash is a canonical 4-tuple hash over transport ports.
void __skb_get_rxhash(struct sk_buff *skb)
struct flow_keys keys;
u32 hash;
if (!skb_flow_dissect(skb, &keys))
return;
if (keys.ports) {
if ((__force u16)keys.port16[1] < (__force u16)keys.port16[0])
swap(keys.port16[0], keys.port16[1]);
skb->l4_rxhash = 1;
/* get a consistent hash (same value on both flow directions) */
if ((__force u32)keys.dst < (__force u32)keys.src)
swap(keys.dst, keys.src);
hash = jhash_3words((__force u32)keys.dst,
(__force u32)keys.src,
(__force u32)keys.ports, hashrnd);
if (!hash)
hash = 1;
skb->rxhash = hash;
}
EXPORT_SYMBOL(__skb_get_rxhash);
#ifdef CONFIG_RPS
/* One global table that all flow-based protocols share. */
struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
EXPORT_SYMBOL(rps_sock_flow_table);
struct static_key rps_needed __read_mostly;
static struct rps_dev_flow *
set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct rps_dev_flow *rflow, u16 next_cpu)
{
if (next_cpu != RPS_NO_CPU) {
#ifdef CONFIG_RFS_ACCEL
struct netdev_rx_queue *rxqueue;
struct rps_dev_flow_table *flow_table;
struct rps_dev_flow *old_rflow;
u32 flow_id;
u16 rxq_index;
int rc;
/* Should we steer this flow to a different hardware queue? */
if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
!(dev->features & NETIF_F_NTUPLE))
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
goto out;
rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
if (rxq_index == skb_get_rx_queue(skb))
goto out;
rxqueue = dev->_rx + rxq_index;
flow_table = rcu_dereference(rxqueue->rps_flow_table);
if (!flow_table)
goto out;
flow_id = skb->rxhash & flow_table->mask;
rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
rxq_index, flow_id);
if (rc < 0)
goto out;
old_rflow = rflow;
rflow = &flow_table->flows[flow_id];
rflow->filter = rc;
if (old_rflow->filter == rflow->filter)
old_rflow->filter = RPS_NO_FILTER;
out:
#endif
rflow->last_qtail =
per_cpu(softnet_data, next_cpu).input_queue_head;
rflow->cpu = next_cpu;
return rflow;
}
/*
* get_rps_cpu is called from netif_receive_skb and returns the target
* CPU from the RPS map of the receiving queue for a given skb.
* rcu_read_lock must be held on entry.
*/
static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct rps_dev_flow **rflowp)
{
struct netdev_rx_queue *rxqueue;
struct rps_dev_flow_table *flow_table;
struct rps_sock_flow_table *sock_flow_table;
int cpu = -1;
u16 tcpu;
if (skb_rx_queue_recorded(skb)) {
u16 index = skb_get_rx_queue(skb);
if (unlikely(index >= dev->real_num_rx_queues)) {
WARN_ONCE(dev->real_num_rx_queues > 1,
"%s received packet on queue %u, but number "
"of RX queues is %u\n",
dev->name, index, dev->real_num_rx_queues);
goto done;
}
rxqueue = dev->_rx + index;
} else
rxqueue = dev->_rx;
map = rcu_dereference(rxqueue->rps_map);
if (map) {
!rcu_access_pointer(rxqueue->rps_flow_table)) {
tcpu = map->cpus[0];
if (cpu_online(tcpu))
cpu = tcpu;
goto done;
}
} else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
skb_reset_network_header(skb);
if (!skb_get_rxhash(skb))
goto done;
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
flow_table = rcu_dereference(rxqueue->rps_flow_table);
sock_flow_table = rcu_dereference(rps_sock_flow_table);
if (flow_table && sock_flow_table) {
u16 next_cpu;
struct rps_dev_flow *rflow;
rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
tcpu = rflow->cpu;
next_cpu = sock_flow_table->ents[skb->rxhash &
sock_flow_table->mask];
/*
* If the desired CPU (where last recvmsg was done) is
* different from current CPU (one in the rx-queue flow
* table entry), switch if one of the following holds:
* - Current CPU is unset (equal to RPS_NO_CPU).
* - Current CPU is offline.
* - The current CPU's queue tail has advanced beyond the
* last packet that was enqueued using this table entry.
* This guarantees that all previous packets for the flow
* have been dequeued, thus preserving in order delivery.
*/
if (unlikely(tcpu != next_cpu) &&
(tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
((int)(per_cpu(softnet_data, tcpu).input_queue_head -
rflow->last_qtail)) >= 0))
rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
*rflowp = rflow;
cpu = tcpu;
goto done;
}
}
tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
if (cpu_online(tcpu)) {
cpu = tcpu;
goto done;
}
}
done:
return cpu;
}
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
#ifdef CONFIG_RFS_ACCEL
/**
* rps_may_expire_flow - check whether an RFS hardware filter may be removed
* @dev: Device on which the filter was set
* @rxq_index: RX queue index
* @flow_id: Flow ID passed to ndo_rx_flow_steer()
* @filter_id: Filter ID returned by ndo_rx_flow_steer()
*
* Drivers that implement ndo_rx_flow_steer() should periodically call
* this function for each installed filter and remove the filters for
* which it returns %true.
*/
bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
u32 flow_id, u16 filter_id)
{
struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
struct rps_dev_flow_table *flow_table;
struct rps_dev_flow *rflow;
bool expire = true;
int cpu;
rcu_read_lock();
flow_table = rcu_dereference(rxqueue->rps_flow_table);
if (flow_table && flow_id <= flow_table->mask) {
rflow = &flow_table->flows[flow_id];
cpu = ACCESS_ONCE(rflow->cpu);
if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
((int)(per_cpu(softnet_data, cpu).input_queue_head -
rflow->last_qtail) <
(int)(10 * flow_table->mask)))
expire = false;
}
rcu_read_unlock();
return expire;
}
EXPORT_SYMBOL(rps_may_expire_flow);
#endif /* CONFIG_RFS_ACCEL */
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
/*
* Check if this softnet_data structure is another cpu one
* If yes, queue it to our IPI list and return 1
* If no, return 0
*/
static int rps_ipi_queued(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
struct softnet_data *mysd = &__get_cpu_var(softnet_data);
if (sd != mysd) {
sd->rps_ipi_next = mysd->rps_ipi_list;
mysd->rps_ipi_list = sd;
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
return 1;
}
#endif /* CONFIG_RPS */
return 0;
}
/*
* enqueue_to_backlog is called to queue an skb to a per CPU backlog
* queue (may be a remote CPU queue).
*/
static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
unsigned int *qtail)
if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
if (skb_queue_len(&sd->input_pkt_queue)) {
input_queue_tail_incr_save(sd, qtail);
/* Schedule NAPI for backlog device
* We can use non atomic operation since we own the queue lock
*/
if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
atomic_long_inc(&skb->dev->rx_dropped);
kfree_skb(skb);
return NET_RX_DROP;
}
/**
* netif_rx - post buffer to the network code
* @skb: buffer to post
*
* This function receives a packet from a device driver and queues it for
* the upper (protocol) levels to process. It always succeeds. The buffer
* may be dropped during processing for congestion control or by the
* protocol layers.
*
* return values:
* NET_RX_SUCCESS (no congestion)
* NET_RX_DROP (packet was dropped)
*
*/
int netif_rx(struct sk_buff *skb)
{
/* if netpoll wants it, pretend we never saw it */
if (netpoll_rx(skb))
return NET_RX_DROP;
net_timestamp_check(netdev_tstamp_prequeue, skb);
if (static_key_false(&rps_needed)) {
struct rps_dev_flow voidflow, *rflow = &voidflow;
preempt_disable();
cpu = get_rps_cpu(skb->dev, skb, &rflow);
if (cpu < 0)
cpu = smp_processor_id();
ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
{
unsigned int qtail;
ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
put_cpu();
}
int netif_rx_ni(struct sk_buff *skb)
{
int err;