Add a persistent NAPI config area for NAPI configuration to the core.
Drivers opt-in to setting the persistent config for a NAPI by passing an
index when calling netif_napi_add_config.
napi_config is allocated in alloc_netdev_mqs, freed in free_netdev
(after the NAPIs are deleted).
Drivers which call netif_napi_add_config will have persistent per-NAPI
settings: NAPI IDs, gro_flush_timeout, and defer_hard_irq settings.
Per-NAPI settings are saved in napi_disable and restored in napi_enable.
Co-developed-by: Martin Karsten <mkarsten@uwaterloo.ca>
Signed-off-by: Martin Karsten <mkarsten@uwaterloo.ca>
Signed-off-by: Joe Damato <jdamato@fastly.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
---
.../networking/net_cachelines/net_device.rst | 1 +
include/linux/netdevice.h | 36 ++++++++-
net/core/dev.c | 80 +++++++++++++++++--
net/core/dev.h | 12 +++
4 files changed, 119 insertions(+), 10 deletions(-)
diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst
index 67910ea49160..db6192b2bb50 100644
--- a/Documentation/networking/net_cachelines/net_device.rst
+++ b/Documentation/networking/net_cachelines/net_device.rst
@@ -186,6 +186,7 @@ struct dpll_pin* dpll_pin
struct hlist_head page_pools
struct dim_irq_moder* irq_moder
u64 max_pacing_offload_horizon
+struct_napi_config* napi_config
unsigned_long gro_flush_timeout
u32 napi_defer_hard_irqs
=================================== =========================== =================== =================== ===================================================================================
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 93241d4de437..8feaca12655e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -342,6 +342,15 @@ struct gro_list {
*/
#define GRO_HASH_BUCKETS 8
+/*
+ * Structure for per-NAPI config
+ */
+struct napi_config {
+ u64 gro_flush_timeout;
+ u32 defer_hard_irqs;
+ unsigned int napi_id;
+};
+
/*
* Structure for NAPI scheduling similar to tasklet but with weighting
*/
@@ -379,6 +388,8 @@ struct napi_struct {
struct list_head dev_list;
struct hlist_node napi_hash_node;
int irq;
+ int index;
+ struct napi_config *config;
};
enum {
@@ -1868,9 +1879,6 @@ enum netdev_reg_state {
* allocated at register_netdev() time
* @real_num_rx_queues: Number of RX queues currently active in device
* @xdp_prog: XDP sockets filter program pointer
- * @gro_flush_timeout: timeout for GRO layer in NAPI
- * @napi_defer_hard_irqs: If not zero, provides a counter that would
- * allow to avoid NIC hard IRQ, on busy queues.
*
* @rx_handler: handler for received packets
* @rx_handler_data: XXX: need comments on this one
@@ -2020,6 +2028,11 @@ enum netdev_reg_state {
* where the clock is recovered.
*
* @max_pacing_offload_horizon: max EDT offload horizon in nsec.
+ * @napi_config: An array of napi_config structures containing per-NAPI
+ * settings.
+ * @gro_flush_timeout: timeout for GRO layer in NAPI
+ * @napi_defer_hard_irqs: If not zero, provides a counter that would
+ * allow to avoid NIC hard IRQ, on busy queues.
*
* FIXME: cleanup struct net_device such that network protocol info
* moves out.
@@ -2413,6 +2426,7 @@ struct net_device {
struct dim_irq_moder *irq_moder;
u64 max_pacing_offload_horizon;
+ struct napi_config *napi_config;
unsigned long gro_flush_timeout;
u32 napi_defer_hard_irqs;
@@ -2678,6 +2692,22 @@ netif_napi_add_tx_weight(struct net_device *dev,
netif_napi_add_weight(dev, napi, poll, weight);
}
+/**
+ * netif_napi_add_config - initialize a NAPI context with persistent config
+ * @dev: network device
+ * @napi: NAPI context
+ * @poll: polling function
+ * @index: the NAPI index
+ */
+static inline void
+netif_napi_add_config(struct net_device *dev, struct napi_struct *napi,
+ int (*poll)(struct napi_struct *, int), int index)
+{
+ napi->index = index;
+ napi->config = &dev->napi_config[index];
+ netif_napi_add_weight(dev, napi, poll, NAPI_POLL_WEIGHT);
+}
+
/**
* netif_napi_add_tx() - initialize a NAPI context to be used for Tx only
* @dev: network device
diff --git a/net/core/dev.c b/net/core/dev.c
index e21ace3551d5..c682173a7642 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6505,6 +6505,23 @@ EXPORT_SYMBOL(napi_busy_loop);
#endif /* CONFIG_NET_RX_BUSY_POLL */
+static void __napi_hash_add_with_id(struct napi_struct *napi,
+ unsigned int napi_id)
+{
+ napi->napi_id = napi_id;
+ hlist_add_head_rcu(&napi->napi_hash_node,
+ &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
+}
+
+static void napi_hash_add_with_id(struct napi_struct *napi,
+ unsigned int napi_id)
+{
+ spin_lock(&napi_hash_lock);
+ WARN_ON_ONCE(napi_by_id(napi_id));
+ __napi_hash_add_with_id(napi, napi_id);
+ spin_unlock(&napi_hash_lock);
+}
+
static void napi_hash_add(struct napi_struct *napi)
{
if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
@@ -6517,10 +6534,8 @@ static void napi_hash_add(struct napi_struct *napi)
if (unlikely(++napi_gen_id < MIN_NAPI_ID))
napi_gen_id = MIN_NAPI_ID;
} while (napi_by_id(napi_gen_id));
- napi->napi_id = napi_gen_id;
- hlist_add_head_rcu(&napi->napi_hash_node,
- &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
+ __napi_hash_add_with_id(napi, napi_gen_id);
spin_unlock(&napi_hash_lock);
}
@@ -6643,6 +6658,28 @@ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
}
EXPORT_SYMBOL(netif_queue_set_napi);
+static void napi_restore_config(struct napi_struct *n)
+{
+ n->defer_hard_irqs = n->config->defer_hard_irqs;
+ n->gro_flush_timeout = n->config->gro_flush_timeout;
+ /* a NAPI ID might be stored in the config, if so use it. if not, use
+ * napi_hash_add to generate one for us. It will be saved to the config
+ * in napi_disable.
+ */
+ if (n->config->napi_id)
+ napi_hash_add_with_id(n, n->config->napi_id);
+ else
+ napi_hash_add(n);
+}
+
+static void napi_save_config(struct napi_struct *n)
+{
+ n->config->defer_hard_irqs = n->defer_hard_irqs;
+ n->config->gro_flush_timeout = n->gro_flush_timeout;
+ n->config->napi_id = n->napi_id;
+ napi_hash_del(n);
+}
+
void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight)
{
@@ -6653,8 +6690,6 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
INIT_HLIST_NODE(&napi->napi_hash_node);
hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
napi->timer.function = napi_watchdog;
- napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs));
- napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout));
init_gro_hash(napi);
napi->skb = NULL;
INIT_LIST_HEAD(&napi->rx_list);
@@ -6672,7 +6707,13 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
set_bit(NAPI_STATE_SCHED, &napi->state);
set_bit(NAPI_STATE_NPSVC, &napi->state);
list_add_rcu(&napi->dev_list, &dev->napi_list);
- napi_hash_add(napi);
+
+ /* default settings from sysfs are applied to all NAPIs. any per-NAPI
+ * configuration will be loaded in napi_enable
+ */
+ napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs));
+ napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout));
+
napi_get_frags_check(napi);
/* Create kthread for this napi if dev->threaded is set.
* Clear dev->threaded if kthread creation failed so that
@@ -6704,6 +6745,11 @@ void napi_disable(struct napi_struct *n)
hrtimer_cancel(&n->timer);
+ if (n->config)
+ napi_save_config(n);
+ else
+ napi_hash_del(n);
+
clear_bit(NAPI_STATE_DISABLE, &n->state);
}
EXPORT_SYMBOL(napi_disable);
@@ -6719,6 +6765,11 @@ void napi_enable(struct napi_struct *n)
{
unsigned long new, val = READ_ONCE(n->state);
+ if (n->config)
+ napi_restore_config(n);
+ else
+ napi_hash_add(n);
+
do {
BUG_ON(!test_bit(NAPI_STATE_SCHED, &val));
@@ -6748,7 +6799,11 @@ void __netif_napi_del(struct napi_struct *napi)
if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
return;
- napi_hash_del(napi);
+ if (napi->config) {
+ napi->index = -1;
+ napi->config = NULL;
+ }
+
list_del_rcu(&napi->dev_list);
napi_free_frags(napi);
@@ -11085,6 +11140,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
unsigned int txqs, unsigned int rxqs)
{
struct net_device *dev;
+ size_t napi_config_sz;
+ unsigned int maxqs;
BUG_ON(strlen(name) >= sizeof(dev->name));
@@ -11098,6 +11155,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
return NULL;
}
+ maxqs = max(txqs, rxqs);
+
dev = kvzalloc(struct_size(dev, priv, sizeof_priv),
GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
if (!dev)
@@ -11174,6 +11233,11 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
if (!dev->ethtool)
goto free_all;
+ napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config));
+ dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT);
+ if (!dev->napi_config)
+ goto free_all;
+
strscpy(dev->name, name);
dev->name_assign_type = name_assign_type;
dev->group = INIT_NETDEV_GROUP;
@@ -11237,6 +11301,8 @@ void free_netdev(struct net_device *dev)
list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
netif_napi_del(p);
+ kvfree(dev->napi_config);
+
ref_tracker_dir_exit(&dev->refcnt_tracker);
#ifdef CONFIG_PCPU_DEV_REFCNT
free_percpu(dev->pcpu_refcnt);
diff --git a/net/core/dev.h b/net/core/dev.h
index 7d0aab7e3ef1..7881bced70a9 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -177,11 +177,17 @@ static inline void napi_set_defer_hard_irqs(struct napi_struct *n, u32 defer)
static inline void netdev_set_defer_hard_irqs(struct net_device *netdev,
u32 defer)
{
+ unsigned int count = max(netdev->num_rx_queues,
+ netdev->num_tx_queues);
struct napi_struct *napi;
+ int i;
WRITE_ONCE(netdev->napi_defer_hard_irqs, defer);
list_for_each_entry(napi, &netdev->napi_list, dev_list)
napi_set_defer_hard_irqs(napi, defer);
+
+ for (i = 0; i < count; i++)
+ netdev->napi_config[i].defer_hard_irqs = defer;
}
/**
@@ -217,11 +223,17 @@ static inline void napi_set_gro_flush_timeout(struct napi_struct *n,
static inline void netdev_set_gro_flush_timeout(struct net_device *netdev,
unsigned long timeout)
{
+ unsigned int count = max(netdev->num_rx_queues,
+ netdev->num_tx_queues);
struct napi_struct *napi;
+ int i;
WRITE_ONCE(netdev->gro_flush_timeout, timeout);
list_for_each_entry(napi, &netdev->napi_list, dev_list)
napi_set_gro_flush_timeout(napi, timeout);
+
+ for (i = 0; i < count; i++)
+ netdev->napi_config[i].gro_flush_timeout = timeout;
}
int rps_cpumask_housekeeping(struct cpumask *mask);
--
2.25.1
Hi,
On Fri, Oct 11, 2024 at 06:45:00PM +0000, Joe Damato wrote:
> Add a persistent NAPI config area for NAPI configuration to the core.
> Drivers opt-in to setting the persistent config for a NAPI by passing an
> index when calling netif_napi_add_config.
>
> napi_config is allocated in alloc_netdev_mqs, freed in free_netdev
> (after the NAPIs are deleted).
>
> Drivers which call netif_napi_add_config will have persistent per-NAPI
> settings: NAPI IDs, gro_flush_timeout, and defer_hard_irq settings.
>
> Per-NAPI settings are saved in napi_disable and restored in napi_enable.
>
> Co-developed-by: Martin Karsten <mkarsten@uwaterloo.ca>
> Signed-off-by: Martin Karsten <mkarsten@uwaterloo.ca>
> Signed-off-by: Joe Damato <jdamato@fastly.com>
> Reviewed-by: Jakub Kicinski <kuba@kernel.org>
This patch triggers a lock inversion message on pcnet Ethernet adapters.
========================================================
WARNING: possible irq lock inversion dependency detected
6.12.0-08446-g228a1157fb9f #1 Tainted: G N
The problem seems obvious - napi_hash_lock and the local driver lock are
acquired in different order depending on the call sequence. Unfortunately
I have no idea how to fix the problem or I'd submit a patch.
Complete backtrace and bisect log attached. Bisect was run with qemu on
riscv64, but the architecture/platform does not really matter.
Please let me know if there is anything I can do to help resolve the
problem.
Thanks,
Guenter
---
[ 13.251894] ========================================================
[ 13.252024] WARNING: possible irq lock inversion dependency detected
[ 13.252307] 6.12.0-08446-g228a1157fb9f #1 Tainted: G N
[ 13.252472] --------------------------------------------------------
[ 13.252569] ip/1816 just changed the state of lock:
[ 13.252678] ffffffff81dec490 (napi_hash_lock){+...}-{3:3}, at: napi_disable+0xf8/0x10c
[ 13.253497] but this lock was taken by another, HARDIRQ-safe lock in the past:
[ 13.253637] (&lp->lock){-.-.}-{3:3}
[ 13.253682]
[ 13.253682]
[ 13.253682] and interrupts could create inverse lock ordering between them.
[ 13.253682]
[ 13.253923]
[ 13.253923] other info that might help us debug this:
[ 13.254082] Possible interrupt unsafe locking scenario:
[ 13.254082]
[ 13.254186] CPU0 CPU1
[ 13.254264] ---- ----
[ 13.254340] lock(napi_hash_lock);
[ 13.254438] local_irq_disable();
[ 13.254532] lock(&lp->lock);
[ 13.254649] lock(napi_hash_lock);
[ 13.254772] <Interrupt>
[ 13.254828] lock(&lp->lock);
[ 13.254921]
[ 13.254921] *** DEADLOCK ***
[ 13.254921]
[ 13.255049] 1 lock held by ip/1816:
[ 13.255127] #0: ffffffff81dece50 (rtnl_mutex){+.+.}-{4:4}, at: rtnl_newlink+0x22a/0x74c
[ 13.255398]
[ 13.255398] the shortest dependencies between 2nd lock and 1st lock:
[ 13.255593] -> (&lp->lock){-.-.}-{3:3} ops: 75 {
[ 13.255802] IN-HARDIRQ-W at:
[ 13.255910] __lock_acquire+0xa3e/0x2158
[ 13.256055] lock_acquire.part.0+0xba/0x21e
[ 13.256153] lock_acquire+0x44/0x5a
[ 13.256241] _raw_spin_lock+0x2c/0x40
[ 13.256343] pcnet32_interrupt+0x3c/0x200
[ 13.256442] __handle_irq_event_percpu+0xa0/0x2e0
[ 13.256547] handle_irq_event+0x3c/0x8a
[ 13.256640] handle_fasteoi_irq+0x9c/0x1d2
[ 13.256738] generic_handle_domain_irq+0x1c/0x2a
[ 13.256840] plic_handle_irq+0x7e/0xfc
[ 13.256937] generic_handle_domain_irq+0x1c/0x2a
[ 13.257041] riscv_intc_irq+0x26/0x60
[ 13.257133] handle_riscv_irq+0x4a/0x74
[ 13.257228] call_on_irq_stack+0x32/0x40
[ 13.257349] IN-SOFTIRQ-W at:
[ 13.257420] __lock_acquire+0x40a/0x2158
[ 13.257515] lock_acquire.part.0+0xba/0x21e
[ 13.257611] lock_acquire+0x44/0x5a
[ 13.257699] _raw_spin_lock_irqsave+0x3a/0x64
[ 13.257798] pcnet32_poll+0x2ac/0x768
[ 13.257892] __napi_poll.constprop.0+0x26/0x128
[ 13.257997] net_rx_action+0x186/0x30e
[ 13.258090] handle_softirqs+0x110/0x4a2
[ 13.258187] __irq_exit_rcu+0xe2/0x10c
[ 13.258279] irq_exit_rcu+0xc/0x36
[ 13.258368] handle_riscv_irq+0x64/0x74
[ 13.258463] call_on_irq_stack+0x32/0x40
[ 13.258558] INITIAL USE at:
[ 13.258629] __lock_acquire+0x46c/0x2158
[ 13.258723] lock_acquire.part.0+0xba/0x21e
[ 13.258820] lock_acquire+0x44/0x5a
[ 13.258909] _raw_spin_lock_irqsave+0x3a/0x64
[ 13.259007] pcnet32_get_stats+0x2a/0x62
[ 13.259101] dev_get_stats+0xc4/0x2a6
[ 13.259380] rtnl_fill_stats+0x32/0xee
[ 13.259480] rtnl_fill_ifinfo.isra.0+0x648/0x141c
[ 13.259589] rtmsg_ifinfo_build_skb+0x98/0xf0
[ 13.259711] rtmsg_ifinfo+0x36/0x78
[ 13.259799] register_netdevice+0x758/0x7a8
[ 13.259905] register_netdev+0x18/0x2e
[ 13.259999] pcnet32_probe1+0xb96/0x103e
[ 13.260099] pcnet32_probe_pci+0xcc/0x12e
[ 13.260196] pci_device_probe+0x82/0x100
[ 13.260291] really_probe+0x86/0x234
[ 13.260383] __driver_probe_device+0x5c/0xda
[ 13.260482] driver_probe_device+0x2c/0xb2
[ 13.260578] __driver_attach+0x70/0x120
[ 13.260671] bus_for_each_dev+0x60/0xae
[ 13.260764] driver_attach+0x1a/0x22
[ 13.260853] bus_add_driver+0xce/0x1d6
[ 13.260951] driver_register+0x3e/0xd8
[ 13.261048] __pci_register_driver+0x5c/0x66
[ 13.261149] pcnet32_init_module+0x58/0x14a
[ 13.261255] do_one_initcall+0x7e/0x2b6
[ 13.261352] kernel_init_freeable+0x2cc/0x33e
[ 13.261456] kernel_init+0x1e/0x11a
[ 13.261550] ret_from_fork+0xe/0x18
[ 13.261653] }
[ 13.261705] ... key at: [<ffffffff82a9efb0>] __key.4+0x0/0x10
[ 13.261839] ... acquired at:
[ 13.261907] lock_acquire.part.0+0xba/0x21e
[ 13.261987] lock_acquire+0x44/0x5a
[ 13.262056] _raw_spin_lock+0x2c/0x40
[ 13.262132] napi_hash_add+0x26/0xb8
[ 13.262207] napi_enable+0x10e/0x124
[ 13.262281] pcnet32_open+0x3c2/0x6b8
[ 13.262357] __dev_open+0xba/0x158
[ 13.262427] __dev_change_flags+0x19a/0x214
[ 13.262508] dev_change_flags+0x1e/0x56
[ 13.262583] do_setlink.isra.0+0x20c/0xcc2
[ 13.262661] rtnl_newlink+0x592/0x74c
[ 13.262731] rtnetlink_rcv_msg+0x3a8/0x582
[ 13.262807] netlink_rcv_skb+0x44/0xf4
[ 13.262882] rtnetlink_rcv+0x14/0x1c
[ 13.262956] netlink_unicast+0x1a0/0x23e
[ 13.263031] netlink_sendmsg+0x17e/0x38e
[ 13.263109] __sock_sendmsg+0x40/0x7a
[ 13.263182] ____sys_sendmsg+0x18e/0x1de
[ 13.263288] ___sys_sendmsg+0x82/0xc6
[ 13.263360] __sys_sendmsg+0x8e/0xe0
[ 13.263430] __riscv_sys_sendmsg+0x16/0x1e
[ 13.263507] do_trap_ecall_u+0x1b6/0x1e2
[ 13.263583] _new_vmalloc_restore_context_a0+0xc2/0xce
[ 13.263685]
[ 13.263746] -> (napi_hash_lock){+...}-{3:3} ops: 2 {
[ 13.263891] HARDIRQ-ON-W at:
[ 13.263963] __lock_acquire+0x688/0x2158
[ 13.264057] lock_acquire.part.0+0xba/0x21e
[ 13.264153] lock_acquire+0x44/0x5a
[ 13.264240] _raw_spin_lock+0x2c/0x40
[ 13.264330] napi_disable+0xf8/0x10c
[ 13.264419] pcnet32_close+0x5c/0x126
[ 13.264511] __dev_close_many+0x8a/0xf8
[ 13.264609] __dev_change_flags+0x176/0x214
[ 13.264708] dev_change_flags+0x1e/0x56
[ 13.264800] do_setlink.isra.0+0x20c/0xcc2
[ 13.264900] rtnl_newlink+0x592/0x74c
[ 13.264988] rtnetlink_rcv_msg+0x3a8/0x582
[ 13.265082] netlink_rcv_skb+0x44/0xf4
[ 13.265174] rtnetlink_rcv+0x14/0x1c
[ 13.265268] netlink_unicast+0x1a0/0x23e
[ 13.265367] netlink_sendmsg+0x17e/0x38e
[ 13.265466] __sock_sendmsg+0x40/0x7a
[ 13.265557] ____sys_sendmsg+0x18e/0x1de
[ 13.265649] ___sys_sendmsg+0x82/0xc6
[ 13.265738] __sys_sendmsg+0x8e/0xe0
[ 13.265828] __riscv_sys_sendmsg+0x16/0x1e
[ 13.265928] do_trap_ecall_u+0x1b6/0x1e2
[ 13.266024] _new_vmalloc_restore_context_a0+0xc2/0xce
[ 13.266134] INITIAL USE at:
[ 13.266206] __lock_acquire+0x46c/0x2158
[ 13.266298] lock_acquire.part.0+0xba/0x21e
[ 13.266394] lock_acquire+0x44/0x5a
[ 13.266480] _raw_spin_lock+0x2c/0x40
[ 13.266572] napi_hash_add+0x26/0xb8
[ 13.266660] napi_enable+0x10e/0x124
[ 13.266748] pcnet32_open+0x3c2/0x6b8
[ 13.266839] __dev_open+0xba/0x158
[ 13.266930] __dev_change_flags+0x19a/0x214
[ 13.267026] dev_change_flags+0x1e/0x56
[ 13.267116] do_setlink.isra.0+0x20c/0xcc2
[ 13.267211] rtnl_newlink+0x592/0x74c
[ 13.267299] rtnetlink_rcv_msg+0x3a8/0x582
[ 13.267395] netlink_rcv_skb+0x44/0xf4
[ 13.267489] rtnetlink_rcv+0x14/0x1c
[ 13.267578] netlink_unicast+0x1a0/0x23e
[ 13.267670] netlink_sendmsg+0x17e/0x38e
[ 13.267763] __sock_sendmsg+0x40/0x7a
[ 13.267851] ____sys_sendmsg+0x18e/0x1de
[ 13.267946] ___sys_sendmsg+0x82/0xc6
[ 13.268034] __sys_sendmsg+0x8e/0xe0
[ 13.268121] __riscv_sys_sendmsg+0x16/0x1e
[ 13.268215] do_trap_ecall_u+0x1b6/0x1e2
[ 13.268309] _new_vmalloc_restore_context_a0+0xc2/0xce
[ 13.268419] }
[ 13.268460] ... key at: [<ffffffff81dec490>] napi_hash_lock+0x18/0x40
[ 13.268579] ... acquired at:
[ 13.268636] mark_lock+0x5f2/0x88a
[ 13.268705] __lock_acquire+0x688/0x2158
[ 13.268779] lock_acquire.part.0+0xba/0x21e
[ 13.268858] lock_acquire+0x44/0x5a
[ 13.268930] _raw_spin_lock+0x2c/0x40
[ 13.269002] napi_disable+0xf8/0x10c
[ 13.269073] pcnet32_close+0x5c/0x126
[ 13.269145] __dev_close_many+0x8a/0xf8
[ 13.269220] __dev_change_flags+0x176/0x214
[ 13.269298] dev_change_flags+0x1e/0x56
[ 13.269371] do_setlink.isra.0+0x20c/0xcc2
[ 13.269449] rtnl_newlink+0x592/0x74c
[ 13.269520] rtnetlink_rcv_msg+0x3a8/0x582
[ 13.269596] netlink_rcv_skb+0x44/0xf4
[ 13.269671] rtnetlink_rcv+0x14/0x1c
[ 13.269742] netlink_unicast+0x1a0/0x23e
[ 13.269818] netlink_sendmsg+0x17e/0x38e
[ 13.269894] __sock_sendmsg+0x40/0x7a
[ 13.269966] ____sys_sendmsg+0x18e/0x1de
[ 13.270041] ___sys_sendmsg+0x82/0xc6
[ 13.270114] __sys_sendmsg+0x8e/0xe0
[ 13.270187] __riscv_sys_sendmsg+0x16/0x1e
[ 13.270268] do_trap_ecall_u+0x1b6/0x1e2
[ 13.270346] _new_vmalloc_restore_context_a0+0xc2/0xce
[ 13.270439]
[ 13.270525]
[ 13.270525] stack backtrace:
[ 13.270729] CPU: 0 UID: 0 PID: 1816 Comm: ip Tainted: G N 6.12.0-08446-g228a1157fb9f #1
[ 13.270933] Tainted: [N]=TEST
[ 13.271006] Hardware name: riscv-virtio,qemu (DT)
[ 13.271165] Call Trace:
[ 13.271270] [<ffffffff80006d42>] dump_backtrace+0x1c/0x24
[ 13.271373] [<ffffffff80dc5574>] show_stack+0x2c/0x38
[ 13.271467] [<ffffffff80ddc854>] dump_stack_lvl+0x74/0xac
[ 13.271565] [<ffffffff80ddc8a0>] dump_stack+0x14/0x1c
[ 13.271657] [<ffffffff8008551a>] print_irq_inversion_bug.part.0+0x1aa/0x1fe
[ 13.271775] [<ffffffff800867f8>] mark_lock+0x5f2/0x88a
[ 13.271869] [<ffffffff80087d44>] __lock_acquire+0x688/0x2158
[ 13.271973] [<ffffffff80086e10>] lock_acquire.part.0+0xba/0x21e
[ 13.272078] [<ffffffff80086fb8>] lock_acquire+0x44/0x5a
[ 13.272173] [<ffffffff80de8ea4>] _raw_spin_lock+0x2c/0x40
[ 13.272269] [<ffffffff80b89bf0>] napi_disable+0xf8/0x10c
[ 13.272360] [<ffffffff8099d022>] pcnet32_close+0x5c/0x126
[ 13.272454] [<ffffffff80b8fa52>] __dev_close_many+0x8a/0xf8
[ 13.272549] [<ffffffff80b9790e>] __dev_change_flags+0x176/0x214
[ 13.272647] [<ffffffff80b979ca>] dev_change_flags+0x1e/0x56
[ 13.272740] [<ffffffff80ba93a8>] do_setlink.isra.0+0x20c/0xcc2
[ 13.272838] [<ffffffff80baa3f0>] rtnl_newlink+0x592/0x74c
[ 13.272935] [<ffffffff80babf5a>] rtnetlink_rcv_msg+0x3a8/0x582
[ 13.273032] [<ffffffff80c05bcc>] netlink_rcv_skb+0x44/0xf4
[ 13.273127] [<ffffffff80ba64f6>] rtnetlink_rcv+0x14/0x1c
[ 13.273223] [<ffffffff80c05578>] netlink_unicast+0x1a0/0x23e
[ 13.273326] [<ffffffff80c05794>] netlink_sendmsg+0x17e/0x38e
[ 13.273423] [<ffffffff80b633e2>] __sock_sendmsg+0x40/0x7a
[ 13.273515] [<ffffffff80b63742>] ____sys_sendmsg+0x18e/0x1de
[ 13.273610] [<ffffffff80b65e44>] ___sys_sendmsg+0x82/0xc6
[ 13.273704] [<ffffffff80b662ac>] __sys_sendmsg+0x8e/0xe0
[ 13.273795] [<ffffffff80b66314>] __riscv_sys_sendmsg+0x16/0x1e
[ 13.273894] [<ffffffff80ddd3e4>] do_trap_ecall_u+0x1b6/0x1e2
[ 13.273992] [<ffffffff80de9c7a>] _new_vmalloc_restore_context_a0+0xc2/0xce
---
# bad: [228a1157fb9fec47eb135b51c0202b574e079ebf] Merge tag '6.13-rc-part1-SMB3-client-fixes' of git://git.samba.org/sfrench/cifs-2.6
# good: [43fb83c17ba2d63dfb798f0be7453ed55ca3f9c2] Merge tag 'soc-arm-6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/soc/soc
git bisect start '228a1157fb9f' '43fb83c17ba2'
# bad: [071b34dcf71523a559b6c39f5d21a268a9531b50] Merge tag 'sound-6.13-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tiwai/sound
git bisect bad 071b34dcf71523a559b6c39f5d21a268a9531b50
# bad: [31a1f8752f7df7e3d8122054fbef02a9a8bff38f] Merge branch 'phy-mediatek-reorg'
git bisect bad 31a1f8752f7df7e3d8122054fbef02a9a8bff38f
# bad: [de51ad08b1177bbbb8b60cb7dd4c3c5dd50d262f] phonet: Pass net and ifindex to rtm_phonet_notify().
git bisect bad de51ad08b1177bbbb8b60cb7dd4c3c5dd50d262f
# good: [76d46d766a45e205e59af511efbb24abe22d0b4c] net: emaclite: Adopt clock support
git bisect good 76d46d766a45e205e59af511efbb24abe22d0b4c
# bad: [a37b0e4eca0436ebc17d512d70b1409956340688] ipv6: Use rtnl_register_many().
git bisect bad a37b0e4eca0436ebc17d512d70b1409956340688
# bad: [de306f0051ae947680a13c13a9fd9373d7460bb1] net: gianfar: Use __be64 * to store pointers to big endian values
git bisect bad de306f0051ae947680a13c13a9fd9373d7460bb1
# good: [5c16e118b796e95d6e5c80c5d8af2591262431c9] net: ethernet: ti: am65-cpsw: Use __be64 type for id_temp
git bisect good 5c16e118b796e95d6e5c80c5d8af2591262431c9
# bad: [41936522749654e64531121bbd6a95bab5d56d76] bnxt: Add support for persistent NAPI config
git bisect bad 41936522749654e64531121bbd6a95bab5d56d76
# good: [79636038d37e7bd4d078238f2a3f002cab4423bc] ipv4: tcp: give socket pointer to control skbs
git bisect good 79636038d37e7bd4d078238f2a3f002cab4423bc
# good: [516010460011ae74ac3b7383cf90ed27e2711cd6] netdev-genl: Dump napi_defer_hard_irqs
git bisect good 516010460011ae74ac3b7383cf90ed27e2711cd6
# good: [0137891e74576f77a7901718dc0ce08ca074ae74] netdev-genl: Dump gro_flush_timeout
git bisect good 0137891e74576f77a7901718dc0ce08ca074ae74
# bad: [1287c1ae0fc227e5acef11a539eb4e75646e31c7] netdev-genl: Support setting per-NAPI config values
git bisect bad 1287c1ae0fc227e5acef11a539eb4e75646e31c7
# bad: [86e25f40aa1e9e54e081e55016f65b5c92523989] net: napi: Add napi_config
git bisect bad 86e25f40aa1e9e54e081e55016f65b5c92523989
# first bad commit: [86e25f40aa1e9e54e081e55016f65b5c92523989] net: napi: Add napi_config
On Wed, Nov 27, 2024 at 09:43:54AM -0800, Guenter Roeck wrote:
> Hi,
>
> On Fri, Oct 11, 2024 at 06:45:00PM +0000, Joe Damato wrote:
> > Add a persistent NAPI config area for NAPI configuration to the core.
> > Drivers opt-in to setting the persistent config for a NAPI by passing an
> > index when calling netif_napi_add_config.
> >
> > napi_config is allocated in alloc_netdev_mqs, freed in free_netdev
> > (after the NAPIs are deleted).
> >
> > Drivers which call netif_napi_add_config will have persistent per-NAPI
> > settings: NAPI IDs, gro_flush_timeout, and defer_hard_irq settings.
> >
> > Per-NAPI settings are saved in napi_disable and restored in napi_enable.
> >
> > Co-developed-by: Martin Karsten <mkarsten@uwaterloo.ca>
> > Signed-off-by: Martin Karsten <mkarsten@uwaterloo.ca>
> > Signed-off-by: Joe Damato <jdamato@fastly.com>
> > Reviewed-by: Jakub Kicinski <kuba@kernel.org>
>
> This patch triggers a lock inversion message on pcnet Ethernet adapters.
Thanks for the report. I am not familiar with the pcnet driver, but
took some time now to read the report below and the driver code.
I could definitely be reading the output incorrectly (if so please
let me know), but it seems like the issue can be triggered in this
case:
CPU 0:
pcnet32_open
lock(lp->lock)
napi_enable
napi_hash_add
lock(napi_hash_lock)
unlock(napi_hash_lock)
unlock(lp->lock)
Meanwhile on CPU 1:
pcnet32_close
napi_disable
napi_hash_del
lock(napi_hash_lock)
unlock(napi_hash_lock)
lock(lp->lock)
[... other code ...]
unlock(lp->lock)
[... other code ...]
lock(lp->lock)
[... other code ...]
unlock(lp->lock)
In other words: while the close path is holding napi_hash_lock (and
before it acquires lp->lock), the enable path takes lp->lock and
then napi_hash_lock.
It seems this was triggered because before the identified commit,
napi_enable did not call napi_hash_add (and thus did not take the
napi_hash_lock).
So, I agree there is an inversion; I can't say for sure if this
would cause a deadlock in certain situations. It seems like
napi_hash_del in the close path will return, so the inversion
doesn't seem like it'd lead to a deadlock, but I am not an expert in
this and could certainly be wrong.
I wonder if a potential fix for this would be in the driver's close
function?
In pcnet32_open the order is:
lock(lp->lock)
napi_enable
netif_start_queue
mod_timer(watchdog)
unlock(lp->lock)
Perhaps pcnet32_close should be the same?
I've included an example patch below for pcnet32_close and I've CC'd
the maintainer of pcnet32 that is not currently CC'd.
Guenter: Is there any change you might be able to test the proposed
patch below?
Don: Would you mind taking a look to see if this change is sensible?
Netdev maintainers: at a higher level, I'm not sure how many other
drivers might have locking patterns like this that commit
86e25f40aa1e ("net: napi: Add napi_config") will break in a similar
manner.
Do I:
- comb through drivers trying to identify these, and/or
- do we find a way to implement the identified commit with the
original lock ordering to avoid breaking any other driver?
I'd appreciate guidance/insight from the maintainers on how to best
proceed.
diff --git a/drivers/net/ethernet/amd/pcnet32.c b/drivers/net/ethernet/amd/pcnet32.c
index 72db9f9e7bee..ff56a308fec9 100644
--- a/drivers/net/ethernet/amd/pcnet32.c
+++ b/drivers/net/ethernet/amd/pcnet32.c
@@ -2623,13 +2623,13 @@ static int pcnet32_close(struct net_device *dev)
struct pcnet32_private *lp = netdev_priv(dev);
unsigned long flags;
+ spin_lock_irqsave(&lp->lock, flags);
+
del_timer_sync(&lp->watchdog_timer);
netif_stop_queue(dev);
napi_disable(&lp->napi);
- spin_lock_irqsave(&lp->lock, flags);
-
dev->stats.rx_missed_errors = lp->a->read_csr(ioaddr, 112);
netif_printk(lp, ifdown, KERN_DEBUG, dev,
On 11/27/24 10:51, Joe Damato wrote:
> On Wed, Nov 27, 2024 at 09:43:54AM -0800, Guenter Roeck wrote:
>> Hi,
>>
>> On Fri, Oct 11, 2024 at 06:45:00PM +0000, Joe Damato wrote:
>>> Add a persistent NAPI config area for NAPI configuration to the core.
>>> Drivers opt-in to setting the persistent config for a NAPI by passing an
>>> index when calling netif_napi_add_config.
>>>
>>> napi_config is allocated in alloc_netdev_mqs, freed in free_netdev
>>> (after the NAPIs are deleted).
>>>
>>> Drivers which call netif_napi_add_config will have persistent per-NAPI
>>> settings: NAPI IDs, gro_flush_timeout, and defer_hard_irq settings.
>>>
>>> Per-NAPI settings are saved in napi_disable and restored in napi_enable.
>>>
>>> Co-developed-by: Martin Karsten <mkarsten@uwaterloo.ca>
>>> Signed-off-by: Martin Karsten <mkarsten@uwaterloo.ca>
>>> Signed-off-by: Joe Damato <jdamato@fastly.com>
>>> Reviewed-by: Jakub Kicinski <kuba@kernel.org>
>>
>> This patch triggers a lock inversion message on pcnet Ethernet adapters.
>
> Thanks for the report. I am not familiar with the pcnet driver, but
> took some time now to read the report below and the driver code.
>
> I could definitely be reading the output incorrectly (if so please
> let me know), but it seems like the issue can be triggered in this
> case:
>
> CPU 0:
> pcnet32_open
> lock(lp->lock)
> napi_enable
> napi_hash_add
> lock(napi_hash_lock)
> unlock(napi_hash_lock)
> unlock(lp->lock)
>
>
> Meanwhile on CPU 1:
> pcnet32_close
> napi_disable
> napi_hash_del
> lock(napi_hash_lock)
> unlock(napi_hash_lock)
> lock(lp->lock)
> [... other code ...]
> unlock(lp->lock)
> [... other code ...]
> lock(lp->lock)
> [... other code ...]
> unlock(lp->lock)
>
> In other words: while the close path is holding napi_hash_lock (and
> before it acquires lp->lock), the enable path takes lp->lock and
> then napi_hash_lock.
>
> It seems this was triggered because before the identified commit,
> napi_enable did not call napi_hash_add (and thus did not take the
> napi_hash_lock).
>
> So, I agree there is an inversion; I can't say for sure if this
> would cause a deadlock in certain situations. It seems like
> napi_hash_del in the close path will return, so the inversion
> doesn't seem like it'd lead to a deadlock, but I am not an expert in
> this and could certainly be wrong.
>
> I wonder if a potential fix for this would be in the driver's close
> function?
>
> In pcnet32_open the order is:
> lock(lp->lock)
> napi_enable
> netif_start_queue
> mod_timer(watchdog)
> unlock(lp->lock)
>
> Perhaps pcnet32_close should be the same?
>
> I've included an example patch below for pcnet32_close and I've CC'd
> the maintainer of pcnet32 that is not currently CC'd.
>
> Guenter: Is there any change you might be able to test the proposed
> patch below?
>
I moved the spinlock after del_timer_sync() because it is not a good idea
to hold a spinlock when calling that function. That results in:
[ 10.646956] BUG: sleeping function called from invalid context at net/core/dev.c:6775
[ 10.647142] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 1817, name: ip
[ 10.647237] preempt_count: 1, expected: 0
[ 10.647319] 2 locks held by ip/1817:
[ 10.647383] #0: ffffffff81ded990 (rtnl_mutex){+.+.}-{4:4}, at: rtnl_newlink+0x22a/0x74c
[ 10.647880] #1: ff6000000498ccb0 (&lp->lock){-.-.}-{3:3}, at: pcnet32_close+0x40/0x126
[ 10.648050] irq event stamp: 3720
[ 10.648102] hardirqs last enabled at (3719): [<ffffffff80decaf4>] _raw_spin_unlock_irqrestore+0x54/0x62
[ 10.648204] hardirqs last disabled at (3720): [<ffffffff80dec8a2>] _raw_spin_lock_irqsave+0x5e/0x64
[ 10.648301] softirqs last enabled at (3712): [<ffffffff8001efca>] handle_softirqs+0x3e6/0x4a2
[ 10.648396] softirqs last disabled at (3631): [<ffffffff80ded6cc>] __do_softirq+0x12/0x1a
[ 10.648666] CPU: 0 UID: 0 PID: 1817 Comm: ip Tainted: G N 6.12.0-10313-g7d4050728c83-dirty #1
[ 10.648828] Tainted: [N]=TEST
[ 10.648879] Hardware name: riscv-virtio,qemu (DT)
[ 10.648978] Call Trace:
[ 10.649048] [<ffffffff80006d42>] dump_backtrace+0x1c/0x24
[ 10.649117] [<ffffffff80dc8d94>] show_stack+0x2c/0x38
[ 10.649180] [<ffffffff80de00b0>] dump_stack_lvl+0x74/0xac
[ 10.649246] [<ffffffff80de00fc>] dump_stack+0x14/0x1c
[ 10.649308] [<ffffffff8004da18>] __might_resched+0x23e/0x248
[ 10.649377] [<ffffffff8004da60>] __might_sleep+0x3e/0x62
[ 10.649441] [<ffffffff80b8d370>] napi_disable+0x24/0x10c
[ 10.649506] [<ffffffff809a06fe>] pcnet32_close+0x6c/0x126
...
This is due to might_sleep() at the beginning of napi_disable(). So it doesn't
work as intended, it just replaces one problem with another.
> Don: Would you mind taking a look to see if this change is sensible?
>
> Netdev maintainers: at a higher level, I'm not sure how many other
> drivers might have locking patterns like this that commit
> 86e25f40aa1e ("net: napi: Add napi_config") will break in a similar
> manner.
>
> Do I:
> - comb through drivers trying to identify these, and/or
Coccinelle, checking for napi_enable calls under spinlock, points to:
napi_enable called under spin_lock_irqsave from drivers/net/ethernet/via/via-velocity.c:2325
napi_enable called under spin_lock_irqsave from drivers/net/can/grcan.c:1076
napi_enable called under spin_lock from drivers/net/ethernet/marvell/mvneta.c:4388
napi_enable called under spin_lock_irqsave from drivers/net/ethernet/amd/pcnet32.c:2104
Guenter
> - do we find a way to implement the identified commit with the
> original lock ordering to avoid breaking any other driver?
>
> I'd appreciate guidance/insight from the maintainers on how to best
> proceed.
>
> diff --git a/drivers/net/ethernet/amd/pcnet32.c b/drivers/net/ethernet/amd/pcnet32.c
> index 72db9f9e7bee..ff56a308fec9 100644
> --- a/drivers/net/ethernet/amd/pcnet32.c
> +++ b/drivers/net/ethernet/amd/pcnet32.c
> @@ -2623,13 +2623,13 @@ static int pcnet32_close(struct net_device *dev)
> struct pcnet32_private *lp = netdev_priv(dev);
> unsigned long flags;
>
> + spin_lock_irqsave(&lp->lock, flags);
> +
> del_timer_sync(&lp->watchdog_timer);
>
> netif_stop_queue(dev);
> napi_disable(&lp->napi);
>
> - spin_lock_irqsave(&lp->lock, flags);
> -
> dev->stats.rx_missed_errors = lp->a->read_csr(ioaddr, 112);
>
> netif_printk(lp, ifdown, KERN_DEBUG, dev,
On Wed, Nov 27, 2024 at 01:43:14PM -0800, Guenter Roeck wrote:
> On 11/27/24 10:51, Joe Damato wrote:
> > On Wed, Nov 27, 2024 at 09:43:54AM -0800, Guenter Roeck wrote:
> > > Hi,
> > >
> > > On Fri, Oct 11, 2024 at 06:45:00PM +0000, Joe Damato wrote:
[...]
> > It seems this was triggered because before the identified commit,
> > napi_enable did not call napi_hash_add (and thus did not take the
> > napi_hash_lock).
> >
> > So, I agree there is an inversion; I can't say for sure if this
> > would cause a deadlock in certain situations. It seems like
> > napi_hash_del in the close path will return, so the inversion
> > doesn't seem like it'd lead to a deadlock, but I am not an expert in
> > this and could certainly be wrong.
> >
> > I wonder if a potential fix for this would be in the driver's close
> > function?
> >
> > In pcnet32_open the order is:
> > lock(lp->lock)
> > napi_enable
> > netif_start_queue
> > mod_timer(watchdog)
> > unlock(lp->lock)
> >
> > Perhaps pcnet32_close should be the same?
> >
> > I've included an example patch below for pcnet32_close and I've CC'd
> > the maintainer of pcnet32 that is not currently CC'd.
> >
> > Guenter: Is there any change you might be able to test the proposed
> > patch below?
> >
>
> I moved the spinlock after del_timer_sync() because it is not a good idea
> to hold a spinlock when calling that function. That results in:
>
> [ 10.646956] BUG: sleeping function called from invalid context at net/core/dev.c:6775
> [ 10.647142] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 1817, name: ip
> [ 10.647237] preempt_count: 1, expected: 0
> [ 10.647319] 2 locks held by ip/1817:
> [ 10.647383] #0: ffffffff81ded990 (rtnl_mutex){+.+.}-{4:4}, at: rtnl_newlink+0x22a/0x74c
> [ 10.647880] #1: ff6000000498ccb0 (&lp->lock){-.-.}-{3:3}, at: pcnet32_close+0x40/0x126
[...]
> This is due to might_sleep() at the beginning of napi_disable(). So it doesn't
> work as intended, it just replaces one problem with another.
Thanks for testing that. And you are right, it is also not correct.
I will give it some thought to see if I can think of something
better.
Maybe Don will have some ideas.
> > Don: Would you mind taking a look to see if this change is sensible?
> >
> > Netdev maintainers: at a higher level, I'm not sure how many other
> > drivers might have locking patterns like this that commit
> > 86e25f40aa1e ("net: napi: Add napi_config") will break in a similar
> > manner.
> >
> > Do I:
> > - comb through drivers trying to identify these, and/or
>
> Coccinelle, checking for napi_enable calls under spinlock, points to:
>
> napi_enable called under spin_lock_irqsave from drivers/net/ethernet/via/via-velocity.c:2325
> napi_enable called under spin_lock_irqsave from drivers/net/can/grcan.c:1076
> napi_enable called under spin_lock from drivers/net/ethernet/marvell/mvneta.c:4388
> napi_enable called under spin_lock_irqsave from drivers/net/ethernet/amd/pcnet32.c:2104
I checked the 3 cases above other than pcnet32 and they appear to be
false positives to me.
Guenter: would you mind sending me your cocci script? Mostly for
selfish reasons; I'd like to see how you did it so I can learn more
:) Feel free to do so off list if you prefer.
I tried to write my first coccinelle script (which you can find
below) that is probably wrong, but it attempts to detect:
- interrupt routines that hold locks
- in drivers that call napi_enable between a lock/unlock
I couldn't figure out how to get regexps to work in my script, so I
made a couple variants of the script for each of the spin_lock_*
variants and ran them all.
Only one offender was detected: pcnet32.
So, I guess the question to put out there to maintainers / others on
the list is:
- There seems to be at least 1 driver affected (pcnet32). There
might be others, but my simple (and likely incorrect) cocci
script below couldn't detect any with this particular bug shape.
Worth mentioning: there could be other bug shapes that trigger
an inversion that I am currently unaware of.
- As far as I can tell, there are three ways to proceed:
1. Find and fix all drivers which broke (pcnet32 being the only
known driver at this point), or
2. Disable IRQs when taking the lock in napi_hash_del, or
3. Move the napi hash add/remove out of napi enable/disable.
Happy to proceed however seems most reasonable to the maintainers,
please let me know.
My cocci script follows; as noted above I am too much of a noob and
couldn't figure out how to use regexps to match the different
spin_lock* variants, so I simply made multiple versions of this
script for each variant:
virtual report
@napi@
identifier func0;
position p0;
@@
func0(...)
{
...
spin_lock_irqsave(...)
...
napi_enable@p0(...)
...
spin_unlock_irqrestore(...)
...
}
@u@
position p;
identifier func;
typedef irqreturn_t;
@@
irqreturn_t func (...)
{
...
spin_lock@p(...)
...
}
@script:python depends on napi && u@
p << u.p;
func << u.func;
disable << napi.p0;
@@
print("* file: %s irq handler %s takes lock on line %s and calls napi_enable under lock %s" % (p[0].file,func,p[0].line,disable[0].line))
On Wed, Nov 27, 2024 at 05:17:15PM -0800, Joe Damato wrote:
>
> Guenter: would you mind sending me your cocci script? Mostly for
> selfish reasons; I'd like to see how you did it so I can learn more
> :) Feel free to do so off list if you prefer.
>
You mean because it is so clumsy ? :-). No worries, I attached
it below. It is way too complicated, but it was good enough
for a quick hack.
> I tried to write my first coccinelle script (which you can find
> below) that is probably wrong, but it attempts to detect:
> - interrupt routines that hold locks
> - in drivers that call napi_enable between a lock/unlock
>
> I couldn't figure out how to get regexps to work in my script, so I
> made a couple variants of the script for each of the spin_lock_*
> variants and ran them all.
I pretty much did the same, only in one script.
>
> Only one offender was detected: pcnet32.
>
Your script doesn't take into account that the spinlock may have been
released before the call to napi_enable(). Other than that it should
work just fine. I didn't try to track the interrupt handler because
tracking that through multiple functions can be difficult.
Thanks,
Guenter
---
virtual report
@f1@
identifier flags;
position p;
expression e;
@@
<...
spin_lock_irqsave@p(e, flags);
... when != spin_unlock_irqrestore(e, flags);
napi_enable(...);
... when any
spin_unlock_irqrestore(e, flags);
...>
@f2@
position p;
expression e;
@@
<...
spin_lock_irq@p(e);
... when != spin_unlock_irq(e);
napi_enable(...);
... when any
spin_unlock_irq(e);
...>
@f3@
position p;
expression e;
@@
<...
spin_lock@p(e);
... when != spin_unlock(e);
napi_enable(...);
... when any
spin_unlock(e);
...>
@script:python@
p << f1.p;
@@
print("napi_enable called under spin_lock_irqsave from %s:%s" % (p[0].file, p[0].line))
@script:python@
p << f2.p;
@@
print("napi_enable called under spin_lock_irq from %s:%s" % (p[0].file, p[0].line))
@script:python@
p << f3.p;
@@
print("napi_enable called under spin_lock from %s:%s" % (p[0].file, p[0].line))
On Wed, Nov 27, 2024 at 10:51:16AM -0800, Joe Damato wrote:
> On Wed, Nov 27, 2024 at 09:43:54AM -0800, Guenter Roeck wrote:
> > Hi,
> >
> > On Fri, Oct 11, 2024 at 06:45:00PM +0000, Joe Damato wrote:
> > > Add a persistent NAPI config area for NAPI configuration to the core.
> > > Drivers opt-in to setting the persistent config for a NAPI by passing an
> > > index when calling netif_napi_add_config.
> > >
> > > napi_config is allocated in alloc_netdev_mqs, freed in free_netdev
> > > (after the NAPIs are deleted).
> > >
> > > Drivers which call netif_napi_add_config will have persistent per-NAPI
> > > settings: NAPI IDs, gro_flush_timeout, and defer_hard_irq settings.
> > >
> > > Per-NAPI settings are saved in napi_disable and restored in napi_enable.
> > >
> > > Co-developed-by: Martin Karsten <mkarsten@uwaterloo.ca>
> > > Signed-off-by: Martin Karsten <mkarsten@uwaterloo.ca>
> > > Signed-off-by: Joe Damato <jdamato@fastly.com>
> > > Reviewed-by: Jakub Kicinski <kuba@kernel.org>
> >
> > This patch triggers a lock inversion message on pcnet Ethernet adapters.
>
> Thanks for the report. I am not familiar with the pcnet driver, but
> took some time now to read the report below and the driver code.
>
> I could definitely be reading the output incorrectly (if so please
> let me know), but it seems like the issue can be triggered in this
> case:
Sorry, my apologies, I both:
- read the report incorrectly, and
- proposed a bad patch that would result in a deadlock :)
After re-reading it and running this by Martin (who is CC'd), the
inversion is actually:
CPU 0:
pcnet32_open
lock(lp->lock)
napi_enable
napi_hash_add <- before this executes, CPU 1 proceeds
lock(napi_hash_lock)
CPU 1:
pcnet32_close
napi_disable
napi_hash_del
lock(napi_hash_lock)
< INTERRUPT >
pcnet32_interrupt
lock(lp->lock)
This is now an inversion because:
CPU 0: holds lp->lock and is about to take napi_hash_lock
CPU 1: holds napi_hashlock and an IRQ firing on CPU 1 tries to take
lp->lock (which CPU 0 already holds)
Neither side can proceed:
- CPU 0 is stuck waiting for napi_hash_lock
- CPU 1 is stuck waiting for lp->lock
I think the below explanation is still correct as to why the
identified commit causes the issue:
> It seems this was triggered because before the identified commit,
> napi_enable did not call napi_hash_add (and thus did not take the
> napi_hash_lock).
However, the previous patch I proposed for pcnet32 would also cause
a deadlock as the watchdog timer's function also needs lp->lock.
A corrected patch for pcnet32 can be found below.
Guenter: Sorry, would you mind testing the below instead of the
previous patch?
Don: Let me know what you think about the below?
Netdev maintainers, there is an alternate locking solution I can
propose as an RFC that might avoid this class of problem if this
sort of issue is more widespread than just pcnet32:
- add the NAPI to the hash in netif_napi_add_weight (instead of napi_enable)
- remove the NAPI from the hash in __netif_napi_del (instead of
napi_disable)
If changing the locking order in core is the desired route, than the
patch below should be unnecessary, but:
diff --git a/drivers/net/ethernet/amd/pcnet32.c b/drivers/net/ethernet/amd/pcnet32.c
index 72db9f9e7bee..2e0077e68883 100644
--- a/drivers/net/ethernet/amd/pcnet32.c
+++ b/drivers/net/ethernet/amd/pcnet32.c
@@ -2625,11 +2625,10 @@ static int pcnet32_close(struct net_device *dev)
del_timer_sync(&lp->watchdog_timer);
+ spin_lock_irqsave(&lp->lock, flags);
netif_stop_queue(dev);
napi_disable(&lp->napi);
- spin_lock_irqsave(&lp->lock, flags);
-
dev->stats.rx_missed_errors = lp->a->read_csr(ioaddr, 112);
netif_printk(lp, ifdown, KERN_DEBUG, dev,
On Wed, 27 Nov 2024 12:00:02 -0800 Joe Damato wrote: > CPU 0: > pcnet32_open > lock(lp->lock) > napi_enable > napi_hash_add <- before this executes, CPU 1 proceeds > lock(napi_hash_lock) > CPU 1: > pcnet32_close > napi_disable > napi_hash_del > lock(napi_hash_lock) > < INTERRUPT > How about making napi_hash_lock irq-safe ? It's a control path lock, it should be fine to disable irqs. > pcnet32_interrupt > lock(lp->lock)
On Sat, Nov 30, 2024 at 12:45:01PM -0800, Jakub Kicinski wrote: > On Wed, 27 Nov 2024 12:00:02 -0800 Joe Damato wrote: > > CPU 0: > > pcnet32_open > > lock(lp->lock) > > napi_enable > > napi_hash_add <- before this executes, CPU 1 proceeds > > lock(napi_hash_lock) > > CPU 1: > > pcnet32_close > > napi_disable > > napi_hash_del > > lock(napi_hash_lock) > > < INTERRUPT > > > How about making napi_hash_lock irq-safe ? > It's a control path lock, it should be fine to disable irqs. Ah, right. That should fix it. I'll write a fixes against net and change the napi_hash_lock to use spin_lock_irqsave and spin_lock_irqrestore and send shortly. > > pcnet32_interrupt > > lock(lp->lock) >
On 11/27/24 12:00, Joe Damato wrote:
> On Wed, Nov 27, 2024 at 10:51:16AM -0800, Joe Damato wrote:
>> On Wed, Nov 27, 2024 at 09:43:54AM -0800, Guenter Roeck wrote:
>>> Hi,
>>>
>>> On Fri, Oct 11, 2024 at 06:45:00PM +0000, Joe Damato wrote:
>>>> Add a persistent NAPI config area for NAPI configuration to the core.
>>>> Drivers opt-in to setting the persistent config for a NAPI by passing an
>>>> index when calling netif_napi_add_config.
>>>>
>>>> napi_config is allocated in alloc_netdev_mqs, freed in free_netdev
>>>> (after the NAPIs are deleted).
>>>>
>>>> Drivers which call netif_napi_add_config will have persistent per-NAPI
>>>> settings: NAPI IDs, gro_flush_timeout, and defer_hard_irq settings.
>>>>
>>>> Per-NAPI settings are saved in napi_disable and restored in napi_enable.
>>>>
>>>> Co-developed-by: Martin Karsten <mkarsten@uwaterloo.ca>
>>>> Signed-off-by: Martin Karsten <mkarsten@uwaterloo.ca>
>>>> Signed-off-by: Joe Damato <jdamato@fastly.com>
>>>> Reviewed-by: Jakub Kicinski <kuba@kernel.org>
>>>
>>> This patch triggers a lock inversion message on pcnet Ethernet adapters.
>>
>> Thanks for the report. I am not familiar with the pcnet driver, but
>> took some time now to read the report below and the driver code.
>>
>> I could definitely be reading the output incorrectly (if so please
>> let me know), but it seems like the issue can be triggered in this
>> case:
>
> Sorry, my apologies, I both:
> - read the report incorrectly, and
> - proposed a bad patch that would result in a deadlock :)
>
> After re-reading it and running this by Martin (who is CC'd), the
> inversion is actually:
>
> CPU 0:
> pcnet32_open
> lock(lp->lock)
> napi_enable
> napi_hash_add <- before this executes, CPU 1 proceeds
> lock(napi_hash_lock)
> CPU 1:
> pcnet32_close
> napi_disable
> napi_hash_del
> lock(napi_hash_lock)
> < INTERRUPT >
> pcnet32_interrupt
> lock(lp->lock)
>
> This is now an inversion because:
>
> CPU 0: holds lp->lock and is about to take napi_hash_lock
> CPU 1: holds napi_hashlock and an IRQ firing on CPU 1 tries to take
> lp->lock (which CPU 0 already holds)
>
> Neither side can proceed:
> - CPU 0 is stuck waiting for napi_hash_lock
> - CPU 1 is stuck waiting for lp->lock
>
> I think the below explanation is still correct as to why the
> identified commit causes the issue:
>
>> It seems this was triggered because before the identified commit,
>> napi_enable did not call napi_hash_add (and thus did not take the
>> napi_hash_lock).
>
> However, the previous patch I proposed for pcnet32 would also cause
> a deadlock as the watchdog timer's function also needs lp->lock.
>
> A corrected patch for pcnet32 can be found below.
>
> Guenter: Sorry, would you mind testing the below instead of the
> previous patch?
>
> Don: Let me know what you think about the below?
>
> Netdev maintainers, there is an alternate locking solution I can
> propose as an RFC that might avoid this class of problem if this
> sort of issue is more widespread than just pcnet32:
> - add the NAPI to the hash in netif_napi_add_weight (instead of napi_enable)
> - remove the NAPI from the hash in __netif_napi_del (instead of
> napi_disable)
>
> If changing the locking order in core is the desired route, than the
> patch below should be unnecessary, but:
>
> diff --git a/drivers/net/ethernet/amd/pcnet32.c b/drivers/net/ethernet/amd/pcnet32.c
> index 72db9f9e7bee..2e0077e68883 100644
> --- a/drivers/net/ethernet/amd/pcnet32.c
> +++ b/drivers/net/ethernet/amd/pcnet32.c
> @@ -2625,11 +2625,10 @@ static int pcnet32_close(struct net_device *dev)
>
> del_timer_sync(&lp->watchdog_timer);
>
> + spin_lock_irqsave(&lp->lock, flags);
> netif_stop_queue(dev);
> napi_disable(&lp->napi);
>
That is what I did, actually. Problem with that is that napi_disable()
wants to be able to sleep, thus the above triggers:
BUG: sleeping function called from invalid context at net/core/dev.c:6775
in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 1817, name: ip
preempt_count: 1, expected: 0
2 locks held by ip/1817:
#0: ffffffff81ded990 (rtnl_mutex){+.+.}-{4:4}, at: rtnl_newlink+0x22a/0x74c
#1: ff6000000498ccb0 (&lp->lock){-.-.}-{3:3}, at: pcnet32_close+0x40/0x126
irq event stamp: 3720
hardirqs last enabled at (3719): [<ffffffff80decaf4>] _raw_spin_unlock_irqrestore+0x54/0x62
hardirqs last disabled at (3720): [<ffffffff80dec8a2>] _raw_spin_lock_irqsave+0x5e/0x64
softirqs last enabled at (3712): [<ffffffff8001efca>] handle_softirqs+0x3e6/0x4a2
softirqs last disabled at (3631): [<ffffffff80ded6cc>] __do_softirq+0x12/0x1a
CPU: 0 UID: 0 PID: 1817 Comm: ip Tainted: G N 6.12.0-10313-g7d4050728c83-dirty #1
Tainted: [N]=TEST
Hardware name: riscv-virtio,qemu (DT)
Call Trace:
[<ffffffff80006d42>] dump_backtrace+0x1c/0x24
[<ffffffff80dc8d94>] show_stack+0x2c/0x38
[<ffffffff80de00b0>] dump_stack_lvl+0x74/0xac
[<ffffffff80de00fc>] dump_stack+0x14/0x1c
[<ffffffff8004da18>] __might_resched+0x23e/0x248
[<ffffffff8004da60>] __might_sleep+0x3e/0x62
[<ffffffff80b8d370>] napi_disable+0x24/0x10c
[<ffffffff809a06fe>] pcnet32_close+0x6c/0x126
Guenter
On Fri, Oct 11, 2024 at 8:46 PM Joe Damato <jdamato@fastly.com> wrote: > > Add a persistent NAPI config area for NAPI configuration to the core. > Drivers opt-in to setting the persistent config for a NAPI by passing an > index when calling netif_napi_add_config. > > napi_config is allocated in alloc_netdev_mqs, freed in free_netdev > (after the NAPIs are deleted). > > Drivers which call netif_napi_add_config will have persistent per-NAPI > settings: NAPI IDs, gro_flush_timeout, and defer_hard_irq settings. > > Per-NAPI settings are saved in napi_disable and restored in napi_enable. > > Co-developed-by: Martin Karsten <mkarsten@uwaterloo.ca> > Signed-off-by: Martin Karsten <mkarsten@uwaterloo.ca> > Signed-off-by: Joe Damato <jdamato@fastly.com> > Reviewed-by: Jakub Kicinski <kuba@kernel.org> Reviewed-by: Eric Dumazet <edumazet@google.com>
© 2016 - 2026 Red Hat, Inc.