[v6] Add support for per-NAPI config via netlink

[net-next v6 5/9] net: napi: Add napi_config

Posted by Joe Damato 1 year, 4 months ago

Add a persistent NAPI config area for NAPI configuration to the core.
Drivers opt-in to setting the persistent config for a NAPI by passing an
index when calling netif_napi_add_config.

napi_config is allocated in alloc_netdev_mqs, freed in free_netdev
(after the NAPIs are deleted).

Drivers which call netif_napi_add_config will have persistent per-NAPI
settings: NAPI IDs, gro_flush_timeout, and defer_hard_irq settings.

Per-NAPI settings are saved in napi_disable and restored in napi_enable.

Co-developed-by: Martin Karsten <mkarsten@uwaterloo.ca>
Signed-off-by: Martin Karsten <mkarsten@uwaterloo.ca>
Signed-off-by: Joe Damato <jdamato@fastly.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
---
 .../networking/net_cachelines/net_device.rst  |  1 +
 include/linux/netdevice.h                     | 36 ++++++++-
 net/core/dev.c                                | 80 +++++++++++++++++--
 net/core/dev.h                                | 12 +++
 4 files changed, 119 insertions(+), 10 deletions(-)

diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst
index 67910ea49160..db6192b2bb50 100644
--- a/Documentation/networking/net_cachelines/net_device.rst
+++ b/Documentation/networking/net_cachelines/net_device.rst
@@ -186,6 +186,7 @@ struct dpll_pin*                    dpll_pin
 struct hlist_head                   page_pools
 struct dim_irq_moder*               irq_moder
 u64                                 max_pacing_offload_horizon
+struct_napi_config*                 napi_config
 unsigned_long                       gro_flush_timeout
 u32                                 napi_defer_hard_irqs
 =================================== =========================== =================== =================== ===================================================================================
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 93241d4de437..8feaca12655e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -342,6 +342,15 @@ struct gro_list {
  */
 #define GRO_HASH_BUCKETS	8
 
+/*
+ * Structure for per-NAPI config
+ */
+struct napi_config {
+	u64 gro_flush_timeout;
+	u32 defer_hard_irqs;
+	unsigned int napi_id;
+};
+
 /*
  * Structure for NAPI scheduling similar to tasklet but with weighting
  */
@@ -379,6 +388,8 @@ struct napi_struct {
 	struct list_head	dev_list;
 	struct hlist_node	napi_hash_node;
 	int			irq;
+	int			index;
+	struct napi_config	*config;
 };
 
 enum {
@@ -1868,9 +1879,6 @@ enum netdev_reg_state {
  *				allocated at register_netdev() time
  *	@real_num_rx_queues: 	Number of RX queues currently active in device
  *	@xdp_prog:		XDP sockets filter program pointer
- *	@gro_flush_timeout:	timeout for GRO layer in NAPI
- *	@napi_defer_hard_irqs:	If not zero, provides a counter that would
- *				allow to avoid NIC hard IRQ, on busy queues.
  *
  *	@rx_handler:		handler for received packets
  *	@rx_handler_data: 	XXX: need comments on this one
@@ -2020,6 +2028,11 @@ enum netdev_reg_state {
  *		   where the clock is recovered.
  *
  *	@max_pacing_offload_horizon: max EDT offload horizon in nsec.
+ *	@napi_config: An array of napi_config structures containing per-NAPI
+ *		      settings.
+ *	@gro_flush_timeout:	timeout for GRO layer in NAPI
+ *	@napi_defer_hard_irqs:	If not zero, provides a counter that would
+ *				allow to avoid NIC hard IRQ, on busy queues.
  *
  *	FIXME: cleanup struct net_device such that network protocol info
  *	moves out.
@@ -2413,6 +2426,7 @@ struct net_device {
 	struct dim_irq_moder	*irq_moder;
 
 	u64			max_pacing_offload_horizon;
+	struct napi_config	*napi_config;
 	unsigned long		gro_flush_timeout;
 	u32			napi_defer_hard_irqs;
 
@@ -2678,6 +2692,22 @@ netif_napi_add_tx_weight(struct net_device *dev,
 	netif_napi_add_weight(dev, napi, poll, weight);
 }
 
+/**
+ * netif_napi_add_config - initialize a NAPI context with persistent config
+ * @dev: network device
+ * @napi: NAPI context
+ * @poll: polling function
+ * @index: the NAPI index
+ */
+static inline void
+netif_napi_add_config(struct net_device *dev, struct napi_struct *napi,
+		      int (*poll)(struct napi_struct *, int), int index)
+{
+	napi->index = index;
+	napi->config = &dev->napi_config[index];
+	netif_napi_add_weight(dev, napi, poll, NAPI_POLL_WEIGHT);
+}
+
 /**
  * netif_napi_add_tx() - initialize a NAPI context to be used for Tx only
  * @dev:  network device
diff --git a/net/core/dev.c b/net/core/dev.c
index e21ace3551d5..c682173a7642 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6505,6 +6505,23 @@ EXPORT_SYMBOL(napi_busy_loop);
 
 #endif /* CONFIG_NET_RX_BUSY_POLL */
 
+static void __napi_hash_add_with_id(struct napi_struct *napi,
+				    unsigned int napi_id)
+{
+	napi->napi_id = napi_id;
+	hlist_add_head_rcu(&napi->napi_hash_node,
+			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
+}
+
+static void napi_hash_add_with_id(struct napi_struct *napi,
+				  unsigned int napi_id)
+{
+	spin_lock(&napi_hash_lock);
+	WARN_ON_ONCE(napi_by_id(napi_id));
+	__napi_hash_add_with_id(napi, napi_id);
+	spin_unlock(&napi_hash_lock);
+}
+
 static void napi_hash_add(struct napi_struct *napi)
 {
 	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
@@ -6517,10 +6534,8 @@ static void napi_hash_add(struct napi_struct *napi)
 		if (unlikely(++napi_gen_id < MIN_NAPI_ID))
 			napi_gen_id = MIN_NAPI_ID;
 	} while (napi_by_id(napi_gen_id));
-	napi->napi_id = napi_gen_id;
 
-	hlist_add_head_rcu(&napi->napi_hash_node,
-			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
+	__napi_hash_add_with_id(napi, napi_gen_id);
 
 	spin_unlock(&napi_hash_lock);
 }
@@ -6643,6 +6658,28 @@ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
 }
 EXPORT_SYMBOL(netif_queue_set_napi);
 
+static void napi_restore_config(struct napi_struct *n)
+{
+	n->defer_hard_irqs = n->config->defer_hard_irqs;
+	n->gro_flush_timeout = n->config->gro_flush_timeout;
+	/* a NAPI ID might be stored in the config, if so use it. if not, use
+	 * napi_hash_add to generate one for us. It will be saved to the config
+	 * in napi_disable.
+	 */
+	if (n->config->napi_id)
+		napi_hash_add_with_id(n, n->config->napi_id);
+	else
+		napi_hash_add(n);
+}
+
+static void napi_save_config(struct napi_struct *n)
+{
+	n->config->defer_hard_irqs = n->defer_hard_irqs;
+	n->config->gro_flush_timeout = n->gro_flush_timeout;
+	n->config->napi_id = n->napi_id;
+	napi_hash_del(n);
+}
+
 void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
 			   int (*poll)(struct napi_struct *, int), int weight)
 {
@@ -6653,8 +6690,6 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
 	INIT_HLIST_NODE(&napi->napi_hash_node);
 	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
 	napi->timer.function = napi_watchdog;
-	napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs));
-	napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout));
 	init_gro_hash(napi);
 	napi->skb = NULL;
 	INIT_LIST_HEAD(&napi->rx_list);
@@ -6672,7 +6707,13 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
 	set_bit(NAPI_STATE_SCHED, &napi->state);
 	set_bit(NAPI_STATE_NPSVC, &napi->state);
 	list_add_rcu(&napi->dev_list, &dev->napi_list);
-	napi_hash_add(napi);
+
+	/* default settings from sysfs are applied to all NAPIs. any per-NAPI
+	 * configuration will be loaded in napi_enable
+	 */
+	napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs));
+	napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout));
+
 	napi_get_frags_check(napi);
 	/* Create kthread for this napi if dev->threaded is set.
 	 * Clear dev->threaded if kthread creation failed so that
@@ -6704,6 +6745,11 @@ void napi_disable(struct napi_struct *n)
 
 	hrtimer_cancel(&n->timer);
 
+	if (n->config)
+		napi_save_config(n);
+	else
+		napi_hash_del(n);
+
 	clear_bit(NAPI_STATE_DISABLE, &n->state);
 }
 EXPORT_SYMBOL(napi_disable);
@@ -6719,6 +6765,11 @@ void napi_enable(struct napi_struct *n)
 {
 	unsigned long new, val = READ_ONCE(n->state);
 
+	if (n->config)
+		napi_restore_config(n);
+	else
+		napi_hash_add(n);
+
 	do {
 		BUG_ON(!test_bit(NAPI_STATE_SCHED, &val));
 
@@ -6748,7 +6799,11 @@ void __netif_napi_del(struct napi_struct *napi)
 	if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
 		return;
 
-	napi_hash_del(napi);
+	if (napi->config) {
+		napi->index = -1;
+		napi->config = NULL;
+	}
+
 	list_del_rcu(&napi->dev_list);
 	napi_free_frags(napi);
 
@@ -11085,6 +11140,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 		unsigned int txqs, unsigned int rxqs)
 {
 	struct net_device *dev;
+	size_t napi_config_sz;
+	unsigned int maxqs;
 
 	BUG_ON(strlen(name) >= sizeof(dev->name));
 
@@ -11098,6 +11155,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 		return NULL;
 	}
 
+	maxqs = max(txqs, rxqs);
+
 	dev = kvzalloc(struct_size(dev, priv, sizeof_priv),
 		       GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
 	if (!dev)
@@ -11174,6 +11233,11 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 	if (!dev->ethtool)
 		goto free_all;
 
+	napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config));
+	dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT);
+	if (!dev->napi_config)
+		goto free_all;
+
 	strscpy(dev->name, name);
 	dev->name_assign_type = name_assign_type;
 	dev->group = INIT_NETDEV_GROUP;
@@ -11237,6 +11301,8 @@ void free_netdev(struct net_device *dev)
 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
 		netif_napi_del(p);
 
+	kvfree(dev->napi_config);
+
 	ref_tracker_dir_exit(&dev->refcnt_tracker);
 #ifdef CONFIG_PCPU_DEV_REFCNT
 	free_percpu(dev->pcpu_refcnt);
diff --git a/net/core/dev.h b/net/core/dev.h
index 7d0aab7e3ef1..7881bced70a9 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -177,11 +177,17 @@ static inline void napi_set_defer_hard_irqs(struct napi_struct *n, u32 defer)
 static inline void netdev_set_defer_hard_irqs(struct net_device *netdev,
 					      u32 defer)
 {
+	unsigned int count = max(netdev->num_rx_queues,
+				 netdev->num_tx_queues);
 	struct napi_struct *napi;
+	int i;
 
 	WRITE_ONCE(netdev->napi_defer_hard_irqs, defer);
 	list_for_each_entry(napi, &netdev->napi_list, dev_list)
 		napi_set_defer_hard_irqs(napi, defer);
+
+	for (i = 0; i < count; i++)
+		netdev->napi_config[i].defer_hard_irqs = defer;
 }
 
 /**
@@ -217,11 +223,17 @@ static inline void napi_set_gro_flush_timeout(struct napi_struct *n,
 static inline void netdev_set_gro_flush_timeout(struct net_device *netdev,
 						unsigned long timeout)
 {
+	unsigned int count = max(netdev->num_rx_queues,
+				 netdev->num_tx_queues);
 	struct napi_struct *napi;
+	int i;
 
 	WRITE_ONCE(netdev->gro_flush_timeout, timeout);
 	list_for_each_entry(napi, &netdev->napi_list, dev_list)
 		napi_set_gro_flush_timeout(napi, timeout);
+
+	for (i = 0; i < count; i++)
+		netdev->napi_config[i].gro_flush_timeout = timeout;
 }
 
 int rps_cpumask_housekeeping(struct cpumask *mask);
-- 
2.25.1

Re: [net-next v6 5/9] net: napi: Add napi_config

Posted by Guenter Roeck 1 year, 2 months ago

Hi,

On Fri, Oct 11, 2024 at 06:45:00PM +0000, Joe Damato wrote:
> Add a persistent NAPI config area for NAPI configuration to the core.
> Drivers opt-in to setting the persistent config for a NAPI by passing an
> index when calling netif_napi_add_config.
> 
> napi_config is allocated in alloc_netdev_mqs, freed in free_netdev
> (after the NAPIs are deleted).
> 
> Drivers which call netif_napi_add_config will have persistent per-NAPI
> settings: NAPI IDs, gro_flush_timeout, and defer_hard_irq settings.
> 
> Per-NAPI settings are saved in napi_disable and restored in napi_enable.
> 
> Co-developed-by: Martin Karsten <mkarsten@uwaterloo.ca>
> Signed-off-by: Martin Karsten <mkarsten@uwaterloo.ca>
> Signed-off-by: Joe Damato <jdamato@fastly.com>
> Reviewed-by: Jakub Kicinski <kuba@kernel.org>

This patch triggers a lock inversion message on pcnet Ethernet adapters.

========================================================
WARNING: possible irq lock inversion dependency detected
6.12.0-08446-g228a1157fb9f #1 Tainted: G                 N

The problem seems obvious - napi_hash_lock and the local driver lock are
acquired in different order depending on the call sequence. Unfortunately
I have no idea how to fix the problem or I'd submit a patch.

Complete backtrace and bisect log attached. Bisect was run with qemu on
riscv64, but the architecture/platform does not really matter.

Please let me know if there is anything I can do to help resolve the
problem.

Thanks,
Guenter

---
[   13.251894] ========================================================
[   13.252024] WARNING: possible irq lock inversion dependency detected
[   13.252307] 6.12.0-08446-g228a1157fb9f #1 Tainted: G                 N
[   13.252472] --------------------------------------------------------
[   13.252569] ip/1816 just changed the state of lock:
[   13.252678] ffffffff81dec490 (napi_hash_lock){+...}-{3:3}, at: napi_disable+0xf8/0x10c
[   13.253497] but this lock was taken by another, HARDIRQ-safe lock in the past:
[   13.253637]  (&lp->lock){-.-.}-{3:3}
[   13.253682]
[   13.253682]
[   13.253682] and interrupts could create inverse lock ordering between them.
[   13.253682]
[   13.253923]
[   13.253923] other info that might help us debug this:
[   13.254082]  Possible interrupt unsafe locking scenario:
[   13.254082]
[   13.254186]        CPU0                    CPU1
[   13.254264]        ----                    ----
[   13.254340]   lock(napi_hash_lock);
[   13.254438]                                local_irq_disable();
[   13.254532]                                lock(&lp->lock);
[   13.254649]                                lock(napi_hash_lock);
[   13.254772]   <Interrupt>
[   13.254828]     lock(&lp->lock);
[   13.254921]
[   13.254921]  *** DEADLOCK ***
[   13.254921]
[   13.255049] 1 lock held by ip/1816:
[   13.255127]  #0: ffffffff81dece50 (rtnl_mutex){+.+.}-{4:4}, at: rtnl_newlink+0x22a/0x74c
[   13.255398]
[   13.255398] the shortest dependencies between 2nd lock and 1st lock:
[   13.255593]  -> (&lp->lock){-.-.}-{3:3} ops: 75 {
[   13.255802]     IN-HARDIRQ-W at:
[   13.255910]                       __lock_acquire+0xa3e/0x2158
[   13.256055]                       lock_acquire.part.0+0xba/0x21e
[   13.256153]                       lock_acquire+0x44/0x5a
[   13.256241]                       _raw_spin_lock+0x2c/0x40
[   13.256343]                       pcnet32_interrupt+0x3c/0x200
[   13.256442]                       __handle_irq_event_percpu+0xa0/0x2e0
[   13.256547]                       handle_irq_event+0x3c/0x8a
[   13.256640]                       handle_fasteoi_irq+0x9c/0x1d2
[   13.256738]                       generic_handle_domain_irq+0x1c/0x2a
[   13.256840]                       plic_handle_irq+0x7e/0xfc
[   13.256937]                       generic_handle_domain_irq+0x1c/0x2a
[   13.257041]                       riscv_intc_irq+0x26/0x60
[   13.257133]                       handle_riscv_irq+0x4a/0x74
[   13.257228]                       call_on_irq_stack+0x32/0x40
[   13.257349]     IN-SOFTIRQ-W at:
[   13.257420]                       __lock_acquire+0x40a/0x2158
[   13.257515]                       lock_acquire.part.0+0xba/0x21e
[   13.257611]                       lock_acquire+0x44/0x5a
[   13.257699]                       _raw_spin_lock_irqsave+0x3a/0x64
[   13.257798]                       pcnet32_poll+0x2ac/0x768
[   13.257892]                       __napi_poll.constprop.0+0x26/0x128
[   13.257997]                       net_rx_action+0x186/0x30e
[   13.258090]                       handle_softirqs+0x110/0x4a2
[   13.258187]                       __irq_exit_rcu+0xe2/0x10c
[   13.258279]                       irq_exit_rcu+0xc/0x36
[   13.258368]                       handle_riscv_irq+0x64/0x74
[   13.258463]                       call_on_irq_stack+0x32/0x40
[   13.258558]     INITIAL USE at:
[   13.258629]                      __lock_acquire+0x46c/0x2158
[   13.258723]                      lock_acquire.part.0+0xba/0x21e
[   13.258820]                      lock_acquire+0x44/0x5a
[   13.258909]                      _raw_spin_lock_irqsave+0x3a/0x64
[   13.259007]                      pcnet32_get_stats+0x2a/0x62
[   13.259101]                      dev_get_stats+0xc4/0x2a6
[   13.259380]                      rtnl_fill_stats+0x32/0xee
[   13.259480]                      rtnl_fill_ifinfo.isra.0+0x648/0x141c
[   13.259589]                      rtmsg_ifinfo_build_skb+0x98/0xf0
[   13.259711]                      rtmsg_ifinfo+0x36/0x78
[   13.259799]                      register_netdevice+0x758/0x7a8
[   13.259905]                      register_netdev+0x18/0x2e
[   13.259999]                      pcnet32_probe1+0xb96/0x103e
[   13.260099]                      pcnet32_probe_pci+0xcc/0x12e
[   13.260196]                      pci_device_probe+0x82/0x100
[   13.260291]                      really_probe+0x86/0x234
[   13.260383]                      __driver_probe_device+0x5c/0xda
[   13.260482]                      driver_probe_device+0x2c/0xb2
[   13.260578]                      __driver_attach+0x70/0x120
[   13.260671]                      bus_for_each_dev+0x60/0xae
[   13.260764]                      driver_attach+0x1a/0x22
[   13.260853]                      bus_add_driver+0xce/0x1d6
[   13.260951]                      driver_register+0x3e/0xd8
[   13.261048]                      __pci_register_driver+0x5c/0x66
[   13.261149]                      pcnet32_init_module+0x58/0x14a
[   13.261255]                      do_one_initcall+0x7e/0x2b6
[   13.261352]                      kernel_init_freeable+0x2cc/0x33e
[   13.261456]                      kernel_init+0x1e/0x11a
[   13.261550]                      ret_from_fork+0xe/0x18
[   13.261653]   }
[   13.261705]   ... key      at: [<ffffffff82a9efb0>] __key.4+0x0/0x10
[   13.261839]   ... acquired at:
[   13.261907]    lock_acquire.part.0+0xba/0x21e
[   13.261987]    lock_acquire+0x44/0x5a
[   13.262056]    _raw_spin_lock+0x2c/0x40
[   13.262132]    napi_hash_add+0x26/0xb8
[   13.262207]    napi_enable+0x10e/0x124
[   13.262281]    pcnet32_open+0x3c2/0x6b8
[   13.262357]    __dev_open+0xba/0x158
[   13.262427]    __dev_change_flags+0x19a/0x214
[   13.262508]    dev_change_flags+0x1e/0x56
[   13.262583]    do_setlink.isra.0+0x20c/0xcc2
[   13.262661]    rtnl_newlink+0x592/0x74c
[   13.262731]    rtnetlink_rcv_msg+0x3a8/0x582
[   13.262807]    netlink_rcv_skb+0x44/0xf4
[   13.262882]    rtnetlink_rcv+0x14/0x1c
[   13.262956]    netlink_unicast+0x1a0/0x23e
[   13.263031]    netlink_sendmsg+0x17e/0x38e
[   13.263109]    __sock_sendmsg+0x40/0x7a
[   13.263182]    ____sys_sendmsg+0x18e/0x1de
[   13.263288]    ___sys_sendmsg+0x82/0xc6
[   13.263360]    __sys_sendmsg+0x8e/0xe0
[   13.263430]    __riscv_sys_sendmsg+0x16/0x1e
[   13.263507]    do_trap_ecall_u+0x1b6/0x1e2
[   13.263583]    _new_vmalloc_restore_context_a0+0xc2/0xce
[   13.263685]
[   13.263746] -> (napi_hash_lock){+...}-{3:3} ops: 2 {
[   13.263891]    HARDIRQ-ON-W at:
[   13.263963]                     __lock_acquire+0x688/0x2158
[   13.264057]                     lock_acquire.part.0+0xba/0x21e
[   13.264153]                     lock_acquire+0x44/0x5a
[   13.264240]                     _raw_spin_lock+0x2c/0x40
[   13.264330]                     napi_disable+0xf8/0x10c
[   13.264419]                     pcnet32_close+0x5c/0x126
[   13.264511]                     __dev_close_many+0x8a/0xf8
[   13.264609]                     __dev_change_flags+0x176/0x214
[   13.264708]                     dev_change_flags+0x1e/0x56
[   13.264800]                     do_setlink.isra.0+0x20c/0xcc2
[   13.264900]                     rtnl_newlink+0x592/0x74c
[   13.264988]                     rtnetlink_rcv_msg+0x3a8/0x582
[   13.265082]                     netlink_rcv_skb+0x44/0xf4
[   13.265174]                     rtnetlink_rcv+0x14/0x1c
[   13.265268]                     netlink_unicast+0x1a0/0x23e
[   13.265367]                     netlink_sendmsg+0x17e/0x38e
[   13.265466]                     __sock_sendmsg+0x40/0x7a
[   13.265557]                     ____sys_sendmsg+0x18e/0x1de
[   13.265649]                     ___sys_sendmsg+0x82/0xc6
[   13.265738]                     __sys_sendmsg+0x8e/0xe0
[   13.265828]                     __riscv_sys_sendmsg+0x16/0x1e
[   13.265928]                     do_trap_ecall_u+0x1b6/0x1e2
[   13.266024]                     _new_vmalloc_restore_context_a0+0xc2/0xce
[   13.266134]    INITIAL USE at:
[   13.266206]                    __lock_acquire+0x46c/0x2158
[   13.266298]                    lock_acquire.part.0+0xba/0x21e
[   13.266394]                    lock_acquire+0x44/0x5a
[   13.266480]                    _raw_spin_lock+0x2c/0x40
[   13.266572]                    napi_hash_add+0x26/0xb8
[   13.266660]                    napi_enable+0x10e/0x124
[   13.266748]                    pcnet32_open+0x3c2/0x6b8
[   13.266839]                    __dev_open+0xba/0x158
[   13.266930]                    __dev_change_flags+0x19a/0x214
[   13.267026]                    dev_change_flags+0x1e/0x56
[   13.267116]                    do_setlink.isra.0+0x20c/0xcc2
[   13.267211]                    rtnl_newlink+0x592/0x74c
[   13.267299]                    rtnetlink_rcv_msg+0x3a8/0x582
[   13.267395]                    netlink_rcv_skb+0x44/0xf4
[   13.267489]                    rtnetlink_rcv+0x14/0x1c
[   13.267578]                    netlink_unicast+0x1a0/0x23e
[   13.267670]                    netlink_sendmsg+0x17e/0x38e
[   13.267763]                    __sock_sendmsg+0x40/0x7a
[   13.267851]                    ____sys_sendmsg+0x18e/0x1de
[   13.267946]                    ___sys_sendmsg+0x82/0xc6
[   13.268034]                    __sys_sendmsg+0x8e/0xe0
[   13.268121]                    __riscv_sys_sendmsg+0x16/0x1e
[   13.268215]                    do_trap_ecall_u+0x1b6/0x1e2
[   13.268309]                    _new_vmalloc_restore_context_a0+0xc2/0xce
[   13.268419]  }
[   13.268460]  ... key      at: [<ffffffff81dec490>] napi_hash_lock+0x18/0x40
[   13.268579]  ... acquired at:
[   13.268636]    mark_lock+0x5f2/0x88a
[   13.268705]    __lock_acquire+0x688/0x2158
[   13.268779]    lock_acquire.part.0+0xba/0x21e
[   13.268858]    lock_acquire+0x44/0x5a
[   13.268930]    _raw_spin_lock+0x2c/0x40
[   13.269002]    napi_disable+0xf8/0x10c
[   13.269073]    pcnet32_close+0x5c/0x126
[   13.269145]    __dev_close_many+0x8a/0xf8
[   13.269220]    __dev_change_flags+0x176/0x214
[   13.269298]    dev_change_flags+0x1e/0x56
[   13.269371]    do_setlink.isra.0+0x20c/0xcc2
[   13.269449]    rtnl_newlink+0x592/0x74c
[   13.269520]    rtnetlink_rcv_msg+0x3a8/0x582
[   13.269596]    netlink_rcv_skb+0x44/0xf4
[   13.269671]    rtnetlink_rcv+0x14/0x1c
[   13.269742]    netlink_unicast+0x1a0/0x23e
[   13.269818]    netlink_sendmsg+0x17e/0x38e
[   13.269894]    __sock_sendmsg+0x40/0x7a
[   13.269966]    ____sys_sendmsg+0x18e/0x1de
[   13.270041]    ___sys_sendmsg+0x82/0xc6
[   13.270114]    __sys_sendmsg+0x8e/0xe0
[   13.270187]    __riscv_sys_sendmsg+0x16/0x1e
[   13.270268]    do_trap_ecall_u+0x1b6/0x1e2
[   13.270346]    _new_vmalloc_restore_context_a0+0xc2/0xce
[   13.270439]
[   13.270525]
[   13.270525] stack backtrace:
[   13.270729] CPU: 0 UID: 0 PID: 1816 Comm: ip Tainted: G                 N 6.12.0-08446-g228a1157fb9f #1
[   13.270933] Tainted: [N]=TEST
[   13.271006] Hardware name: riscv-virtio,qemu (DT)
[   13.271165] Call Trace:
[   13.271270] [<ffffffff80006d42>] dump_backtrace+0x1c/0x24
[   13.271373] [<ffffffff80dc5574>] show_stack+0x2c/0x38
[   13.271467] [<ffffffff80ddc854>] dump_stack_lvl+0x74/0xac
[   13.271565] [<ffffffff80ddc8a0>] dump_stack+0x14/0x1c
[   13.271657] [<ffffffff8008551a>] print_irq_inversion_bug.part.0+0x1aa/0x1fe
[   13.271775] [<ffffffff800867f8>] mark_lock+0x5f2/0x88a
[   13.271869] [<ffffffff80087d44>] __lock_acquire+0x688/0x2158
[   13.271973] [<ffffffff80086e10>] lock_acquire.part.0+0xba/0x21e
[   13.272078] [<ffffffff80086fb8>] lock_acquire+0x44/0x5a
[   13.272173] [<ffffffff80de8ea4>] _raw_spin_lock+0x2c/0x40
[   13.272269] [<ffffffff80b89bf0>] napi_disable+0xf8/0x10c
[   13.272360] [<ffffffff8099d022>] pcnet32_close+0x5c/0x126
[   13.272454] [<ffffffff80b8fa52>] __dev_close_many+0x8a/0xf8
[   13.272549] [<ffffffff80b9790e>] __dev_change_flags+0x176/0x214
[   13.272647] [<ffffffff80b979ca>] dev_change_flags+0x1e/0x56
[   13.272740] [<ffffffff80ba93a8>] do_setlink.isra.0+0x20c/0xcc2
[   13.272838] [<ffffffff80baa3f0>] rtnl_newlink+0x592/0x74c
[   13.272935] [<ffffffff80babf5a>] rtnetlink_rcv_msg+0x3a8/0x582
[   13.273032] [<ffffffff80c05bcc>] netlink_rcv_skb+0x44/0xf4
[   13.273127] [<ffffffff80ba64f6>] rtnetlink_rcv+0x14/0x1c
[   13.273223] [<ffffffff80c05578>] netlink_unicast+0x1a0/0x23e
[   13.273326] [<ffffffff80c05794>] netlink_sendmsg+0x17e/0x38e
[   13.273423] [<ffffffff80b633e2>] __sock_sendmsg+0x40/0x7a
[   13.273515] [<ffffffff80b63742>] ____sys_sendmsg+0x18e/0x1de
[   13.273610] [<ffffffff80b65e44>] ___sys_sendmsg+0x82/0xc6
[   13.273704] [<ffffffff80b662ac>] __sys_sendmsg+0x8e/0xe0
[   13.273795] [<ffffffff80b66314>] __riscv_sys_sendmsg+0x16/0x1e
[   13.273894] [<ffffffff80ddd3e4>] do_trap_ecall_u+0x1b6/0x1e2
[   13.273992] [<ffffffff80de9c7a>] _new_vmalloc_restore_context_a0+0xc2/0xce

---
# bad: [228a1157fb9fec47eb135b51c0202b574e079ebf] Merge tag '6.13-rc-part1-SMB3-client-fixes' of git://git.samba.org/sfrench/cifs-2.6
# good: [43fb83c17ba2d63dfb798f0be7453ed55ca3f9c2] Merge tag 'soc-arm-6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/soc/soc
git bisect start '228a1157fb9f' '43fb83c17ba2'
# bad: [071b34dcf71523a559b6c39f5d21a268a9531b50] Merge tag 'sound-6.13-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tiwai/sound
git bisect bad 071b34dcf71523a559b6c39f5d21a268a9531b50
# bad: [31a1f8752f7df7e3d8122054fbef02a9a8bff38f] Merge branch 'phy-mediatek-reorg'
git bisect bad 31a1f8752f7df7e3d8122054fbef02a9a8bff38f
# bad: [de51ad08b1177bbbb8b60cb7dd4c3c5dd50d262f] phonet: Pass net and ifindex to rtm_phonet_notify().
git bisect bad de51ad08b1177bbbb8b60cb7dd4c3c5dd50d262f
# good: [76d46d766a45e205e59af511efbb24abe22d0b4c] net: emaclite: Adopt clock support
git bisect good 76d46d766a45e205e59af511efbb24abe22d0b4c
# bad: [a37b0e4eca0436ebc17d512d70b1409956340688] ipv6: Use rtnl_register_many().
git bisect bad a37b0e4eca0436ebc17d512d70b1409956340688
# bad: [de306f0051ae947680a13c13a9fd9373d7460bb1] net: gianfar: Use __be64 * to store pointers to big endian values
git bisect bad de306f0051ae947680a13c13a9fd9373d7460bb1
# good: [5c16e118b796e95d6e5c80c5d8af2591262431c9] net: ethernet: ti: am65-cpsw: Use __be64 type for id_temp
git bisect good 5c16e118b796e95d6e5c80c5d8af2591262431c9
# bad: [41936522749654e64531121bbd6a95bab5d56d76] bnxt: Add support for persistent NAPI config
git bisect bad 41936522749654e64531121bbd6a95bab5d56d76
# good: [79636038d37e7bd4d078238f2a3f002cab4423bc] ipv4: tcp: give socket pointer to control skbs
git bisect good 79636038d37e7bd4d078238f2a3f002cab4423bc
# good: [516010460011ae74ac3b7383cf90ed27e2711cd6] netdev-genl: Dump napi_defer_hard_irqs
git bisect good 516010460011ae74ac3b7383cf90ed27e2711cd6
# good: [0137891e74576f77a7901718dc0ce08ca074ae74] netdev-genl: Dump gro_flush_timeout
git bisect good 0137891e74576f77a7901718dc0ce08ca074ae74
# bad: [1287c1ae0fc227e5acef11a539eb4e75646e31c7] netdev-genl: Support setting per-NAPI config values
git bisect bad 1287c1ae0fc227e5acef11a539eb4e75646e31c7
# bad: [86e25f40aa1e9e54e081e55016f65b5c92523989] net: napi: Add napi_config
git bisect bad 86e25f40aa1e9e54e081e55016f65b5c92523989
# first bad commit: [86e25f40aa1e9e54e081e55016f65b5c92523989] net: napi: Add napi_config

Re: [net-next v6 5/9] net: napi: Add napi_config

Posted by Joe Damato 1 year, 2 months ago

On Wed, Nov 27, 2024 at 09:43:54AM -0800, Guenter Roeck wrote:
> Hi,
> 
> On Fri, Oct 11, 2024 at 06:45:00PM +0000, Joe Damato wrote:
> > Add a persistent NAPI config area for NAPI configuration to the core.
> > Drivers opt-in to setting the persistent config for a NAPI by passing an
> > index when calling netif_napi_add_config.
> > 
> > napi_config is allocated in alloc_netdev_mqs, freed in free_netdev
> > (after the NAPIs are deleted).
> > 
> > Drivers which call netif_napi_add_config will have persistent per-NAPI
> > settings: NAPI IDs, gro_flush_timeout, and defer_hard_irq settings.
> > 
> > Per-NAPI settings are saved in napi_disable and restored in napi_enable.
> > 
> > Co-developed-by: Martin Karsten <mkarsten@uwaterloo.ca>
> > Signed-off-by: Martin Karsten <mkarsten@uwaterloo.ca>
> > Signed-off-by: Joe Damato <jdamato@fastly.com>
> > Reviewed-by: Jakub Kicinski <kuba@kernel.org>
> 
> This patch triggers a lock inversion message on pcnet Ethernet adapters.

Thanks for the report. I am not familiar with the pcnet driver, but
took some time now to read the report below and the driver code.

I could definitely be reading the output incorrectly (if so please
let me know), but it seems like the issue can be triggered in this
case:

CPU 0:
pcnet32_open
   lock(lp->lock)
     napi_enable
       napi_hash_add
         lock(napi_hash_lock)
         unlock(napi_hash_lock)
   unlock(lp->lock)

Meanwhile on CPU 1:
  pcnet32_close
    napi_disable
      napi_hash_del
        lock(napi_hash_lock)
        unlock(napi_hash_lock)
    lock(lp->lock)
    [... other code ...]
    unlock(lp->lock)
    [... other code ...]
    lock(lp->lock)
    [... other code ...]
    unlock(lp->lock)

In other words: while the close path is holding napi_hash_lock (and
before it acquires lp->lock), the enable path takes lp->lock and
then napi_hash_lock.

It seems this was triggered because before the identified commit,
napi_enable did not call napi_hash_add (and thus did not take the
napi_hash_lock).

So, I agree there is an inversion; I can't say for sure if this
would cause a deadlock in certain situations. It seems like
napi_hash_del in the close path will return, so the inversion
doesn't seem like it'd lead to a deadlock, but I am not an expert in
this and could certainly be wrong.

I wonder if a potential fix for this would be in the driver's close
function? 

In pcnet32_open the order is:
  lock(lp->lock)
    napi_enable
    netif_start_queue
    mod_timer(watchdog)
  unlock(lp->lock)

Perhaps pcnet32_close should be the same?

I've included an example patch below for pcnet32_close and I've CC'd
the maintainer of pcnet32 that is not currently CC'd.

Guenter: Is there any change you might be able to test the proposed
patch below?

Don: Would you mind taking a look to see if this change is sensible?

Netdev maintainers: at a higher level, I'm not sure how many other
drivers might have locking patterns like this that commit
86e25f40aa1e ("net: napi: Add napi_config") will break in a similar
manner. 

Do I:
  - comb through drivers trying to identify these, and/or
  - do we find a way to implement the identified commit with the
    original lock ordering to avoid breaking any other driver?

I'd appreciate guidance/insight from the maintainers on how to best
proceed.

diff --git a/drivers/net/ethernet/amd/pcnet32.c b/drivers/net/ethernet/amd/pcnet32.c
index 72db9f9e7bee..ff56a308fec9 100644
--- a/drivers/net/ethernet/amd/pcnet32.c
+++ b/drivers/net/ethernet/amd/pcnet32.c
@@ -2623,13 +2623,13 @@ static int pcnet32_close(struct net_device *dev)
        struct pcnet32_private *lp = netdev_priv(dev);
        unsigned long flags;

+       spin_lock_irqsave(&lp->lock, flags);
+
        del_timer_sync(&lp->watchdog_timer);

        netif_stop_queue(dev);
        napi_disable(&lp->napi);

-       spin_lock_irqsave(&lp->lock, flags);
-
        dev->stats.rx_missed_errors = lp->a->read_csr(ioaddr, 112);

        netif_printk(lp, ifdown, KERN_DEBUG, dev,

Re: [net-next v6 5/9] net: napi: Add napi_config

Posted by Guenter Roeck 1 year, 2 months ago

On 11/27/24 10:51, Joe Damato wrote:
> On Wed, Nov 27, 2024 at 09:43:54AM -0800, Guenter Roeck wrote:
>> Hi,
>>
>> On Fri, Oct 11, 2024 at 06:45:00PM +0000, Joe Damato wrote:
>>> Add a persistent NAPI config area for NAPI configuration to the core.
>>> Drivers opt-in to setting the persistent config for a NAPI by passing an
>>> index when calling netif_napi_add_config.
>>>
>>> napi_config is allocated in alloc_netdev_mqs, freed in free_netdev
>>> (after the NAPIs are deleted).
>>>
>>> Drivers which call netif_napi_add_config will have persistent per-NAPI
>>> settings: NAPI IDs, gro_flush_timeout, and defer_hard_irq settings.
>>>
>>> Per-NAPI settings are saved in napi_disable and restored in napi_enable.
>>>
>>> Co-developed-by: Martin Karsten <mkarsten@uwaterloo.ca>
>>> Signed-off-by: Martin Karsten <mkarsten@uwaterloo.ca>
>>> Signed-off-by: Joe Damato <jdamato@fastly.com>
>>> Reviewed-by: Jakub Kicinski <kuba@kernel.org>
>>
>> This patch triggers a lock inversion message on pcnet Ethernet adapters.
> 
> Thanks for the report. I am not familiar with the pcnet driver, but
> took some time now to read the report below and the driver code.
> 
> I could definitely be reading the output incorrectly (if so please
> let me know), but it seems like the issue can be triggered in this
> case:
> 
> CPU 0:
> pcnet32_open
>     lock(lp->lock)
>       napi_enable
>         napi_hash_add
>           lock(napi_hash_lock)
>           unlock(napi_hash_lock)
>     unlock(lp->lock)
> 
> 
> Meanwhile on CPU 1:
>    pcnet32_close
>      napi_disable
>        napi_hash_del
>          lock(napi_hash_lock)
>          unlock(napi_hash_lock)
>      lock(lp->lock)
>      [... other code ...]
>      unlock(lp->lock)
>      [... other code ...]
>      lock(lp->lock)
>      [... other code ...]
>      unlock(lp->lock)
> 
> In other words: while the close path is holding napi_hash_lock (and
> before it acquires lp->lock), the enable path takes lp->lock and
> then napi_hash_lock.
> 
> It seems this was triggered because before the identified commit,
> napi_enable did not call napi_hash_add (and thus did not take the
> napi_hash_lock).
> 
> So, I agree there is an inversion; I can't say for sure if this
> would cause a deadlock in certain situations. It seems like
> napi_hash_del in the close path will return, so the inversion
> doesn't seem like it'd lead to a deadlock, but I am not an expert in
> this and could certainly be wrong.
> 
> I wonder if a potential fix for this would be in the driver's close
> function?
> 
> In pcnet32_open the order is:
>    lock(lp->lock)
>      napi_enable
>      netif_start_queue
>      mod_timer(watchdog)
>    unlock(lp->lock)
> 
> Perhaps pcnet32_close should be the same?
> 
> I've included an example patch below for pcnet32_close and I've CC'd
> the maintainer of pcnet32 that is not currently CC'd.
> 
> Guenter: Is there any change you might be able to test the proposed
> patch below?
> 

I moved the spinlock after del_timer_sync() because it is not a good idea
to hold a spinlock when calling that function. That results in:

[   10.646956] BUG: sleeping function called from invalid context at net/core/dev.c:6775
[   10.647142] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 1817, name: ip
[   10.647237] preempt_count: 1, expected: 0
[   10.647319] 2 locks held by ip/1817:
[   10.647383]  #0: ffffffff81ded990 (rtnl_mutex){+.+.}-{4:4}, at: rtnl_newlink+0x22a/0x74c
[   10.647880]  #1: ff6000000498ccb0 (&lp->lock){-.-.}-{3:3}, at: pcnet32_close+0x40/0x126
[   10.648050] irq event stamp: 3720
[   10.648102] hardirqs last  enabled at (3719): [<ffffffff80decaf4>] _raw_spin_unlock_irqrestore+0x54/0x62
[   10.648204] hardirqs last disabled at (3720): [<ffffffff80dec8a2>] _raw_spin_lock_irqsave+0x5e/0x64
[   10.648301] softirqs last  enabled at (3712): [<ffffffff8001efca>] handle_softirqs+0x3e6/0x4a2
[   10.648396] softirqs last disabled at (3631): [<ffffffff80ded6cc>] __do_softirq+0x12/0x1a
[   10.648666] CPU: 0 UID: 0 PID: 1817 Comm: ip Tainted: G                 N 6.12.0-10313-g7d4050728c83-dirty #1
[   10.648828] Tainted: [N]=TEST
[   10.648879] Hardware name: riscv-virtio,qemu (DT)
[   10.648978] Call Trace:
[   10.649048] [<ffffffff80006d42>] dump_backtrace+0x1c/0x24
[   10.649117] [<ffffffff80dc8d94>] show_stack+0x2c/0x38
[   10.649180] [<ffffffff80de00b0>] dump_stack_lvl+0x74/0xac
[   10.649246] [<ffffffff80de00fc>] dump_stack+0x14/0x1c
[   10.649308] [<ffffffff8004da18>] __might_resched+0x23e/0x248
[   10.649377] [<ffffffff8004da60>] __might_sleep+0x3e/0x62
[   10.649441] [<ffffffff80b8d370>] napi_disable+0x24/0x10c
[   10.649506] [<ffffffff809a06fe>] pcnet32_close+0x6c/0x126
...

This is due to might_sleep() at the beginning of napi_disable(). So it doesn't
work as intended, it just replaces one problem with another.

> Don: Would you mind taking a look to see if this change is sensible?
> 
> Netdev maintainers: at a higher level, I'm not sure how many other
> drivers might have locking patterns like this that commit
> 86e25f40aa1e ("net: napi: Add napi_config") will break in a similar
> manner.
> 
> Do I:
>    - comb through drivers trying to identify these, and/or

Coccinelle, checking for napi_enable calls under spinlock, points to:

napi_enable called under spin_lock_irqsave from drivers/net/ethernet/via/via-velocity.c:2325
napi_enable called under spin_lock_irqsave from drivers/net/can/grcan.c:1076
napi_enable called under spin_lock from drivers/net/ethernet/marvell/mvneta.c:4388
napi_enable called under spin_lock_irqsave from drivers/net/ethernet/amd/pcnet32.c:2104

Guenter

>    - do we find a way to implement the identified commit with the
>      original lock ordering to avoid breaking any other driver?
> 
> I'd appreciate guidance/insight from the maintainers on how to best
> proceed.
> 
> diff --git a/drivers/net/ethernet/amd/pcnet32.c b/drivers/net/ethernet/amd/pcnet32.c
> index 72db9f9e7bee..ff56a308fec9 100644
> --- a/drivers/net/ethernet/amd/pcnet32.c
> +++ b/drivers/net/ethernet/amd/pcnet32.c
> @@ -2623,13 +2623,13 @@ static int pcnet32_close(struct net_device *dev)
>          struct pcnet32_private *lp = netdev_priv(dev);
>          unsigned long flags;
> 
> +       spin_lock_irqsave(&lp->lock, flags);
> +
>          del_timer_sync(&lp->watchdog_timer);
> 
>          netif_stop_queue(dev);
>          napi_disable(&lp->napi);
> 
> -       spin_lock_irqsave(&lp->lock, flags);
> -
>          dev->stats.rx_missed_errors = lp->a->read_csr(ioaddr, 112);
> 
>          netif_printk(lp, ifdown, KERN_DEBUG, dev,

Re: [net-next v6 5/9] net: napi: Add napi_config

Posted by Joe Damato 1 year, 2 months ago

On Wed, Nov 27, 2024 at 01:43:14PM -0800, Guenter Roeck wrote:
> On 11/27/24 10:51, Joe Damato wrote:
> > On Wed, Nov 27, 2024 at 09:43:54AM -0800, Guenter Roeck wrote:
> > > Hi,
> > > 
> > > On Fri, Oct 11, 2024 at 06:45:00PM +0000, Joe Damato wrote:

[...]

> > It seems this was triggered because before the identified commit,
> > napi_enable did not call napi_hash_add (and thus did not take the
> > napi_hash_lock).
> > 
> > So, I agree there is an inversion; I can't say for sure if this
> > would cause a deadlock in certain situations. It seems like
> > napi_hash_del in the close path will return, so the inversion
> > doesn't seem like it'd lead to a deadlock, but I am not an expert in
> > this and could certainly be wrong.
> > 
> > I wonder if a potential fix for this would be in the driver's close
> > function?
> > 
> > In pcnet32_open the order is:
> >    lock(lp->lock)
> >      napi_enable
> >      netif_start_queue
> >      mod_timer(watchdog)
> >    unlock(lp->lock)
> > 
> > Perhaps pcnet32_close should be the same?
> > 
> > I've included an example patch below for pcnet32_close and I've CC'd
> > the maintainer of pcnet32 that is not currently CC'd.
> > 
> > Guenter: Is there any change you might be able to test the proposed
> > patch below?
> > 
> 
> I moved the spinlock after del_timer_sync() because it is not a good idea
> to hold a spinlock when calling that function. That results in:
> 
> [   10.646956] BUG: sleeping function called from invalid context at net/core/dev.c:6775
> [   10.647142] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 1817, name: ip
> [   10.647237] preempt_count: 1, expected: 0
> [   10.647319] 2 locks held by ip/1817:
> [   10.647383]  #0: ffffffff81ded990 (rtnl_mutex){+.+.}-{4:4}, at: rtnl_newlink+0x22a/0x74c
> [   10.647880]  #1: ff6000000498ccb0 (&lp->lock){-.-.}-{3:3}, at: pcnet32_close+0x40/0x126

[...]

> This is due to might_sleep() at the beginning of napi_disable(). So it doesn't
> work as intended, it just replaces one problem with another.

Thanks for testing that. And you are right, it is also not correct.
I will give it some thought to see if I can think of something
better.

Maybe Don will have some ideas.
 
> > Don: Would you mind taking a look to see if this change is sensible?
> > 
> > Netdev maintainers: at a higher level, I'm not sure how many other
> > drivers might have locking patterns like this that commit
> > 86e25f40aa1e ("net: napi: Add napi_config") will break in a similar
> > manner.
> > 
> > Do I:
> >    - comb through drivers trying to identify these, and/or
> 
> Coccinelle, checking for napi_enable calls under spinlock, points to:
> 
> napi_enable called under spin_lock_irqsave from drivers/net/ethernet/via/via-velocity.c:2325
> napi_enable called under spin_lock_irqsave from drivers/net/can/grcan.c:1076
> napi_enable called under spin_lock from drivers/net/ethernet/marvell/mvneta.c:4388
> napi_enable called under spin_lock_irqsave from drivers/net/ethernet/amd/pcnet32.c:2104

I checked the 3 cases above other than pcnet32 and they appear to be
false positives to me.

Guenter: would you mind sending me your cocci script? Mostly for
selfish reasons; I'd like to see how you did it so I can learn more
:) Feel free to do so off list if you prefer.

I tried to write my first coccinelle script (which you can find
below) that is probably wrong, but it attempts to detect:
  - interrupt routines that hold locks
  - in drivers that call napi_enable between a lock/unlock

I couldn't figure out how to get regexps to work in my script, so I
made a couple variants of the script for each of the spin_lock_*
variants and ran them all.

Only one offender was detected: pcnet32.

So, I guess the question to put out there to maintainers / others on
the list is:

  - There seems to be at least 1 driver affected (pcnet32). There
    might be others, but my simple (and likely incorrect) cocci
    script below couldn't detect any with this particular bug shape.
    Worth mentioning: there could be other bug shapes that trigger
    an inversion that I am currently unaware of.

  - As far as I can tell, there are three ways to proceed:
    1. Find and fix all drivers which broke (pcnet32 being the only
       known driver at this point), or
    2. Disable IRQs when taking the lock in napi_hash_del, or
    3. Move the napi hash add/remove out of napi enable/disable.

Happy to proceed however seems most reasonable to the maintainers,
please let me know.

My cocci script follows; as noted above I am too much of a noob and
couldn't figure out how to use regexps to match the different
spin_lock* variants, so I simply made multiple versions of this
script for each variant:

virtual report

@napi@
identifier func0;
position p0;
@@

func0(...)
{
	...
	spin_lock_irqsave(...)
	...
	napi_enable@p0(...)
	...
	spin_unlock_irqrestore(...)
	...
}

@u@
position p;
identifier func;
typedef irqreturn_t;
@@

irqreturn_t func (...)
{
	...
	spin_lock@p(...)
	...
}

@script:python depends on napi && u@
p << u.p;
func << u.func;
disable << napi.p0;
@@

print("* file: %s irq handler %s takes lock on line %s and calls napi_enable under lock %s" % (p[0].file,func,p[0].line,disable[0].line))

Re: [net-next v6 5/9] net: napi: Add napi_config

Posted by Guenter Roeck 1 year, 2 months ago

On Wed, Nov 27, 2024 at 05:17:15PM -0800, Joe Damato wrote:
> 
> Guenter: would you mind sending me your cocci script? Mostly for
> selfish reasons; I'd like to see how you did it so I can learn more
> :) Feel free to do so off list if you prefer.
> 
You mean because it is so clumsy ? :-). No worries, I attached
it below. It is way too complicated, but it was good enough
for a quick hack.

> I tried to write my first coccinelle script (which you can find
> below) that is probably wrong, but it attempts to detect:
>   - interrupt routines that hold locks
>   - in drivers that call napi_enable between a lock/unlock
> 
> I couldn't figure out how to get regexps to work in my script, so I
> made a couple variants of the script for each of the spin_lock_*
> variants and ran them all.

I pretty much did the same, only in one script.

> 
> Only one offender was detected: pcnet32.
> 

Your script doesn't take into account that the spinlock may have been
released before the call to napi_enable(). Other than that it should
work just fine. I didn't try to track the interrupt handler because
tracking that through multiple functions can be difficult.

Thanks,
Guenter

---
virtual report

@f1@
identifier flags;
position p;
expression e;
@@

<...
    spin_lock_irqsave@p(e, flags);
    ... when != spin_unlock_irqrestore(e, flags);
    napi_enable(...);
    ... when any
    spin_unlock_irqrestore(e, flags);
...>

@f2@
position p;
expression e;
@@

<...
    spin_lock_irq@p(e);
    ... when != spin_unlock_irq(e);
    napi_enable(...);
    ... when any
    spin_unlock_irq(e);
...>

@f3@
position p;
expression e;
@@

<...
    spin_lock@p(e);
    ... when != spin_unlock(e);
    napi_enable(...);
    ... when any
    spin_unlock(e);
...>

@script:python@
p << f1.p;
@@

print("napi_enable called under spin_lock_irqsave from %s:%s" % (p[0].file, p[0].line))

@script:python@
p << f2.p;
@@

print("napi_enable called under spin_lock_irq from %s:%s" % (p[0].file, p[0].line))

@script:python@
p << f3.p;
@@

print("napi_enable called under spin_lock from %s:%s" % (p[0].file, p[0].line))

Re: [net-next v6 5/9] net: napi: Add napi_config

Posted by Joe Damato 1 year, 2 months ago

On Wed, Nov 27, 2024 at 10:51:16AM -0800, Joe Damato wrote:
> On Wed, Nov 27, 2024 at 09:43:54AM -0800, Guenter Roeck wrote:
> > Hi,
> > 
> > On Fri, Oct 11, 2024 at 06:45:00PM +0000, Joe Damato wrote:
> > > Add a persistent NAPI config area for NAPI configuration to the core.
> > > Drivers opt-in to setting the persistent config for a NAPI by passing an
> > > index when calling netif_napi_add_config.
> > > 
> > > napi_config is allocated in alloc_netdev_mqs, freed in free_netdev
> > > (after the NAPIs are deleted).
> > > 
> > > Drivers which call netif_napi_add_config will have persistent per-NAPI
> > > settings: NAPI IDs, gro_flush_timeout, and defer_hard_irq settings.
> > > 
> > > Per-NAPI settings are saved in napi_disable and restored in napi_enable.
> > > 
> > > Co-developed-by: Martin Karsten <mkarsten@uwaterloo.ca>
> > > Signed-off-by: Martin Karsten <mkarsten@uwaterloo.ca>
> > > Signed-off-by: Joe Damato <jdamato@fastly.com>
> > > Reviewed-by: Jakub Kicinski <kuba@kernel.org>
> > 
> > This patch triggers a lock inversion message on pcnet Ethernet adapters.
> 
> Thanks for the report. I am not familiar with the pcnet driver, but
> took some time now to read the report below and the driver code.
> 
> I could definitely be reading the output incorrectly (if so please
> let me know), but it seems like the issue can be triggered in this
> case:

Sorry, my apologies, I both:
  - read the report incorrectly, and
  - proposed a bad patch that would result in a deadlock :)

After re-reading it and running this by Martin (who is CC'd), the
inversion is actually:

CPU 0:
pcnet32_open
   lock(lp->lock)
     napi_enable
       napi_hash_add <- before this executes, CPU 1 proceeds
         lock(napi_hash_lock)
CPU 1:
  pcnet32_close
    napi_disable
      napi_hash_del
        lock(napi_hash_lock)
         < INTERRUPT >
            pcnet32_interrupt
              lock(lp->lock)

This is now an inversion because:

CPU 0: holds lp->lock and is about to take napi_hash_lock
CPU 1: holds napi_hashlock and an IRQ firing on CPU 1 tries to take
       lp->lock (which CPU 0 already holds)

Neither side can proceed:
  - CPU 0 is stuck waiting for napi_hash_lock
  - CPU 1 is stuck waiting for lp->lock

I think the below explanation is still correct as to why the
identified commit causes the issue:

> It seems this was triggered because before the identified commit,
> napi_enable did not call napi_hash_add (and thus did not take the
> napi_hash_lock).

However, the previous patch I proposed for pcnet32 would also cause
a deadlock as the watchdog timer's function also needs lp->lock.

A corrected patch for pcnet32 can be found below.

Guenter: Sorry, would you mind testing the below instead of the
previous patch?

Don: Let me know what you think about the below?

Netdev maintainers, there is an alternate locking solution I can
propose as an RFC that might avoid this class of problem if this
sort of issue is more widespread than just pcnet32:
  - add the NAPI to the hash in netif_napi_add_weight (instead of napi_enable)
  - remove the NAPI from the hash in __netif_napi_del (instead of
    napi_disable)

If changing the locking order in core is the desired route, than the
patch below should be unnecessary, but:

diff --git a/drivers/net/ethernet/amd/pcnet32.c b/drivers/net/ethernet/amd/pcnet32.c
index 72db9f9e7bee..2e0077e68883 100644
--- a/drivers/net/ethernet/amd/pcnet32.c
+++ b/drivers/net/ethernet/amd/pcnet32.c
@@ -2625,11 +2625,10 @@ static int pcnet32_close(struct net_device *dev)

        del_timer_sync(&lp->watchdog_timer);

+       spin_lock_irqsave(&lp->lock, flags);
        netif_stop_queue(dev);
        napi_disable(&lp->napi);

-       spin_lock_irqsave(&lp->lock, flags);
-
        dev->stats.rx_missed_errors = lp->a->read_csr(ioaddr, 112);

        netif_printk(lp, ifdown, KERN_DEBUG, dev,

Re: [net-next v6 5/9] net: napi: Add napi_config

Posted by Jakub Kicinski 1 year, 2 months ago

On Wed, 27 Nov 2024 12:00:02 -0800 Joe Damato wrote:
> CPU 0:
> pcnet32_open
>    lock(lp->lock)
>      napi_enable
>        napi_hash_add <- before this executes, CPU 1 proceeds
>          lock(napi_hash_lock)
> CPU 1:
>   pcnet32_close
>     napi_disable
>       napi_hash_del
>         lock(napi_hash_lock)
>          < INTERRUPT >

How about making napi_hash_lock irq-safe ?
It's a control path lock, it should be fine to disable irqs.

>             pcnet32_interrupt
>               lock(lp->lock)

Re: [net-next v6 5/9] net: napi: Add napi_config

Posted by Joe Damato 1 year, 2 months ago

On Sat, Nov 30, 2024 at 12:45:01PM -0800, Jakub Kicinski wrote:
> On Wed, 27 Nov 2024 12:00:02 -0800 Joe Damato wrote:
> > CPU 0:
> > pcnet32_open
> >    lock(lp->lock)
> >      napi_enable
> >        napi_hash_add <- before this executes, CPU 1 proceeds
> >          lock(napi_hash_lock)
> > CPU 1:
> >   pcnet32_close
> >     napi_disable
> >       napi_hash_del
> >         lock(napi_hash_lock)
> >          < INTERRUPT >
> 
> How about making napi_hash_lock irq-safe ?
> It's a control path lock, it should be fine to disable irqs.

Ah, right. That should fix it.

I'll write a fixes against net and change the napi_hash_lock to use
spin_lock_irqsave and spin_lock_irqrestore and send shortly.

> >             pcnet32_interrupt
> >               lock(lp->lock)
>

Re: [net-next v6 5/9] net: napi: Add napi_config

Posted by Guenter Roeck 1 year, 2 months ago

On 11/27/24 12:00, Joe Damato wrote:
> On Wed, Nov 27, 2024 at 10:51:16AM -0800, Joe Damato wrote:
>> On Wed, Nov 27, 2024 at 09:43:54AM -0800, Guenter Roeck wrote:
>>> Hi,
>>>
>>> On Fri, Oct 11, 2024 at 06:45:00PM +0000, Joe Damato wrote:
>>>> Add a persistent NAPI config area for NAPI configuration to the core.
>>>> Drivers opt-in to setting the persistent config for a NAPI by passing an
>>>> index when calling netif_napi_add_config.
>>>>
>>>> napi_config is allocated in alloc_netdev_mqs, freed in free_netdev
>>>> (after the NAPIs are deleted).
>>>>
>>>> Drivers which call netif_napi_add_config will have persistent per-NAPI
>>>> settings: NAPI IDs, gro_flush_timeout, and defer_hard_irq settings.
>>>>
>>>> Per-NAPI settings are saved in napi_disable and restored in napi_enable.
>>>>
>>>> Co-developed-by: Martin Karsten <mkarsten@uwaterloo.ca>
>>>> Signed-off-by: Martin Karsten <mkarsten@uwaterloo.ca>
>>>> Signed-off-by: Joe Damato <jdamato@fastly.com>
>>>> Reviewed-by: Jakub Kicinski <kuba@kernel.org>
>>>
>>> This patch triggers a lock inversion message on pcnet Ethernet adapters.
>>
>> Thanks for the report. I am not familiar with the pcnet driver, but
>> took some time now to read the report below and the driver code.
>>
>> I could definitely be reading the output incorrectly (if so please
>> let me know), but it seems like the issue can be triggered in this
>> case:
> 
> Sorry, my apologies, I both:
>    - read the report incorrectly, and
>    - proposed a bad patch that would result in a deadlock :)
> 
> After re-reading it and running this by Martin (who is CC'd), the
> inversion is actually:
> 
> CPU 0:
> pcnet32_open
>     lock(lp->lock)
>       napi_enable
>         napi_hash_add <- before this executes, CPU 1 proceeds
>           lock(napi_hash_lock)
> CPU 1:
>    pcnet32_close
>      napi_disable
>        napi_hash_del
>          lock(napi_hash_lock)
>           < INTERRUPT >
>              pcnet32_interrupt
>                lock(lp->lock)
> 
> This is now an inversion because:
> 
> CPU 0: holds lp->lock and is about to take napi_hash_lock
> CPU 1: holds napi_hashlock and an IRQ firing on CPU 1 tries to take
>         lp->lock (which CPU 0 already holds)
> 
> Neither side can proceed:
>    - CPU 0 is stuck waiting for napi_hash_lock
>    - CPU 1 is stuck waiting for lp->lock
> 
> I think the below explanation is still correct as to why the
> identified commit causes the issue:
> 
>> It seems this was triggered because before the identified commit,
>> napi_enable did not call napi_hash_add (and thus did not take the
>> napi_hash_lock).
> 
> However, the previous patch I proposed for pcnet32 would also cause
> a deadlock as the watchdog timer's function also needs lp->lock.
> 
> A corrected patch for pcnet32 can be found below.
> 
> Guenter: Sorry, would you mind testing the below instead of the
> previous patch?
> 
> Don: Let me know what you think about the below?
> 
> Netdev maintainers, there is an alternate locking solution I can
> propose as an RFC that might avoid this class of problem if this
> sort of issue is more widespread than just pcnet32:
>    - add the NAPI to the hash in netif_napi_add_weight (instead of napi_enable)
>    - remove the NAPI from the hash in __netif_napi_del (instead of
>      napi_disable)
> 
> If changing the locking order in core is the desired route, than the
> patch below should be unnecessary, but:
> 
> diff --git a/drivers/net/ethernet/amd/pcnet32.c b/drivers/net/ethernet/amd/pcnet32.c
> index 72db9f9e7bee..2e0077e68883 100644
> --- a/drivers/net/ethernet/amd/pcnet32.c
> +++ b/drivers/net/ethernet/amd/pcnet32.c
> @@ -2625,11 +2625,10 @@ static int pcnet32_close(struct net_device *dev)
> 
>          del_timer_sync(&lp->watchdog_timer);
> 
> +       spin_lock_irqsave(&lp->lock, flags);
>          netif_stop_queue(dev);
>          napi_disable(&lp->napi);
> 

That is what I did, actually. Problem with that is that napi_disable()
wants to be able to sleep, thus the above triggers:

BUG: sleeping function called from invalid context at net/core/dev.c:6775
in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 1817, name: ip
preempt_count: 1, expected: 0
2 locks held by ip/1817:
#0: ffffffff81ded990 (rtnl_mutex){+.+.}-{4:4}, at: rtnl_newlink+0x22a/0x74c
#1: ff6000000498ccb0 (&lp->lock){-.-.}-{3:3}, at: pcnet32_close+0x40/0x126
irq event stamp: 3720
hardirqs last  enabled at (3719): [<ffffffff80decaf4>] _raw_spin_unlock_irqrestore+0x54/0x62
hardirqs last disabled at (3720): [<ffffffff80dec8a2>] _raw_spin_lock_irqsave+0x5e/0x64
softirqs last  enabled at (3712): [<ffffffff8001efca>] handle_softirqs+0x3e6/0x4a2
softirqs last disabled at (3631): [<ffffffff80ded6cc>] __do_softirq+0x12/0x1a
CPU: 0 UID: 0 PID: 1817 Comm: ip Tainted: G                 N 6.12.0-10313-g7d4050728c83-dirty #1
Tainted: [N]=TEST
Hardware name: riscv-virtio,qemu (DT)
Call Trace:
[<ffffffff80006d42>] dump_backtrace+0x1c/0x24
[<ffffffff80dc8d94>] show_stack+0x2c/0x38
[<ffffffff80de00b0>] dump_stack_lvl+0x74/0xac
[<ffffffff80de00fc>] dump_stack+0x14/0x1c
[<ffffffff8004da18>] __might_resched+0x23e/0x248
[<ffffffff8004da60>] __might_sleep+0x3e/0x62
[<ffffffff80b8d370>] napi_disable+0x24/0x10c
[<ffffffff809a06fe>] pcnet32_close+0x6c/0x126

Guenter

Re: [net-next v6 5/9] net: napi: Add napi_config

Posted by Eric Dumazet 1 year, 4 months ago

On Fri, Oct 11, 2024 at 8:46 PM Joe Damato <jdamato@fastly.com> wrote:
>
> Add a persistent NAPI config area for NAPI configuration to the core.
> Drivers opt-in to setting the persistent config for a NAPI by passing an
> index when calling netif_napi_add_config.
>
> napi_config is allocated in alloc_netdev_mqs, freed in free_netdev
> (after the NAPIs are deleted).
>
> Drivers which call netif_napi_add_config will have persistent per-NAPI
> settings: NAPI IDs, gro_flush_timeout, and defer_hard_irq settings.
>
> Per-NAPI settings are saved in napi_disable and restored in napi_enable.
>
> Co-developed-by: Martin Karsten <mkarsten@uwaterloo.ca>
> Signed-off-by: Martin Karsten <mkarsten@uwaterloo.ca>
> Signed-off-by: Joe Damato <jdamato@fastly.com>
> Reviewed-by: Jakub Kicinski <kuba@kernel.org>

Reviewed-by: Eric Dumazet <edumazet@google.com>