[v3] net: sleepable ndo_set_rx_mode

[PATCH net-next v3 03/13] net: introduce ndo_set_rx_mode_async and dev_rx_mode_work

Posted by Stanislav Fomichev 2 weeks, 3 days ago

Add ndo_set_rx_mode_async callback that drivers can implement instead
of the legacy ndo_set_rx_mode. The legacy callback runs under the
netif_addr_lock spinlock with BHs disabled, preventing drivers from
sleeping. The async variant runs from a work queue with rtnl_lock and
netdev_lock_ops held, in fully sleepable context.

When __dev_set_rx_mode() sees ndo_set_rx_mode_async, it schedules
dev_rx_mode_work instead of calling the driver inline. The work
function takes two snapshots of each address list (uc/mc) under
the addr_lock, then drops the lock and calls the driver with the
work copies. After the driver returns, it reconciles the snapshots
back to the real lists under the lock.

Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
---
 Documentation/networking/netdevices.rst |  8 +++
 include/linux/netdevice.h               | 20 ++++++
 net/core/dev.c                          | 95 +++++++++++++++++++++++--
 3 files changed, 116 insertions(+), 7 deletions(-)

diff --git a/Documentation/networking/netdevices.rst b/Documentation/networking/netdevices.rst
index 35704d115312..dc83d78d3b27 100644
--- a/Documentation/networking/netdevices.rst
+++ b/Documentation/networking/netdevices.rst
@@ -289,6 +289,14 @@ struct net_device synchronization rules
 ndo_set_rx_mode:
 	Synchronization: netif_addr_lock spinlock.
 	Context: BHs disabled
+	Notes: Deprecated in favor of sleepable ndo_set_rx_mode_async.
+
+ndo_set_rx_mode_async:
+	Synchronization: rtnl_lock() semaphore. In addition, netdev instance
+	lock if the driver implements queue management or shaper API.
+	Context: process (from a work queue)
+	Notes: Sleepable version of ndo_set_rx_mode. Receives snapshots
+	of the unicast and multicast address lists.
 
 ndo_setup_tc:
 	``TC_SETUP_BLOCK`` and ``TC_SETUP_FT`` are running under NFT locks
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 469b7cdb3237..b05bdd67b807 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1117,6 +1117,16 @@ struct netdev_net_notifier {
  *	This function is called device changes address list filtering.
  *	If driver handles unicast address filtering, it should set
  *	IFF_UNICAST_FLT in its priv_flags.
+ *	Cannot sleep, called with netif_addr_lock_bh held.
+ *	Deprecated in favor of sleepable ndo_set_rx_mode_async.
+ *
+ * void (*ndo_set_rx_mode_async)(struct net_device *dev,
+ *				 struct netdev_hw_addr_list *uc,
+ *				 struct netdev_hw_addr_list *mc);
+ *	Sleepable version of ndo_set_rx_mode. Called from a work queue
+ *	with rtnl_lock and netdev_lock_ops(dev) held. The uc/mc parameters
+ *	are snapshots of the address lists - iterate with
+ *	netdev_hw_addr_list_for_each(ha, uc).
  *
  * int (*ndo_set_mac_address)(struct net_device *dev, void *addr);
  *	This function  is called when the Media Access Control address
@@ -1437,6 +1447,9 @@ struct net_device_ops {
 	void			(*ndo_change_rx_flags)(struct net_device *dev,
 						       int flags);
 	void			(*ndo_set_rx_mode)(struct net_device *dev);
+	void			(*ndo_set_rx_mode_async)(struct net_device *dev,
+							 struct netdev_hw_addr_list *uc,
+							 struct netdev_hw_addr_list *mc);
 	int			(*ndo_set_mac_address)(struct net_device *dev,
 						       void *addr);
 	int			(*ndo_validate_addr)(struct net_device *dev);
@@ -1903,6 +1916,7 @@ enum netdev_reg_state {
  *				has been enabled due to the need to listen to
  *				additional unicast addresses in a device that
  *				does not implement ndo_set_rx_mode()
+ *	@rx_mode_work:		Work queue entry for ndo_set_rx_mode_async()
  *	@uc:			unicast mac addresses
  *	@mc:			multicast mac addresses
  *	@dev_addrs:		list of device hw addresses
@@ -2293,6 +2307,7 @@ struct net_device {
 	unsigned int		promiscuity;
 	unsigned int		allmulti;
 	bool			uc_promisc;
+	struct work_struct	rx_mode_work;
 #ifdef CONFIG_LOCKDEP
 	unsigned char		nested_level;
 #endif
@@ -4661,6 +4676,11 @@ static inline bool netif_device_present(const struct net_device *dev)
 	return test_bit(__LINK_STATE_PRESENT, &dev->state);
 }
 
+static inline bool netif_up_and_present(const struct net_device *dev)
+{
+	return (dev->flags & IFF_UP) && netif_device_present(dev);
+}
+
 void netif_device_detach(struct net_device *dev);
 
 void netif_device_attach(struct net_device *dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index 200d44883fc1..fedc423306fc 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2381,6 +2381,8 @@ static void netstamp_clear(struct work_struct *work)
 static DECLARE_WORK(netstamp_work, netstamp_clear);
 #endif
 
+static struct workqueue_struct *rx_mode_wq;
+
 void net_enable_timestamp(void)
 {
 #ifdef CONFIG_JUMP_LABEL
@@ -9669,22 +9671,84 @@ int netif_set_allmulti(struct net_device *dev, int inc, bool notify)
 	return 0;
 }
 
-/*
- *	Upload unicast and multicast address lists to device and
- *	configure RX filtering. When the device doesn't support unicast
- *	filtering it is put in promiscuous mode while unicast addresses
- *	are present.
+static void dev_rx_mode_work(struct work_struct *work)
+{
+	struct net_device *dev = container_of(work, struct net_device,
+					      rx_mode_work);
+	struct netdev_hw_addr_list uc_snap, mc_snap, uc_ref, mc_ref;
+	const struct net_device_ops *ops = dev->netdev_ops;
+	int err;
+
+	__hw_addr_init(&uc_snap);
+	__hw_addr_init(&mc_snap);
+	__hw_addr_init(&uc_ref);
+	__hw_addr_init(&mc_ref);
+
+	rtnl_lock();
+	netdev_lock_ops(dev);
+
+	if (!netif_up_and_present(dev))
+		goto out;
+
+	if (ops->ndo_set_rx_mode_async) {
+		netif_addr_lock_bh(dev);
+
+		err = __hw_addr_list_snapshot(&uc_snap, &dev->uc,
+					      dev->addr_len);
+		if (!err)
+			err = __hw_addr_list_snapshot(&uc_ref, &dev->uc,
+						      dev->addr_len);
+		if (!err)
+			err = __hw_addr_list_snapshot(&mc_snap, &dev->mc,
+						      dev->addr_len);
+		if (!err)
+			err = __hw_addr_list_snapshot(&mc_ref, &dev->mc,
+						      dev->addr_len);
+		netif_addr_unlock_bh(dev);
+
+		if (err) {
+			netdev_WARN(dev, "failed to sync uc/mc addresses\n");
+			__hw_addr_flush(&uc_snap);
+			__hw_addr_flush(&uc_ref);
+			__hw_addr_flush(&mc_snap);
+			goto out;
+		}
+
+		ops->ndo_set_rx_mode_async(dev, &uc_snap, &mc_snap);
+
+		netif_addr_lock_bh(dev);
+		__hw_addr_list_reconcile(&dev->uc, &uc_snap,
+					 &uc_ref, dev->addr_len);
+		__hw_addr_list_reconcile(&dev->mc, &mc_snap,
+					 &mc_ref, dev->addr_len);
+		netif_addr_unlock_bh(dev);
+	}
+
+out:
+	netdev_unlock_ops(dev);
+	rtnl_unlock();
+}
+
+/**
+ * __dev_set_rx_mode() - upload unicast and multicast address lists to device
+ * and configure RX filtering.
+ * @dev: device
+ *
+ * When the device doesn't support unicast filtering it is put in promiscuous
+ * mode while unicast addresses are present.
  */
 void __dev_set_rx_mode(struct net_device *dev)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
 
 	/* dev_open will call this function so the list will stay sane. */
-	if (!(dev->flags&IFF_UP))
+	if (!netif_up_and_present(dev))
 		return;
 
-	if (!netif_device_present(dev))
+	if (ops->ndo_set_rx_mode_async) {
+		queue_work(rx_mode_wq, &dev->rx_mode_work);
 		return;
+	}
 
 	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
 		/* Unicast addresses changes may only happen under the rtnl,
@@ -11708,6 +11772,16 @@ void netdev_run_todo(void)
 
 	__rtnl_unlock();
 
+	/* Make sure all pending rx_mode work completes before returning.
+	 *
+	 * rx_mode_wq may be NULL during early boot:
+	 * core_initcall(netlink_proto_init) vs subsys_initcall(net_dev_init).
+	 *
+	 * Check current_work() to avoid flushing from the wq.
+	 */
+	if (rx_mode_wq && !current_work())
+		flush_workqueue(rx_mode_wq);
+
 	/* Wait for rcu callbacks to finish before next phase */
 	if (!list_empty(&list))
 		rcu_barrier();
@@ -12099,6 +12173,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 #endif
 
 	mutex_init(&dev->lock);
+	INIT_WORK(&dev->rx_mode_work, dev_rx_mode_work);
 
 	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
 	setup(dev);
@@ -12203,6 +12278,8 @@ void free_netdev(struct net_device *dev)
 
 	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
 
+	cancel_work_sync(&dev->rx_mode_work);
+
 	/* Flush device addresses */
 	dev_addr_flush(dev);
 
@@ -13296,6 +13373,10 @@ static int __init net_dev_init(void)
 	if (register_pernet_device(&default_device_ops))
 		goto out;
 
+	rx_mode_wq = alloc_ordered_workqueue("rx_mode_wq", 0);
+	if (!rx_mode_wq)
+		goto out;
+
 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
 
-- 
2.53.0

Re: [PATCH net-next v3 03/13] net: introduce ndo_set_rx_mode_async and dev_rx_mode_work

Posted by Jakub Kicinski 1 week, 6 days ago

On Thu, 19 Mar 2026 18:24:51 -0700 Stanislav Fomichev wrote:
> diff --git a/Documentation/networking/netdevices.rst b/Documentation/networking/netdevices.rst
> index 35704d115312..dc83d78d3b27 100644
> --- a/Documentation/networking/netdevices.rst
> +++ b/Documentation/networking/netdevices.rst
> @@ -289,6 +289,14 @@ struct net_device synchronization rules
>  ndo_set_rx_mode:
>  	Synchronization: netif_addr_lock spinlock.
>  	Context: BHs disabled
> +	Notes: Deprecated in favor of sleepable ndo_set_rx_mode_async.
> 
> +ndo_set_rx_mode_async:
> +	Synchronization: rtnl_lock() semaphore. In addition, netdev instance
> +	lock if the driver implements queue management or shaper API.
> +	Context: process (from a work queue)
> +	Notes: Sleepable version of ndo_set_rx_mode. Receives snapshots

It's probably just my weirdness but I find creating adjectives out 
of random nouns by adding "-able" to be in poor taste. "sleepable"
took root in certain three letter subsystems but I hope it won't
in netdev.

Please use your words:

	Notes: Async version of ndo_set_rx_mode which runs in process
	context. Receives snapshots f the unicast and multicast address lists.

> +	of the unicast and multicast address lists.
>  
>  ndo_setup_tc:
>  	``TC_SETUP_BLOCK`` and ``TC_SETUP_FT`` are running under NFT locks
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 469b7cdb3237..b05bdd67b807 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1117,6 +1117,16 @@ struct netdev_net_notifier {
>   *	This function is called device changes address list filtering.
>   *	If driver handles unicast address filtering, it should set
>   *	IFF_UNICAST_FLT in its priv_flags.
> + *	Cannot sleep, called with netif_addr_lock_bh held.
> + *	Deprecated in favor of sleepable ndo_set_rx_mode_async.
> + *
> + * void (*ndo_set_rx_mode_async)(struct net_device *dev,
> + *				 struct netdev_hw_addr_list *uc,
> + *				 struct netdev_hw_addr_list *mc);
> + *	Sleepable version of ndo_set_rx_mode. Called from a work queue
> + *	with rtnl_lock and netdev_lock_ops(dev) held. The uc/mc parameters
> + *	are snapshots of the address lists - iterate with
> + *	netdev_hw_addr_list_for_each(ha, uc).
>   *
>   * int (*ndo_set_mac_address)(struct net_device *dev, void *addr);
>   *	This function  is called when the Media Access Control address
> @@ -1437,6 +1447,9 @@ struct net_device_ops {
>  	void			(*ndo_change_rx_flags)(struct net_device *dev,
>  						       int flags);
>  	void			(*ndo_set_rx_mode)(struct net_device *dev);
> +	void			(*ndo_set_rx_mode_async)(struct net_device *dev,
> +							 struct netdev_hw_addr_list *uc,
> +							 struct netdev_hw_addr_list *mc);
>  	int			(*ndo_set_mac_address)(struct net_device *dev,
>  						       void *addr);
>  	int			(*ndo_validate_addr)(struct net_device *dev);
> @@ -1903,6 +1916,7 @@ enum netdev_reg_state {
>   *				has been enabled due to the need to listen to
>   *				additional unicast addresses in a device that
>   *				does not implement ndo_set_rx_mode()
> + *	@rx_mode_work:		Work queue entry for ndo_set_rx_mode_async()
>   *	@uc:			unicast mac addresses
>   *	@mc:			multicast mac addresses
>   *	@dev_addrs:		list of device hw addresses
> @@ -2293,6 +2307,7 @@ struct net_device {
>  	unsigned int		promiscuity;
>  	unsigned int		allmulti;
>  	bool			uc_promisc;
> +	struct work_struct	rx_mode_work;
>  #ifdef CONFIG_LOCKDEP
>  	unsigned char		nested_level;
>  #endif
> @@ -4661,6 +4676,11 @@ static inline bool netif_device_present(const struct net_device *dev)
>  	return test_bit(__LINK_STATE_PRESENT, &dev->state);
>  }
>  
> +static inline bool netif_up_and_present(const struct net_device *dev)
> +{
> +	return (dev->flags & IFF_UP) && netif_device_present(dev);

Is this really worth a dedicated helper? What are you trying to express
here semantically?

> +
>  void netif_device_detach(struct net_device *dev);
>  
>  void netif_device_attach(struct net_device *dev);
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 200d44883fc1..fedc423306fc 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -2381,6 +2381,8 @@ static void netstamp_clear(struct work_struct *work)
>  static DECLARE_WORK(netstamp_work, netstamp_clear);
>  #endif
>  
> +static struct workqueue_struct *rx_mode_wq;
> +
>  void net_enable_timestamp(void)
>  {
>  #ifdef CONFIG_JUMP_LABEL
> @@ -9669,22 +9671,84 @@ int netif_set_allmulti(struct net_device *dev, int inc, bool notify)
>  	return 0;
>  }
>  
> -/*
> - *	Upload unicast and multicast address lists to device and
> - *	configure RX filtering. When the device doesn't support unicast
> - *	filtering it is put in promiscuous mode while unicast addresses
> - *	are present.
> +static void dev_rx_mode_work(struct work_struct *work)
> +{
> +	struct net_device *dev = container_of(work, struct net_device,
> +					      rx_mode_work);
> +	struct netdev_hw_addr_list uc_snap, mc_snap, uc_ref, mc_ref;
> +	const struct net_device_ops *ops = dev->netdev_ops;
> +	int err;
> +
> +	__hw_addr_init(&uc_snap);
> +	__hw_addr_init(&mc_snap);
> +	__hw_addr_init(&uc_ref);
> +	__hw_addr_init(&mc_ref);
> +
> +	rtnl_lock();
> +	netdev_lock_ops(dev);
> +
> +	if (!netif_up_and_present(dev))
> +		goto out;
> +
> +	if (ops->ndo_set_rx_mode_async) {

How did we get here if we don't have this op?
Are you planning to plumb more code thru this work in the future?
If yes the whole rx_mode handling should be in a dedicated helper
rather than indenting most of the function.

> +		netif_addr_lock_bh(dev);
> +
> +		err = __hw_addr_list_snapshot(&uc_snap, &dev->uc,
> +					      dev->addr_len);
> +		if (!err)
> +			err = __hw_addr_list_snapshot(&uc_ref, &dev->uc,
> +						      dev->addr_len);
> +		if (!err)
> +			err = __hw_addr_list_snapshot(&mc_snap, &dev->mc,
> +						      dev->addr_len);
> +		if (!err)
> +			err = __hw_addr_list_snapshot(&mc_ref, &dev->mc,
> +						      dev->addr_len);

This doesn't get slow with a few thousands of addresses?

> +		netif_addr_unlock_bh(dev);
> +
> +		if (err) {
> +			netdev_WARN(dev, "failed to sync uc/mc addresses\n");
> +			__hw_addr_flush(&uc_snap);
> +			__hw_addr_flush(&uc_ref);
> +			__hw_addr_flush(&mc_snap);
> +			goto out;
> +		}
> +
> +		ops->ndo_set_rx_mode_async(dev, &uc_snap, &mc_snap);
> +
> +		netif_addr_lock_bh(dev);
> +		__hw_addr_list_reconcile(&dev->uc, &uc_snap,
> +					 &uc_ref, dev->addr_len);
> +		__hw_addr_list_reconcile(&dev->mc, &mc_snap,
> +					 &mc_ref, dev->addr_len);
> +		netif_addr_unlock_bh(dev);
> +	}
> +
> +out:
> +	netdev_unlock_ops(dev);
> +	rtnl_unlock();
> +}
> +
> +/**
> + * __dev_set_rx_mode() - upload unicast and multicast address lists to device
> + * and configure RX filtering.
> + * @dev: device
> + *
> + * When the device doesn't support unicast filtering it is put in promiscuous
> + * mode while unicast addresses are present.
>   */
>  void __dev_set_rx_mode(struct net_device *dev)
>  {
>  	const struct net_device_ops *ops = dev->netdev_ops;
>  
>  	/* dev_open will call this function so the list will stay sane. */
> -	if (!(dev->flags&IFF_UP))
> +	if (!netif_up_and_present(dev))
>  		return;
>  
> -	if (!netif_device_present(dev))
> +	if (ops->ndo_set_rx_mode_async) {
> +		queue_work(rx_mode_wq, &dev->rx_mode_work);
>  		return;
> +	}
>  
>  	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
>  		/* Unicast addresses changes may only happen under the rtnl,
> @@ -11708,6 +11772,16 @@ void netdev_run_todo(void)
>  
>  	__rtnl_unlock();
>  
> +	/* Make sure all pending rx_mode work completes before returning.
> +	 *
> +	 * rx_mode_wq may be NULL during early boot:
> +	 * core_initcall(netlink_proto_init) vs subsys_initcall(net_dev_init).
> +	 *
> +	 * Check current_work() to avoid flushing from the wq.
> +	 */
> +	if (rx_mode_wq && !current_work())
> +		flush_workqueue(rx_mode_wq);

Can we give the work a reference on the netdev (at init time) and
cancel + release it here instead of flushing / waiting?

>  	/* Wait for rcu callbacks to finish before next phase */
>  	if (!list_empty(&list))
>  		rcu_barrier();
> @@ -12099,6 +12173,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
>  #endif
>  
>  	mutex_init(&dev->lock);
> +	INIT_WORK(&dev->rx_mode_work, dev_rx_mode_work);
>  
>  	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
>  	setup(dev);
> @@ -12203,6 +12278,8 @@ void free_netdev(struct net_device *dev)
>  
>  	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
>  
> +	cancel_work_sync(&dev->rx_mode_work);

Should never happen so maybe wrap it in a WARN ?


>  	/* Flush device addresses */
>  	dev_addr_flush(dev);
>  
> @@ -13296,6 +13373,10 @@ static int __init net_dev_init(void)
>  	if (register_pernet_device(&default_device_ops))
>  		goto out;
>  
> +	rx_mode_wq = alloc_ordered_workqueue("rx_mode_wq", 0);
> +	if (!rx_mode_wq)
> +		goto out;
> +
>  	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
>  	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
>

Re: [PATCH net-next v3 03/13] net: introduce ndo_set_rx_mode_async and dev_rx_mode_work

Posted by Stanislav Fomichev 1 week, 5 days ago

On 03/23, Jakub Kicinski wrote:
> On Thu, 19 Mar 2026 18:24:51 -0700 Stanislav Fomichev wrote:
> > diff --git a/Documentation/networking/netdevices.rst b/Documentation/networking/netdevices.rst
> > index 35704d115312..dc83d78d3b27 100644
> > --- a/Documentation/networking/netdevices.rst
> > +++ b/Documentation/networking/netdevices.rst
> > @@ -289,6 +289,14 @@ struct net_device synchronization rules
> >  ndo_set_rx_mode:
> >  	Synchronization: netif_addr_lock spinlock.
> >  	Context: BHs disabled
> > +	Notes: Deprecated in favor of sleepable ndo_set_rx_mode_async.
> > 
> > +ndo_set_rx_mode_async:
> > +	Synchronization: rtnl_lock() semaphore. In addition, netdev instance
> > +	lock if the driver implements queue management or shaper API.
> > +	Context: process (from a work queue)
> > +	Notes: Sleepable version of ndo_set_rx_mode. Receives snapshots
> 
> It's probably just my weirdness but I find creating adjectives out 
> of random nouns by adding "-able" to be in poor taste. "sleepable"
> took root in certain three letter subsystems but I hope it won't
> in netdev.
> 
> Please use your words:
> 
> 	Notes: Async version of ndo_set_rx_mode which runs in process
> 	context. Receives snapshots f the unicast and multicast address lists.

SG, will do!
 
> > +	of the unicast and multicast address lists.
> >  
> >  ndo_setup_tc:
> >  	``TC_SETUP_BLOCK`` and ``TC_SETUP_FT`` are running under NFT locks
> > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > index 469b7cdb3237..b05bdd67b807 100644
> > --- a/include/linux/netdevice.h
> > +++ b/include/linux/netdevice.h
> > @@ -1117,6 +1117,16 @@ struct netdev_net_notifier {
> >   *	This function is called device changes address list filtering.
> >   *	If driver handles unicast address filtering, it should set
> >   *	IFF_UNICAST_FLT in its priv_flags.
> > + *	Cannot sleep, called with netif_addr_lock_bh held.
> > + *	Deprecated in favor of sleepable ndo_set_rx_mode_async.
> > + *
> > + * void (*ndo_set_rx_mode_async)(struct net_device *dev,
> > + *				 struct netdev_hw_addr_list *uc,
> > + *				 struct netdev_hw_addr_list *mc);
> > + *	Sleepable version of ndo_set_rx_mode. Called from a work queue
> > + *	with rtnl_lock and netdev_lock_ops(dev) held. The uc/mc parameters
> > + *	are snapshots of the address lists - iterate with
> > + *	netdev_hw_addr_list_for_each(ha, uc).
> >   *
> >   * int (*ndo_set_mac_address)(struct net_device *dev, void *addr);
> >   *	This function  is called when the Media Access Control address
> > @@ -1437,6 +1447,9 @@ struct net_device_ops {
> >  	void			(*ndo_change_rx_flags)(struct net_device *dev,
> >  						       int flags);
> >  	void			(*ndo_set_rx_mode)(struct net_device *dev);
> > +	void			(*ndo_set_rx_mode_async)(struct net_device *dev,
> > +							 struct netdev_hw_addr_list *uc,
> > +							 struct netdev_hw_addr_list *mc);
> >  	int			(*ndo_set_mac_address)(struct net_device *dev,
> >  						       void *addr);
> >  	int			(*ndo_validate_addr)(struct net_device *dev);
> > @@ -1903,6 +1916,7 @@ enum netdev_reg_state {
> >   *				has been enabled due to the need to listen to
> >   *				additional unicast addresses in a device that
> >   *				does not implement ndo_set_rx_mode()
> > + *	@rx_mode_work:		Work queue entry for ndo_set_rx_mode_async()
> >   *	@uc:			unicast mac addresses
> >   *	@mc:			multicast mac addresses
> >   *	@dev_addrs:		list of device hw addresses
> > @@ -2293,6 +2307,7 @@ struct net_device {
> >  	unsigned int		promiscuity;
> >  	unsigned int		allmulti;
> >  	bool			uc_promisc;
> > +	struct work_struct	rx_mode_work;
> >  #ifdef CONFIG_LOCKDEP
> >  	unsigned char		nested_level;
> >  #endif
> > @@ -4661,6 +4676,11 @@ static inline bool netif_device_present(const struct net_device *dev)
> >  	return test_bit(__LINK_STATE_PRESENT, &dev->state);
> >  }
> >  
> > +static inline bool netif_up_and_present(const struct net_device *dev)
> > +{
> > +	return (dev->flags & IFF_UP) && netif_device_present(dev);
> 
> Is this really worth a dedicated helper? What are you trying to express
> here semantically?

I mostly added it to avoid repeating the same present & UP check. Will
undo.

> > +
> >  void netif_device_detach(struct net_device *dev);
> >  
> >  void netif_device_attach(struct net_device *dev);
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index 200d44883fc1..fedc423306fc 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -2381,6 +2381,8 @@ static void netstamp_clear(struct work_struct *work)
> >  static DECLARE_WORK(netstamp_work, netstamp_clear);
> >  #endif
> >  
> > +static struct workqueue_struct *rx_mode_wq;
> > +
> >  void net_enable_timestamp(void)
> >  {
> >  #ifdef CONFIG_JUMP_LABEL
> > @@ -9669,22 +9671,84 @@ int netif_set_allmulti(struct net_device *dev, int inc, bool notify)
> >  	return 0;
> >  }
> >  
> > -/*
> > - *	Upload unicast and multicast address lists to device and
> > - *	configure RX filtering. When the device doesn't support unicast
> > - *	filtering it is put in promiscuous mode while unicast addresses
> > - *	are present.
> > +static void dev_rx_mode_work(struct work_struct *work)
> > +{
> > +	struct net_device *dev = container_of(work, struct net_device,
> > +					      rx_mode_work);
> > +	struct netdev_hw_addr_list uc_snap, mc_snap, uc_ref, mc_ref;
> > +	const struct net_device_ops *ops = dev->netdev_ops;
> > +	int err;
> > +
> > +	__hw_addr_init(&uc_snap);
> > +	__hw_addr_init(&mc_snap);
> > +	__hw_addr_init(&uc_ref);
> > +	__hw_addr_init(&mc_ref);
> > +
> > +	rtnl_lock();
> > +	netdev_lock_ops(dev);
> > +
> > +	if (!netif_up_and_present(dev))
> > +		goto out;
> > +
> > +	if (ops->ndo_set_rx_mode_async) {
> 
> How did we get here if we don't have this op?
> Are you planning to plumb more code thru this work in the future?
> If yes the whole rx_mode handling should be in a dedicated helper
> rather than indenting most of the function.

I do expand this, yes, in the subsequent patches. With promisc
handling (ndo_change_rx_flags) and ndo_set_rx_mode fallback. Let me try
to see if I can add some helper+struct to manage a set of snapshots
to make it a bit more clear.

> > +		netif_addr_lock_bh(dev);
> > +
> > +		err = __hw_addr_list_snapshot(&uc_snap, &dev->uc,
> > +					      dev->addr_len);
> > +		if (!err)
> > +			err = __hw_addr_list_snapshot(&uc_ref, &dev->uc,
> > +						      dev->addr_len);
> > +		if (!err)
> > +			err = __hw_addr_list_snapshot(&mc_snap, &dev->mc,
> > +						      dev->addr_len);
> > +		if (!err)
> > +			err = __hw_addr_list_snapshot(&mc_ref, &dev->mc,
> > +						      dev->addr_len);
> 
> This doesn't get slow with a few thousands of addresses?

I can add kunit benchmark and attach the output? Although not sure where
to go from that. The alternative to this is allocating an array of entries.
I started with that initially but __hw_addr_sync_dev wants to kfree the
individual entries and I decided not to have a separate helpers to
manage the snapshots.

> > +		netif_addr_unlock_bh(dev);
> > +
> > +		if (err) {
> > +			netdev_WARN(dev, "failed to sync uc/mc addresses\n");
> > +			__hw_addr_flush(&uc_snap);
> > +			__hw_addr_flush(&uc_ref);
> > +			__hw_addr_flush(&mc_snap);
> > +			goto out;
> > +		}
> > +
> > +		ops->ndo_set_rx_mode_async(dev, &uc_snap, &mc_snap);
> > +
> > +		netif_addr_lock_bh(dev);
> > +		__hw_addr_list_reconcile(&dev->uc, &uc_snap,
> > +					 &uc_ref, dev->addr_len);
> > +		__hw_addr_list_reconcile(&dev->mc, &mc_snap,
> > +					 &mc_ref, dev->addr_len);
> > +		netif_addr_unlock_bh(dev);
> > +	}
> > +
> > +out:
> > +	netdev_unlock_ops(dev);
> > +	rtnl_unlock();
> > +}
> > +
> > +/**
> > + * __dev_set_rx_mode() - upload unicast and multicast address lists to device
> > + * and configure RX filtering.
> > + * @dev: device
> > + *
> > + * When the device doesn't support unicast filtering it is put in promiscuous
> > + * mode while unicast addresses are present.
> >   */
> >  void __dev_set_rx_mode(struct net_device *dev)
> >  {
> >  	const struct net_device_ops *ops = dev->netdev_ops;
> >  
> >  	/* dev_open will call this function so the list will stay sane. */
> > -	if (!(dev->flags&IFF_UP))
> > +	if (!netif_up_and_present(dev))
> >  		return;
> >  
> > -	if (!netif_device_present(dev))
> > +	if (ops->ndo_set_rx_mode_async) {
> > +		queue_work(rx_mode_wq, &dev->rx_mode_work);
> >  		return;
> > +	}
> >  
> >  	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
> >  		/* Unicast addresses changes may only happen under the rtnl,
> > @@ -11708,6 +11772,16 @@ void netdev_run_todo(void)
> >  
> >  	__rtnl_unlock();
> >  
> > +	/* Make sure all pending rx_mode work completes before returning.
> > +	 *
> > +	 * rx_mode_wq may be NULL during early boot:
> > +	 * core_initcall(netlink_proto_init) vs subsys_initcall(net_dev_init).
> > +	 *
> > +	 * Check current_work() to avoid flushing from the wq.
> > +	 */
> > +	if (rx_mode_wq && !current_work())
> > +		flush_workqueue(rx_mode_wq);
> 
> Can we give the work a reference on the netdev (at init time) and
> cancel + release it here instead of flushing / waiting?

Not sure why cancel+release, maybe you're thinking about the unregister
path? This is rtnl_unlock -> netdev_run_todo -> __rtnl_unlock + some
extras.

And the flush is here to plumb the addresses to the real devices
before we return to the callers. Mostly because of the following
things we have in the tests:

# TEST: team cleanup mode lacp                                        [FAIL]
#       macvlan unicast address not found on a slave

Can you explain a bit more on the suggestion?

> >  	/* Wait for rcu callbacks to finish before next phase */
> >  	if (!list_empty(&list))
> >  		rcu_barrier();
> > @@ -12099,6 +12173,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
> >  #endif
> >  
> >  	mutex_init(&dev->lock);
> > +	INIT_WORK(&dev->rx_mode_work, dev_rx_mode_work);
> >  
> >  	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
> >  	setup(dev);
> > @@ -12203,6 +12278,8 @@ void free_netdev(struct net_device *dev)
> >  
> >  	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
> >  
> > +	cancel_work_sync(&dev->rx_mode_work);
> 
> Should never happen so maybe wrap it in a WARN ?

Or maybe just flush_workqueue here as well? To signal the intent that we
are mostly waiting for the wq entry to be unused to be able to kfree it?

Re: [PATCH net-next v3 03/13] net: introduce ndo_set_rx_mode_async and dev_rx_mode_work

Posted by Jakub Kicinski 1 week, 5 days ago

On Tue, 24 Mar 2026 11:13:04 -0700 Stanislav Fomichev wrote:
> > > +		netif_addr_lock_bh(dev);
> > > +
> > > +		err = __hw_addr_list_snapshot(&uc_snap, &dev->uc,
> > > +					      dev->addr_len);
> > > +		if (!err)
> > > +			err = __hw_addr_list_snapshot(&uc_ref, &dev->uc,
> > > +						      dev->addr_len);
> > > +		if (!err)
> > > +			err = __hw_addr_list_snapshot(&mc_snap, &dev->mc,
> > > +						      dev->addr_len);
> > > +		if (!err)
> > > +			err = __hw_addr_list_snapshot(&mc_ref, &dev->mc,
> > > +						      dev->addr_len);  
> > 
> > This doesn't get slow with a few thousands of addresses?  
> 
> I can add kunit benchmark and attach the output? Although not sure where
> to go from that. The alternative to this is allocating an array of entries.
> I started with that initially but __hw_addr_sync_dev wants to kfree the
> individual entries and I decided not to have a separate helpers to
> manage the snapshots.

Let's see what the benchmark says. Hopefully it's fast enough and 
we don't have to worry. Is keeping these lists around between the
invocations of the work tricky?

> > Can we give the work a reference on the netdev (at init time) and
> > cancel + release it here instead of flushing / waiting?  
> 
> Not sure why cancel+release, maybe you're thinking about the unregister
> path? This is rtnl_unlock -> netdev_run_todo -> __rtnl_unlock + some
> extras.
> 
> And the flush is here to plumb the addresses to the real devices
> before we return to the callers. Mostly because of the following
> things we have in the tests:
> 
> # TEST: team cleanup mode lacp                                        [FAIL]
> #       macvlan unicast address not found on a slave
> 
> Can you explain a bit more on the suggestion?

Oh, I thought it's here for unregister! Feels like it'd be cleaner to
add the flush in dev_*c_add() and friends? How hard would it be to
identify the callers in atomic context?

> > >  	/* Wait for rcu callbacks to finish before next phase */
> > >  	if (!list_empty(&list))
> > >  		rcu_barrier();
> > > @@ -12099,6 +12173,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
> > >  #endif
> > >  
> > >  	mutex_init(&dev->lock);
> > > +	INIT_WORK(&dev->rx_mode_work, dev_rx_mode_work);
> > >  
> > >  	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
> > >  	setup(dev);
> > > @@ -12203,6 +12278,8 @@ void free_netdev(struct net_device *dev)
> > >  
> > >  	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
> > >  
> > > +	cancel_work_sync(&dev->rx_mode_work);  
> > 
> > Should never happen so maybe wrap it in a WARN ?  
> 
> Or maybe just flush_workqueue here as well? To signal the intent that we
> are mostly waiting for the wq entry to be unused to be able to kfree it?
>

Re: [PATCH net-next v3 03/13] net: introduce ndo_set_rx_mode_async and dev_rx_mode_work

Posted by Stanislav Fomichev 1 week, 5 days ago

On 03/24, Jakub Kicinski wrote:
> On Tue, 24 Mar 2026 11:13:04 -0700 Stanislav Fomichev wrote:
> > > > +		netif_addr_lock_bh(dev);
> > > > +
> > > > +		err = __hw_addr_list_snapshot(&uc_snap, &dev->uc,
> > > > +					      dev->addr_len);
> > > > +		if (!err)
> > > > +			err = __hw_addr_list_snapshot(&uc_ref, &dev->uc,
> > > > +						      dev->addr_len);
> > > > +		if (!err)
> > > > +			err = __hw_addr_list_snapshot(&mc_snap, &dev->mc,
> > > > +						      dev->addr_len);
> > > > +		if (!err)
> > > > +			err = __hw_addr_list_snapshot(&mc_ref, &dev->mc,
> > > > +						      dev->addr_len);  
> > > 
> > > This doesn't get slow with a few thousands of addresses?  
> > 
> > I can add kunit benchmark and attach the output? Although not sure where
> > to go from that. The alternative to this is allocating an array of entries.
> > I started with that initially but __hw_addr_sync_dev wants to kfree the
> > individual entries and I decided not to have a separate helpers to
> > manage the snapshots.
> 
> Let's see what the benchmark says. Hopefully it's fast enough and 
> we don't have to worry. Is keeping these lists around between the
> invocations of the work tricky?

Yeah, that sounds doable, don't think it's too tricky, just extra
list_head on net_device + change the alloc/free to use it.
And then we keep this cache around until unregister? I will try to add it as
a separate patch to cache these entries to keep it simple for review..
 
> > > Can we give the work a reference on the netdev (at init time) and
> > > cancel + release it here instead of flushing / waiting?  
> > 
> > Not sure why cancel+release, maybe you're thinking about the unregister
> > path? This is rtnl_unlock -> netdev_run_todo -> __rtnl_unlock + some
> > extras.
> > 
> > And the flush is here to plumb the addresses to the real devices
> > before we return to the callers. Mostly because of the following
> > things we have in the tests:
> > 
> > # TEST: team cleanup mode lacp                                        [FAIL]
> > #       macvlan unicast address not found on a slave
> > 
> > Can you explain a bit more on the suggestion?
> 
> Oh, I thought it's here for unregister! Feels like it'd be cleaner to
> add the flush in dev_*c_add() and friends? How hard would it be to
> identify the callers in atomic context?

Not sure we can do it in dev_xc_add because it runs under rtnl :-(
I currently do flush in netdev_run_todo because that's the place that
doesn't hold rtnl. Otherwise flush will get stuck because the work
handler grabs it...

Re: [PATCH net-next v3 03/13] net: introduce ndo_set_rx_mode_async and dev_rx_mode_work

Posted by Jakub Kicinski 1 week, 5 days ago

On Tue, 24 Mar 2026 15:49:27 -0700 Stanislav Fomichev wrote:
> > > Not sure why cancel+release, maybe you're thinking about the unregister
> > > path? This is rtnl_unlock -> netdev_run_todo -> __rtnl_unlock + some
> > > extras.
> > > 
> > > And the flush is here to plumb the addresses to the real devices
> > > before we return to the callers. Mostly because of the following
> > > things we have in the tests:
> > > 
> > > # TEST: team cleanup mode lacp                                        [FAIL]
> > > #       macvlan unicast address not found on a slave
> > > 
> > > Can you explain a bit more on the suggestion?  
> > 
> > Oh, I thought it's here for unregister! Feels like it'd be cleaner to
> > add the flush in dev_*c_add() and friends? How hard would it be to
> > identify the callers in atomic context?  
> 
> Not sure we can do it in dev_xc_add because it runs under rtnl :-(
> I currently do flush in netdev_run_todo because that's the place that
> doesn't hold rtnl. Otherwise flush will get stuck because the work
> handler grabs it...

I was thinking of something a'la linkwatch. We can "steal" / "flush"
the pending work inline. I guess linkwatch is a major source of races
over the years...

Does the macvlan + team problem still happens with the current
implementation minus the flush? We are only flushing once so only
pushing the addresses thru one layer of async callbacks.

Re: [PATCH net-next v3 03/13] net: introduce ndo_set_rx_mode_async and dev_rx_mode_work

Posted by Stanislav Fomichev 1 week, 4 days ago

On 03/24, Jakub Kicinski wrote:
> On Tue, 24 Mar 2026 15:49:27 -0700 Stanislav Fomichev wrote:
> > > > Not sure why cancel+release, maybe you're thinking about the unregister
> > > > path? This is rtnl_unlock -> netdev_run_todo -> __rtnl_unlock + some
> > > > extras.
> > > > 
> > > > And the flush is here to plumb the addresses to the real devices
> > > > before we return to the callers. Mostly because of the following
> > > > things we have in the tests:
> > > > 
> > > > # TEST: team cleanup mode lacp                                        [FAIL]
> > > > #       macvlan unicast address not found on a slave
> > > > 
> > > > Can you explain a bit more on the suggestion?  
> > > 
> > > Oh, I thought it's here for unregister! Feels like it'd be cleaner to
> > > add the flush in dev_*c_add() and friends? How hard would it be to
> > > identify the callers in atomic context?  
> > 
> > Not sure we can do it in dev_xc_add because it runs under rtnl :-(
> > I currently do flush in netdev_run_todo because that's the place that
> > doesn't hold rtnl. Otherwise flush will get stuck because the work
> > handler grabs it...
> 
> I was thinking of something a'la linkwatch. We can "steal" / "flush"
> the pending work inline. I guess linkwatch is a major source of races
> over the years...
>
> Does the macvlan + team problem still happens with the current
> implementation minus the flush? We are only flushing once so only
> pushing the addresses thru one layer of async callbacks.

Yes, it does happen consistently when I remove the flush. It also
happens with my internal v4, so I need to look again at what's going on.
Not sure whether it's my internal regression or I was just sloppy/lucky
(since you're correct in pointing out that we flush only once).

Before I went down the workqueue route, I had a simple
net_todo_list-like approach: `list_add_tail` on enqueue and
`while(!list_empty) run_work()` on rtnl_unlock. This had a nice properly of
tracking re-submissions (by checking whether the device's list_head is
linked into the list or not) and it was relatively easy to do the
recursive flush. Let me try get back to this approach and see whether
it solves the flush? Not sure what wq buys us at this point.

Re: [PATCH net-next v3 03/13] net: introduce ndo_set_rx_mode_async and dev_rx_mode_work

Posted by Stanislav Fomichev 1 week, 4 days ago

On 03/25, Stanislav Fomichev wrote:
> On 03/24, Jakub Kicinski wrote:
> > On Tue, 24 Mar 2026 15:49:27 -0700 Stanislav Fomichev wrote:
> > > > > Not sure why cancel+release, maybe you're thinking about the unregister
> > > > > path? This is rtnl_unlock -> netdev_run_todo -> __rtnl_unlock + some
> > > > > extras.
> > > > > 
> > > > > And the flush is here to plumb the addresses to the real devices
> > > > > before we return to the callers. Mostly because of the following
> > > > > things we have in the tests:
> > > > > 
> > > > > # TEST: team cleanup mode lacp                                        [FAIL]
> > > > > #       macvlan unicast address not found on a slave
> > > > > 
> > > > > Can you explain a bit more on the suggestion?  
> > > > 
> > > > Oh, I thought it's here for unregister! Feels like it'd be cleaner to
> > > > add the flush in dev_*c_add() and friends? How hard would it be to
> > > > identify the callers in atomic context?  
> > > 
> > > Not sure we can do it in dev_xc_add because it runs under rtnl :-(
> > > I currently do flush in netdev_run_todo because that's the place that
> > > doesn't hold rtnl. Otherwise flush will get stuck because the work
> > > handler grabs it...
> > 
> > I was thinking of something a'la linkwatch. We can "steal" / "flush"
> > the pending work inline. I guess linkwatch is a major source of races
> > over the years...
> >
> > Does the macvlan + team problem still happens with the current
> > implementation minus the flush? We are only flushing once so only
> > pushing the addresses thru one layer of async callbacks.
> 
> Yes, it does happen consistently when I remove the flush. It also
> happens with my internal v4, so I need to look again at what's going on.
> Not sure whether it's my internal regression or I was just sloppy/lucky
> (since you're correct in pointing out that we flush only once).

Hmm, the test does 'team -d' in the background. That's why it works for
bonding, but not the teaming. I'll update the test to a bunch of
'ip' commands instead of starting a daemon..

> Before I went down the workqueue route, I had a simple
> net_todo_list-like approach: `list_add_tail` on enqueue and
> `while(!list_empty) run_work()` on rtnl_unlock. This had a nice properly of
> tracking re-submissions (by checking whether the device's list_head is
> linked into the list or not) and it was relatively easy to do the
> recursive flush. Let me try get back to this approach and see whether
> it solves the flush? Not sure what wq buys us at this point.

Will still look into that, maybe something similar to the linkwatch as
you mentioned.

RE: [Intel-wired-lan] [PATCH net-next v3 03/13] net: introduce ndo_set_rx_mode_async and dev_rx_mode_work

Posted by Loktionov, Aleksandr 2 weeks, 3 days ago


> -----Original Message-----
> From: Intel-wired-lan <intel-wired-lan-bounces@osuosl.org> On Behalf
> Of Stanislav Fomichev
> Sent: Friday, March 20, 2026 2:25 AM
> To: netdev@vger.kernel.org
> Cc: davem@davemloft.net; edumazet@google.com; kuba@kernel.org;
> pabeni@redhat.com; horms@kernel.org; corbet@lwn.net;
> skhan@linuxfoundation.org; andrew+netdev@lunn.ch;
> michael.chan@broadcom.com; pavan.chebbi@broadcom.com; Nguyen, Anthony
> L <anthony.l.nguyen@intel.com>; Kitszel, Przemyslaw
> <przemyslaw.kitszel@intel.com>; saeedm@nvidia.com; tariqt@nvidia.com;
> mbloch@nvidia.com; alexanderduyck@fb.com; kernel-team@meta.com;
> johannes@sipsolutions.net; sd@queasysnail.net; jianbol@nvidia.com;
> dtatulea@nvidia.com; sdf@fomichev.me; mohsin.bashr@gmail.com; Keller,
> Jacob E <jacob.e.keller@intel.com>; willemb@google.com;
> skhawaja@google.com; bestswngs@gmail.com; Loktionov, Aleksandr
> <aleksandr.loktionov@intel.com>; kees@kernel.org; linux-
> doc@vger.kernel.org; linux-kernel@vger.kernel.org; intel-wired-
> lan@lists.osuosl.org; linux-rdma@vger.kernel.org; linux-
> wireless@vger.kernel.org; linux-kselftest@vger.kernel.org;
> leon@kernel.org
> Subject: [Intel-wired-lan] [PATCH net-next v3 03/13] net: introduce
> ndo_set_rx_mode_async and dev_rx_mode_work
> 
> Add ndo_set_rx_mode_async callback that drivers can implement instead
> of the legacy ndo_set_rx_mode. The legacy callback runs under the
> netif_addr_lock spinlock with BHs disabled, preventing drivers from
> sleeping. The async variant runs from a work queue with rtnl_lock and
> netdev_lock_ops held, in fully sleepable context.
> 
> When __dev_set_rx_mode() sees ndo_set_rx_mode_async, it schedules
> dev_rx_mode_work instead of calling the driver inline. The work
> function takes two snapshots of each address list (uc/mc) under the
> addr_lock, then drops the lock and calls the driver with the work
> copies. After the driver returns, it reconciles the snapshots back to
> the real lists under the lock.
> 
> Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
> Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
> ---
>  Documentation/networking/netdevices.rst |  8 +++
>  include/linux/netdevice.h               | 20 ++++++
>  net/core/dev.c                          | 95 +++++++++++++++++++++++-
> -
>  3 files changed, 116 insertions(+), 7 deletions(-)
> 
> diff --git a/Documentation/networking/netdevices.rst
> b/Documentation/networking/netdevices.rst
> index 35704d115312..dc83d78d3b27 100644
> --- a/Documentation/networking/netdevices.rst
> +++ b/Documentation/networking/netdevices.rst
> @@ -289,6 +289,14 @@ struct net_device synchronization rules
>  ndo_set_rx_mode:
>  	Synchronization: netif_addr_lock spinlock.
>  	Context: BHs disabled

...

> to
> +device
> + * and configure RX filtering.
> + * @dev: device
> + *
> + * When the device doesn't support unicast filtering it is put in
> +promiscuous
> + * mode while unicast addresses are present.
>   */
>  void __dev_set_rx_mode(struct net_device *dev)  {
>  	const struct net_device_ops *ops = dev->netdev_ops;
> 
>  	/* dev_open will call this function so the list will stay sane.
> */
> -	if (!(dev->flags&IFF_UP))
> +	if (!netif_up_and_present(dev))
>  		return;
> 
> -	if (!netif_device_present(dev))
> +	if (ops->ndo_set_rx_mode_async) {
> +		queue_work(rx_mode_wq, &dev->rx_mode_work);
>  		return;
This early return skips the legacy core fallback below.
Before this patch, __dev_set_rx_mode() continued into the
existing unicast-filter handling when the device did not
advertise IFF_UNICAST_FLT.

After this patch, any driver that implements
ndo_set_rx_mode_async but does not set IFF_UNICAST_FLT
will never hit that fallback path.

+	}
> 
>  	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
>  		/* Unicast addresses changes may only happen under the
> rtnl, @@ -11708,6 +11772,16 @@ void netdev_run_todo(void)
> 
>  	__rtnl_unlock();
> 

...

>  	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
>  	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
> 
> --
> 2.53.0

Re: [Intel-wired-lan] [PATCH net-next v3 03/13] net: introduce ndo_set_rx_mode_async and dev_rx_mode_work

Posted by Stanislav Fomichev 2 weeks, 2 days ago

On 03/20, Loktionov, Aleksandr wrote:
> 
> 
> > -----Original Message-----
> > From: Intel-wired-lan <intel-wired-lan-bounces@osuosl.org> On Behalf
> > Of Stanislav Fomichev
> > Sent: Friday, March 20, 2026 2:25 AM
> > To: netdev@vger.kernel.org
> > Cc: davem@davemloft.net; edumazet@google.com; kuba@kernel.org;
> > pabeni@redhat.com; horms@kernel.org; corbet@lwn.net;
> > skhan@linuxfoundation.org; andrew+netdev@lunn.ch;
> > michael.chan@broadcom.com; pavan.chebbi@broadcom.com; Nguyen, Anthony
> > L <anthony.l.nguyen@intel.com>; Kitszel, Przemyslaw
> > <przemyslaw.kitszel@intel.com>; saeedm@nvidia.com; tariqt@nvidia.com;
> > mbloch@nvidia.com; alexanderduyck@fb.com; kernel-team@meta.com;
> > johannes@sipsolutions.net; sd@queasysnail.net; jianbol@nvidia.com;
> > dtatulea@nvidia.com; sdf@fomichev.me; mohsin.bashr@gmail.com; Keller,
> > Jacob E <jacob.e.keller@intel.com>; willemb@google.com;
> > skhawaja@google.com; bestswngs@gmail.com; Loktionov, Aleksandr
> > <aleksandr.loktionov@intel.com>; kees@kernel.org; linux-
> > doc@vger.kernel.org; linux-kernel@vger.kernel.org; intel-wired-
> > lan@lists.osuosl.org; linux-rdma@vger.kernel.org; linux-
> > wireless@vger.kernel.org; linux-kselftest@vger.kernel.org;
> > leon@kernel.org
> > Subject: [Intel-wired-lan] [PATCH net-next v3 03/13] net: introduce
> > ndo_set_rx_mode_async and dev_rx_mode_work
> > 
> > Add ndo_set_rx_mode_async callback that drivers can implement instead
> > of the legacy ndo_set_rx_mode. The legacy callback runs under the
> > netif_addr_lock spinlock with BHs disabled, preventing drivers from
> > sleeping. The async variant runs from a work queue with rtnl_lock and
> > netdev_lock_ops held, in fully sleepable context.
> > 
> > When __dev_set_rx_mode() sees ndo_set_rx_mode_async, it schedules
> > dev_rx_mode_work instead of calling the driver inline. The work
> > function takes two snapshots of each address list (uc/mc) under the
> > addr_lock, then drops the lock and calls the driver with the work
> > copies. After the driver returns, it reconciles the snapshots back to
> > the real lists under the lock.
> > 
> > Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
> > Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
> > ---
> >  Documentation/networking/netdevices.rst |  8 +++
> >  include/linux/netdevice.h               | 20 ++++++
> >  net/core/dev.c                          | 95 +++++++++++++++++++++++-
> > -
> >  3 files changed, 116 insertions(+), 7 deletions(-)
> > 
> > diff --git a/Documentation/networking/netdevices.rst
> > b/Documentation/networking/netdevices.rst
> > index 35704d115312..dc83d78d3b27 100644
> > --- a/Documentation/networking/netdevices.rst
> > +++ b/Documentation/networking/netdevices.rst
> > @@ -289,6 +289,14 @@ struct net_device synchronization rules
> >  ndo_set_rx_mode:
> >  	Synchronization: netif_addr_lock spinlock.
> >  	Context: BHs disabled
> 
> ...
> 
> > to
> > +device
> > + * and configure RX filtering.
> > + * @dev: device
> > + *
> > + * When the device doesn't support unicast filtering it is put in
> > +promiscuous
> > + * mode while unicast addresses are present.
> >   */
> >  void __dev_set_rx_mode(struct net_device *dev)  {
> >  	const struct net_device_ops *ops = dev->netdev_ops;
> > 
> >  	/* dev_open will call this function so the list will stay sane.
> > */
> > -	if (!(dev->flags&IFF_UP))
> > +	if (!netif_up_and_present(dev))
> >  		return;
> > 
> > -	if (!netif_device_present(dev))
> > +	if (ops->ndo_set_rx_mode_async) {
> > +		queue_work(rx_mode_wq, &dev->rx_mode_work);
> >  		return;
> This early return skips the legacy core fallback below.
> Before this patch, __dev_set_rx_mode() continued into the
> existing unicast-filter handling when the device did not
> advertise IFF_UNICAST_FLT.
> 
> After this patch, any driver that implements
> ndo_set_rx_mode_async but does not set IFF_UNICAST_FLT
> will never hit that fallback path.

I believe this is addressed later in "net: move promiscuity handling into
dev_rx_mode_work"? That should take care of doing __dev_set_promiscuity
for !IFF_UNICAST_FLT+ndo_set_rx_mode_async. Not sure if there is a
better way to rearrange the chunks in the patches.

        if (ops->ndo_set_rx_mode_async) {
                ...

+		promisc_inc = dev_uc_promisc_update(dev);
+
+		netif_addr_unlock_bh(dev);
+	} else {
+		netif_addr_lock_bh(dev);
+		promisc_inc = dev_uc_promisc_update(dev);
+		netif_addr_unlock_bh(dev);
+	}
+
+	if (promisc_inc)
+		__dev_set_promiscuity(dev, promisc_inc, false);
+