[PATCH net-next v8 4/5] netconsole: resume previously deactivated target

Andre Carvalho posted 5 patches 2 days, 23 hours ago
[PATCH net-next v8 4/5] netconsole: resume previously deactivated target
Posted by Andre Carvalho 2 days, 23 hours ago
Attempt to resume a previously deactivated target when the associated
interface comes back (NETDEV_REGISTER) or when it changes name
(NETDEV_CHANGENAME) by calling netpoll_setup on the device.

Depending on how the target was setup (by mac or interface name), the
corresponding field is compared with the device being brought up. Targets
that match the incoming device, are scheduled for resume on resume_wq, so
that netpoll_setup is able to force the device UP.

Target transitions to STATE_DISABLED in case of failures resuming it to
avoid retrying the same target indefinitely.

Signed-off-by: Andre Carvalho <asantostc@gmail.com>
---
 drivers/net/netconsole.c | 98 +++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 92 insertions(+), 6 deletions(-)

diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c
index 7a1e5559fc0d..b31762f2ee45 100644
--- a/drivers/net/netconsole.c
+++ b/drivers/net/netconsole.c
@@ -39,6 +39,7 @@
 #include <linux/u64_stats_sync.h>
 #include <linux/utsname.h>
 #include <linux/rtnetlink.h>
+#include <linux/workqueue.h>
 
 MODULE_AUTHOR("Matt Mackall <mpm@selenic.com>");
 MODULE_DESCRIPTION("Console driver for network interfaces");
@@ -138,10 +139,14 @@ enum target_state {
  * @stats:	Packet send stats for the target. Used for debugging.
  * @state:	State of the target.
  *		Visible from userspace (read-write).
- *		We maintain a strict 1:1 correspondence between this and
- *		whether the corresponding netpoll is active or inactive.
+ *		From a userspace perspective, the target is either enabled or
+ *		disabled. Internally, although both STATE_DISABLED and
+ *		STATE_DEACTIVATED correspond to inactive targets, the latter is
+ *		due to automatic interface state changes and will try
+ *		recover automatically, if the interface comes back
+ *		online.
  *		Also, other parameters of a target may be modified at
- *		runtime only when it is disabled (state == STATE_DISABLED).
+ *		runtime only when it is disabled (state != STATE_ENABLED).
  * @extended:	Denotes whether console is extended or not.
  * @release:	Denotes whether kernel release version should be prepended
  *		to the message. Depends on extended console.
@@ -155,6 +160,7 @@ enum target_state {
  *		local_mac	(read-only)
  *		remote_mac	(read-write)
  * @buf:	The buffer used to send the full msg to the network stack
+ * @resume_wq:	Workqueue to resume deactivated target
  */
 struct netconsole_target {
 	struct list_head	list;
@@ -177,6 +183,7 @@ struct netconsole_target {
 	struct netpoll		np;
 	/* protected by target_list_lock */
 	char			buf[MAX_PRINT_CHUNK];
+	struct work_struct	resume_wq;
 };
 
 #ifdef	CONFIG_NETCONSOLE_DYNAMIC
@@ -242,6 +249,75 @@ static void populate_configfs_item(struct netconsole_target *nt,
 }
 #endif	/* CONFIG_NETCONSOLE_DYNAMIC */
 
+/* Check if the target was bound by mac address. */
+static bool bound_by_mac(struct netconsole_target *nt)
+{
+	return is_valid_ether_addr(nt->np.dev_mac);
+}
+
+/* Attempts to resume logging to a deactivated target. */
+static void resume_target(struct netconsole_target *nt)
+{
+	int ret;
+
+	/* check if target is still deactivated as it may have been disabled
+	 * while resume was being scheduled.
+	 */
+	if (nt->state != STATE_DEACTIVATED)
+		return;
+
+	if (bound_by_mac(nt))
+		/* ensure netpoll_setup will retrieve device by mac */
+		memset(&nt->np.dev_name, 0, IFNAMSIZ);
+
+	ret = netpoll_setup(&nt->np);
+	if (ret) {
+		/* netpoll fails setup once, do not try again. */
+		nt->state = STATE_DISABLED;
+		return;
+	}
+
+	nt->state = STATE_ENABLED;
+	pr_info("network logging resumed on interface %s\n", nt->np.dev_name);
+}
+
+/* Checks if a deactivated target matches a device. */
+static bool deactivated_target_match(struct netconsole_target *nt,
+				     struct net_device *ndev)
+{
+	if (nt->state != STATE_DEACTIVATED)
+		return false;
+
+	if (bound_by_mac(nt))
+		return !memcmp(nt->np.dev_mac, ndev->dev_addr, ETH_ALEN);
+	return !strncmp(nt->np.dev_name, ndev->name, IFNAMSIZ);
+}
+
+/* Process work scheduled for target resume. */
+static void process_resume_target(struct work_struct *work)
+{
+	struct netconsole_target *nt =
+		container_of(work, struct netconsole_target, resume_wq);
+	unsigned long flags;
+
+	/* resume_target is IRQ unsafe, remove target from
+	 * target_list in order to resume it with IRQ enabled.
+	 */
+	spin_lock_irqsave(&target_list_lock, flags);
+	list_del_init(&nt->list);
+	spin_unlock_irqrestore(&target_list_lock, flags);
+
+	resume_target(nt);
+
+	/* At this point the target is either enabled or disabled and
+	 * was cleaned up before getting deactivated. Either way, add it
+	 * back to target list.
+	 */
+	spin_lock_irqsave(&target_list_lock, flags);
+	list_add(&nt->list, &target_list);
+	spin_unlock_irqrestore(&target_list_lock, flags);
+}
+
 /* Allocate and initialize with defaults.
  * Note that these targets get their config_item fields zeroed-out.
  */
@@ -264,6 +340,7 @@ static struct netconsole_target *alloc_and_init(void)
 	nt->np.remote_port = 6666;
 	eth_broadcast_addr(nt->np.remote_mac);
 	nt->state = STATE_DISABLED;
+	INIT_WORK(&nt->resume_wq, process_resume_target);
 
 	return nt;
 }
@@ -1434,13 +1511,14 @@ static int prepare_sysdata(struct netconsole_target *nt)
 static int netconsole_netdev_event(struct notifier_block *this,
 				   unsigned long event, void *ptr)
 {
-	unsigned long flags;
-	struct netconsole_target *nt, *tmp;
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct netconsole_target *nt, *tmp;
 	bool stopped = false;
+	unsigned long flags;
 
 	if (!(event == NETDEV_CHANGENAME || event == NETDEV_UNREGISTER ||
-	      event == NETDEV_RELEASE || event == NETDEV_JOIN))
+	      event == NETDEV_RELEASE || event == NETDEV_JOIN ||
+	      event == NETDEV_REGISTER))
 		goto done;
 
 	mutex_lock(&target_cleanup_list_lock);
@@ -1469,6 +1547,13 @@ static int netconsole_netdev_event(struct notifier_block *this,
 				stopped = true;
 			}
 		}
+		if ((event == NETDEV_REGISTER || event == NETDEV_CHANGENAME) &&
+		    deactivated_target_match(nt, dev))
+			/* Schedule resume on a workqueue as it will attempt
+			 * to UP the device, which can't be done as part of this
+			 * notifier.
+			 */
+			schedule_work(&nt->resume_wq);
 		netconsole_target_put(nt);
 	}
 	spin_unlock_irqrestore(&target_list_lock, flags);
@@ -1937,6 +2022,7 @@ static struct netconsole_target *alloc_param_target(char *target_config,
 /* Cleanup netpoll for given target (from boot/module param) and free it */
 static void free_param_target(struct netconsole_target *nt)
 {
+	cancel_work_sync(&nt->resume_wq);
 	netpoll_cleanup(&nt->np);
 #ifdef	CONFIG_NETCONSOLE_DYNAMIC
 	kfree(nt->userdata);

-- 
2.52.0
Re: [PATCH net-next v8 4/5] netconsole: resume previously deactivated target
Posted by Breno Leitao 10 hours ago
Hello Andre,

On Fri, Nov 28, 2025 at 10:08:03PM +0000, Andre Carvalho wrote:
> @@ -242,6 +249,75 @@ static void populate_configfs_item(struct netconsole_target *nt,
>  }
>  #endif	/* CONFIG_NETCONSOLE_DYNAMIC */
>  
> +/* Check if the target was bound by mac address. */
> +static bool bound_by_mac(struct netconsole_target *nt)
> +{
> +	return is_valid_ether_addr(nt->np.dev_mac);
> +}
> +
> +/* Attempts to resume logging to a deactivated target. */
> +static void resume_target(struct netconsole_target *nt)
> +{
> +	int ret;
> +
> +	/* check if target is still deactivated as it may have been disabled
> +	 * while resume was being scheduled.
> +	 */
This only happens if this is a dynamic target and someone is toggling
the device (or even removing it, which would cause a crash I _think_).

Given you are completely lockless here, so, there is a chance you hit
a TOCTOU, also.

I think you want to have dynamic_netconsole_mutex held during the
operation of process_resume_target().

  * mutex_lock(&dynamic_netconsole_mutex);
  * remove from the list
  * resume
  * re-add to the list
  * mutex_unlock(&dynamic_netconsole_mutex);
  

netconsole design has two locks:
  * target lock list, which protects devices getting disabled by netdev
    notifications
  * dynamic_netconsole_mutex, which protects anyone disabling and
    removing the target from configfs

> +	if (nt->state != STATE_DEACTIVATED)
> +		return;
> +
> +	if (bound_by_mac(nt))
> +		/* ensure netpoll_setup will retrieve device by mac */
> +		memset(&nt->np.dev_name, 0, IFNAMSIZ);

This is a clean-up step that was missing whent the target is getting
down, and htis is just a work around that doesn't belong in here.

Please move it to netconsole_process_cleanups_core(), in a separate
patch.

Something as: 

	list_for_each_entry_safe(nt, tmp, &target_cleanup_list, list)
		do_netpoll_cleanup(&nt->np);
		if (bound_by_mac(nt))
			memset(&nt->np.dev_name, 0, IFNAMSIZ);
			

Ideally this should belong to do_netpoll_cleanup(), but let's keep it in
netconsole_process_cleanups_core() for three reasons:


1) Bounding by mac is a netconsole concept
2) do_netpoll_cleanup() is only used by netconsole, and I plan to move
   it back to netconsole. Some PoC in [1]
3) bound_by_mac() should be in netconsole and we do not want to export
   it.

[1]:
https://lore.kernel.org/all/20250902-netpoll_untangle_v3-v1-3-51a03d6411be@debian.org/

> +
> +	ret = netpoll_setup(&nt->np);
> +	if (ret) {
> +		/* netpoll fails setup once, do not try again. */
> +		nt->state = STATE_DISABLED;
> +		return;
> +	}
> +
> +	nt->state = STATE_ENABLED;
> +	pr_info("network logging resumed on interface %s\n", nt->np.dev_name);
> +}
> +
> +/* Checks if a deactivated target matches a device. */
> +static bool deactivated_target_match(struct netconsole_target *nt,
> +				     struct net_device *ndev)
> +{
> +	if (nt->state != STATE_DEACTIVATED)
> +		return false;
> +
> +	if (bound_by_mac(nt))
> +		return !memcmp(nt->np.dev_mac, ndev->dev_addr, ETH_ALEN);
> +	return !strncmp(nt->np.dev_name, ndev->name, IFNAMSIZ);
> +}
> +
> +/* Process work scheduled for target resume. */
> +static void process_resume_target(struct work_struct *work)
> +{
> +	struct netconsole_target *nt =
> +		container_of(work, struct netconsole_target, resume_wq);
> +	unsigned long flags;
> +

mutex_lock(&dynamic_netconsole_mutex);
As discussed above

> +	/* resume_target is IRQ unsafe, remove target from
> +	 * target_list in order to resume it with IRQ enabled.
> +	 */
> +	spin_lock_irqsave(&target_list_lock, flags);
> +	list_del_init(&nt->list);
> +	spin_unlock_irqrestore(&target_list_lock, flags);
> +
> +	resume_target(nt);
> +
> +	/* At this point the target is either enabled or disabled and
> +	 * was cleaned up before getting deactivated. Either way, add it
> +	 * back to target list.
> +	 */
> +	spin_lock_irqsave(&target_list_lock, flags);
> +	list_add(&nt->list, &target_list);
> +	spin_unlock_irqrestore(&target_list_lock, flags);

mutex_unlock(&dynamic_netconsole_mutex);

> +}
> +
>  /* Allocate and initialize with defaults.
>   * Note that these targets get their config_item fields zeroed-out.
>   */
> @@ -264,6 +340,7 @@ static struct netconsole_target *alloc_and_init(void)
>  	nt->np.remote_port = 6666;
>  	eth_broadcast_addr(nt->np.remote_mac);
>  	nt->state = STATE_DISABLED;
> +	INIT_WORK(&nt->resume_wq, process_resume_target);

It needs to be initialized earlier before the kzalloc, otherwise we
might hit a similar problem to the one fixed by e5235eb6cfe0  ("net:
netpoll: initialize work queue before error checks")

The code path would be:
  * alloc_param_target()
	  * alloc_and_init()
		  * kzalloc() fails and return NULL.
		  * resume_wq() is still not initialized
  fail:
	* free_param_target()
		* cancel_work_sync(&nt->resume_wq); and resume_wq is not
		  initialized

Thanks for the patch,
--breno

--
pw-bot: cr
Re: [PATCH net-next v8 4/5] netconsole: resume previously deactivated target
Posted by Andre Carvalho an hour ago
On Mon, Dec 01, 2025 at 03:35:04AM -0800, Breno Leitao wrote:
> Given you are completely lockless here, so, there is a chance you hit
> a TOCTOU, also.
> 
> I think you want to have dynamic_netconsole_mutex held during the
> operation of process_resume_target().
> 
>   * mutex_lock(&dynamic_netconsole_mutex);
>   * remove from the list
>   * resume
>   * re-add to the list
>   * mutex_unlock(&dynamic_netconsole_mutex);
>   

This is a pretty good point. Will address this on the next version.

> > +	if (bound_by_mac(nt))
> > +		/* ensure netpoll_setup will retrieve device by mac */
> > +		memset(&nt->np.dev_name, 0, IFNAMSIZ);
> 
> This is a clean-up step that was missing whent the target is getting
> down, and htis is just a work around that doesn't belong in here.
> 
> Please move it to netconsole_process_cleanups_core(), in a separate
> patch.

Sounds good. Will include this as a separated patch on the next version of this
series.

> Something as: 
> 
> 	list_for_each_entry_safe(nt, tmp, &target_cleanup_list, list)
> 		do_netpoll_cleanup(&nt->np);
> 		if (bound_by_mac(nt))
> 			memset(&nt->np.dev_name, 0, IFNAMSIZ);
> 			
> 
> Ideally this should belong to do_netpoll_cleanup(), but let's keep it in
> netconsole_process_cleanups_core() for three reasons:
> 
> 
> 1) Bounding by mac is a netconsole concept
> 2) do_netpoll_cleanup() is only used by netconsole, and I plan to move
>    it back to netconsole. Some PoC in [1]
> 3) bound_by_mac() should be in netconsole and we do not want to export
>    it.
> 
> [1]:
> https://lore.kernel.org/all/20250902-netpoll_untangle_v3-v1-3-51a03d6411be@debian.org/

The reasoning makes sense to me. I considered performing this cleanup on netpoll,
but given your patch opted for this 'hack' before setup - I think doing it on
netconsole_process_cleanups_core makes more sense. 

I need to check this more, but I'm wondering if we would be able to completely
remove dev_name and dev_mac from netpoll and make it part of the netconsole_target.
Perhaps as a future refactoring after your patch series.

> 
> It needs to be initialized earlier before the kzalloc, otherwise we
> might hit a similar problem to the one fixed by e5235eb6cfe0  ("net:
> netpoll: initialize work queue before error checks")
> 
> The code path would be:
>   * alloc_param_target()
> 	  * alloc_and_init()
> 		  * kzalloc() fails and return NULL.
> 		  * resume_wq() is still not initialized
>   fail:
> 	* free_param_target()
> 		* cancel_work_sync(&nt->resume_wq); and resume_wq is not
> 		  initialized

Ack. Will fix this.

> 
> Thanks for the patch,
> --breno

Thanks again for the review. Will submit a new version addressing all the comments
once net-next re-opens.

-- 
Andre Carvalho