Introduce NUMA-node-aware synchronous probing: drivers
can initialize and allocate memory on the device’s local
node without scattering kmalloc_node() calls throughout
the code.
NUMA-aware probing was first added to PCI drivers by
commit d42c69972b85 ("[PATCH] PCI: Run PCI driver
initialization on local node") in 2005 and has benefited
PCI drivers ever since.
The asynchronous probe path already supports NUMA-node-aware
probing via async_schedule_dev() in the driver core. Since
NUMA affinity is orthogonal to sync/async probing, this
patch adds NUMA-node-aware support to the synchronous
probe path.
Signed-off-by: Jinhui Guo <guojinhui.liam@bytedance.com>
---
drivers/base/dd.c | 104 ++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 101 insertions(+), 3 deletions(-)
diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index 896f98add97d..e1fb10ae2cc0 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -381,6 +381,92 @@ static void __exit deferred_probe_exit(void)
}
__exitcall(deferred_probe_exit);
+/*
+ * NUMA-node-aware synchronous probing:
+ * drivers can initialize and allocate memory on the device’s local
+ * node without scattering kmalloc_node() calls throughout the code.
+ */
+
+/* Generic function pointer type */
+typedef int (*numa_func_t)(void *arg1, void *arg2);
+
+/* Context for NUMA execution */
+struct numa_work_ctx {
+ struct work_struct work;
+ numa_func_t func;
+ void *arg1;
+ void *arg2;
+ int result;
+};
+
+/* Worker function running on the target node */
+static void numa_work_func(struct work_struct *work)
+{
+ struct numa_work_ctx *ctx = container_of(work, struct numa_work_ctx, work);
+
+ ctx->result = ctx->func(ctx->arg1, ctx->arg2);
+}
+
+/*
+ * __exec_on_numa_node - Execute a function on a specific NUMA node synchronously
+ * @node: Target NUMA node ID
+ * @func: The wrapper function to execute
+ * @arg1: First argument (void *)
+ * @arg2: Second argument (void *)
+ *
+ * Returns the result of the function execution, or -ENODEV if initialization fails.
+ * If the node is invalid or offline, it falls back to local execution.
+ */
+static int __exec_on_numa_node(int node, numa_func_t func, void *arg1, void *arg2)
+{
+ struct numa_work_ctx ctx;
+
+ /* Fallback to local execution if the node is invalid or offline */
+ if (node < 0 || node >= MAX_NUMNODES || !node_online(node))
+ return func(arg1, arg2);
+
+ ctx.func = func;
+ ctx.arg1 = arg1;
+ ctx.arg2 = arg2;
+ ctx.result = -ENODEV;
+ INIT_WORK_ONSTACK(&ctx.work, numa_work_func);
+
+ /* Use system_dfl_wq to allow execution on the specific node. */
+ queue_work_node(node, system_dfl_wq, &ctx.work);
+ flush_work(&ctx.work);
+ destroy_work_on_stack(&ctx.work);
+
+ return ctx.result;
+}
+
+/*
+ * DEFINE_NUMA_WRAPPER - Generate a type-safe wrapper for a function
+ * @func_name: The name of the target function
+ * @type1: The type of the first argument
+ * @type2: The type of the second argument
+ *
+ * This macro generates a static function named __wrapper_<func_name> that
+ * casts void pointers back to their original types and calls the target function.
+ */
+#define DEFINE_NUMA_WRAPPER(func_name, type1, type2) \
+ static int __wrapper_##func_name(void *arg1, void *arg2) \
+ { \
+ return func_name((type1)arg1, (type2)arg2); \
+ }
+
+/*
+ * EXEC_ON_NUMA_NODE - Execute a registered function on a NUMA node
+ * @node: Target NUMA node ID
+ * @func_name: The name of the target function (must be registered via DEFINE_NUMA_WRAPPER)
+ * @arg1: First argument
+ * @arg2: Second argument
+ *
+ * This macro invokes the internal execution helper using the generated wrapper.
+ */
+#define EXEC_ON_NUMA_NODE(node, func_name, arg1, arg2) \
+ __exec_on_numa_node(node, __wrapper_##func_name, \
+ (void *)(arg1), (void *)(arg2))
+
/**
* device_is_bound() - Check if device is bound to a driver
* @dev: device to check
@@ -808,6 +894,8 @@ static int __driver_probe_device(const struct device_driver *drv, struct device
return ret;
}
+DEFINE_NUMA_WRAPPER(__driver_probe_device, const struct device_driver *, struct device *)
+
/**
* driver_probe_device - attempt to bind device & driver together
* @drv: driver to bind a device to
@@ -844,6 +932,8 @@ static int driver_probe_device(const struct device_driver *drv, struct device *d
return ret;
}
+DEFINE_NUMA_WRAPPER(driver_probe_device, const struct device_driver *, struct device *)
+
static inline bool cmdline_requested_async_probing(const char *drv_name)
{
bool async_drv;
@@ -1000,6 +1090,8 @@ static int __device_attach_driver_scan(struct device_attach_data *data,
return ret;
}
+DEFINE_NUMA_WRAPPER(__device_attach_driver_scan, struct device_attach_data *, bool *)
+
static void __device_attach_async_helper(void *_dev, async_cookie_t cookie)
{
struct device *dev = _dev;
@@ -1055,7 +1147,9 @@ static int __device_attach(struct device *dev, bool allow_async)
.want_async = false,
};
- ret = __device_attach_driver_scan(&data, &async);
+ ret = EXEC_ON_NUMA_NODE(dev_to_node(dev),
+ __device_attach_driver_scan,
+ &data, &async);
}
out_unlock:
device_unlock(dev);
@@ -1142,7 +1236,9 @@ int device_driver_attach(const struct device_driver *drv, struct device *dev)
int ret;
__device_driver_lock(dev, dev->parent);
- ret = __driver_probe_device(drv, dev);
+ ret = EXEC_ON_NUMA_NODE(dev_to_node(dev),
+ __driver_probe_device,
+ drv, dev);
__device_driver_unlock(dev, dev->parent);
/* also return probe errors as normal negative errnos */
@@ -1231,7 +1327,9 @@ static int __driver_attach(struct device *dev, void *data)
}
__device_driver_lock(dev, dev->parent);
- driver_probe_device(drv, dev);
+ EXEC_ON_NUMA_NODE(dev_to_node(dev),
+ driver_probe_device,
+ drv, dev);
__device_driver_unlock(dev, dev->parent);
return 0;
--
2.20.1
On Wed Jan 7, 2026 at 6:55 PM CET, Jinhui Guo wrote:
> @@ -808,6 +894,8 @@ static int __driver_probe_device(const struct device_driver *drv, struct device
> return ret;
> }
>
> +DEFINE_NUMA_WRAPPER(__driver_probe_device, const struct device_driver *, struct device *)
> +
> /**
> * driver_probe_device - attempt to bind device & driver together
> * @drv: driver to bind a device to
> @@ -844,6 +932,8 @@ static int driver_probe_device(const struct device_driver *drv, struct device *d
> return ret;
> }
>
> +DEFINE_NUMA_WRAPPER(driver_probe_device, const struct device_driver *, struct device *)
> +
> static inline bool cmdline_requested_async_probing(const char *drv_name)
> {
> bool async_drv;
> @@ -1000,6 +1090,8 @@ static int __device_attach_driver_scan(struct device_attach_data *data,
> return ret;
> }
>
> +DEFINE_NUMA_WRAPPER(__device_attach_driver_scan, struct device_attach_data *, bool *)
Why define three different wrappers? To me it looks like we should easily get
away with a single wrapper for __driver_probe_device(), which could just be
__driver_probe_device_node().
__device_attach_driver_scan() already has this information (i.e. we can check if
need_async == NULL). Additionally, we can change the signature of
driver_probe_device() to
static int driver_probe_device(const struct device_driver *drv, struct device *dev, bool async)
This reduces complexity a lot, since it gets us rid of DEFINE_NUMA_WRAPPER() and
EXEC_ON_NUMA_NODE() macros.
> static void __device_attach_async_helper(void *_dev, async_cookie_t cookie)
> {
> struct device *dev = _dev;
> @@ -1055,7 +1147,9 @@ static int __device_attach(struct device *dev, bool allow_async)
> .want_async = false,
> };
>
> - ret = __device_attach_driver_scan(&data, &async);
> + ret = EXEC_ON_NUMA_NODE(dev_to_node(dev),
> + __device_attach_driver_scan,
> + &data, &async);
> }
> out_unlock:
> device_unlock(dev);
> @@ -1142,7 +1236,9 @@ int device_driver_attach(const struct device_driver *drv, struct device *dev)
> int ret;
>
> __device_driver_lock(dev, dev->parent);
> - ret = __driver_probe_device(drv, dev);
> + ret = EXEC_ON_NUMA_NODE(dev_to_node(dev),
> + __driver_probe_device,
> + drv, dev);
> __device_driver_unlock(dev, dev->parent);
>
> /* also return probe errors as normal negative errnos */
> @@ -1231,7 +1327,9 @@ static int __driver_attach(struct device *dev, void *data)
> }
>
> __device_driver_lock(dev, dev->parent);
> - driver_probe_device(drv, dev);
> + EXEC_ON_NUMA_NODE(dev_to_node(dev),
> + driver_probe_device,
> + drv, dev);
> __device_driver_unlock(dev, dev->parent);
>
> return 0;
> --
> 2.20.1
On Sat Jan 17, 2026 15:03:08 +0100, Danilo Krummrich wrote:
> On Wed Jan 7, 2026 at 6:55 PM CET, Jinhui Guo wrote:
> > @@ -808,6 +894,8 @@ static int __driver_probe_device(const struct device_driver *drv, struct device
> > return ret;
> > }
> >
> > +DEFINE_NUMA_WRAPPER(__driver_probe_device, const struct device_driver *, struct device *)
> > +
> > /**
> > * driver_probe_device - attempt to bind device & driver together
> > * @drv: driver to bind a device to
> > @@ -844,6 +932,8 @@ static int driver_probe_device(const struct device_driver *drv, struct device *d
> > return ret;
> > }
> >
> > +DEFINE_NUMA_WRAPPER(driver_probe_device, const struct device_driver *, struct device *)
> > +
> > static inline bool cmdline_requested_async_probing(const char *drv_name)
> > {
> > bool async_drv;
> > @@ -1000,6 +1090,8 @@ static int __device_attach_driver_scan(struct device_attach_data *data,
> > return ret;
> > }
> >
> > +DEFINE_NUMA_WRAPPER(__device_attach_driver_scan, struct device_attach_data *, bool *)
>
> Why define three different wrappers? To me it looks like we should easily get
> away with a single wrapper for __driver_probe_device(), which could just be
> __driver_probe_device_node().
>
>
> __device_attach_driver_scan() already has this information (i.e. we can check if
> need_async == NULL). Additionally, we can change the signature of
> driver_probe_device() to
>
> static int driver_probe_device(const struct device_driver *drv, struct device *dev, bool async)
>
> This reduces complexity a lot, since it gets us rid of DEFINE_NUMA_WRAPPER() and
> EXEC_ON_NUMA_NODE() macros.
Hi Danilo,
Thank you for your time and helpful comments.
Apologies for the delayed reply. I understand your concern: before sending this
patchset I prototyped a version that added __driver_probe_device_node() and
relied solely on current_is_async() to detect an async worker, without changing
driver_probe_device()’s signature. That proved fragile, so I abandoned it; your
suggestion is the more reliable path forward.
I’ve spent the last couple of days preparing a new patch and will send it out
after testing.
Best Regards,
Jinhui
On Wed Jan 7, 2026 at 6:55 PM CET, Jinhui Guo wrote:
> + * __exec_on_numa_node - Execute a function on a specific NUMA node synchronously
> + * @node: Target NUMA node ID
> + * @func: The wrapper function to execute
> + * @arg1: First argument (void *)
> + * @arg2: Second argument (void *)
> + *
> + * Returns the result of the function execution, or -ENODEV if initialization fails.
> + * If the node is invalid or offline, it falls back to local execution.
> + */
> +static int __exec_on_numa_node(int node, numa_func_t func, void *arg1, void *arg2)
> +{
> + struct numa_work_ctx ctx;
> +
> + /* Fallback to local execution if the node is invalid or offline */
> + if (node < 0 || node >= MAX_NUMNODES || !node_online(node))
> + return func(arg1, arg2);
Just a quick drive-by comment (I’ll go through it more thoroughly later).
What about the case where we are already on the requested node?
Also, we should probably set the corresponding CPU affinity for the time we are
executing func() to prevent migration.
> +
> + ctx.func = func;
> + ctx.arg1 = arg1;
> + ctx.arg2 = arg2;
> + ctx.result = -ENODEV;
> + INIT_WORK_ONSTACK(&ctx.work, numa_work_func);
> +
> + /* Use system_dfl_wq to allow execution on the specific node. */
> + queue_work_node(node, system_dfl_wq, &ctx.work);
> + flush_work(&ctx.work);
> + destroy_work_on_stack(&ctx.work);
> +
> + return ctx.result;
> +}
On Wed Jan 07, 2026 at 19:22:15 +0100, Danilo Krummrich wrote:
> On Wed Jan 7, 2026 at 6:55 PM CET, Jinhui Guo wrote:
> > + * __exec_on_numa_node - Execute a function on a specific NUMA node synchronously
> > + * @node: Target NUMA node ID
> > + * @func: The wrapper function to execute
> > + * @arg1: First argument (void *)
> > + * @arg2: Second argument (void *)
> > + *
> > + * Returns the result of the function execution, or -ENODEV if initialization fails.
> > + * If the node is invalid or offline, it falls back to local execution.
> > + */
> > +static int __exec_on_numa_node(int node, numa_func_t func, void *arg1, void *arg2)
> > +{
> > + struct numa_work_ctx ctx;
> > +
> > + /* Fallback to local execution if the node is invalid or offline */
> > + if (node < 0 || node >= MAX_NUMNODES || !node_online(node))
> > + return func(arg1, arg2);
>
> Just a quick drive-by comment (I’ll go through it more thoroughly later).
>
> What about the case where we are already on the requested node?
>
> Also, we should probably set the corresponding CPU affinity for the time we are
> executing func() to prevent migration.
Hi Danilo,
Thank you for your time and helpful comments.
Relying on queue_work_node() for node affinity is safer, even if the thread
is already on the target CPU.
Checking the current CPU and then setting affinity ourselves would require
handling CPU-hotplug and isolated CPUs—corner cases that become complex
quickly.
The PCI driver tried this years ago and ran into numerous problems; delegating
the decision to queue_work_node() avoids repeating that history.
- Commit d42c69972b85 ("[PATCH] PCI: Run PCI driver initialization on local node")
first added NUMA awareness with set_cpus_allowed_ptr().
- Commit 1ddd45f8d76f ("PCI: Use cpu_hotplug_disable() instead of get_online_cpus()")
handled CPU-hotplug.
- Commits 69a18b18699b ("PCI: Restrict probe functions to housekeeping CPUs") and
9d42ea0d6984 ("pci: Decouple HK_FLAG_WQ and HK_FLAG_DOMAIN cpumask fetch") dealt
with isolated CPUs.
I considered setting CPU affinity, but the performance gain is minimal:
1. Driver probing happens mainly at boot, when load is light, so queuing a worker
incurs little delay.
2. With many devices they are usually spread across nodes, so workers are not
stalled long within any NUMA node.
3. Even after pinning, tasks can still be migrated by load balancing within the
NUMA node, so the reduction in context switches versus using queue_work_node()
alone is negligible.
Test data [1] shows that queue_work_node() has negligible impact on synchronous probe time.
[1] https://lore.kernel.org/all/20260107175548.1792-1-guojinhui.liam@bytedance.com/
If you have any other concerns, please let me know.
Best Regards,
Jinhui
© 2016 - 2026 Red Hat, Inc.