drivers/pci/pci-driver.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-)
local_pci_probe() and hence pci_call_probe() can be called
recursively. If the recursive calls are done indirectly via workqueue
kworker, a lockdep recursive warning can be produced. Below is the
stack trace of the lockdep warning on a 4-socket x86-64 Skylake server.
<TASK>
:
start_flush_work+0x40b/0x9b0
__flush_work+0xbd/0x1a0
pci_call_probe+0x510/0x700
pci_device_probe+0x17c/0x270
call_driver_probe+0x68/0x1f0
really_probe+0x197/0x7b0
__driver_probe_device+0x32d/0x460
driver_probe_device+0x49/0x120
__device_attach_driver+0x162/0x290
bus_for_each_drv+0x109/0x190
__device_attach+0x1a2/0x3f0
device_initial_probe+0x7d/0xa0
pci_bus_add_device+0x93/0xe0
pci_bus_add_devices+0x83/0x190
vmd_enable_domain+0x11fb/0x1b80
vmd_probe+0x34c/0x4b0
local_pci_probe+0xdf/0x190
local_pci_probe_callback+0x35/0x80
process_one_work+0x919/0x1af0
worker_thread+0x5a6/0xd10
:
</TASK>
The use of work function originally comes from commit 873392ca514f
("PCI: work_on_cpu: use in drivers/pci/pci-driver.c") to execute the
device probing and allocate memory on the right node where the device
bus is attached to.
In the case of nested device probing within a work function, the current
CPU is likely to be in the right node already. So there is no point in
scheduling another work function in the same or a neigboring CPU and wait
for its completion. It will be more efficient to call local_pci_probe()
directly when the current CPU is indeed in the right node. That will
also avoid the lockdep warning due to nested calls to schedule and
flush a work function.
Signed-off-by: Waiman Long <longman@redhat.com>
---
drivers/pci/pci-driver.c | 24 +++++++++++++++++++-----
1 file changed, 19 insertions(+), 5 deletions(-)
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index e3f59001785a..542b22537852 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -375,6 +375,8 @@ static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev,
{
int error, node, cpu;
struct drv_dev_and_id ddi = { drv, dev, id };
+ bool node_invalid, cpu_in_node = false;
+ const struct cpumask *node_cpus;
/*
* Execute driver initialization on node where the device is
@@ -383,14 +385,27 @@ static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev,
*/
node = dev_to_node(&dev->dev);
dev->is_probed = 1;
+ node_invalid = node < 0 || node >= MAX_NUMNODES || !node_online(node);
+ node_cpus = node_invalid ? cpu_online_mask : cpumask_of_node(node);
+
+ /*
+ * If the current task is a wq kworker activated by queue_work_on()
+ * below, the kworker is affined to a designated CPU and won't be
+ * switched to another one. So the current CPU can be checked to see
+ * if it is in the right node.
+ */
+ if (current->flags & PF_WQ_WORKER) {
+ cpu_in_node = cpumask_test_cpu(get_cpu(), node_cpus);
+ put_cpu();
+ }
cpu_hotplug_disable();
/*
* Prevent nesting work_on_cpu() for the case where a Virtual Function
- * device is probed from work_on_cpu() of the Physical device.
+ * device is probed from work_on_cpu() of the Physical device or when
+ * the current CPU is in the desired node.
*/
- if (node < 0 || node >= MAX_NUMNODES || !node_online(node) ||
- pci_physfn_is_probed(dev)) {
+ if (node_invalid || cpu_in_node || pci_physfn_is_probed(dev)) {
error = local_pci_probe(&ddi);
} else {
struct pci_probe_arg arg = { .ddi = &ddi };
@@ -404,8 +419,7 @@ static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev,
* targets.
*/
rcu_read_lock();
- cpu = cpumask_any_and(cpumask_of_node(node),
- housekeeping_cpumask(HK_TYPE_DOMAIN));
+ cpu = cpumask_any_and(node_cpus, housekeeping_cpumask(HK_TYPE_DOMAIN));
if (cpu < nr_cpu_ids) {
struct workqueue_struct *wq = pci_probe_wq;
--
2.54.0
© 2016 - 2026 Red Hat, Inc.