drivers/iommu/iommu.c | 26 ++++++++++++++++++-------- include/linux/iommu.h | 2 ++ 2 files changed, 20 insertions(+), 8 deletions(-)
Next up on our list of race windows to close is another one during
iommu_device_register() - it's now OK again for multiple instances to
run their bus_iommu_probe() in parallel, but an iommu_probe_device() can
still also race against a running bus_iommu_probe(). As Johan has
managed to prove, this has now become a lot more visible on DT platforms
wth driver_async_probe where a client driver is attempting to probe in
parallel with its IOMMU driver - although commit b46064a18810 ("iommu:
Handle race with default domain setup") resolves this from the client
driver's point of view, this isn't before of_iommu_configure() has had
the chance to attempt to "replay" a probe that the bus walk hasn't even
tried yet, and so still cause the out-of-order group allocation
behaviour that we're trying to clean up (and now warning about).
The most reliable thing to do here is to explicitly keep track of the
"iommu_device_register() is still running" state, so we can then
special-case the ops lookup for the replay path (based on dev->iommu
again) to let that think it's still waiting for the IOMMU driver to
appear at all. This still leaves the longstanding theoretical case of
iommu_bus_notifier() being triggered during bus_iommu_probe(), but it's
not so simple to defer a notifier, and nobody's ever reported that being
a visible issue, so let's quietly kick that can down the road for now...
Reported-by: Johan Hovold <johan@kernel.org>
Fixes: bcb81ac6ae3c ("iommu: Get DT/ACPI parsing into the proper probe path")
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
---
drivers/iommu/iommu.c | 26 ++++++++++++++++++--------
include/linux/iommu.h | 2 ++
2 files changed, 20 insertions(+), 8 deletions(-)
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 264383b507bc..2605dab818a0 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -277,6 +277,8 @@ int iommu_device_register(struct iommu_device *iommu,
err = bus_iommu_probe(iommu_buses[i]);
if (err)
iommu_device_unregister(iommu);
+ else
+ WRITE_ONCE(iommu->ready, true);
return err;
}
EXPORT_SYMBOL_GPL(iommu_device_register);
@@ -2832,31 +2834,39 @@ bool iommu_default_passthrough(void)
}
EXPORT_SYMBOL_GPL(iommu_default_passthrough);
-const struct iommu_ops *iommu_ops_from_fwnode(const struct fwnode_handle *fwnode)
+static const struct iommu_device *iommu_from_fwnode(const struct fwnode_handle *fwnode)
{
- const struct iommu_ops *ops = NULL;
- struct iommu_device *iommu;
+ const struct iommu_device *iommu, *ret = NULL;
spin_lock(&iommu_device_lock);
list_for_each_entry(iommu, &iommu_device_list, list)
if (iommu->fwnode == fwnode) {
- ops = iommu->ops;
+ ret = iommu;
break;
}
spin_unlock(&iommu_device_lock);
- return ops;
+ return ret;
+}
+
+const struct iommu_ops *iommu_ops_from_fwnode(const struct fwnode_handle *fwnode)
+{
+ const struct iommu_device *iommu = iommu_from_fwnode(fwnode);
+
+ return iommu ? iommu->ops : NULL;
}
int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode)
{
- const struct iommu_ops *ops = iommu_ops_from_fwnode(iommu_fwnode);
+ const struct iommu_device *iommu = iommu_from_fwnode(iommu_fwnode);
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
- if (!ops)
+ if (!iommu)
return driver_deferred_probe_check_state(dev);
+ if (!dev->iommu && !READ_ONCE(iommu->ready))
+ return -EPROBE_DEFER;
if (fwspec)
- return ops == iommu_fwspec_ops(fwspec) ? 0 : -EINVAL;
+ return iommu->ops == iommu_fwspec_ops(fwspec) ? 0 : -EINVAL;
if (!dev_iommu_get(dev))
return -ENOMEM;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index ccce8a751e2a..cbb1520005c3 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -750,6 +750,7 @@ struct iommu_domain_ops {
* @dev: struct device for sysfs handling
* @singleton_group: Used internally for drivers that have only one group
* @max_pasids: number of supported PASIDs
+ * @ready: set once iommu_device_register() has completed successfully
*/
struct iommu_device {
struct list_head list;
@@ -758,6 +759,7 @@ struct iommu_device {
struct device *dev;
struct iommu_group *singleton_group;
u32 max_pasids;
+ bool ready;
};
/**
--
2.39.2.101.g768bb238c484.dirty
On Thu, Apr 24, 2025 at 06:41:28PM +0100, Robin Murphy wrote:
> Next up on our list of race windows to close is another one during
> iommu_device_register() - it's now OK again for multiple instances to
> run their bus_iommu_probe() in parallel, but an iommu_probe_device() can
> still also race against a running bus_iommu_probe(). As Johan has
> managed to prove, this has now become a lot more visible on DT platforms
> wth driver_async_probe where a client driver is attempting to probe in
> parallel with its IOMMU driver
To be clear, I hit this with just normal probe deferral (not explicit
async probe).
> - although commit b46064a18810 ("iommu:
> Handle race with default domain setup") resolves this from the client
> driver's point of view, this isn't before of_iommu_configure() has had
> the chance to attempt to "replay" a probe that the bus walk hasn't even
> tried yet, and so still cause the out-of-order group allocation
> behaviour that we're trying to clean up (and now warning about).
>
> The most reliable thing to do here is to explicitly keep track of the
> "iommu_device_register() is still running" state, so we can then
> special-case the ops lookup for the replay path (based on dev->iommu
> again) to let that think it's still waiting for the IOMMU driver to
> appear at all. This still leaves the longstanding theoretical case of
> iommu_bus_notifier() being triggered during bus_iommu_probe(), but it's
> not so simple to defer a notifier, and nobody's ever reported that being
> a visible issue, so let's quietly kick that can down the road for now...
>
> Reported-by: Johan Hovold <johan@kernel.org>
Perhaps add a reference to my report:
Link: https://lore.kernel.org/lkml/Z_jMiC1uj_MJpKVj@hovoldconsulting.com/
> Fixes: bcb81ac6ae3c ("iommu: Get DT/ACPI parsing into the proper probe path")
> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
This looks like it should work and nothing blows up even if I haven't
tried to instrument a reliable reproducer to test it against:
Reviewed-by: Johan Hovold <johan+linaro@kernel.org>
Tested-by: Johan Hovold <johan+linaro@kernel.org>
That said, don't you have a similar issue with:
@@ -414,9 +414,21 @@ static int iommu_init_device(struct device *dev)
if (!dev_iommu_get(dev))
return -ENOMEM;
/*
- * For FDT-based systems and ACPI IORT/VIOT, drivers register IOMMU
- * instances with non-NULL fwnodes, and client devices should have been
- * identified with a fwspec by this point. Otherwise, we can currently
+ * For FDT-based systems and ACPI IORT/VIOT, the common firmware parsing
+ * is buried in the bus dma_configure path. Properly unpicking that is
+ * still a big job, so for now just invoke the whole thing. The device
+ * already having a driver bound means dma_configure has already run and
+ * either found no IOMMU to wait for, or we're in its replay call right
+ * now, so either way there's no point calling it again.
+ */
+ if (!dev->driver && dev->bus->dma_configure) {
What prevents a racing client driver probe from having set dev->driver
here so that dma_configure() isn't called?
+ mutex_unlock(&iommu_probe_device_lock);
+ dev->bus->dma_configure(dev);
+ mutex_lock(&iommu_probe_device_lock);
+ }
Johan
On Thu, Apr 24, 2025 at 06:41:28PM +0100, Robin Murphy wrote: > drivers/iommu/iommu.c | 26 ++++++++++++++++++-------- > include/linux/iommu.h | 2 ++ > 2 files changed, 20 insertions(+), 8 deletions(-) Applied, thanks.
© 2016 - 2026 Red Hat, Inc.