Add a new test_dev domain op for driver to test the compatibility between
a domain and a device at the driver level, before calling into the actual
attachment/replacement of a domain. Support pasid for set_dev_pasid call.
Move existing core-level compatibility tests to a helper function. Invoke
it prior to:
* __iommu_attach_device() or its wrapper __iommu_device_set_domain()
* __iommu_set_group_pasid()
And keep them within the group->mutex, so drivers can simply move all the
sanity and compatibility tests from their attach_dev callbacks to the new
test_dev callbacks without concerning about a race condition.
This may be a public API someday for VFIO/IOMMUFD to run a list of attach
tests without doing any actual attachment, which may result in a list of
failed tests. So encourage drivers to avoid printks to prevent kernel log
spam.
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
include/linux/iommu.h | 17 +++++--
drivers/iommu/iommu.c | 111 ++++++++++++++++++++++++++++++------------
2 files changed, 93 insertions(+), 35 deletions(-)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 801b2bd9e8d49..2ec99502dc29c 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -714,7 +714,12 @@ struct iommu_ops {
/**
* struct iommu_domain_ops - domain specific operations
- * @attach_dev: attach an iommu domain to a device
+ * @test_dev: Test compatibility prior to an @attach_dev or @set_dev_pasid call.
+ * A driver-level callback of this op should do a thorough sanity, to
+ * make sure a device is compatible with the domain. So the following
+ * @attach_dev and @set_dev_pasid functions would likely succeed with
+ * only one exception due to a temporary failure like out of memory.
+ * It's suggested to avoid the kernel prints in this op.
* Return:
* * 0 - success
* * EINVAL - can indicate that device and domain are incompatible due to
@@ -722,11 +727,15 @@ struct iommu_ops {
* driver shouldn't log an error, since it is legitimate for a
* caller to test reuse of existing domains. Otherwise, it may
* still represent some other fundamental problem
- * * ENOMEM - out of memory
- * * ENOSPC - non-ENOMEM type of resource allocation failures
* * EBUSY - device is attached to a domain and cannot be changed
* * ENODEV - device specific errors, not able to be attached
* * <others> - treated as ENODEV by the caller. Use is discouraged
+ * @attach_dev: attach an iommu domain to a device
+ * Return:
+ * * 0 - success
+ * * ENOMEM - out of memory
+ * * ENOSPC - non-ENOMEM type of resource allocation failures
+ * * <others> - Use is discouraged
* @set_dev_pasid: set or replace an iommu domain to a pasid of device. The pasid of
* the device should be left in the old config in error case.
* @map_pages: map a physically contiguous set of pages of the same size to
@@ -751,6 +760,8 @@ struct iommu_ops {
* @free: Release the domain after use.
*/
struct iommu_domain_ops {
+ int (*test_dev)(struct iommu_domain *domain, struct device *dev,
+ ioasid_t pasid, struct iommu_domain *old);
int (*attach_dev)(struct iommu_domain *domain, struct device *dev,
struct iommu_domain *old);
int (*set_dev_pasid)(struct iommu_domain *domain, struct device *dev,
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 170e522b5bda4..95f0e2898b6b5 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -99,6 +99,9 @@ static int bus_iommu_probe(const struct bus_type *bus);
static int iommu_bus_notifier(struct notifier_block *nb,
unsigned long action, void *data);
static void iommu_release_device(struct device *dev);
+static int __iommu_domain_test_device(struct iommu_domain *domain,
+ struct device *dev, ioasid_t pasid,
+ struct iommu_domain *old);
static int __iommu_attach_device(struct iommu_domain *domain,
struct device *dev, struct iommu_domain *old);
static int __iommu_attach_group(struct iommu_domain *domain,
@@ -642,6 +645,10 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list
if (group->default_domain)
iommu_create_device_direct_mappings(group->default_domain, dev);
if (group->domain) {
+ ret = __iommu_domain_test_device(group->domain, dev,
+ IOMMU_NO_PASID, NULL);
+ if (ret)
+ goto err_remove_gdev;
ret = __iommu_device_set_domain(group, dev, group->domain, NULL,
0);
if (ret)
@@ -2185,6 +2192,8 @@ EXPORT_SYMBOL_GPL(iommu_attach_device);
int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain)
{
+ int ret;
+
/*
* This is called on the dma mapping fast path so avoid locking. This is
* racy, but we have an expectation that the driver will setup its DMAs
@@ -2195,6 +2204,10 @@ int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain)
guard(mutex)(&dev->iommu_group->mutex);
+ ret = __iommu_domain_test_device(domain, dev, IOMMU_NO_PASID, NULL);
+ if (ret)
+ return ret;
+
return __iommu_attach_device(domain, dev, NULL);
}
@@ -2262,6 +2275,53 @@ static bool domain_iommu_ops_compatible(const struct iommu_ops *ops,
return false;
}
+/*
+ * Test if a future attach for a domain to the device will always fail. This may
+ * indicate that the device and the domain are incompatible in some way. Actual
+ * attach may still fail for some temporary failure like out of memory.
+ *
+ * If pasid != IOMMU_NO_PASID, it is meant to test a future set_dev_pasid call.
+ */
+static int __iommu_domain_test_device(struct iommu_domain *domain,
+ struct device *dev, ioasid_t pasid,
+ struct iommu_domain *old)
+{
+ const struct iommu_ops *ops = dev_iommu_ops(dev);
+ struct iommu_group *group = dev->iommu_group;
+
+ lockdep_assert_held(&group->mutex);
+
+ if (!domain_iommu_ops_compatible(ops, domain))
+ return -EINVAL;
+
+ if (pasid != IOMMU_NO_PASID) {
+ struct group_device *gdev;
+
+ if (!domain->ops->set_dev_pasid || !ops->blocked_domain ||
+ !ops->blocked_domain->ops->set_dev_pasid)
+ return -EOPNOTSUPP;
+ /*
+ * Skip PASID validation for devices without PASID support
+ * (max_pasids = 0). These devices cannot issue transactions
+ * with PASID, so they don't affect group's PASID usage.
+ */
+ for_each_group_device(group, gdev) {
+ if (gdev->dev->iommu->max_pasids > 0 &&
+ pasid >= gdev->dev->iommu->max_pasids)
+ return -EINVAL;
+ }
+ }
+
+ /*
+ * Domains that don't implement a test_dev callback function must have a
+ * simple domain attach behavior. The sanity above should be enough.
+ */
+ if (!domain->ops->test_dev)
+ return 0;
+
+ return domain->ops->test_dev(domain, dev, pasid, old);
+}
+
static int __iommu_attach_group(struct iommu_domain *domain,
struct iommu_group *group)
{
@@ -2272,8 +2332,7 @@ static int __iommu_attach_group(struct iommu_domain *domain,
return -EBUSY;
dev = iommu_group_first_dev(group);
- if (!dev_has_iommu(dev) ||
- !domain_iommu_ops_compatible(dev_iommu_ops(dev), domain))
+ if (!dev_has_iommu(dev))
return -EINVAL;
return __iommu_group_set_domain(group, domain);
@@ -2381,6 +2440,13 @@ static int __iommu_group_set_domain_internal(struct iommu_group *group,
if (WARN_ON(!new_domain))
return -EINVAL;
+ for_each_group_device(group, gdev) {
+ ret = __iommu_domain_test_device(new_domain, gdev->dev,
+ IOMMU_NO_PASID, group->domain);
+ if (ret)
+ return ret;
+ }
+
/*
* Changing the domain is done by calling attach_dev() on the new
* domain. This switch does not have to be atomic and DMA can be
@@ -3479,38 +3545,19 @@ int iommu_attach_device_pasid(struct iommu_domain *domain,
{
/* Caller must be a probed driver on dev */
struct iommu_group *group = dev->iommu_group;
- struct group_device *device;
- const struct iommu_ops *ops;
void *entry;
int ret;
if (!group)
return -ENODEV;
-
- ops = dev_iommu_ops(dev);
-
- if (!domain->ops->set_dev_pasid ||
- !ops->blocked_domain ||
- !ops->blocked_domain->ops->set_dev_pasid)
- return -EOPNOTSUPP;
-
- if (!domain_iommu_ops_compatible(ops, domain) ||
- pasid == IOMMU_NO_PASID)
+ if (pasid == IOMMU_NO_PASID)
return -EINVAL;
mutex_lock(&group->mutex);
- for_each_group_device(group, device) {
- /*
- * Skip PASID validation for devices without PASID support
- * (max_pasids = 0). These devices cannot issue transactions
- * with PASID, so they don't affect group's PASID usage.
- */
- if ((device->dev->iommu->max_pasids > 0) &&
- (pasid >= device->dev->iommu->max_pasids)) {
- ret = -EINVAL;
- goto out_unlock;
- }
- }
+
+ ret = __iommu_domain_test_device(domain, dev, pasid, NULL);
+ if (ret)
+ goto out_unlock;
entry = iommu_make_pasid_array_entry(domain, handle);
@@ -3573,12 +3620,7 @@ int iommu_replace_device_pasid(struct iommu_domain *domain,
if (!group)
return -ENODEV;
-
- if (!domain->ops->set_dev_pasid)
- return -EOPNOTSUPP;
-
- if (!domain_iommu_ops_compatible(dev_iommu_ops(dev), domain) ||
- pasid == IOMMU_NO_PASID || !handle)
+ if (pasid == IOMMU_NO_PASID || !handle)
return -EINVAL;
mutex_lock(&group->mutex);
@@ -3615,6 +3657,11 @@ int iommu_replace_device_pasid(struct iommu_domain *domain,
ret = 0;
if (curr_domain != domain) {
+ ret = __iommu_domain_test_device(domain, dev, pasid,
+ curr_domain);
+ if (ret)
+ goto out_unlock;
+
ret = __iommu_set_group_pasid(domain, group,
pasid, curr_domain);
if (ret)
--
2.43.0
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Monday, October 13, 2025 8:05 AM
> @@ -714,7 +714,12 @@ struct iommu_ops {
>
> /**
> * struct iommu_domain_ops - domain specific operations
> - * @attach_dev: attach an iommu domain to a device
> + * @test_dev: Test compatibility prior to an @attach_dev or
> @set_dev_pasid call.
> + * A driver-level callback of this op should do a thorough sanity, to
> + * make sure a device is compatible with the domain. So the following
> + * @attach_dev and @set_dev_pasid functions would likely succeed
> with
> + * only one exception due to a temporary failure like out of memory.
> + * It's suggested to avoid the kernel prints in this op.
> * Return:
> * * 0 - success
> * * EINVAL - can indicate that device and domain are incompatible due
> to
> @@ -722,11 +727,15 @@ struct iommu_ops {
> * driver shouldn't log an error, since it is legitimate for a
> * caller to test reuse of existing domains. Otherwise, it may
> * still represent some other fundamental problem
> - * * ENOMEM - out of memory
> - * * ENOSPC - non-ENOMEM type of resource allocation failures
> * * EBUSY - device is attached to a domain and cannot be changed
> * * ENODEV - device specific errors, not able to be attached
> * * <others> - treated as ENODEV by the caller. Use is discouraged
> + * @attach_dev: attach an iommu domain to a device
> + * Return:
> + * * 0 - success
> + * * ENOMEM - out of memory
> + * * ENOSPC - non-ENOMEM type of resource allocation failures
> + * * <others> - Use is discouraged
It might need more work to meet this requirement. e.g. after patch4
I could still spot other errors easily in the attach path:
intel_iommu_attach_device()
iopf_for_domain_set()
intel_iommu_enable_iopf():
if (!info->pri_enabled)
return -ENODEV;
intel_iommu_attach_device()
dmar_domain_attach_device()
domain_attach_iommu():
curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
NULL, info, GFP_KERNEL);
if (curr) {
ret = xa_err(curr) ? : -EBUSY;
goto err_clear;
}
intel_iommu_attach_device()
dmar_domain_attach_device()
domain_setup_first_level()
__domain_setup_first_level()
intel_pasid_setup_first_level():
pte = intel_pasid_get_entry(dev, pasid);
if (!pte) {
spin_unlock(&iommu->lock);
return -ENODEV;
}
if (pasid_pte_is_present(pte)) {
spin_unlock(&iommu->lock);
return -EBUSY;
}
On the other hand, how do we communicate whatever errors returned
by attach_dev in the reset_done path back to userspace? As noted above
resource allocation failures could still occur in attach_dev, but userspace
may think the requested attach in middle of a reset has succeeded as
long as it passes the test_dev check.
Does it work better to block the attaching process upon ongoing reset
and wake it up later upon reset_done to resume attach?
On Thu, Oct 30, 2025 at 08:47:18AM +0000, Tian, Kevin wrote:
> It might need more work to meet this requirement. e.g. after patch4
> I could still spot other errors easily in the attach path:
>
> intel_iommu_attach_device()
> iopf_for_domain_set()
> intel_iommu_enable_iopf():
>
> if (!info->pri_enabled)
> return -ENODEV;
Yea, I missed that.
> intel_iommu_attach_device()
> dmar_domain_attach_device()
> domain_attach_iommu():
>
> curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
> NULL, info, GFP_KERNEL);
> if (curr) {
> ret = xa_err(curr) ? : -EBUSY;
> goto err_clear;
> }
There is actually an xa_load() in this function:
curr = xa_load(&domain->iommu_array, iommu->seq_id);
if (curr) {
curr->refcnt++;
kfree(info);
return 0;
}
[...]
info->refcnt = 1;
info->did = num;
info->iommu = iommu;
curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
NULL, info, GFP_KERNEL);
if (curr) {
ret = xa_err(curr) ? : -EBUSY;
goto err_clear;
}
It seems that this xa_cmpxchg could be just xa_store()?
> intel_iommu_attach_device()
> dmar_domain_attach_device()
> domain_setup_first_level()
> __domain_setup_first_level()
> intel_pasid_setup_first_level():
Yea. There are a few others in the track also..
> pte = intel_pasid_get_entry(dev, pasid);
> if (!pte) {
> spin_unlock(&iommu->lock);
> return -ENODEV;
> }
>
> if (pasid_pte_is_present(pte)) {
> spin_unlock(&iommu->lock);
> return -EBUSY;
> }
Hmm, this is fenced by iommu->lock and can race with !attach_dev
callbacks. It might be difficult to shift these to test_dev..
> On the other hand, how do we communicate whatever errors returned
> by attach_dev in the reset_done path back to userspace? As noted above
> resource allocation failures could still occur in attach_dev, but userspace
> may think the requested attach in middle of a reset has succeeded as
> long as it passes the test_dev check.
That's a legit point. Jason pointed out that we would end up with
some inconsistency between driver and core as well, at the SMMUv3
patch. So, this test_dev doesn't seemingly solve our problem very
well..
> Does it work better to block the attaching process upon ongoing reset
> and wake it up later upon reset_done to resume attach?
Yea, I think returning -EBUSY would be the simplest solution like
we did in the previous version.
But the concern is that VF might not be aware of a PF reset, so it
can still race an attachment, which would be -EBUSY as well. Then,
if its driver doesn't retry/defer the attach, this might break it?
FWIW, I am thinking of another design based on Jason's remarks:
https://lore.kernel.org/linux-iommu/aQBopHFub8wyQh5C@Asurada-Nvidia/
So, instead of core initiating the round trip between the blocking
domain and group->domain, it forwards dev_reset_prepare/done to the
driver where it does a low-level attachment that wouldn't fail:
For SMMUv3, it's an STE update.
For intel_iommu, it seems to be the context table update?
Then, any concurrent would be allowed to carry on to go through all
the compatibility/sanity checks as usual, but it would bypass the
final step: STE or context table update.
Thanks
Nicolin
On Thu, Oct 30, 2025 at 12:43:59PM -0700, Nicolin Chen wrote: > FWIW, I am thinking of another design based on Jason's remarks: > https://lore.kernel.org/linux-iommu/aQBopHFub8wyQh5C@Asurada-Nvidia/ > > So, instead of core initiating the round trip between the blocking > domain and group->domain, it forwards dev_reset_prepare/done to the > driver where it does a low-level attachment that wouldn't fail: > For SMMUv3, it's an STE update. > For intel_iommu, it seems to be the context table update? Kevin, how bad do you think the UAPI issue is if we ignore it? Jason
> From: Jason Gunthorpe <jgg@nvidia.com> > Sent: Tuesday, November 4, 2025 2:54 AM > > On Thu, Oct 30, 2025 at 12:43:59PM -0700, Nicolin Chen wrote: > > > FWIW, I am thinking of another design based on Jason's remarks: > > https://lore.kernel.org/linux-iommu/aQBopHFub8wyQh5C@Asurada- > Nvidia/ > > > > So, instead of core initiating the round trip between the blocking > > domain and group->domain, it forwards dev_reset_prepare/done to the > > driver where it does a low-level attachment that wouldn't fail: > > For SMMUv3, it's an STE update. > > For intel_iommu, it seems to be the context table update? > > Kevin, how bad do you think the UAPI issue is if we ignore it? > yeah probably better to leave it. I didn't see a clean way and the value didn't justify the complexity. Regarding to PF reset, it's a devastating operation while the vf user is operating the vf w/o any awareness. there must be certain coordination in userspace. otherwise nobody can recover the registers. Comparing to that, solving the domain attach problem is less important...
On Wed, Nov 05, 2025 at 06:57:31AM +0000, Tian, Kevin wrote: > > From: Jason Gunthorpe <jgg@nvidia.com> > > Sent: Tuesday, November 4, 2025 2:54 AM > > > > On Thu, Oct 30, 2025 at 12:43:59PM -0700, Nicolin Chen wrote: > > > > > FWIW, I am thinking of another design based on Jason's remarks: > > > https://lore.kernel.org/linux-iommu/aQBopHFub8wyQh5C@Asurada- > > Nvidia/ > > > > > > So, instead of core initiating the round trip between the blocking > > > domain and group->domain, it forwards dev_reset_prepare/done to the > > > driver where it does a low-level attachment that wouldn't fail: > > > For SMMUv3, it's an STE update. > > > For intel_iommu, it seems to be the context table update? > > > > Kevin, how bad do you think the UAPI issue is if we ignore it? > > > > yeah probably better to leave it. I didn't see a clean way and the > value didn't justify the complexity. > > Regarding to PF reset, it's a devastating operation while the vf user > is operating the vf w/o any awareness. there must be certain > coordination in userspace. otherwise nobody can recover the > registers. Comparing to that, solving the domain attach problem > is less important... If I capture these correctly, we should go with a -EBUSY version: - Reject concurrent attachments during a device reset - Skip reset for devices having sibling group devices - Allow PF to stop IOMMU, ignoring VFs ? That sounds pretty much like this v4: https://lore.kernel.org/linux-iommu/0f6021b500c74db33af8118210dd7a2b2fd31b3c.1756682135.git.nicolinc@nvidia.com/ by dropping the SRIOV concern. Thanks Nicolin
On Wed, Nov 05, 2025 at 10:18:01AM -0800, Nicolin Chen wrote: > On Wed, Nov 05, 2025 at 06:57:31AM +0000, Tian, Kevin wrote: > > > From: Jason Gunthorpe <jgg@nvidia.com> > > > Sent: Tuesday, November 4, 2025 2:54 AM > > > > > > On Thu, Oct 30, 2025 at 12:43:59PM -0700, Nicolin Chen wrote: > > > > > > > FWIW, I am thinking of another design based on Jason's remarks: > > > > https://lore.kernel.org/linux-iommu/aQBopHFub8wyQh5C@Asurada- > > > Nvidia/ > > > > > > > > So, instead of core initiating the round trip between the blocking > > > > domain and group->domain, it forwards dev_reset_prepare/done to the > > > > driver where it does a low-level attachment that wouldn't fail: > > > > For SMMUv3, it's an STE update. > > > > For intel_iommu, it seems to be the context table update? > > > > > > Kevin, how bad do you think the UAPI issue is if we ignore it? > > > > > > > yeah probably better to leave it. I didn't see a clean way and the > > value didn't justify the complexity. > > > > Regarding to PF reset, it's a devastating operation while the vf user > > is operating the vf w/o any awareness. there must be certain > > coordination in userspace. otherwise nobody can recover the > > registers. Comparing to that, solving the domain attach problem > > is less important... > > If I capture these correctly, we should go with a -EBUSY version: > - Reject concurrent attachments during a device reset > - Skip reset for devices having sibling group devices > - Allow PF to stop IOMMU, ignoring VFs > ? > > That sounds pretty much like this v4: > https://lore.kernel.org/linux-iommu/0f6021b500c74db33af8118210dd7a2b2fd31b3c.1756682135.git.nicolinc@nvidia.com/ > by dropping the SRIOV concern. It seems like the simplest answer.. I'd ignore the VFs, I think it is already really weird/dangerous to be resetting the PF while VFs have drivers bound.. Not sure there is anything we can do to make this work better. Jason
On Fri, Nov 07, 2025 at 02:54:09PM -0400, Jason Gunthorpe wrote: > On Wed, Nov 05, 2025 at 10:18:01AM -0800, Nicolin Chen wrote: > > On Wed, Nov 05, 2025 at 06:57:31AM +0000, Tian, Kevin wrote: > > > > From: Jason Gunthorpe <jgg@nvidia.com> > > > > Sent: Tuesday, November 4, 2025 2:54 AM > > > > > > > > On Thu, Oct 30, 2025 at 12:43:59PM -0700, Nicolin Chen wrote: > > > > > > > > > FWIW, I am thinking of another design based on Jason's remarks: > > > > > https://lore.kernel.org/linux-iommu/aQBopHFub8wyQh5C@Asurada- > > > > Nvidia/ > > > > > > > > > > So, instead of core initiating the round trip between the blocking > > > > > domain and group->domain, it forwards dev_reset_prepare/done to the > > > > > driver where it does a low-level attachment that wouldn't fail: > > > > > For SMMUv3, it's an STE update. > > > > > For intel_iommu, it seems to be the context table update? > > > > > > > > Kevin, how bad do you think the UAPI issue is if we ignore it? > > > > > > > > > > yeah probably better to leave it. I didn't see a clean way and the > > > value didn't justify the complexity. > > > > > > Regarding to PF reset, it's a devastating operation while the vf user > > > is operating the vf w/o any awareness. there must be certain > > > coordination in userspace. otherwise nobody can recover the > > > registers. Comparing to that, solving the domain attach problem > > > is less important... > > > > If I capture these correctly, we should go with a -EBUSY version: > > - Reject concurrent attachments during a device reset > > - Skip reset for devices having sibling group devices > > - Allow PF to stop IOMMU, ignoring VFs > > ? > > > > That sounds pretty much like this v4: > > https://lore.kernel.org/linux-iommu/0f6021b500c74db33af8118210dd7a2b2fd31b3c.1756682135.git.nicolinc@nvidia.com/ > > by dropping the SRIOV concern. > > It seems like the simplest answer.. > > I'd ignore the VFs, I think it is already really weird/dangerous to be > resetting the PF while VFs have drivers bound.. Not sure there is > anything we can do to make this work better. Ack. I've rebased that and will do some cleanup and run sanity tests before sending. Thanks Nicolin
On Sun, Oct 12, 2025 at 05:04:59PM -0700, Nicolin Chen wrote:
> @@ -3479,38 +3545,19 @@ int iommu_attach_device_pasid(struct iommu_domain *domain,
...
> - for_each_group_device(group, device) {
> - /*
> - * Skip PASID validation for devices without PASID support
> - * (max_pasids = 0). These devices cannot issue transactions
> - * with PASID, so they don't affect group's PASID usage.
> - */
> - if ((device->dev->iommu->max_pasids > 0) &&
> - (pasid >= device->dev->iommu->max_pasids)) {
> - ret = -EINVAL;
> - goto out_unlock;
> - }
> - }
> +
> + ret = __iommu_domain_test_device(domain, dev, pasid, NULL);
I realized that here it needs to be under for_each_group_device,
as its following __iommu_set_group_pasid() calls attach_dev() on
every group_device. So, the new code should run a test_dev() on
every group_device to keep the consistency.
> @@ -3615,6 +3657,11 @@ int iommu_replace_device_pasid(struct iommu_domain *domain,
> ret = 0;
>
> if (curr_domain != domain) {
> + ret = __iommu_domain_test_device(domain, dev, pasid,
> + curr_domain);
> + if (ret)
> + goto out_unlock;
> +
> ret = __iommu_set_group_pasid(domain, group,
> pasid, curr_domain);
Needs the same fix above.
Nicolin
On Sun, Oct 12, 2025 at 05:04:59PM -0700, Nicolin Chen wrote:
> And keep them within the group->mutex, so drivers can simply move all the
> sanity and compatibility tests from their attach_dev callbacks to the new
> test_dev callbacks without concerning about a race condition.
I'm not sure about this.. For the problem we are trying to solve this
would be racy as the test would be done and the group mutex
unlocked. Then later it will be re-tested and attached.
> /**
> * struct iommu_domain_ops - domain specific operations
> - * @attach_dev: attach an iommu domain to a device
> + * @test_dev: Test compatibility prior to an @attach_dev or @set_dev_pasid call.
> + * A driver-level callback of this op should do a thorough sanity, to
> + * make sure a device is compatible with the domain. So the following
> + * @attach_dev and @set_dev_pasid functions would likely succeed with
> + * only one exception due to a temporary failure like out of memory.
> + * It's suggested to avoid the kernel prints in this op.
This is a general rule, even normal attach should be following it or
iommufd will be spamming the log.
> * Return:
> * * 0 - success
> * * EINVAL - can indicate that device and domain are incompatible due to
> @@ -722,11 +727,15 @@ struct iommu_ops {
> * driver shouldn't log an error, since it is legitimate for a
> * caller to test reuse of existing domains. Otherwise, it may
> * still represent some other fundamental problem
> - * * ENOMEM - out of memory
> - * * ENOSPC - non-ENOMEM type of resource allocation failures
> * * EBUSY - device is attached to a domain and cannot be changed
> * * ENODEV - device specific errors, not able to be attached
> * * <others> - treated as ENODEV by the caller. Use is discouraged
> + * @attach_dev: attach an iommu domain to a device
> + * Return:
> + * * 0 - success
> + * * ENOMEM - out of memory
> + * * ENOSPC - non-ENOMEM type of resource allocation failures
> + * * <others> - Use is discouraged
> * @set_dev_pasid: set or replace an iommu domain to a pasid of device. The pasid of
> * the device should be left in the old config in error case.
> * @map_pages: map a physically contiguous set of pages of the same size to
> @@ -751,6 +760,8 @@ struct iommu_ops {
> * @free: Release the domain after use.
> */
> struct iommu_domain_ops {
> + int (*test_dev)(struct iommu_domain *domain, struct device *dev,
> + ioasid_t pasid, struct iommu_domain *old);
Because of the starting remark I'm skeptical that old should be
included here.
The rest looks OK
Jason
On Mon, Oct 20, 2025 at 01:27:36PM -0300, Jason Gunthorpe wrote:
> On Sun, Oct 12, 2025 at 05:04:59PM -0700, Nicolin Chen wrote:
>
> > And keep them within the group->mutex, so drivers can simply move all the
> > sanity and compatibility tests from their attach_dev callbacks to the new
> > test_dev callbacks without concerning about a race condition.
>
> I'm not sure about this.. For the problem we are trying to solve this
> would be racy as the test would be done and the group mutex
> unlocked. Then later it will be re-tested and attached.
Oh right, we'll have to retest in iommu_dev_reset_done(). I missed
that.
> > @@ -751,6 +760,8 @@ struct iommu_ops {
> > * @free: Release the domain after use.
> > */
> > struct iommu_domain_ops {
> > + int (*test_dev)(struct iommu_domain *domain, struct device *dev,
> > + ioasid_t pasid, struct iommu_domain *old);
>
> Because of the starting remark I'm skeptical that old should be
> included here.
Hmm, the followings functions sanitizes "old":
- qcom_iommu_identity_attach() drivers/iommu/arm/arm-smmu/qcom_iommu.c
- iommu_sva_set_dev_pasid() in drivers/iommu/amd/pasid.c
Thanks
Nicolin
On Mon, Oct 20, 2025 at 11:51:49AM -0700, Nicolin Chen wrote:
> On Mon, Oct 20, 2025 at 01:27:36PM -0300, Jason Gunthorpe wrote:
> > On Sun, Oct 12, 2025 at 05:04:59PM -0700, Nicolin Chen wrote:
> >
> > > And keep them within the group->mutex, so drivers can simply move all the
> > > sanity and compatibility tests from their attach_dev callbacks to the new
> > > test_dev callbacks without concerning about a race condition.
> >
> > I'm not sure about this.. For the problem we are trying to solve this
> > would be racy as the test would be done and the group mutex
> > unlocked. Then later it will be re-tested and attached.
>
> Oh right, we'll have to retest in iommu_dev_reset_done(). I missed
> that.
>
> > > @@ -751,6 +760,8 @@ struct iommu_ops {
> > > * @free: Release the domain after use.
> > > */
> > > struct iommu_domain_ops {
> > > + int (*test_dev)(struct iommu_domain *domain, struct device *dev,
> > > + ioasid_t pasid, struct iommu_domain *old);
> >
> > Because of the starting remark I'm skeptical that old should be
> > included here.
>
> Hmm, the followings functions sanitizes "old":
> - qcom_iommu_identity_attach() drivers/iommu/arm/arm-smmu/qcom_iommu.c
That shouldn't be copied over to test??
if (domain == identity_domain || !domain)
return 0;
That is just optimizing away the attach if it has nothing to do
qcom_domain = to_qcom_iommu_domain(domain);
if (WARN_ON(!qcom_domain->iommu))
return -EINVAL;
That can't never happen
> - iommu_sva_set_dev_pasid() in drivers/iommu/amd/pasid.c
Its broken, you are not required by API to detach a domain before
setting a new one. Keep it in attach, hope someone fixes this driver
someday.
Jason
On Mon, Oct 27, 2025 at 08:23:10PM -0300, Jason Gunthorpe wrote:
> On Mon, Oct 20, 2025 at 11:51:49AM -0700, Nicolin Chen wrote:
> > On Mon, Oct 20, 2025 at 01:27:36PM -0300, Jason Gunthorpe wrote:
> > > On Sun, Oct 12, 2025 at 05:04:59PM -0700, Nicolin Chen wrote:
> > > > @@ -751,6 +760,8 @@ struct iommu_ops {
> > > > * @free: Release the domain after use.
> > > > */
> > > > struct iommu_domain_ops {
> > > > + int (*test_dev)(struct iommu_domain *domain, struct device *dev,
> > > > + ioasid_t pasid, struct iommu_domain *old);
> > >
> > > Because of the starting remark I'm skeptical that old should be
> > > included here.
> >
> > Hmm, the followings functions sanitizes "old":
> > - qcom_iommu_identity_attach() drivers/iommu/arm/arm-smmu/qcom_iommu.c
>
> That shouldn't be copied over to test??
>
> if (domain == identity_domain || !domain)
> return 0;
>
> That is just optimizing away the attach if it has nothing to do
>
> qcom_domain = to_qcom_iommu_domain(domain);
> if (WARN_ON(!qcom_domain->iommu))
> return -EINVAL;
>
> That can't never happen
>
> > - iommu_sva_set_dev_pasid() in drivers/iommu/amd/pasid.c
>
> Its broken, you are not required by API to detach a domain before
> setting a new one. Keep it in attach, hope someone fixes this driver
> someday.
OK. I am leaving them alone. And no more old passed to test_dev.
Thanks
Nicolin
On Sun, 2025-10-12 at 17:04 -0700, Nicolin Chen wrote:
> Add a new test_dev domain op for driver to test the compatibility between
> a domain and a device at the driver level, before calling into the actual
> attachment/replacement of a domain. Support pasid for set_dev_pasid call.
>
> Move existing core-level compatibility tests to a helper function. Invoke
> it prior to:
> * __iommu_attach_device() or its wrapper __iommu_device_set_domain()
> * __iommu_set_group_pasid()
Should this list also include iommu_deferred_attach()? The code does
include it.
>
> And keep them within the group->mutex, so drivers can simply move all the
> sanity and compatibility tests from their attach_dev callbacks to the new
> test_dev callbacks without concerning about a race condition.
>
> This may be a public API someday for VFIO/IOMMUFD to run a list of attach
> tests without doing any actual attachment, which may result in a list of
> failed tests. So encourage drivers to avoid printks to prevent kernel log
> spam.
>
> Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> ---
> include/linux/iommu.h | 17 +++++--
> drivers/iommu/iommu.c | 111 ++++++++++++++++++++++++++++++------------
> 2 files changed, 93 insertions(+), 35 deletions(-)
>
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index 801b2bd9e8d49..2ec99502dc29c 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -714,7 +714,12 @@ struct iommu_ops {
>
> /**
> * struct iommu_domain_ops - domain specific operations
> - * @attach_dev: attach an iommu domain to a device
> + * @test_dev: Test compatibility prior to an @attach_dev or @set_dev_pasid call.
> + * A driver-level callback of this op should do a thorough sanity, to
You're missing the word "check" above.
> + * make sure a device is compatible with the domain. So the following
> + * @attach_dev and @set_dev_pasid functions would likely succeed with
> + * only one exception due to a temporary failure like out of memory.
Nit: "… only one exception …" / "… like out of memory …" this sounds a
bit odd to me because on the one hand it's one exception but then also
a group (temporary failures).
Maybe better:
"… would likely succeed with only the exception of temporary failures
like out of memory."?
> + * It's suggested to avoid the kernel prints in this op.
> * Return:
> * * 0 - success
> * * EINVAL - can indicate that device and domain are incompatible due to
> @@ -722,11 +727,15 @@ struct iommu_ops {
> * driver shouldn't log an error, since it is legitimate for a
> * caller to test reuse of existing domains. Otherwise, it may
> * still represent some other fundamental problem
> - * * ENOMEM - out of memory
> - * * ENOSPC - non-ENOMEM type of resource allocation failures
> * * EBUSY - device is attached to a domain and cannot be changed
> * * ENODEV - device specific errors, not able to be attached
> * * <others> - treated as ENODEV by the caller. Use is discouraged
> + * @attach_dev: attach an iommu domain to a device
> + * Return:
> + * * 0 - success
> + * * ENOMEM - out of memory
> + * * ENOSPC - non-ENOMEM type of resource allocation failures
> + * * <others> - Use is discouraged
> * @set_dev_pasid: set or replace an iommu domain to a pasid of device. The pasid of
> * the device should be left in the old config in error case.
> * @map_pages: map a physically contiguous set of pages of the same size to
> @@ -751,6 +760,8 @@ struct iommu_ops {
> * @free: Release the domain after use.
> */
> struct iommu_domain_ops {
> + int (*test_dev)(struct iommu_domain *domain, struct device *dev,
> + ioasid_t pasid, struct iommu_domain *old);
> int (*attach_dev)(struct iommu_domain *domain, struct device *dev,
> struct iommu_domain *old);
> int (*set_dev_pasid)(struct iommu_domain *domain, struct device *dev,
>
--- snip ---
> @@ -3615,6 +3657,11 @@ int iommu_replace_device_pasid(struct iommu_domain *domain,
> ret = 0;
>
> if (curr_domain != domain) {
> + ret = __iommu_domain_test_device(domain, dev, pasid,
> + curr_domain);
> + if (ret)
> + goto out_unlock;
> +
> ret = __iommu_set_group_pasid(domain, group,
> pasid, curr_domain);
> if (ret)
Apart from the comment and commit description nits mentioned above this
looks good to me.
Reviewed-by: Niklas Schnelle <schnelle@linux.ibm.com>
Hi Niklas,
On Mon, Oct 13, 2025 at 11:53:55AM +0200, Niklas Schnelle wrote:
> On Sun, 2025-10-12 at 17:04 -0700, Nicolin Chen wrote:
> > Add a new test_dev domain op for driver to test the compatibility between
> > a domain and a device at the driver level, before calling into the actual
> > attachment/replacement of a domain. Support pasid for set_dev_pasid call.
> >
> > Move existing core-level compatibility tests to a helper function. Invoke
> > it prior to:
> > * __iommu_attach_device() or its wrapper __iommu_device_set_domain()
> > * __iommu_set_group_pasid()
>
> Should this list also include iommu_deferred_attach()? The code does
> include it.
iommu_deferred_attach() invokes __iommu_attach_device(), so it is
already included in the list :)
> > /**
> > * struct iommu_domain_ops - domain specific operations
> > - * @attach_dev: attach an iommu domain to a device
> > + * @test_dev: Test compatibility prior to an @attach_dev or @set_dev_pasid call.
> > + * A driver-level callback of this op should do a thorough sanity, to
>
> You're missing the word "check" above.
Ack.
> > + * make sure a device is compatible with the domain. So the following
> > + * @attach_dev and @set_dev_pasid functions would likely succeed with
> > + * only one exception due to a temporary failure like out of memory.
>
> Nit: "… only one exception …" / "… like out of memory …" this sounds a
> bit odd to me because on the one hand it's one exception but then also
> a group (temporary failures).
>
> Maybe better:
> "… would likely succeed with only the exception of temporary failures
> like out of memory."?
Sure. I can do that. Fixing both parts, it would be:
* @test_dev: Test compatibility prior to an @attach_dev or @set_dev_pasid call.
* A driver callback of this op should do a thorough sanity check, to
* make sure a device is compatible with the domain, so the following
* @attach_dev and @set_dev_pasid functions would likely succeed with
* only the exception of temporary failures like out of memory.
> --- snip ---
> > @@ -3615,6 +3657,11 @@ int iommu_replace_device_pasid(struct iommu_domain *domain,
> > ret = 0;
> >
> > if (curr_domain != domain) {
> > + ret = __iommu_domain_test_device(domain, dev, pasid,
> > + curr_domain);
> > + if (ret)
> > + goto out_unlock;
> > +
> > ret = __iommu_set_group_pasid(domain, group,
> > pasid, curr_domain);
> > if (ret)
>
> Apart from the comment and commit description nits mentioned above this
> looks good to me.
>
> Reviewed-by: Niklas Schnelle <schnelle@linux.ibm.com>
Thanks for the review!
Nicolin
On Mon, 2025-10-13 at 10:22 -0700, Nicolin Chen wrote: > Hi Niklas, > > On Mon, Oct 13, 2025 at 11:53:55AM +0200, Niklas Schnelle wrote: > > On Sun, 2025-10-12 at 17:04 -0700, Nicolin Chen wrote: > > > Add a new test_dev domain op for driver to test the compatibility between > > > a domain and a device at the driver level, before calling into the actual > > > attachment/replacement of a domain. Support pasid for set_dev_pasid call. > > > > > > Move existing core-level compatibility tests to a helper function. Invoke > > > it prior to: > > > * __iommu_attach_device() or its wrapper __iommu_device_set_domain() > > > * __iommu_set_group_pasid() > > > > Should this list also include iommu_deferred_attach()? The code does > > include it. > > iommu_deferred_attach() invokes __iommu_attach_device(), so it is > already included in the list :) Ok makes sense, though it does list __iommu_device_set_domain() separately. Either way is fine for me. > > > > /** > > > * struct iommu_domain_ops - domain specific operations > > > - * @attach_dev: attach an iommu domain to a device > > > + * @test_dev: Test compatibility prior to an @attach_dev or @set_dev_pasid call. > > > + * A driver-level callback of this op should do a thorough sanity, to > > > > You're missing the word "check" above. > > Ack. > > > > + * make sure a device is compatible with the domain. So the following > > > + * @attach_dev and @set_dev_pasid functions would likely succeed with > > > + * only one exception due to a temporary failure like out of memory. > > > > Nit: "… only one exception …" / "… like out of memory …" this sounds a > > bit odd to me because on the one hand it's one exception but then also > > a group (temporary failures). > > > > Maybe better: > > "… would likely succeed with only the exception of temporary failures > > like out of memory."? > > Sure. I can do that. Fixing both parts, it would be: > > * @test_dev: Test compatibility prior to an @attach_dev or @set_dev_pasid call. > * A driver callback of this op should do a thorough sanity check, to > * make sure a device is compatible with the domain, so the following > * @attach_dev and @set_dev_pasid functions would likely succeed with > * only the exception of temporary failures like out of memory. > Sounds good, thanks!
© 2016 - 2025 Red Hat, Inc.