iommu: Introduce and roll out test_dev domain op

[PATCH v1 02/20] iommu: Introduce a test_dev domain op and an internal helper

Posted by Nicolin Chen 3 months, 4 weeks ago

Add a new test_dev domain op for driver to test the compatibility between
a domain and a device at the driver level, before calling into the actual
attachment/replacement of a domain. Support pasid for set_dev_pasid call.

Move existing core-level compatibility tests to a helper function. Invoke
it prior to:
 * __iommu_attach_device() or its wrapper __iommu_device_set_domain()
 * __iommu_set_group_pasid()

And keep them within the group->mutex, so drivers can simply move all the
sanity and compatibility tests from their attach_dev callbacks to the new
test_dev callbacks without concerning about a race condition.

This may be a public API someday for VFIO/IOMMUFD to run a list of attach
tests without doing any actual attachment, which may result in a list of
failed tests. So encourage drivers to avoid printks to prevent kernel log
spam.

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
 include/linux/iommu.h |  17 +++++--
 drivers/iommu/iommu.c | 111 ++++++++++++++++++++++++++++++------------
 2 files changed, 93 insertions(+), 35 deletions(-)

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 801b2bd9e8d49..2ec99502dc29c 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -714,7 +714,12 @@ struct iommu_ops {
 
 /**
  * struct iommu_domain_ops - domain specific operations
- * @attach_dev: attach an iommu domain to a device
+ * @test_dev: Test compatibility prior to an @attach_dev or @set_dev_pasid call.
+ *            A driver-level callback of this op should do a thorough sanity, to
+ *            make sure a device is compatible with the domain. So the following
+ *            @attach_dev and @set_dev_pasid functions would likely succeed with
+ *            only one exception due to a temporary failure like out of memory.
+ *            It's suggested to avoid the kernel prints in this op.
  *  Return:
  * * 0		- success
  * * EINVAL	- can indicate that device and domain are incompatible due to
@@ -722,11 +727,15 @@ struct iommu_ops {
  *		  driver shouldn't log an error, since it is legitimate for a
  *		  caller to test reuse of existing domains. Otherwise, it may
  *		  still represent some other fundamental problem
- * * ENOMEM	- out of memory
- * * ENOSPC	- non-ENOMEM type of resource allocation failures
  * * EBUSY	- device is attached to a domain and cannot be changed
  * * ENODEV	- device specific errors, not able to be attached
  * * <others>	- treated as ENODEV by the caller. Use is discouraged
+ * @attach_dev: attach an iommu domain to a device
+ *  Return:
+ * * 0		- success
+ * * ENOMEM	- out of memory
+ * * ENOSPC	- non-ENOMEM type of resource allocation failures
+ * * <others>	- Use is discouraged
  * @set_dev_pasid: set or replace an iommu domain to a pasid of device. The pasid of
  *                 the device should be left in the old config in error case.
  * @map_pages: map a physically contiguous set of pages of the same size to
@@ -751,6 +760,8 @@ struct iommu_ops {
  * @free: Release the domain after use.
  */
 struct iommu_domain_ops {
+	int (*test_dev)(struct iommu_domain *domain, struct device *dev,
+			ioasid_t pasid, struct iommu_domain *old);
 	int (*attach_dev)(struct iommu_domain *domain, struct device *dev,
 			  struct iommu_domain *old);
 	int (*set_dev_pasid)(struct iommu_domain *domain, struct device *dev,
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 170e522b5bda4..95f0e2898b6b5 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -99,6 +99,9 @@ static int bus_iommu_probe(const struct bus_type *bus);
 static int iommu_bus_notifier(struct notifier_block *nb,
 			      unsigned long action, void *data);
 static void iommu_release_device(struct device *dev);
+static int __iommu_domain_test_device(struct iommu_domain *domain,
+				      struct device *dev, ioasid_t pasid,
+				      struct iommu_domain *old);
 static int __iommu_attach_device(struct iommu_domain *domain,
 				 struct device *dev, struct iommu_domain *old);
 static int __iommu_attach_group(struct iommu_domain *domain,
@@ -642,6 +645,10 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list
 	if (group->default_domain)
 		iommu_create_device_direct_mappings(group->default_domain, dev);
 	if (group->domain) {
+		ret = __iommu_domain_test_device(group->domain, dev,
+						 IOMMU_NO_PASID, NULL);
+		if (ret)
+			goto err_remove_gdev;
 		ret = __iommu_device_set_domain(group, dev, group->domain, NULL,
 						0);
 		if (ret)
@@ -2185,6 +2192,8 @@ EXPORT_SYMBOL_GPL(iommu_attach_device);
 
 int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain)
 {
+	int ret;
+
 	/*
 	 * This is called on the dma mapping fast path so avoid locking. This is
 	 * racy, but we have an expectation that the driver will setup its DMAs
@@ -2195,6 +2204,10 @@ int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain)
 
 	guard(mutex)(&dev->iommu_group->mutex);
 
+	ret = __iommu_domain_test_device(domain, dev, IOMMU_NO_PASID, NULL);
+	if (ret)
+		return ret;
+
 	return __iommu_attach_device(domain, dev, NULL);
 }
 
@@ -2262,6 +2275,53 @@ static bool domain_iommu_ops_compatible(const struct iommu_ops *ops,
 	return false;
 }
 
+/*
+ * Test if a future attach for a domain to the device will always fail. This may
+ * indicate that the device and the domain are incompatible in some way. Actual
+ * attach may still fail for some temporary failure like out of memory.
+ *
+ * If pasid != IOMMU_NO_PASID, it is meant to test a future set_dev_pasid call.
+ */
+static int __iommu_domain_test_device(struct iommu_domain *domain,
+				      struct device *dev, ioasid_t pasid,
+				      struct iommu_domain *old)
+{
+	const struct iommu_ops *ops = dev_iommu_ops(dev);
+	struct iommu_group *group = dev->iommu_group;
+
+	lockdep_assert_held(&group->mutex);
+
+	if (!domain_iommu_ops_compatible(ops, domain))
+		return -EINVAL;
+
+	if (pasid != IOMMU_NO_PASID) {
+		struct group_device *gdev;
+
+		if (!domain->ops->set_dev_pasid || !ops->blocked_domain ||
+		    !ops->blocked_domain->ops->set_dev_pasid)
+			return -EOPNOTSUPP;
+		/*
+		 * Skip PASID validation for devices without PASID support
+		 * (max_pasids = 0). These devices cannot issue transactions
+		 * with PASID, so they don't affect group's PASID usage.
+		 */
+		for_each_group_device(group, gdev) {
+			if (gdev->dev->iommu->max_pasids > 0 &&
+			    pasid >= gdev->dev->iommu->max_pasids)
+				return -EINVAL;
+		}
+	}
+
+	/*
+	 * Domains that don't implement a test_dev callback function must have a
+	 * simple domain attach behavior. The sanity above should be enough.
+	 */
+	if (!domain->ops->test_dev)
+		return 0;
+
+	return domain->ops->test_dev(domain, dev, pasid, old);
+}
+
 static int __iommu_attach_group(struct iommu_domain *domain,
 				struct iommu_group *group)
 {
@@ -2272,8 +2332,7 @@ static int __iommu_attach_group(struct iommu_domain *domain,
 		return -EBUSY;
 
 	dev = iommu_group_first_dev(group);
-	if (!dev_has_iommu(dev) ||
-	    !domain_iommu_ops_compatible(dev_iommu_ops(dev), domain))
+	if (!dev_has_iommu(dev))
 		return -EINVAL;
 
 	return __iommu_group_set_domain(group, domain);
@@ -2381,6 +2440,13 @@ static int __iommu_group_set_domain_internal(struct iommu_group *group,
 	if (WARN_ON(!new_domain))
 		return -EINVAL;
 
+	for_each_group_device(group, gdev) {
+		ret = __iommu_domain_test_device(new_domain, gdev->dev,
+						 IOMMU_NO_PASID, group->domain);
+		if (ret)
+			return ret;
+	}
+
 	/*
 	 * Changing the domain is done by calling attach_dev() on the new
 	 * domain. This switch does not have to be atomic and DMA can be
@@ -3479,38 +3545,19 @@ int iommu_attach_device_pasid(struct iommu_domain *domain,
 {
 	/* Caller must be a probed driver on dev */
 	struct iommu_group *group = dev->iommu_group;
-	struct group_device *device;
-	const struct iommu_ops *ops;
 	void *entry;
 	int ret;
 
 	if (!group)
 		return -ENODEV;
-
-	ops = dev_iommu_ops(dev);
-
-	if (!domain->ops->set_dev_pasid ||
-	    !ops->blocked_domain ||
-	    !ops->blocked_domain->ops->set_dev_pasid)
-		return -EOPNOTSUPP;
-
-	if (!domain_iommu_ops_compatible(ops, domain) ||
-	    pasid == IOMMU_NO_PASID)
+	if (pasid == IOMMU_NO_PASID)
 		return -EINVAL;
 
 	mutex_lock(&group->mutex);
-	for_each_group_device(group, device) {
-		/*
-		 * Skip PASID validation for devices without PASID support
-		 * (max_pasids = 0). These devices cannot issue transactions
-		 * with PASID, so they don't affect group's PASID usage.
-		 */
-		if ((device->dev->iommu->max_pasids > 0) &&
-		    (pasid >= device->dev->iommu->max_pasids)) {
-			ret = -EINVAL;
-			goto out_unlock;
-		}
-	}
+
+	ret = __iommu_domain_test_device(domain, dev, pasid, NULL);
+	if (ret)
+		goto out_unlock;
 
 	entry = iommu_make_pasid_array_entry(domain, handle);
 
@@ -3573,12 +3620,7 @@ int iommu_replace_device_pasid(struct iommu_domain *domain,
 
 	if (!group)
 		return -ENODEV;
-
-	if (!domain->ops->set_dev_pasid)
-		return -EOPNOTSUPP;
-
-	if (!domain_iommu_ops_compatible(dev_iommu_ops(dev), domain) ||
-	    pasid == IOMMU_NO_PASID || !handle)
+	if (pasid == IOMMU_NO_PASID || !handle)
 		return -EINVAL;
 
 	mutex_lock(&group->mutex);
@@ -3615,6 +3657,11 @@ int iommu_replace_device_pasid(struct iommu_domain *domain,
 	ret = 0;
 
 	if (curr_domain != domain) {
+		ret = __iommu_domain_test_device(domain, dev, pasid,
+						 curr_domain);
+		if (ret)
+			goto out_unlock;
+
 		ret = __iommu_set_group_pasid(domain, group,
 					      pasid, curr_domain);
 		if (ret)
-- 
2.43.0

RE: [PATCH v1 02/20] iommu: Introduce a test_dev domain op and an internal helper

Posted by Tian, Kevin 3 months, 1 week ago

> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Monday, October 13, 2025 8:05 AM
> @@ -714,7 +714,12 @@ struct iommu_ops {
> 
>  /**
>   * struct iommu_domain_ops - domain specific operations
> - * @attach_dev: attach an iommu domain to a device
> + * @test_dev: Test compatibility prior to an @attach_dev or
> @set_dev_pasid call.
> + *            A driver-level callback of this op should do a thorough sanity, to
> + *            make sure a device is compatible with the domain. So the following
> + *            @attach_dev and @set_dev_pasid functions would likely succeed
> with
> + *            only one exception due to a temporary failure like out of memory.
> + *            It's suggested to avoid the kernel prints in this op.
>   *  Return:
>   * * 0		- success
>   * * EINVAL	- can indicate that device and domain are incompatible due
> to
> @@ -722,11 +727,15 @@ struct iommu_ops {
>   *		  driver shouldn't log an error, since it is legitimate for a
>   *		  caller to test reuse of existing domains. Otherwise, it may
>   *		  still represent some other fundamental problem
> - * * ENOMEM	- out of memory
> - * * ENOSPC	- non-ENOMEM type of resource allocation failures
>   * * EBUSY	- device is attached to a domain and cannot be changed
>   * * ENODEV	- device specific errors, not able to be attached
>   * * <others>	- treated as ENODEV by the caller. Use is discouraged
> + * @attach_dev: attach an iommu domain to a device
> + *  Return:
> + * * 0		- success
> + * * ENOMEM	- out of memory
> + * * ENOSPC	- non-ENOMEM type of resource allocation failures
> + * * <others>	- Use is discouraged

It might need more work to meet this requirement. e.g. after patch4
I could still spot other errors easily in the attach path:

intel_iommu_attach_device()
  iopf_for_domain_set()
    intel_iommu_enable_iopf():

        if (!info->pri_enabled)
                return -ENODEV;

intel_iommu_attach_device()
  dmar_domain_attach_device()
    domain_attach_iommu():
      
       curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
                          NULL, info, GFP_KERNEL);
        if (curr) {
                ret = xa_err(curr) ? : -EBUSY;
                goto err_clear;
        }

intel_iommu_attach_device()
  dmar_domain_attach_device()
    domain_setup_first_level()
      __domain_setup_first_level()
        intel_pasid_setup_first_level():

        pte = intel_pasid_get_entry(dev, pasid);
        if (!pte) {
                spin_unlock(&iommu->lock);
                return -ENODEV;
        }

        if (pasid_pte_is_present(pte)) {
                spin_unlock(&iommu->lock);
                return -EBUSY;
        }

On the other hand, how do we communicate whatever errors returned
by attach_dev in the reset_done path back to userspace? As noted above
resource allocation failures could still occur in attach_dev, but userspace
may think the requested attach in middle of a reset has succeeded as
long as it passes the test_dev check.

Does it work better to block the attaching process upon ongoing reset
and wake it up later upon reset_done to resume attach?