[PATCH v2 3/4] iommufd: Destroy vdevice on idevice destroy

Xu Yilun posted 4 patches 3 months, 2 weeks ago
There is a newer version of this series
[PATCH v2 3/4] iommufd: Destroy vdevice on idevice destroy
Posted by Xu Yilun 3 months, 2 weeks ago
Destroy iommufd_vdevice(vdev) on iommufd_idevice(idev) destroy so that
vdev can't outlive idev.

iommufd_device(idev) represents the physical device bound to iommufd,
while the iommufd_vdevice(vdev) represents the virtual instance of the
physical device in the VM. The lifecycle of the vdev should not be
longer than idev. This doesn't cause real problem on existing use cases
cause vdev doesn't impact the physical device, only provides
virtualization information. But to extend vdev for Confidential
Computing(CC), there are needs to do secure configuration for the vdev,
e.g. TSM Bind/Unbind. These configurations should be rolled back on idev
destroy, or the external driver(VFIO) functionality may be impact.

Building the association between idev & vdev requires the two objects
pointing each other, but not referencing each other. This requires
proper locking. This is done by reviving some of Nicolin's patch [1].

There are 3 cases on idev destroy:

  1. vdev is already destroyed by userspace. No extra handling needed.
  2. vdev is still alive. Use iommufd_object_tombstone_user() to
     destroy vdev and tombstone the vdev ID.
  3. vdev is being destroyed by userspace. The vdev ID is already freed,
     but vdev destroy handler is not complete. The idev destroy handler
     should wait for vdev destroy completion.

[1]: https://lore.kernel.org/all/53025c827c44d68edb6469bfd940a8e8bc6147a5.1729897278.git.nicolinc@nvidia.com/

Original-by: Nicolin Chen <nicolinc@nvidia.com>
Original-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
---
 drivers/iommu/iommufd/device.c          | 43 +++++++++++++++++++++++++
 drivers/iommu/iommufd/iommufd_private.h | 11 +++++++
 drivers/iommu/iommufd/main.c            |  1 +
 drivers/iommu/iommufd/viommu.c          | 33 +++++++++++++++++--
 4 files changed, 85 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index 86244403b532..908a94a27bab 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -137,11 +137,54 @@ static struct iommufd_group *iommufd_get_group(struct iommufd_ctx *ictx,
 	}
 }
 
+static void iommufd_device_remove_vdev(struct iommufd_device *idev)
+{
+	bool vdev_removing = false;
+
+	mutex_lock(&idev->igroup->lock);
+	if (idev->vdev) {
+		struct iommufd_vdevice *vdev;
+
+		vdev = iommufd_get_vdevice(idev->ictx, idev->vdev->obj.id);
+		if (IS_ERR(vdev)) {
+			/* vdev is removed from xarray, but is not destroyed/freed */
+			vdev_removing = true;
+			goto unlock;
+		}
+
+		/* Should never happen */
+		if (WARN_ON(vdev != idev->vdev)) {
+			idev->vdev = NULL;
+			iommufd_put_object(idev->ictx, &vdev->obj);
+			goto unlock;
+		}
+
+		/*
+		 * vdev cannot be destroyed after refcount_inc, safe to release
+		 * idev->igroup->lock and use idev->vdev afterward.
+		 */
+		refcount_inc(&idev->vdev->obj.users);
+		iommufd_put_object(idev->ictx, &idev->vdev->obj);
+	}
+unlock:
+	mutex_unlock(&idev->igroup->lock);
+
+	if (vdev_removing) {
+		if (!wait_event_timeout(idev->ictx->destroy_wait,
+					!idev->vdev,
+					msecs_to_jiffies(60000)))
+			pr_crit("Time out waiting for iommufd vdevice removed\n");
+	} else if (idev->vdev) {
+		iommufd_object_tombstone_user(idev->ictx, &idev->vdev->obj);
+	}
+}
+
 void iommufd_device_destroy(struct iommufd_object *obj)
 {
 	struct iommufd_device *idev =
 		container_of(obj, struct iommufd_device, obj);
 
+	iommufd_device_remove_vdev(idev);
 	iommu_device_release_dma_owner(idev->dev);
 	iommufd_put_group(idev->igroup);
 	if (!iommufd_selftest_is_mock_dev(idev->dev))
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index fbc9ef78d81f..f58aa4439c53 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -446,6 +446,7 @@ struct iommufd_device {
 	/* always the physical device */
 	struct device *dev;
 	bool enforce_cache_coherency;
+	struct iommufd_vdevice *vdev;
 };
 
 static inline struct iommufd_device *
@@ -621,6 +622,7 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd);
 void iommufd_viommu_destroy(struct iommufd_object *obj);
 int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd);
 void iommufd_vdevice_destroy(struct iommufd_object *obj);
+void iommufd_vdevice_abort(struct iommufd_object *obj);
 
 struct iommufd_vdevice {
 	struct iommufd_object obj;
@@ -628,8 +630,17 @@ struct iommufd_vdevice {
 	struct iommufd_viommu *viommu;
 	struct device *dev;
 	u64 id; /* per-vIOMMU virtual ID */
+	struct iommufd_device *idev;
 };
 
+static inline struct iommufd_vdevice *
+iommufd_get_vdevice(struct iommufd_ctx *ictx, u32 id)
+{
+	return container_of(iommufd_get_object(ictx, id,
+					       IOMMUFD_OBJ_VDEVICE),
+			    struct iommufd_vdevice, obj);
+}
+
 #ifdef CONFIG_IOMMUFD_TEST
 int iommufd_test(struct iommufd_ucmd *ucmd);
 void iommufd_selftest_destroy(struct iommufd_object *obj);
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 5fd75aba068b..3f955a123095 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -531,6 +531,7 @@ static const struct iommufd_object_ops iommufd_object_ops[] = {
 	},
 	[IOMMUFD_OBJ_VDEVICE] = {
 		.destroy = iommufd_vdevice_destroy,
+		.abort = iommufd_vdevice_abort,
 	},
 	[IOMMUFD_OBJ_VEVENTQ] = {
 		.destroy = iommufd_veventq_destroy,
diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
index 4577b88c8560..9b062e651ea5 100644
--- a/drivers/iommu/iommufd/viommu.c
+++ b/drivers/iommu/iommufd/viommu.c
@@ -84,16 +84,31 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
 	return rc;
 }
 
-void iommufd_vdevice_destroy(struct iommufd_object *obj)
+void iommufd_vdevice_abort(struct iommufd_object *obj)
 {
 	struct iommufd_vdevice *vdev =
 		container_of(obj, struct iommufd_vdevice, obj);
 	struct iommufd_viommu *viommu = vdev->viommu;
+	struct iommufd_device *idev = vdev->idev;
+
+	lockdep_assert_held(&idev->igroup->lock);
 
 	/* xa_cmpxchg is okay to fail if alloc failed xa_cmpxchg previously */
 	xa_cmpxchg(&viommu->vdevs, vdev->id, vdev, NULL, GFP_KERNEL);
 	refcount_dec(&viommu->obj.users);
 	put_device(vdev->dev);
+	idev->vdev = NULL;
+}
+
+void iommufd_vdevice_destroy(struct iommufd_object *obj)
+{
+	struct iommufd_vdevice *vdev =
+		container_of(obj, struct iommufd_vdevice, obj);
+
+	mutex_lock(&vdev->idev->igroup->lock);
+	iommufd_vdevice_abort(obj);
+	mutex_unlock(&vdev->idev->igroup->lock);
+	wake_up_interruptible_all(&vdev->ictx->destroy_wait);
 }
 
 int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
@@ -124,18 +139,28 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
 		goto out_put_idev;
 	}
 
+	mutex_lock(&idev->igroup->lock);
+	if (idev->vdev) {
+		rc = -EEXIST;
+		goto out_unlock_igroup;
+	}
+
 	vdev = iommufd_object_alloc(ucmd->ictx, vdev, IOMMUFD_OBJ_VDEVICE);
 	if (IS_ERR(vdev)) {
 		rc = PTR_ERR(vdev);
-		goto out_put_idev;
+		goto out_unlock_igroup;
 	}
 
+	/* vdev can't outlive idev, vdev->idev is always valid, need no refcnt */
+	vdev->idev = idev;
 	vdev->ictx = ucmd->ictx;
 	vdev->id = virt_id;
 	vdev->dev = idev->dev;
 	get_device(idev->dev);
 	vdev->viommu = viommu;
 	refcount_inc(&viommu->obj.users);
+	/* idev->vdev is protected by idev->igroup->lock, need no refcnt */
+	idev->vdev = vdev;
 
 	curr = xa_cmpxchg(&viommu->vdevs, virt_id, NULL, vdev, GFP_KERNEL);
 	if (curr) {
@@ -148,10 +173,12 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
 	if (rc)
 		goto out_abort;
 	iommufd_object_finalize(ucmd->ictx, &vdev->obj);
-	goto out_put_idev;
+	goto out_unlock_igroup;
 
 out_abort:
 	iommufd_object_abort_and_destroy(ucmd->ictx, &vdev->obj);
+out_unlock_igroup:
+	mutex_unlock(&idev->igroup->lock);
 out_put_idev:
 	iommufd_put_object(ucmd->ictx, &idev->obj);
 out_put_viommu:
-- 
2.25.1
Re: [PATCH v2 3/4] iommufd: Destroy vdevice on idevice destroy
Posted by Aneesh Kumar K.V 3 months, 2 weeks ago
Xu Yilun <yilun.xu@linux.intel.com> writes:

> Destroy iommufd_vdevice(vdev) on iommufd_idevice(idev) destroy so that
> vdev can't outlive idev.
>
> iommufd_device(idev) represents the physical device bound to iommufd,
> while the iommufd_vdevice(vdev) represents the virtual instance of the
> physical device in the VM. The lifecycle of the vdev should not be
> longer than idev. This doesn't cause real problem on existing use cases
> cause vdev doesn't impact the physical device, only provides
> virtualization information. But to extend vdev for Confidential
> Computing(CC), there are needs to do secure configuration for the vdev,
> e.g. TSM Bind/Unbind. These configurations should be rolled back on idev
> destroy, or the external driver(VFIO) functionality may be impact.
>
> Building the association between idev & vdev requires the two objects
> pointing each other, but not referencing each other. This requires
> proper locking. This is done by reviving some of Nicolin's patch [1].
>
> There are 3 cases on idev destroy:
>
>   1. vdev is already destroyed by userspace. No extra handling needed.
>   2. vdev is still alive. Use iommufd_object_tombstone_user() to
>      destroy vdev and tombstone the vdev ID.
>   3. vdev is being destroyed by userspace. The vdev ID is already freed,
>      but vdev destroy handler is not complete. The idev destroy handler
>      should wait for vdev destroy completion.
>
> [1]: https://lore.kernel.org/all/53025c827c44d68edb6469bfd940a8e8bc6147a5.1729897278.git.nicolinc@nvidia.com/
>
> Original-by: Nicolin Chen <nicolinc@nvidia.com>
> Original-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
> Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>

This is the latest version I have. But as Jason suggested we can
possibly switch to short term users to avoid parallel destroy returning
EBUSY. I am using a mutex lock to block parallel vdevice destroy.


diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index 86244403b532..0bee1b3eadc6 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -221,6 +221,8 @@ struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx,
 	refcount_inc(&idev->obj.users);
 	/* igroup refcount moves into iommufd_device */
 	idev->igroup = igroup;
+	idev->vdev   = NULL;
+	mutex_init(&idev->lock);
 
 	/*
 	 * If the caller fails after this success it must call
@@ -286,6 +288,23 @@ void iommufd_device_unbind(struct iommufd_device *idev)
 }
 EXPORT_SYMBOL_NS_GPL(iommufd_device_unbind, "IOMMUFD");
 
+void iommufd_device_tombstone_vdevice(struct iommufd_device *idev)
+{
+	guard(mutex)(&idev->lock);
+
+	if (idev->vdev) {
+		struct iommufd_vdevice *vdev = idev->vdev;
+		/*
+		 * We want to wait for shortterm users, so we need
+		 * to pass the obj which requires an elevated refcount.
+		 */
+		refcount_inc(&vdev->obj.users);
+		iommufd_object_remove(idev->ictx, &vdev->obj, vdev->obj.id,
+				      REMOVE_OBJ_TOMBSTONE | REMOVE_WAIT_SHORTTERM);
+	}
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_device_tombstone_vdevice, "IOMMUFD");
+
 struct iommufd_ctx *iommufd_device_to_ictx(struct iommufd_device *idev)
 {
 	return idev->ictx;
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 9ccc83341f32..960f79c31145 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -187,8 +187,10 @@ void iommufd_object_finalize(struct iommufd_ctx *ictx,
 			     struct iommufd_object *obj);
 
 enum {
-	REMOVE_WAIT_SHORTTERM = 1,
+	REMOVE_WAIT_SHORTTERM = BIT(0),
+	REMOVE_OBJ_TOMBSTONE = BIT(1),
 };
+
 int iommufd_object_remove(struct iommufd_ctx *ictx,
 			  struct iommufd_object *to_destroy, u32 id,
 			  unsigned int flags);
@@ -199,7 +201,7 @@ int iommufd_object_remove(struct iommufd_ctx *ictx,
  * will be holding one.
  */
 static inline void iommufd_object_destroy_user(struct iommufd_ctx *ictx,
-					       struct iommufd_object *obj)
+						 struct iommufd_object *obj)
 {
 	int ret;
 
@@ -425,6 +427,10 @@ struct iommufd_device {
 	/* always the physical device */
 	struct device *dev;
 	bool enforce_cache_coherency;
+	/* to protect the following members*/
+	struct mutex lock;
+	/* if there is a vdevice mapping the idev */
+	struct iommufd_vdevice *vdev;
 };
 
 static inline struct iommufd_device *
@@ -606,6 +612,7 @@ struct iommufd_vdevice {
 	struct iommufd_ctx *ictx;
 	struct iommufd_viommu *viommu;
 	struct device *dev;
+	struct iommufd_device *idev;
 	u64 id; /* per-vIOMMU virtual ID */
 };
 
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 3df468f64e7d..fd82140e6320 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -81,14 +81,16 @@ void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx,
 struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id,
 					  enum iommufd_object_type type)
 {
+	XA_STATE(xas, &ictx->objects, id);
 	struct iommufd_object *obj;
 
 	if (iommufd_should_fail())
 		return ERR_PTR(-ENOENT);
 
 	xa_lock(&ictx->objects);
-	obj = xa_load(&ictx->objects, id);
-	if (!obj || (type != IOMMUFD_OBJ_ANY && obj->type != type) ||
+	obj = xas_load(&xas);
+	if (!obj || xa_is_zero(obj) ||
+	    (type != IOMMUFD_OBJ_ANY && obj->type != type) ||
 	    !iommufd_lock_obj(obj))
 		obj = ERR_PTR(-ENOENT);
 	xa_unlock(&ictx->objects);
@@ -167,7 +169,7 @@ int iommufd_object_remove(struct iommufd_ctx *ictx,
 		goto err_xa;
 	}
 
-	xas_store(&xas, NULL);
+	xas_store(&xas, (flags & REMOVE_OBJ_TOMBSTONE) ? XA_ZERO_ENTRY : NULL);
 	if (ictx->vfio_ioas == container_of(obj, struct iommufd_ioas, obj))
 		ictx->vfio_ioas = NULL;
 	xa_unlock(&ictx->objects);
@@ -199,9 +201,30 @@ int iommufd_object_remove(struct iommufd_ctx *ictx,
 
 static int iommufd_destroy(struct iommufd_ucmd *ucmd)
 {
+	int ret;
 	struct iommu_destroy *cmd = ucmd->cmd;
+	struct iommufd_object *obj;
+	struct iommufd_device *idev = NULL;
+
+	obj = iommufd_get_object(ucmd->ictx, cmd->id, IOMMUFD_OBJ_ANY);
+	/* Destroying vdevice requires an idevice lock */
+	if (!IS_ERR(obj) && obj->type == IOMMUFD_OBJ_VDEVICE) {
+		struct iommufd_vdevice *vdev =
+			container_of(obj, struct iommufd_vdevice, obj);
+		/*
+		 * since vdev have an refcount on idev, this is safe.
+		 */
+		idev = vdev->idev;
+		mutex_lock(&idev->lock);
+		/* drop the additonal reference taken above */
+		iommufd_put_object(ucmd->ictx, obj);
+	}
+
+	ret = iommufd_object_remove(ucmd->ictx, NULL, cmd->id, 0);
+	if (idev)
+		mutex_unlock(&idev->lock);
 
-	return iommufd_object_remove(ucmd->ictx, NULL, cmd->id, 0);
+	return ret;
 }
 
 static int iommufd_fops_open(struct inode *inode, struct file *filp)
@@ -233,6 +256,16 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+/*
+ * We don't need additional locks because the iommufd_fops_release() function is
+ * only triggered when the iommufd descriptor is released. At that point, no
+ * other iommufd-based ioctl operations can be running concurrently.
+ *
+ * The destruction of the vdevice via idevice unbind is also safe:
+ * iommufd_fops_release() can only be called after the idevice has been unbound.
+ * The idevice bind operation takes a reference to the iommufd descriptor,
+ * preventing early release.
+ */
 static int iommufd_fops_release(struct inode *inode, struct file *filp)
 {
 	struct iommufd_ctx *ictx = filp->private_data;
@@ -251,16 +284,22 @@ static int iommufd_fops_release(struct inode *inode, struct file *filp)
 	 */
 	while (!xa_empty(&ictx->objects)) {
 		unsigned int destroyed = 0;
-		unsigned long index;
+		XA_STATE(xas, &ictx->objects, 0);
+
+		xas_for_each(&xas, obj, ULONG_MAX) {
+
+			if (!xa_is_zero(obj)) {
+				if (!refcount_dec_if_one(&obj->users))
+					continue;
+
+				iommufd_object_ops[obj->type].destroy(obj);
+				kfree(obj);
+			}
 
-		xa_for_each(&ictx->objects, index, obj) {
-			if (!refcount_dec_if_one(&obj->users))
-				continue;
 			destroyed++;
-			xa_erase(&ictx->objects, index);
-			iommufd_object_ops[obj->type].destroy(obj);
-			kfree(obj);
+			xas_store(&xas, NULL);
 		}
+
 		/* Bug related to users refcount */
 		if (WARN_ON(!destroyed))
 			break;
diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
index 01df2b985f02..b3a5f505c7ed 100644
--- a/drivers/iommu/iommufd/viommu.c
+++ b/drivers/iommu/iommufd/viommu.c
@@ -89,10 +89,18 @@ void iommufd_vdevice_destroy(struct iommufd_object *obj)
 	struct iommufd_vdevice *vdev =
 		container_of(obj, struct iommufd_vdevice, obj);
 	struct iommufd_viommu *viommu = vdev->viommu;
+	struct iommufd_device *idev = vdev->idev;
+
+	/*
+	 * since we have an refcount on idev, it can't be freed.
+	 */
+	lockdep_assert_held(&idev->lock);
 
 	/* xa_cmpxchg is okay to fail if alloc failed xa_cmpxchg previously */
 	xa_cmpxchg(&viommu->vdevs, vdev->id, vdev, NULL, GFP_KERNEL);
 	refcount_dec(&viommu->obj.users);
+	idev->vdev = NULL;
+	refcount_dec(&idev->obj.users);
 	put_device(vdev->dev);
 }
 
@@ -124,10 +132,15 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
 		goto out_put_idev;
 	}
 
+	mutex_lock(&idev->lock);
+	if (idev->vdev) {
+		rc = -EINVAL;
+		goto out_put_idev_unlock;
+	}
 	vdev = iommufd_object_alloc(ucmd->ictx, vdev, IOMMUFD_OBJ_VDEVICE);
 	if (IS_ERR(vdev)) {
 		rc = PTR_ERR(vdev);
-		goto out_put_idev;
+		goto out_put_idev_unlock;
 	}
 
 	vdev->id = virt_id;
@@ -147,10 +160,17 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
 	if (rc)
 		goto out_abort;
 	iommufd_object_finalize(ucmd->ictx, &vdev->obj);
-	goto out_put_idev;
+	/* don't allow idev free without vdev free */
+	refcount_inc(&idev->obj.users);
+	vdev->idev = idev;
+	/* vdev lifecycle now managed by idev */
+	idev->vdev = vdev;
+	goto out_put_idev_unlock;
 
 out_abort:
 	iommufd_object_abort_and_destroy(ucmd->ictx, &vdev->obj);
+out_put_idev_unlock:
+	mutex_unlock(&idev->lock);
 out_put_idev:
 	iommufd_put_object(ucmd->ictx, &idev->obj);
 out_put_viommu:
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 6328c3a05bcd..0bf4f8b7f8d2 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -694,6 +694,12 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev)
 #if IS_ENABLED(CONFIG_EEH)
 	eeh_dev_release(vdev->pdev);
 #endif
+
+	/* destroy vdevice which involves tsm unbind before we disable pci disable */
+	if (core_vdev->iommufd_device)
+		iommufd_device_tombstone_vdevice(core_vdev->iommufd_device);
+
+	/* tsm unbind should happen before this */
 	vfio_pci_core_disable(vdev);
 
 	mutex_lock(&vdev->igate);
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 34b6e6ca4bfa..8de3d903bfc0 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -63,6 +63,7 @@ void iommufd_device_detach(struct iommufd_device *idev, ioasid_t pasid);
 
 struct iommufd_ctx *iommufd_device_to_ictx(struct iommufd_device *idev);
 u32 iommufd_device_to_id(struct iommufd_device *idev);
+void iommufd_device_tombstone_vdevice(struct iommufd_device *idev);
 
 struct iommufd_access_ops {
 	u8 needs_pin_pages : 1;
Re: [PATCH v2 3/4] iommufd: Destroy vdevice on idevice destroy
Posted by Xu Yilun 3 months, 2 weeks ago
On Wed, Jun 25, 2025 at 12:10:28PM +0530, Aneesh Kumar K.V wrote:
> Xu Yilun <yilun.xu@linux.intel.com> writes:
> 
> > Destroy iommufd_vdevice(vdev) on iommufd_idevice(idev) destroy so that
> > vdev can't outlive idev.
> >
> > iommufd_device(idev) represents the physical device bound to iommufd,
> > while the iommufd_vdevice(vdev) represents the virtual instance of the
> > physical device in the VM. The lifecycle of the vdev should not be
> > longer than idev. This doesn't cause real problem on existing use cases
> > cause vdev doesn't impact the physical device, only provides
> > virtualization information. But to extend vdev for Confidential
> > Computing(CC), there are needs to do secure configuration for the vdev,
> > e.g. TSM Bind/Unbind. These configurations should be rolled back on idev
> > destroy, or the external driver(VFIO) functionality may be impact.
> >
> > Building the association between idev & vdev requires the two objects
> > pointing each other, but not referencing each other. This requires
> > proper locking. This is done by reviving some of Nicolin's patch [1].
> >
> > There are 3 cases on idev destroy:
> >
> >   1. vdev is already destroyed by userspace. No extra handling needed.
> >   2. vdev is still alive. Use iommufd_object_tombstone_user() to
> >      destroy vdev and tombstone the vdev ID.
> >   3. vdev is being destroyed by userspace. The vdev ID is already freed,
> >      but vdev destroy handler is not complete. The idev destroy handler
> >      should wait for vdev destroy completion.
> >
> > [1]: https://lore.kernel.org/all/53025c827c44d68edb6469bfd940a8e8bc6147a5.1729897278.git.nicolinc@nvidia.com/
> >
> > Original-by: Nicolin Chen <nicolinc@nvidia.com>
> > Original-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
> > Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
> 
> This is the latest version I have. But as Jason suggested we can
> possibly switch to short term users to avoid parallel destroy returning
> EBUSY.

We can discuss in that thread, I have the same question as Kevin.

> I am using a mutex lock to block parallel vdevice destroy.

I don't find reason to add a new lock rather than reuse the
idev->igroup->lock, which is already reviewed in Nicolin's series.

[...]

> diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
> index 3df468f64e7d..fd82140e6320 100644
> --- a/drivers/iommu/iommufd/main.c
> +++ b/drivers/iommu/iommufd/main.c
> @@ -81,14 +81,16 @@ void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx,
>  struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id,
>  					  enum iommufd_object_type type)
>  {
> +	XA_STATE(xas, &ictx->objects, id);
>  	struct iommufd_object *obj;
>  
>  	if (iommufd_should_fail())
>  		return ERR_PTR(-ENOENT);
>  
>  	xa_lock(&ictx->objects);
> -	obj = xa_load(&ictx->objects, id);
> -	if (!obj || (type != IOMMUFD_OBJ_ANY && obj->type != type) ||
> +	obj = xas_load(&xas);
> +	if (!obj || xa_is_zero(obj) ||

xas_load() & filter out zero entries, then what's the difference from
xa_load()?

> +	    (type != IOMMUFD_OBJ_ANY && obj->type != type) ||
>  	    !iommufd_lock_obj(obj))
>  		obj = ERR_PTR(-ENOENT);
>  	xa_unlock(&ictx->objects);

[...]

>  static int iommufd_destroy(struct iommufd_ucmd *ucmd)
>  {
> +	int ret;
>  	struct iommu_destroy *cmd = ucmd->cmd;
> +	struct iommufd_object *obj;
> +	struct iommufd_device *idev = NULL;
> +
> +	obj = iommufd_get_object(ucmd->ictx, cmd->id, IOMMUFD_OBJ_ANY);
> +	/* Destroying vdevice requires an idevice lock */
> +	if (!IS_ERR(obj) && obj->type == IOMMUFD_OBJ_VDEVICE) {
> +		struct iommufd_vdevice *vdev =
> +			container_of(obj, struct iommufd_vdevice, obj);
> +		/*
> +		 * since vdev have an refcount on idev, this is safe.
> +		 */
> +		idev = vdev->idev;
> +		mutex_lock(&idev->lock);
> +		/* drop the additonal reference taken above */
> +		iommufd_put_object(ucmd->ictx, obj);
> +	}
> +
> +	ret = iommufd_object_remove(ucmd->ictx, NULL, cmd->id, 0);
> +	if (idev)
> +		mutex_unlock(&idev->lock);

I'm trying best not to add vdev/idev specific things to generic
code. I also don't prefer adding a idev specific lock around generic
object remove flow. That makes idev/vdev way to special. So for these
locking things, I prefer more to Nicolin's v5 and revives them.

>  
> -	return iommufd_object_remove(ucmd->ictx, NULL, cmd->id, 0);
> +	return ret;
>  }
>  

[...]

> @@ -147,10 +160,17 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
>  	if (rc)
>  		goto out_abort;
>  	iommufd_object_finalize(ucmd->ictx, &vdev->obj);
> -	goto out_put_idev;
> +	/* don't allow idev free without vdev free */
> +	refcount_inc(&idev->obj.users);

IIRC, it has been suggested no long term refcount to kernel created
object. Besides, you actually disallow nothing.

> +	vdev->idev = idev;
> +	/* vdev lifecycle now managed by idev */
> +	idev->vdev = vdev;
> +	goto out_put_idev_unlock;
>  
>  out_abort:
>  	iommufd_object_abort_and_destroy(ucmd->ictx, &vdev->obj);
> +out_put_idev_unlock:
> +	mutex_unlock(&idev->lock);
>  out_put_idev:
>  	iommufd_put_object(ucmd->ictx, &idev->obj);
>  out_put_viommu:
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index 6328c3a05bcd..0bf4f8b7f8d2 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -694,6 +694,12 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev)
>  #if IS_ENABLED(CONFIG_EEH)
>  	eeh_dev_release(vdev->pdev);
>  #endif
> +
> +	/* destroy vdevice which involves tsm unbind before we disable pci disable */
> +	if (core_vdev->iommufd_device)
> +		iommufd_device_tombstone_vdevice(core_vdev->iommufd_device);

Ah.. I think all our effort to destroy vdevice on idevice destruction
is to hide the sequence details from VFIO.

> +
> +	/* tsm unbind should happen before this */
>  	vfio_pci_core_disable(vdev);

I did mention there is still issue when vdevice lifecycle problem is
solved. That VFIO for now do device disable configurations before
iommufd_device_unbind() and cause problem. But my quick idea would
be (not tested):

@@ -544,12 +544,14 @@ static void vfio_df_device_last_close(struct vfio_device_file *df)

        lockdep_assert_held(&device->dev_set->lock);

-       if (device->ops->close_device)
-               device->ops->close_device(device);
        if (iommufd)
                vfio_df_iommufd_unbind(df);
        else
                vfio_device_group_unuse_iommu(device);
+
+       if (device->ops->close_device)
+               device->ops->close_device(device);

>  
>  	mutex_lock(&vdev->igate);
> diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
> index 34b6e6ca4bfa..8de3d903bfc0 100644
> --- a/include/linux/iommufd.h
> +++ b/include/linux/iommufd.h
> @@ -63,6 +63,7 @@ void iommufd_device_detach(struct iommufd_device *idev, ioasid_t pasid);
>  
>  struct iommufd_ctx *iommufd_device_to_ictx(struct iommufd_device *idev);
>  u32 iommufd_device_to_id(struct iommufd_device *idev);
> +void iommufd_device_tombstone_vdevice(struct iommufd_device *idev);
>  
>  struct iommufd_access_ops {
>  	u8 needs_pin_pages : 1;
Re: [PATCH v2 3/4] iommufd: Destroy vdevice on idevice destroy
Posted by Jason Gunthorpe 3 months, 2 weeks ago
On Mon, Jun 23, 2025 at 05:49:45PM +0800, Xu Yilun wrote:
> +static void iommufd_device_remove_vdev(struct iommufd_device *idev)
> +{
> +	bool vdev_removing = false;
> +
> +	mutex_lock(&idev->igroup->lock);
> +	if (idev->vdev) {
> +		struct iommufd_vdevice *vdev;
> +
> +		vdev = iommufd_get_vdevice(idev->ictx, idev->vdev->obj.id);
> +		if (IS_ERR(vdev)) {

This incrs obj.users which will cause a concurrent
iommufd_object_remove() to fail with -EBUSY, which we are trying to
avoid.

Also you can hit a race where the tombstone has NULL'd the entry but
the racing destroy will then load the NULL with xas_load() and hit this:

		if (WARN_ON(obj != to_destroy)) {

So, this doesn't look like it will work right to me..

You want somewhat different destroy logic:

/*
 * The caller must directly obtain a shortterm_users reference without a users
 * reference using its own locking to protect the pointer. This function always
 * puts back the shortterm_users reference.
 */
int iommufd_object_remove_tombstone(struct iommufd_ctx *ictx,
				    struct iommufd_object *to_destroy)
{
	XA_STATE(xas, &ictx->objects, to_destroy->id);
	struct iommufd_object *obj;
	int ret;

	xa_lock(&ictx->objects);
	obj = xas_load(&xas);
	if (xa_is_zero(obj) || obj == NULL) {
		/*
		 * Another thread is racing to destroy this, since we have the
		 * shortterm_users refcount the other thread has xa_unlocked()
		 * but not passed iommufd_object_dec_wait_shortterm().
		 */
		if (refcount_dec_and_test(&to_destroy->shortterm_users))
			wake_up_interruptible_all(&ictx->destroy_wait);
		ret = 0;
		goto err_xa;
	} else if (WARN_ON(obj != to_destroy)) {
		refcount_dec(&obj->shortterm_users);
		ret = -ENOENT;
		goto err_xa;
	}

	/*
	 * The object is still in the xarray, so this thread will try to destroy
	 * it. Put back the callers shortterm_users.
	 */
	refcount_dec(&obj->shortterm_users);

	if (!refcount_dec_if_one(&obj->users)) {
		ret = -EBUSY;
		goto err_xa;
	}

	/* Leave behind a tombstone to prevent re-use of this entry */
	xas_store(&xas, XA_ZERO_ENTRY);
	xa_unlock(&ictx->objects);

	/*
	 * Since users is zero any positive users_shortterm must be racing
	 * iommufd_put_object(), or we have a bug.
	 */
	ret = iommufd_object_dec_wait_shortterm(ictx, obj);
	if (WARN_ON(ret))
		return ret;

	iommufd_object_ops[obj->type].destroy(obj);
	kfree(obj);
	return 0;

err_xa:
	xa_unlock(&ictx->objects);

	/* The returned object reference count is zero */
	return ret;
}

Then you'd call it by doing something like:

static void iommufd_device_remove_vdev(struct iommufd_device *idev)
{
	struct iommufd_object *to_destroy = NULL;
	int ret;

	mutex_lock(&idev->igroup->lock);
	if (!idev->vdev) {
		mutex_unlock(&idev->igroup->lock);
		return;
	}
	if (refcount_inc_not_zero(&idev->vdev->obj.shortterm_users))
		to_destroy = &idev->vdev->obj;
	mutex_unlock(&idev->igroup->lock);

	if (to_destroy) {
		ret = iommufd_object_remove_tombstone(idev->ictx, to_destroy);
		if (WARN_ON(ret))
			return;
	}

	/*
	 * We don't know what thread is actually going to destroy the vdev, but
	 * once the vdev is destroyed the pointer is NULL'd. At this
	 * point idev->users is 0 so no other thread can set a new vdev.
	 */
	if (!wait_event_timeout(idev->ictx->destroy_wait,
				!READ_ONCE(idev->vdev),
				msecs_to_jiffies(60000)))
		pr_crit("Time out waiting for iommufd vdevice removed\n");
}

Though there is a cleaner option here, you could do:

	mutex_lock(&idev->igroup->lock);
	if (idev->vdev)
		iommufd_vdevice_abort(&idev->vdev->obj);
	mutex_unlock(&idev->igroup->lock);

And make it safe to call abort twice, eg by setting dev to NULL and
checking for that. First thread to get to the igroup lock, either via
iommufd_vdevice_destroy() or via the above will do the actual abort
synchronously without any wait_event_timeout. That seems better??

> +	/* vdev can't outlive idev, vdev->idev is always valid, need no refcnt */
> +	vdev->idev = idev;

So this means a soon as 'idev->vdev = NULL;' happens idev is an
invalid pointer. Need a WRITE_ONCE there.

I would rephrase the comment as 
 iommufd_device_destroy() waits until idev->vdev is NULL before
 freeing the idev, which only happens once the vdev is finished
 destruction. Thus we do not need refcounting on either idev->vdev or
 vdev->idev.

and group both assignments together.

>  	vdev->ictx = ucmd->ictx;
>  	vdev->id = virt_id;
>  	vdev->dev = idev->dev;
>  	get_device(idev->dev);
>  	vdev->viommu = viommu;
>  	refcount_inc(&viommu->obj.users);
> +	/* idev->vdev is protected by idev->igroup->lock, need no refcnt */
> +	idev->vdev = vdev;

This can be WRITE_ONCE too

Jason
Re: [PATCH v2 3/4] iommufd: Destroy vdevice on idevice destroy
Posted by Xu Yilun 3 months, 2 weeks ago
On Tue, Jun 24, 2025 at 11:53:46AM -0300, Jason Gunthorpe wrote:
> On Mon, Jun 23, 2025 at 05:49:45PM +0800, Xu Yilun wrote:
> > +static void iommufd_device_remove_vdev(struct iommufd_device *idev)
> > +{
> > +	bool vdev_removing = false;
> > +
> > +	mutex_lock(&idev->igroup->lock);
> > +	if (idev->vdev) {
> > +		struct iommufd_vdevice *vdev;
> > +
> > +		vdev = iommufd_get_vdevice(idev->ictx, idev->vdev->obj.id);
> > +		if (IS_ERR(vdev)) {
> 
> This incrs obj.users which will cause a concurrent
> iommufd_object_remove() to fail with -EBUSY, which we are trying to
> avoid.

I have the same question as Kevin, leave this to that thread.

[...]

> 	/*
> 	 * We don't know what thread is actually going to destroy the vdev, but
> 	 * once the vdev is destroyed the pointer is NULL'd. At this
> 	 * point idev->users is 0 so no other thread can set a new vdev.
> 	 */
> 	if (!wait_event_timeout(idev->ictx->destroy_wait,
> 				!READ_ONCE(idev->vdev),
> 				msecs_to_jiffies(60000)))
> 		pr_crit("Time out waiting for iommufd vdevice removed\n");
> }
> 
> Though there is a cleaner option here, you could do:
> 
> 	mutex_lock(&idev->igroup->lock);
> 	if (idev->vdev)
> 		iommufd_vdevice_abort(&idev->vdev->obj);
> 	mutex_unlock(&idev->igroup->lock);
> 
> And make it safe to call abort twice, eg by setting dev to NULL and
> checking for that. First thread to get to the igroup lock, either via
> iommufd_vdevice_destroy() or via the above will do the actual abort
> synchronously without any wait_event_timeout. That seems better??

I'm good to both options, but slightly tend not to make vdevice so
special from other objects, so still prefer the wait_event option.

> 
> > +	/* vdev can't outlive idev, vdev->idev is always valid, need no refcnt */
> > +	vdev->idev = idev;
> 
> So this means a soon as 'idev->vdev = NULL;' happens idev is an
> invalid pointer. Need a WRITE_ONCE there.
> 
> I would rephrase the comment as 
>  iommufd_device_destroy() waits until idev->vdev is NULL before
>  freeing the idev, which only happens once the vdev is finished
>  destruction. Thus we do not need refcounting on either idev->vdev or
>  vdev->idev.
> 
> and group both assignments together.

Good to me.

Thanks,
Yilun
Re: [PATCH v2 3/4] iommufd: Destroy vdevice on idevice destroy
Posted by Jason Gunthorpe 3 months, 2 weeks ago
On Wed, Jun 25, 2025 at 06:06:00PM +0800, Xu Yilun wrote:
> > 	/*
> > 	 * We don't know what thread is actually going to destroy the vdev, but
> > 	 * once the vdev is destroyed the pointer is NULL'd. At this
> > 	 * point idev->users is 0 so no other thread can set a new vdev.
> > 	 */
> > 	if (!wait_event_timeout(idev->ictx->destroy_wait,
> > 				!READ_ONCE(idev->vdev),
> > 				msecs_to_jiffies(60000)))
> > 		pr_crit("Time out waiting for iommufd vdevice removed\n");
> > }
> > 
> > Though there is a cleaner option here, you could do:
> > 
> > 	mutex_lock(&idev->igroup->lock);
> > 	if (idev->vdev)
> > 		iommufd_vdevice_abort(&idev->vdev->obj);
> > 	mutex_unlock(&idev->igroup->lock);
> > 
> > And make it safe to call abort twice, eg by setting dev to NULL and
> > checking for that. First thread to get to the igroup lock, either via
> > iommufd_vdevice_destroy() or via the above will do the actual abort
> > synchronously without any wait_event_timeout. That seems better??
> 
> I'm good to both options, but slightly tend not to make vdevice so
> special from other objects, so still prefer the wait_event option.

The wait_event is a ugly hack though, even in its existing code. The
above version is better because it doesn't have any failure mode and
doesn't introduce any unlocked use of the idev->vdev which is easier
to reason about, no READ_ONCE/WRITE_ONCE/etc

It sounds like you should largely leave the existing other parts the
same as this v2, though can you try reorganize it to look a little
more like the version I shared?

Jason
Re: [PATCH v2 3/4] iommufd: Destroy vdevice on idevice destroy
Posted by Xu Yilun 3 months, 2 weeks ago
On Wed, Jun 25, 2025 at 09:38:32AM -0300, Jason Gunthorpe wrote:
> On Wed, Jun 25, 2025 at 06:06:00PM +0800, Xu Yilun wrote:
> > > 	/*
> > > 	 * We don't know what thread is actually going to destroy the vdev, but
> > > 	 * once the vdev is destroyed the pointer is NULL'd. At this
> > > 	 * point idev->users is 0 so no other thread can set a new vdev.
> > > 	 */
> > > 	if (!wait_event_timeout(idev->ictx->destroy_wait,
> > > 				!READ_ONCE(idev->vdev),
> > > 				msecs_to_jiffies(60000)))
> > > 		pr_crit("Time out waiting for iommufd vdevice removed\n");
> > > }
> > > 
> > > Though there is a cleaner option here, you could do:
> > > 
> > > 	mutex_lock(&idev->igroup->lock);
> > > 	if (idev->vdev)
> > > 		iommufd_vdevice_abort(&idev->vdev->obj);
> > > 	mutex_unlock(&idev->igroup->lock);
> > > 
> > > And make it safe to call abort twice, eg by setting dev to NULL and
> > > checking for that. First thread to get to the igroup lock, either via
> > > iommufd_vdevice_destroy() or via the above will do the actual abort
> > > synchronously without any wait_event_timeout. That seems better??
> > 
> > I'm good to both options, but slightly tend not to make vdevice so
> > special from other objects, so still prefer the wait_event option.
> 
> The wait_event is a ugly hack though, even in its existing code. The
> above version is better because it doesn't have any failure mode and
> doesn't introduce any unlocked use of the idev->vdev which is easier
> to reason about, no READ_ONCE/WRITE_ONCE/etc
> 
> It sounds like you should largely leave the existing other parts the
> same as this v2, though can you try reorganize it to look a little
> more like the version I shared?

Sure. But may I confirm that your only want reentrant
iommufd_vdevice_abort() but not your iommufd_object_remove_tombstone()
changes?

To me, grab a shortterm_users but not a user is a new operation model. I
hesitate to add it when the existing refcount_inc(&obj->user) works for
this case.

Thanks,
Yilun

> 
> Jason
>
Re: [PATCH v2 3/4] iommufd: Destroy vdevice on idevice destroy
Posted by Jason Gunthorpe 3 months, 2 weeks ago
On Thu, Jun 26, 2025 at 11:31:06AM +0800, Xu Yilun wrote:
> > The wait_event is a ugly hack though, even in its existing code. The
> > above version is better because it doesn't have any failure mode and
> > doesn't introduce any unlocked use of the idev->vdev which is easier
> > to reason about, no READ_ONCE/WRITE_ONCE/etc
> > 
> > It sounds like you should largely leave the existing other parts the
> > same as this v2, though can you try reorganize it to look a little
> > more like the version I shared?
> 
> Sure. But may I confirm that your only want reentrant
> iommufd_vdevice_abort() but not your iommufd_object_remove_tombstone()
> changes?

I think take a look at how I organized the control flow in the patch I
sent and try to use some of those ideas, it was a bit simpler

> To me, grab a shortterm_users but not a user is a new operation model. I
> hesitate to add it when the existing refcount_inc(&obj->user) works for
> this case.

Yes, I am convinced you should not do this. Just hold the users only
and use the normal destroy with the XA_ZERO_ENTRY change

Along with the locked abort idea.

Jason
RE: [PATCH v2 3/4] iommufd: Destroy vdevice on idevice destroy
Posted by Tian, Kevin 3 months, 2 weeks ago
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Tuesday, June 24, 2025 10:54 PM
> 
> On Mon, Jun 23, 2025 at 05:49:45PM +0800, Xu Yilun wrote:
> > +static void iommufd_device_remove_vdev(struct iommufd_device *idev)
> > +{
> > +	bool vdev_removing = false;
> > +
> > +	mutex_lock(&idev->igroup->lock);
> > +	if (idev->vdev) {
> > +		struct iommufd_vdevice *vdev;
> > +
> > +		vdev = iommufd_get_vdevice(idev->ictx, idev->vdev->obj.id);
> > +		if (IS_ERR(vdev)) {
> 
> This incrs obj.users which will cause a concurrent
> iommufd_object_remove() to fail with -EBUSY, which we are trying to
> avoid.

concurrent remove means a user-initiated IOMMU_DESTROY, for which 
failing with -EBUSY is expected as it doesn't wait for shortterm?

> 
> Also you can hit a race where the tombstone has NULL'd the entry but
> the racing destroy will then load the NULL with xas_load() and hit this:
> 
> 		if (WARN_ON(obj != to_destroy)) {

IOMMU_DESTROY doesn't provide to_destroy.

or are you concerned about the racing between two kernel-initiated
destroy paths? at least for vdev story this doesn't sound to be the
case...
Re: [PATCH v2 3/4] iommufd: Destroy vdevice on idevice destroy
Posted by Jason Gunthorpe 3 months, 2 weeks ago
On Tue, Jun 24, 2025 at 11:57:31PM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Tuesday, June 24, 2025 10:54 PM
> > 
> > On Mon, Jun 23, 2025 at 05:49:45PM +0800, Xu Yilun wrote:
> > > +static void iommufd_device_remove_vdev(struct iommufd_device *idev)
> > > +{
> > > +	bool vdev_removing = false;
> > > +
> > > +	mutex_lock(&idev->igroup->lock);
> > > +	if (idev->vdev) {
> > > +		struct iommufd_vdevice *vdev;
> > > +
> > > +		vdev = iommufd_get_vdevice(idev->ictx, idev->vdev->obj.id);
> > > +		if (IS_ERR(vdev)) {
> > 
> > This incrs obj.users which will cause a concurrent
> > iommufd_object_remove() to fail with -EBUSY, which we are trying to
> > avoid.
> 
> concurrent remove means a user-initiated IOMMU_DESTROY, for which 
> failing with -EBUSY is expected as it doesn't wait for shortterm?

Yes a user IOMMU_DESTROY of the vdevice should not have a transient
EBUSY failure. Avoiding that is the purpose of the shorterm_users
mechanism.

> > Also you can hit a race where the tombstone has NULL'd the entry but
> > the racing destroy will then load the NULL with xas_load() and hit this:
> > 
> > 		if (WARN_ON(obj != to_destroy)) {
> 
> IOMMU_DESTROY doesn't provide to_destroy.

Right, but IOMMU_DESTROY thread could have already gone past the
xa_store(NULL) and then the kernel destroy thread could reach the
above WARN as it does use to_destroy.

Jason
RE: [PATCH v2 3/4] iommufd: Destroy vdevice on idevice destroy
Posted by Tian, Kevin 3 months, 2 weeks ago
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Wednesday, June 25, 2025 9:36 AM
> 
> On Tue, Jun 24, 2025 at 11:57:31PM +0000, Tian, Kevin wrote:
> > > From: Jason Gunthorpe <jgg@nvidia.com>
> > > Sent: Tuesday, June 24, 2025 10:54 PM
> > >
> > > On Mon, Jun 23, 2025 at 05:49:45PM +0800, Xu Yilun wrote:
> > > > +static void iommufd_device_remove_vdev(struct iommufd_device
> *idev)
> > > > +{
> > > > +	bool vdev_removing = false;
> > > > +
> > > > +	mutex_lock(&idev->igroup->lock);
> > > > +	if (idev->vdev) {
> > > > +		struct iommufd_vdevice *vdev;
> > > > +
> > > > +		vdev = iommufd_get_vdevice(idev->ictx, idev->vdev->obj.id);
> > > > +		if (IS_ERR(vdev)) {
> > >
> > > This incrs obj.users which will cause a concurrent
> > > iommufd_object_remove() to fail with -EBUSY, which we are trying to
> > > avoid.
> >
> > concurrent remove means a user-initiated IOMMU_DESTROY, for which
> > failing with -EBUSY is expected as it doesn't wait for shortterm?
> 
> Yes a user IOMMU_DESTROY of the vdevice should not have a transient
> EBUSY failure. Avoiding that is the purpose of the shorterm_users
> mechanism.

hmm my understanding is the opposite.

currently iommufd_destroy() doesn't set REMOVE_WAIT_SHORTTERM:

static int iommufd_destroy(struct iommufd_ucmd *ucmd)
{
	struct iommu_destroy *cmd = ucmd->cmd;

	return iommufd_object_remove(ucmd->ictx, NULL, cmd->id, 0);
}

so it's natural for IOMMU_DESTROY to hit transient EBUSY when a parallel
ioctl is being executed on the destroyed object:

	if (!refcount_dec_if_one(&obj->users)) {
		ret = -EBUSY;
		goto err_xa;
	}

idevice unbind is just a similar (but indirect) transient race to 
IOMMU_DESTROY.

waiting shorterm_users is more for kernel destroy.

> 
> > > Also you can hit a race where the tombstone has NULL'd the entry but
> > > the racing destroy will then load the NULL with xas_load() and hit this:
> > >
> > > 		if (WARN_ON(obj != to_destroy)) {
> >
> > IOMMU_DESTROY doesn't provide to_destroy.
> 
> Right, but IOMMU_DESTROY thread could have already gone past the
> xa_store(NULL) and then the kernel destroy thread could reach the
> above WARN as it does use to_destroy.
> 

If IOMMU_DESTROY have already gone past xa_store(NULL), there are
two scenarios:

1) vdevice has been completely destroyed with idev->vdev=NULL.

In such case iommufd_device_remove_vdev() is nop.

2) vdevice destroy has not been completed with idev->vdev still valid

In such case iommufd_get_vdevice() fails with vdev_removing set.

Then iommufd_device_remove_vdev() will wait on idev->vdev to
be NULL instead of calling iommufd_object_tombstone_user().

so the said race won't happen. 😊
Re: [PATCH v2 3/4] iommufd: Destroy vdevice on idevice destroy
Posted by Jason Gunthorpe 3 months, 2 weeks ago
On Wed, Jun 25, 2025 at 02:11:40AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Wednesday, June 25, 2025 9:36 AM
> > 
> > On Tue, Jun 24, 2025 at 11:57:31PM +0000, Tian, Kevin wrote:
> > > > From: Jason Gunthorpe <jgg@nvidia.com>
> > > > Sent: Tuesday, June 24, 2025 10:54 PM
> > > >
> > > > On Mon, Jun 23, 2025 at 05:49:45PM +0800, Xu Yilun wrote:
> > > > > +static void iommufd_device_remove_vdev(struct iommufd_device
> > *idev)
> > > > > +{
> > > > > +	bool vdev_removing = false;
> > > > > +
> > > > > +	mutex_lock(&idev->igroup->lock);
> > > > > +	if (idev->vdev) {
> > > > > +		struct iommufd_vdevice *vdev;
> > > > > +
> > > > > +		vdev = iommufd_get_vdevice(idev->ictx, idev->vdev->obj.id);
> > > > > +		if (IS_ERR(vdev)) {
> > > >
> > > > This incrs obj.users which will cause a concurrent
> > > > iommufd_object_remove() to fail with -EBUSY, which we are trying to
> > > > avoid.
> > >
> > > concurrent remove means a user-initiated IOMMU_DESTROY, for which
> > > failing with -EBUSY is expected as it doesn't wait for shortterm?
> > 
> > Yes a user IOMMU_DESTROY of the vdevice should not have a transient
> > EBUSY failure. Avoiding that is the purpose of the shorterm_users
> > mechanism.
> 
> hmm my understanding is the opposite.
> 
> currently iommufd_destroy() doesn't set REMOVE_WAIT_SHORTTERM:

Oh yes I got them mixed up.

> waiting shorterm_users is more for kernel destroy.

Yes, it is to make kernel destroy not fail

> Then iommufd_device_remove_vdev() will wait on idev->vdev to
> be NULL instead of calling iommufd_object_tombstone_user().

Ah, because the original version incrs users, not just
shortterm_users.

Jason
RE: [PATCH v2 3/4] iommufd: Destroy vdevice on idevice destroy
Posted by Tian, Kevin 3 months, 2 weeks ago
> From: Xu Yilun <yilun.xu@linux.intel.com>
> Sent: Monday, June 23, 2025 5:50 PM
> 
> +static void iommufd_device_remove_vdev(struct iommufd_device *idev)
> +{
> +	bool vdev_removing = false;
> +
> +	mutex_lock(&idev->igroup->lock);
> +	if (idev->vdev) {
> +		struct iommufd_vdevice *vdev;
> +
> +		vdev = iommufd_get_vdevice(idev->ictx, idev->vdev->obj.id);
> +		if (IS_ERR(vdev)) {
> +			/* vdev is removed from xarray, but is not
> destroyed/freed */
> +			vdev_removing = true;
> +			goto unlock;
> +		}
> +
> +		/* Should never happen */
> +		if (WARN_ON(vdev != idev->vdev)) {
> +			idev->vdev = NULL;
> +			iommufd_put_object(idev->ictx, &vdev->obj);
> +			goto unlock;
> +		}
> +
> +		/*
> +		 * vdev cannot be destroyed after refcount_inc, safe to
> release

"vdev cannot be destroyed by userspace"

> +		 * idev->igroup->lock and use idev->vdev afterward.
> +		 */
> +		refcount_inc(&idev->vdev->obj.users);
> +		iommufd_put_object(idev->ictx, &idev->vdev->obj);

s/idev->vdev/vdev/

> @@ -124,18 +139,28 @@ int iommufd_vdevice_alloc_ioctl(struct
> iommufd_ucmd *ucmd)
>  		goto out_put_idev;
>  	}
> 
> +	mutex_lock(&idev->igroup->lock);
> +	if (idev->vdev) {
> +		rc = -EEXIST;
> +		goto out_unlock_igroup;
> +	}
> +
>  	vdev = iommufd_object_alloc(ucmd->ictx, vdev,
> IOMMUFD_OBJ_VDEVICE);
>  	if (IS_ERR(vdev)) {
>  		rc = PTR_ERR(vdev);
> -		goto out_put_idev;
> +		goto out_unlock_igroup;
>  	}
> 
> +	/* vdev can't outlive idev, vdev->idev is always valid, need no refcnt
> */
> +	vdev->idev = idev;
>  	vdev->ictx = ucmd->ictx;
>  	vdev->id = virt_id;
>  	vdev->dev = idev->dev;
>  	get_device(idev->dev);

this is not necessary now, as idevice already holds a reference to device
and now vdevice cannot outlive idevice.
Re: [PATCH v2 3/4] iommufd: Destroy vdevice on idevice destroy
Posted by Xu Yilun 3 months, 2 weeks ago
On Tue, Jun 24, 2025 at 08:22:11AM +0000, Tian, Kevin wrote:
> > From: Xu Yilun <yilun.xu@linux.intel.com>
> > Sent: Monday, June 23, 2025 5:50 PM
> > 
> > +static void iommufd_device_remove_vdev(struct iommufd_device *idev)
> > +{
> > +	bool vdev_removing = false;
> > +
> > +	mutex_lock(&idev->igroup->lock);
> > +	if (idev->vdev) {
> > +		struct iommufd_vdevice *vdev;
> > +
> > +		vdev = iommufd_get_vdevice(idev->ictx, idev->vdev->obj.id);
> > +		if (IS_ERR(vdev)) {
> > +			/* vdev is removed from xarray, but is not
> > destroyed/freed */
> > +			vdev_removing = true;
> > +			goto unlock;
> > +		}
> > +
> > +		/* Should never happen */
> > +		if (WARN_ON(vdev != idev->vdev)) {
> > +			idev->vdev = NULL;
> > +			iommufd_put_object(idev->ictx, &vdev->obj);
> > +			goto unlock;
> > +		}
> > +
> > +		/*
> > +		 * vdev cannot be destroyed after refcount_inc, safe to
> > release
> 
> "vdev cannot be destroyed by userspace"
> 
> > +		 * idev->igroup->lock and use idev->vdev afterward.
> > +		 */
> > +		refcount_inc(&idev->vdev->obj.users);
> > +		iommufd_put_object(idev->ictx, &idev->vdev->obj);
> 
> s/idev->vdev/vdev/

Will adopt these fixes.

> 
> > @@ -124,18 +139,28 @@ int iommufd_vdevice_alloc_ioctl(struct
> > iommufd_ucmd *ucmd)
> >  		goto out_put_idev;
> >  	}
> > 
> > +	mutex_lock(&idev->igroup->lock);
> > +	if (idev->vdev) {
> > +		rc = -EEXIST;
> > +		goto out_unlock_igroup;
> > +	}
> > +
> >  	vdev = iommufd_object_alloc(ucmd->ictx, vdev,
> > IOMMUFD_OBJ_VDEVICE);
> >  	if (IS_ERR(vdev)) {
> >  		rc = PTR_ERR(vdev);
> > -		goto out_put_idev;
> > +		goto out_unlock_igroup;
> >  	}
> > 
> > +	/* vdev can't outlive idev, vdev->idev is always valid, need no refcnt
> > */
> > +	vdev->idev = idev;
> >  	vdev->ictx = ucmd->ictx;
> >  	vdev->id = virt_id;
> >  	vdev->dev = idev->dev;
> >  	get_device(idev->dev);
> 
> this is not necessary now, as idevice already holds a reference to device
> and now vdevice cannot outlive idevice.

I agree.

Besides, Jason suggests use vdev->dev for iommufd_vdevice_abort() safe
reentrancy. I think we could change to use vdev->viommu.

@@ -93,10 +93,17 @@ void iommufd_vdevice_abort(struct iommufd_object *obj)

        lockdep_assert_held(&idev->igroup->lock);

+       /*
+        * iommufd_vdevice_abort() could be reentrant, by
+        * iommufd_device_unbind() or by iommufd_destroy(). Cleanup only once.
+        */
+       if (!viommu)
+               return;
+
        /* xa_cmpxchg is okay to fail if alloc failed xa_cmpxchg previously */
        xa_cmpxchg(&viommu->vdevs, vdev->id, vdev, NULL, GFP_KERNEL);
        refcount_dec(&viommu->obj.users);
-       put_device(vdev->dev);
+       vdev->viommu = NULL;
        idev->vdev = NULL;

Thanks,
Yilun
Re: [PATCH v2 3/4] iommufd: Destroy vdevice on idevice destroy
Posted by Xu Yilun 3 months, 2 weeks ago
> +void iommufd_vdevice_destroy(struct iommufd_object *obj)
> +{
> +	struct iommufd_vdevice *vdev =
> +		container_of(obj, struct iommufd_vdevice, obj);
> +
> +	mutex_lock(&vdev->idev->igroup->lock);
> +	iommufd_vdevice_abort(obj);
> +	mutex_unlock(&vdev->idev->igroup->lock);
> +	wake_up_interruptible_all(&vdev->ictx->destroy_wait);

Should change to

        wake_up_interruptible_all(&vdev->viommu->ictx->destroy_wait);

since vdev->ictx will be deleted.

Thanks,
Yilun
Re: [PATCH v2 3/4] iommufd: Destroy vdevice on idevice destroy
Posted by Baolu Lu 3 months, 2 weeks ago
On 6/23/25 17:49, Xu Yilun wrote:
> Destroy iommufd_vdevice(vdev) on iommufd_idevice(idev) destroy so that
> vdev can't outlive idev.
> 
> iommufd_device(idev) represents the physical device bound to iommufd,
> while the iommufd_vdevice(vdev) represents the virtual instance of the
> physical device in the VM. The lifecycle of the vdev should not be
> longer than idev. This doesn't cause real problem on existing use cases
> cause vdev doesn't impact the physical device, only provides
> virtualization information. But to extend vdev for Confidential
> Computing(CC), there are needs to do secure configuration for the vdev,
> e.g. TSM Bind/Unbind. These configurations should be rolled back on idev
> destroy, or the external driver(VFIO) functionality may be impact.
> 
> Building the association between idev & vdev requires the two objects
> pointing each other, but not referencing each other.

Does this mean each idevice can have at most a single vdevice? Is it
possible that different PASIDs of a physical device are assigned to
userspace for different purposes, such that there is a need for multiple
vdevices per idevice?

Thanks,
baolu
Re: [PATCH v2 3/4] iommufd: Destroy vdevice on idevice destroy
Posted by Xu Yilun 3 months, 2 weeks ago
On Tue, Jun 24, 2025 at 11:32:01AM +0800, Baolu Lu wrote:
> On 6/23/25 17:49, Xu Yilun wrote:
> > Destroy iommufd_vdevice(vdev) on iommufd_idevice(idev) destroy so that
> > vdev can't outlive idev.
> > 
> > iommufd_device(idev) represents the physical device bound to iommufd,
> > while the iommufd_vdevice(vdev) represents the virtual instance of the
> > physical device in the VM. The lifecycle of the vdev should not be
> > longer than idev. This doesn't cause real problem on existing use cases
> > cause vdev doesn't impact the physical device, only provides
> > virtualization information. But to extend vdev for Confidential
> > Computing(CC), there are needs to do secure configuration for the vdev,
> > e.g. TSM Bind/Unbind. These configurations should be rolled back on idev
> > destroy, or the external driver(VFIO) functionality may be impact.
> > 
> > Building the association between idev & vdev requires the two objects
> > pointing each other, but not referencing each other.
> 
> Does this mean each idevice can have at most a single vdevice? Is it

Yes, I got this idea from here.

https://lore.kernel.org/all/20250604132403.GJ5028@nvidia.com/

> possible that different PASIDs of a physical device are assigned to
> userspace for different purposes, such that there is a need for multiple
> vdevices per idevice?

Mm.. I don't have a clear idea how SIOV assignment work with iommufd,
may come back later.

Thanks,
Yilun
RE: [PATCH v2 3/4] iommufd: Destroy vdevice on idevice destroy
Posted by Tian, Kevin 3 months, 2 weeks ago
> From: Xu Yilun <yilun.xu@linux.intel.com>
> Sent: Tuesday, June 24, 2025 4:12 PM
> 
> On Tue, Jun 24, 2025 at 11:32:01AM +0800, Baolu Lu wrote:
> > On 6/23/25 17:49, Xu Yilun wrote:
> > > Destroy iommufd_vdevice(vdev) on iommufd_idevice(idev) destroy so
> that
> > > vdev can't outlive idev.
> > >
> > > iommufd_device(idev) represents the physical device bound to iommufd,
> > > while the iommufd_vdevice(vdev) represents the virtual instance of the
> > > physical device in the VM. The lifecycle of the vdev should not be
> > > longer than idev. This doesn't cause real problem on existing use cases
> > > cause vdev doesn't impact the physical device, only provides
> > > virtualization information. But to extend vdev for Confidential
> > > Computing(CC), there are needs to do secure configuration for the vdev,
> > > e.g. TSM Bind/Unbind. These configurations should be rolled back on idev
> > > destroy, or the external driver(VFIO) functionality may be impact.
> > >
> > > Building the association between idev & vdev requires the two objects
> > > pointing each other, but not referencing each other.
> >
> > Does this mean each idevice can have at most a single vdevice? Is it
> 
> Yes, I got this idea from here.
> 
> https://lore.kernel.org/all/20250604132403.GJ5028@nvidia.com/
> 
> > possible that different PASIDs of a physical device are assigned to
> > userspace for different purposes, such that there is a need for multiple
> > vdevices per idevice?
> 
> Mm.. I don't have a clear idea how SIOV assignment work with iommufd,
> may come back later.
> 

Let's put SIOV out of scope here. It's not supported yet. there are
other obstacles to be figured out (e.g. igroup etc.) when it comes to
the table. 
RE: [PATCH v2 3/4] iommufd: Destroy vdevice on idevice destroy
Posted by Tian, Kevin 3 months, 2 weeks ago
> From: Baolu Lu <baolu.lu@linux.intel.com>
> Sent: Tuesday, June 24, 2025 11:32 AM
> 
> On 6/23/25 17:49, Xu Yilun wrote:
> > Destroy iommufd_vdevice(vdev) on iommufd_idevice(idev) destroy so that
> > vdev can't outlive idev.
> >
> > iommufd_device(idev) represents the physical device bound to iommufd,
> > while the iommufd_vdevice(vdev) represents the virtual instance of the
> > physical device in the VM. The lifecycle of the vdev should not be
> > longer than idev. This doesn't cause real problem on existing use cases
> > cause vdev doesn't impact the physical device, only provides
> > virtualization information. But to extend vdev for Confidential
> > Computing(CC), there are needs to do secure configuration for the vdev,
> > e.g. TSM Bind/Unbind. These configurations should be rolled back on idev
> > destroy, or the external driver(VFIO) functionality may be impact.
> >
> > Building the association between idev & vdev requires the two objects
> > pointing each other, but not referencing each other.
> 
> Does this mean each idevice can have at most a single vdevice? Is it
> possible that different PASIDs of a physical device are assigned to
> userspace for different purposes, such that there is a need for multiple
> vdevices per idevice?
> 

PASID is a resource of physical device. If it's reported to a VM then
it becomes the resource of virtual device. 1:1 association makes
sense here.
Re: [PATCH v2 3/4] iommufd: Destroy vdevice on idevice destroy
Posted by Baolu Lu 3 months, 2 weeks ago
On 6/24/25 16:12, Tian, Kevin wrote:
>> From: Baolu Lu <baolu.lu@linux.intel.com>
>> Sent: Tuesday, June 24, 2025 11:32 AM
>>
>> On 6/23/25 17:49, Xu Yilun wrote:
>>> Destroy iommufd_vdevice(vdev) on iommufd_idevice(idev) destroy so that
>>> vdev can't outlive idev.
>>>
>>> iommufd_device(idev) represents the physical device bound to iommufd,
>>> while the iommufd_vdevice(vdev) represents the virtual instance of the
>>> physical device in the VM. The lifecycle of the vdev should not be
>>> longer than idev. This doesn't cause real problem on existing use cases
>>> cause vdev doesn't impact the physical device, only provides
>>> virtualization information. But to extend vdev for Confidential
>>> Computing(CC), there are needs to do secure configuration for the vdev,
>>> e.g. TSM Bind/Unbind. These configurations should be rolled back on idev
>>> destroy, or the external driver(VFIO) functionality may be impact.
>>>
>>> Building the association between idev & vdev requires the two objects
>>> pointing each other, but not referencing each other.
>>
>> Does this mean each idevice can have at most a single vdevice? Is it
>> possible that different PASIDs of a physical device are assigned to
>> userspace for different purposes, such that there is a need for multiple
>> vdevices per idevice?
>>
> 
> PASID is a resource of physical device. If it's reported to a VM then
> it becomes the resource of virtual device. 1:1 association makes
> sense here.

Okay, make sense.