[PATCH v3 2/5] iommufd: Destroy vdevice on idevice destroy

Xu Yilun posted 5 patches 3 months, 1 week ago
There is a newer version of this series
[PATCH v3 2/5] iommufd: Destroy vdevice on idevice destroy
Posted by Xu Yilun 3 months, 1 week ago
Destroy iommufd_vdevice (vdev) on iommufd_idevice (idev) destroy so
that vdev can't outlive idev.

iommufd_device (idev) represents the physical device bound to iommufd,
while the iommufd_vdevice (vdev) represents the virtual instance of the
physical device in the VM. The lifecycle of the vdev should not be
longer than idev. This doesn't cause real problem on existing use cases
cause vdev doesn't impact the physical device, only provides
virtualization information. But to extend vdev for Confidential
Computing (CC), there are needs to do secure configuration for the vdev,
e.g. TSM Bind/Unbind. These configurations should be rolled back on idev
destroy, or the external driver (VFIO) functionality may be impact.

Building the association between idev & vdev requires the two objects
pointing each other, but not referencing each other. This requires
proper locking. This is done by reviving some of Nicolin's patch [1].

There are 3 cases on idev destroy:

  1. vdev is already destroyed by userspace. No extra handling needed.
  2. vdev is still alive. Use iommufd_object_tombstone_user() to
     destroy vdev and tombstone the vdev ID.
  3. vdev is being destroyed by userspace. The vdev ID is already freed,
     but vdev destroy handler is not completed. Destroy the vdev
     immediately. To solve the racing with userspace destruction, make
     iommufd_vdevice_abort() reentrant.

[1]: https://lore.kernel.org/all/53025c827c44d68edb6469bfd940a8e8bc6147a5.1729897278.git.nicolinc@nvidia.com/

Originally-by: Nicolin Chen <nicolinc@nvidia.com>
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Co-developed-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
---
 drivers/iommu/iommufd/device.c          | 42 +++++++++++++++++++++++
 drivers/iommu/iommufd/iommufd_private.h | 11 +++++++
 drivers/iommu/iommufd/main.c            |  1 +
 drivers/iommu/iommufd/viommu.c          | 44 +++++++++++++++++++++++--
 4 files changed, 95 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index 86244403b532..0937d4989185 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -137,11 +137,53 @@ static struct iommufd_group *iommufd_get_group(struct iommufd_ctx *ictx,
 	}
 }
 
+static void iommufd_device_remove_vdev(struct iommufd_device *idev)
+{
+	struct iommufd_vdevice *vdev;
+
+	mutex_lock(&idev->igroup->lock);
+	/* vdev has been completely destroyed by userspace */
+	if (!idev->vdev)
+		goto out_unlock;
+
+	vdev = iommufd_get_vdevice(idev->ictx, idev->vdev->obj.id);
+	if (IS_ERR(vdev)) {
+		/*
+		 * vdev is removed from xarray by userspace, but is not
+		 * destroyed/freed. Since iommufd_vdevice_abort() is reentrant,
+		 * safe to destroy vdev here.
+		 */
+		iommufd_vdevice_abort(&idev->vdev->obj);
+		goto out_unlock;
+	}
+
+	/* Should never happen */
+	if (WARN_ON(vdev != idev->vdev)) {
+		iommufd_put_object(idev->ictx, &vdev->obj);
+		goto out_unlock;
+	}
+
+	/*
+	 * vdev is still alive. Hold a users refcount to prevent racing with
+	 * userspace destruction, then use iommufd_object_tombstone_user() to
+	 * destroy it and leave a tombstone.
+	 */
+	refcount_inc(&vdev->obj.users);
+	iommufd_put_object(idev->ictx, &vdev->obj);
+	mutex_unlock(&idev->igroup->lock);
+	iommufd_object_tombstone_user(idev->ictx, &vdev->obj);
+	return;
+
+out_unlock:
+	mutex_unlock(&idev->igroup->lock);
+}
+
 void iommufd_device_destroy(struct iommufd_object *obj)
 {
 	struct iommufd_device *idev =
 		container_of(obj, struct iommufd_device, obj);
 
+	iommufd_device_remove_vdev(idev);
 	iommu_device_release_dma_owner(idev->dev);
 	iommufd_put_group(idev->igroup);
 	if (!iommufd_selftest_is_mock_dev(idev->dev))
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index fbc9ef78d81f..f58aa4439c53 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -446,6 +446,7 @@ struct iommufd_device {
 	/* always the physical device */
 	struct device *dev;
 	bool enforce_cache_coherency;
+	struct iommufd_vdevice *vdev;
 };
 
 static inline struct iommufd_device *
@@ -621,6 +622,7 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd);
 void iommufd_viommu_destroy(struct iommufd_object *obj);
 int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd);
 void iommufd_vdevice_destroy(struct iommufd_object *obj);
+void iommufd_vdevice_abort(struct iommufd_object *obj);
 
 struct iommufd_vdevice {
 	struct iommufd_object obj;
@@ -628,8 +630,17 @@ struct iommufd_vdevice {
 	struct iommufd_viommu *viommu;
 	struct device *dev;
 	u64 id; /* per-vIOMMU virtual ID */
+	struct iommufd_device *idev;
 };
 
+static inline struct iommufd_vdevice *
+iommufd_get_vdevice(struct iommufd_ctx *ictx, u32 id)
+{
+	return container_of(iommufd_get_object(ictx, id,
+					       IOMMUFD_OBJ_VDEVICE),
+			    struct iommufd_vdevice, obj);
+}
+
 #ifdef CONFIG_IOMMUFD_TEST
 int iommufd_test(struct iommufd_ucmd *ucmd);
 void iommufd_selftest_destroy(struct iommufd_object *obj);
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 620923669b42..64731b4fdbdf 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -529,6 +529,7 @@ static const struct iommufd_object_ops iommufd_object_ops[] = {
 	},
 	[IOMMUFD_OBJ_VDEVICE] = {
 		.destroy = iommufd_vdevice_destroy,
+		.abort = iommufd_vdevice_abort,
 	},
 	[IOMMUFD_OBJ_VEVENTQ] = {
 		.destroy = iommufd_veventq_destroy,
diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
index 01df2b985f02..632d1d7b8fd8 100644
--- a/drivers/iommu/iommufd/viommu.c
+++ b/drivers/iommu/iommufd/viommu.c
@@ -84,16 +84,38 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
 	return rc;
 }
 
-void iommufd_vdevice_destroy(struct iommufd_object *obj)
+void iommufd_vdevice_abort(struct iommufd_object *obj)
 {
 	struct iommufd_vdevice *vdev =
 		container_of(obj, struct iommufd_vdevice, obj);
 	struct iommufd_viommu *viommu = vdev->viommu;
+	struct iommufd_device *idev = vdev->idev;
+
+	lockdep_assert_held(&idev->igroup->lock);
+
+	/*
+	 * iommufd_vdevice_abort() could be reentrant, by
+	 * iommufd_device_unbind() or by iommufd_destroy(). Cleanup only once.
+	 */
+	if (!viommu)
+		return;
 
 	/* xa_cmpxchg is okay to fail if alloc failed xa_cmpxchg previously */
 	xa_cmpxchg(&viommu->vdevs, vdev->id, vdev, NULL, GFP_KERNEL);
 	refcount_dec(&viommu->obj.users);
 	put_device(vdev->dev);
+	vdev->viommu = NULL;
+	idev->vdev = NULL;
+}
+
+void iommufd_vdevice_destroy(struct iommufd_object *obj)
+{
+	struct iommufd_vdevice *vdev =
+		container_of(obj, struct iommufd_vdevice, obj);
+
+	mutex_lock(&vdev->idev->igroup->lock);
+	iommufd_vdevice_abort(obj);
+	mutex_unlock(&vdev->idev->igroup->lock);
 }
 
 int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
@@ -124,10 +146,16 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
 		goto out_put_idev;
 	}
 
+	mutex_lock(&idev->igroup->lock);
+	if (idev->vdev) {
+		rc = -EEXIST;
+		goto out_unlock_igroup;
+	}
+
 	vdev = iommufd_object_alloc(ucmd->ictx, vdev, IOMMUFD_OBJ_VDEVICE);
 	if (IS_ERR(vdev)) {
 		rc = PTR_ERR(vdev);
-		goto out_put_idev;
+		goto out_unlock_igroup;
 	}
 
 	vdev->id = virt_id;
@@ -135,6 +163,14 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
 	get_device(idev->dev);
 	vdev->viommu = viommu;
 	refcount_inc(&viommu->obj.users);
+	/*
+	 * iommufd_device_destroy() waits until idev->vdev is NULL before
+	 * freeing the idev, which only happens once the vdev is finished
+	 * destruction. Thus we do not need refcounting on either idev->vdev or
+	 * vdev->idev.
+	 */
+	vdev->idev = idev;
+	idev->vdev = vdev;
 
 	curr = xa_cmpxchg(&viommu->vdevs, virt_id, NULL, vdev, GFP_KERNEL);
 	if (curr) {
@@ -147,10 +183,12 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
 	if (rc)
 		goto out_abort;
 	iommufd_object_finalize(ucmd->ictx, &vdev->obj);
-	goto out_put_idev;
+	goto out_unlock_igroup;
 
 out_abort:
 	iommufd_object_abort_and_destroy(ucmd->ictx, &vdev->obj);
+out_unlock_igroup:
+	mutex_unlock(&idev->igroup->lock);
 out_put_idev:
 	iommufd_put_object(ucmd->ictx, &idev->obj);
 out_put_viommu:
-- 
2.25.1
Re: [PATCH v3 2/5] iommufd: Destroy vdevice on idevice destroy
Posted by Nicolin Chen 3 months, 1 week ago
On Fri, Jun 27, 2025 at 11:38:06AM +0800, Xu Yilun wrote:
> +static void iommufd_device_remove_vdev(struct iommufd_device *idev)
> +{
> +	struct iommufd_vdevice *vdev;
> +
> +	mutex_lock(&idev->igroup->lock);
> +	/* vdev has been completely destroyed by userspace */
> +	if (!idev->vdev)
> +		goto out_unlock;
> +
> +	vdev = iommufd_get_vdevice(idev->ictx, idev->vdev->obj.id);
> +	if (IS_ERR(vdev)) {
> +		/*
> +		 * vdev is removed from xarray by userspace, but is not
> +		 * destroyed/freed. Since iommufd_vdevice_abort() is reentrant,
> +		 * safe to destroy vdev here.
> +		 */
> +		iommufd_vdevice_abort(&idev->vdev->obj);
> +		goto out_unlock;

This is the case #3, i.e. a racing vdev destory, in the commit log?

I think it is worth clarifying that there is a concurrent destroy:
		/*
		 * An ongoing vdev destroy ioctl has removed the vdev from the
		 * object xarray but has not finished iommufd_vdevice_destroy()
		 * yet, as it is holding the same mutex. Destroy the vdev here,
		 * i.e. the iommufd_vdevice_destroy() will be a NOP once it is
		 * unlocked.
		 */

> @@ -147,10 +183,12 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
>  	if (rc)
>  		goto out_abort;
>  	iommufd_object_finalize(ucmd->ictx, &vdev->obj);
> -	goto out_put_idev;
> +	goto out_unlock_igroup;
>  
>  out_abort:
>  	iommufd_object_abort_and_destroy(ucmd->ictx, &vdev->obj);
> +out_unlock_igroup:
> +	mutex_unlock(&idev->igroup->lock);

Looks like we will have to partially revert the _ucmd allocator,
in this function:
https://lore.kernel.org/all/107b24a3b791091bb09c92ffb0081c56c413b26d.1749882255.git.nicolinc@nvidia.com/

Please try fixing the conflicts on top of Jason's for-next tree.

Thanks
Nicolin
Re: [PATCH v3 2/5] iommufd: Destroy vdevice on idevice destroy
Posted by Xu Yilun 3 months ago
On Mon, Jun 30, 2025 at 12:34:03PM -0700, Nicolin Chen wrote:
> On Fri, Jun 27, 2025 at 11:38:06AM +0800, Xu Yilun wrote:
> > +static void iommufd_device_remove_vdev(struct iommufd_device *idev)
> > +{
> > +	struct iommufd_vdevice *vdev;
> > +
> > +	mutex_lock(&idev->igroup->lock);
> > +	/* vdev has been completely destroyed by userspace */
> > +	if (!idev->vdev)
> > +		goto out_unlock;
> > +
> > +	vdev = iommufd_get_vdevice(idev->ictx, idev->vdev->obj.id);
> > +	if (IS_ERR(vdev)) {
> > +		/*
> > +		 * vdev is removed from xarray by userspace, but is not
> > +		 * destroyed/freed. Since iommufd_vdevice_abort() is reentrant,
> > +		 * safe to destroy vdev here.
> > +		 */
> > +		iommufd_vdevice_abort(&idev->vdev->obj);
> > +		goto out_unlock;
> 
> This is the case #3, i.e. a racing vdev destory, in the commit log?
> 
> I think it is worth clarifying that there is a concurrent destroy:
> 		/*
> 		 * An ongoing vdev destroy ioctl has removed the vdev from the
> 		 * object xarray but has not finished iommufd_vdevice_destroy()
> 		 * yet, as it is holding the same mutex.

Applied this part.

>		 * Destroy the vdev here,
> 		 * i.e. the iommufd_vdevice_destroy() will be a NOP once it is
> 		 * unlocked.
> 		 */
> 
> > @@ -147,10 +183,12 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
> >  	if (rc)
> >  		goto out_abort;
> >  	iommufd_object_finalize(ucmd->ictx, &vdev->obj);
> > -	goto out_put_idev;
> > +	goto out_unlock_igroup;
> >  
> >  out_abort:
> >  	iommufd_object_abort_and_destroy(ucmd->ictx, &vdev->obj);
> > +out_unlock_igroup:
> > +	mutex_unlock(&idev->igroup->lock);
> 
> Looks like we will have to partially revert the _ucmd allocator,
> in this function:
> https://lore.kernel.org/all/107b24a3b791091bb09c92ffb0081c56c413b26d.1749882255.git.nicolinc@nvidia.com/
> 
> Please try fixing the conflicts on top of Jason's for-next tree.

Yes, will rebase for next version.

Thanks,
Yilun

> 
> Thanks
> Nicolin
RE: [PATCH v3 2/5] iommufd: Destroy vdevice on idevice destroy
Posted by Tian, Kevin 3 months, 1 week ago
> From: Xu Yilun <yilun.xu@linux.intel.com>
> Sent: Friday, June 27, 2025 11:38 AM
> 
> +static void iommufd_device_remove_vdev(struct iommufd_device *idev)
> +{
> +	struct iommufd_vdevice *vdev;
> +
> +	mutex_lock(&idev->igroup->lock);
> +	/* vdev has been completely destroyed by userspace */
> +	if (!idev->vdev)
> +		goto out_unlock;
> +
> +	vdev = iommufd_get_vdevice(idev->ictx, idev->vdev->obj.id);
> +	if (IS_ERR(vdev)) {
> +		/*
> +		 * vdev is removed from xarray by userspace, but is not
> +		 * destroyed/freed. Since iommufd_vdevice_abort() is
> reentrant,
> +		 * safe to destroy vdev here.
> +		 */
> +		iommufd_vdevice_abort(&idev->vdev->obj);
> +		goto out_unlock;
> +	}

let's add a comment that vdev is still freed in iommufd_destroy()
in this situation.

> -void iommufd_vdevice_destroy(struct iommufd_object *obj)
> +void iommufd_vdevice_abort(struct iommufd_object *obj)
>  {
>  	struct iommufd_vdevice *vdev =
>  		container_of(obj, struct iommufd_vdevice, obj);
>  	struct iommufd_viommu *viommu = vdev->viommu;
> +	struct iommufd_device *idev = vdev->idev;
> +
> +	lockdep_assert_held(&idev->igroup->lock);
> +
> +	/*
> +	 * iommufd_vdevice_abort() could be reentrant, by
> +	 * iommufd_device_unbind() or by iommufd_destroy(). Cleanup only
> once.
> +	 */
> +	if (!viommu)
> +		return;

Just check idev->vdev, to be consistent with the other path.
Re: [PATCH v3 2/5] iommufd: Destroy vdevice on idevice destroy
Posted by Xu Yilun 3 months, 1 week ago
On Mon, Jun 30, 2025 at 06:27:51AM +0000, Tian, Kevin wrote:
> > From: Xu Yilun <yilun.xu@linux.intel.com>
> > Sent: Friday, June 27, 2025 11:38 AM
> > 
> > +static void iommufd_device_remove_vdev(struct iommufd_device *idev)
> > +{
> > +	struct iommufd_vdevice *vdev;
> > +
> > +	mutex_lock(&idev->igroup->lock);
> > +	/* vdev has been completely destroyed by userspace */
> > +	if (!idev->vdev)
> > +		goto out_unlock;
> > +
> > +	vdev = iommufd_get_vdevice(idev->ictx, idev->vdev->obj.id);
> > +	if (IS_ERR(vdev)) {
> > +		/*
> > +		 * vdev is removed from xarray by userspace, but is not
> > +		 * destroyed/freed. Since iommufd_vdevice_abort() is
> > reentrant,
> > +		 * safe to destroy vdev here.
> > +		 */
> > +		iommufd_vdevice_abort(&idev->vdev->obj);
> > +		goto out_unlock;
> > +	}
> 
> let's add a comment that vdev is still freed in iommufd_destroy()
> in this situation.

Yes.

> 
> > -void iommufd_vdevice_destroy(struct iommufd_object *obj)
> > +void iommufd_vdevice_abort(struct iommufd_object *obj)
> >  {
> >  	struct iommufd_vdevice *vdev =
> >  		container_of(obj, struct iommufd_vdevice, obj);
> >  	struct iommufd_viommu *viommu = vdev->viommu;
> > +	struct iommufd_device *idev = vdev->idev;
> > +
> > +	lockdep_assert_held(&idev->igroup->lock);
> > +
> > +	/*
> > +	 * iommufd_vdevice_abort() could be reentrant, by
> > +	 * iommufd_device_unbind() or by iommufd_destroy(). Cleanup only
> > once.
> > +	 */
> > +	if (!viommu)
> > +		return;
> 
> Just check idev->vdev, to be consistent with the other path.

I think there is problem here. From your comments above, vdev could be
aborted/destroyed by iommufd_destroy() again *after* idev is freed.
That means in iommufd_vdevice_abort/destroy(), usage of vdev->idev or
idev->vdev or vdev->idev->igroup->lock may be invalid.

I need to reconsider this, seems we need a dedicated vdev lock to
synchronize concurrent vdev abort/destroy.

Thanks,
Yilun
Re: [PATCH v3 2/5] iommufd: Destroy vdevice on idevice destroy
Posted by Jason Gunthorpe 3 months, 1 week ago
On Mon, Jun 30, 2025 at 06:18:50PM +0800, Xu Yilun wrote:

> I need to reconsider this, seems we need a dedicated vdev lock to
> synchronize concurrent vdev abort/destroy.

It is not possible to be concurrent

destroy is only called once after it is no longer possible to call
abort.

Jason
Re: [PATCH v3 2/5] iommufd: Destroy vdevice on idevice destroy
Posted by Xu Yilun 3 months, 1 week ago
On Mon, Jun 30, 2025 at 11:50:51AM -0300, Jason Gunthorpe wrote:
> On Mon, Jun 30, 2025 at 06:18:50PM +0800, Xu Yilun wrote:
> 
> > I need to reconsider this, seems we need a dedicated vdev lock to
> > synchronize concurrent vdev abort/destroy.
> 
> It is not possible to be concurrent
> 
> destroy is only called once after it is no longer possible to call
> abort.

I'm almost about to drop the "abort twice" idea. [1]

[1]: https://lore.kernel.org/linux-iommu/20250625123832.GF167785@nvidia.com/

See from the flow below,

  T1. iommufd_device_unbind(idev)
	iommufd_device_destroy(obj)
		mutex_lock(&idev->igroup->lock)
 		iommufd_vdevice_abort(idev->vdev.obj)
		mutex_unlock(&idev->igroup->lock)
	kfree(obj)

  T2. iommufd_destroy(vdev_id)
	iommufd_vdevice_destroy(obj)
		mutex_lock(&vdev->idev->igroup->lock)
		iommufd_vdevice_abort(obj);
		mutex_unlock(&vdev->idev->igroup->lock)
	kfree(obj)

iommufd_vdevice_destroy() will access idev->igroup->lock, but it is
possible the idev is already freed at that time:

                                            iommufd_destroy(vdev_id)
                                            iommufd_vdevice_destroy(obj)
  iommufd_device_unbind(idev)
  iommufd_device_destroy(obj)
  mutex_lock(&idev->igroup->lock)
                                            mutex_lock(&vdev->idev->igroup->lock) (wait)
  iommufd_vdevice_abort(idev->vdev.obj)
  mutex_unlock(&idev->igroup->lock)
  kfree(obj)
                                            mutex_lock(&vdev->idev->igroup->lock) (PANIC)
                                            iommufd_vdevice_abort(obj)
                                            ...

We also can't introduce some vdev side lock instead of idev side lock,
vdev could also be freed right after iommufd_vdevice_destroy().

I think the only simple way is to let idev destruction wait for vdev
destruction, that go back to the wait_event idea ...

Thanks,
Yilun
Re: [PATCH v3 2/5] iommufd: Destroy vdevice on idevice destroy
Posted by Jason Gunthorpe 3 months, 1 week ago
On Tue, Jul 01, 2025 at 05:19:05PM +0800, Xu Yilun wrote:
> On Mon, Jun 30, 2025 at 11:50:51AM -0300, Jason Gunthorpe wrote:
> > On Mon, Jun 30, 2025 at 06:18:50PM +0800, Xu Yilun wrote:
> > 
> > > I need to reconsider this, seems we need a dedicated vdev lock to
> > > synchronize concurrent vdev abort/destroy.
> > 
> > It is not possible to be concurrent
> > 
> > destroy is only called once after it is no longer possible to call
> > abort.
> 
> I'm almost about to drop the "abort twice" idea. [1]
> 
> [1]: https://lore.kernel.org/linux-iommu/20250625123832.GF167785@nvidia.com/
> 
> See from the flow below,
> 
>   T1. iommufd_device_unbind(idev)
> 	iommufd_device_destroy(obj)
> 		mutex_lock(&idev->igroup->lock)
>  		iommufd_vdevice_abort(idev->vdev.obj)
> 		mutex_unlock(&idev->igroup->lock)
> 	kfree(obj)
> 
>   T2. iommufd_destroy(vdev_id)
> 	iommufd_vdevice_destroy(obj)
> 		mutex_lock(&vdev->idev->igroup->lock)
> 		iommufd_vdevice_abort(obj);
> 		mutex_unlock(&vdev->idev->igroup->lock)
> 	kfree(obj)
> 
> iommufd_vdevice_destroy() will access idev->igroup->lock, but it is
> possible the idev is already freed at that time:
> 
>                                             iommufd_destroy(vdev_id)
>                                             iommufd_vdevice_destroy(obj)
>   iommufd_device_unbind(idev)
>   iommufd_device_destroy(obj)
>   mutex_lock(&idev->igroup->lock)
>                                             mutex_lock(&vdev->idev->igroup->lock) (wait)
>   iommufd_vdevice_abort(idev->vdev.obj)
>   mutex_unlock(&idev->igroup->lock)
>   kfree(obj)
>                                             mutex_lock(&vdev->idev->igroup->lock) (PANIC)
>                                             iommufd_vdevice_abort(obj)
>                                             ...

Yes, you can't touch idev inside the destroy function at all, under
any version. idev is only valid if you have a refcount on vdev.

But why are you touching this lock? Arrange things so abort doesn't
touch the idev??

Jason
Re: [PATCH v3 2/5] iommufd: Destroy vdevice on idevice destroy
Posted by Xu Yilun 3 months, 1 week ago
On Tue, Jul 01, 2025 at 09:13:15AM -0300, Jason Gunthorpe wrote:
> On Tue, Jul 01, 2025 at 05:19:05PM +0800, Xu Yilun wrote:
> > On Mon, Jun 30, 2025 at 11:50:51AM -0300, Jason Gunthorpe wrote:
> > > On Mon, Jun 30, 2025 at 06:18:50PM +0800, Xu Yilun wrote:
> > > 
> > > > I need to reconsider this, seems we need a dedicated vdev lock to
> > > > synchronize concurrent vdev abort/destroy.
> > > 
> > > It is not possible to be concurrent
> > > 
> > > destroy is only called once after it is no longer possible to call
> > > abort.
> > 
> > I'm almost about to drop the "abort twice" idea. [1]
> > 
> > [1]: https://lore.kernel.org/linux-iommu/20250625123832.GF167785@nvidia.com/
> > 
> > See from the flow below,
> > 
> >   T1. iommufd_device_unbind(idev)
> > 	iommufd_device_destroy(obj)
> > 		mutex_lock(&idev->igroup->lock)
> >  		iommufd_vdevice_abort(idev->vdev.obj)
> > 		mutex_unlock(&idev->igroup->lock)
> > 	kfree(obj)
> > 
> >   T2. iommufd_destroy(vdev_id)
> > 	iommufd_vdevice_destroy(obj)
> > 		mutex_lock(&vdev->idev->igroup->lock)
> > 		iommufd_vdevice_abort(obj);
> > 		mutex_unlock(&vdev->idev->igroup->lock)
> > 	kfree(obj)
> > 
> > iommufd_vdevice_destroy() will access idev->igroup->lock, but it is
> > possible the idev is already freed at that time:
> > 
> >                                             iommufd_destroy(vdev_id)
> >                                             iommufd_vdevice_destroy(obj)
> >   iommufd_device_unbind(idev)
> >   iommufd_device_destroy(obj)
> >   mutex_lock(&idev->igroup->lock)
> >                                             mutex_lock(&vdev->idev->igroup->lock) (wait)
> >   iommufd_vdevice_abort(idev->vdev.obj)
> >   mutex_unlock(&idev->igroup->lock)
> >   kfree(obj)
> >                                             mutex_lock(&vdev->idev->igroup->lock) (PANIC)
> >                                             iommufd_vdevice_abort(obj)
> >                                             ...
> 
> Yes, you can't touch idev inside the destroy function at all, under
> any version. idev is only valid if you have a refcount on vdev.
> 
> But why are you touching this lock? Arrange things so abort doesn't
> touch the idev??

idev has a pointer idev->vdev to track the vdev's lifecycle.
idev->igroup->lock protects the pointer. At the end of
iommufd_vdevice_destroy() this pointer should be NULLed so that idev
knows vdev is really destroyed.

I haven't found a safer way for vdev to sync up its validness with idev
w/o touching idev.

I was thinking of using vdev->idev and some vdev lock for tracking
instead. Then iommufd_vdevice_abort() doesn't touch idev. But it is
still the same, just switch to put idev in risk:


                                               iommufd_destroy(vdev_id)
                                               iommufd_vdevice_destroy(obj)
  iommufd_device_unbind(idev)
  iommufd_device_destroy(obj)
                                               mutex_lock(&vdev->some_lock)
  mutex_lock(&idev->vdev->some_lock) (wait)
                                               iommufd_vdevice_abort(obj)
                                               mutex_unlock(&vdev->some_lock)
                                               kfree(obj)
  mutex_lock(&idev->vdev->some_lock) (PANIC)
  iommufd_vdevice_abort(idev->vdev.obj)
  ...

Thanks,
Yilun
RE: [PATCH v3 2/5] iommufd: Destroy vdevice on idevice destroy
Posted by Tian, Kevin 3 months, 1 week ago
> From: Xu Yilun <yilun.xu@linux.intel.com>
> Sent: Wednesday, July 2, 2025 10:24 AM
> 
> On Tue, Jul 01, 2025 at 09:13:15AM -0300, Jason Gunthorpe wrote:
> > On Tue, Jul 01, 2025 at 05:19:05PM +0800, Xu Yilun wrote:
> > > On Mon, Jun 30, 2025 at 11:50:51AM -0300, Jason Gunthorpe wrote:
> > > > On Mon, Jun 30, 2025 at 06:18:50PM +0800, Xu Yilun wrote:
> > > >
> > > > > I need to reconsider this, seems we need a dedicated vdev lock to
> > > > > synchronize concurrent vdev abort/destroy.
> > > >
> > > > It is not possible to be concurrent
> > > >
> > > > destroy is only called once after it is no longer possible to call
> > > > abort.
> > >
> > > I'm almost about to drop the "abort twice" idea. [1]
> > >
> > > [1]: https://lore.kernel.org/linux-
> iommu/20250625123832.GF167785@nvidia.com/
> > >
> > > See from the flow below,
> > >
> > >   T1. iommufd_device_unbind(idev)
> > > 	iommufd_device_destroy(obj)
> > > 		mutex_lock(&idev->igroup->lock)
> > >  		iommufd_vdevice_abort(idev->vdev.obj)
> > > 		mutex_unlock(&idev->igroup->lock)
> > > 	kfree(obj)
> > >
> > >   T2. iommufd_destroy(vdev_id)
> > > 	iommufd_vdevice_destroy(obj)
> > > 		mutex_lock(&vdev->idev->igroup->lock)
> > > 		iommufd_vdevice_abort(obj);
> > > 		mutex_unlock(&vdev->idev->igroup->lock)
> > > 	kfree(obj)
> > >
> > > iommufd_vdevice_destroy() will access idev->igroup->lock, but it is
> > > possible the idev is already freed at that time:
> > >
> > >                                             iommufd_destroy(vdev_id)
> > >                                             iommufd_vdevice_destroy(obj)
> > >   iommufd_device_unbind(idev)
> > >   iommufd_device_destroy(obj)
> > >   mutex_lock(&idev->igroup->lock)
> > >                                             mutex_lock(&vdev->idev->igroup->lock) (wait)
> > >   iommufd_vdevice_abort(idev->vdev.obj)
> > >   mutex_unlock(&idev->igroup->lock)
> > >   kfree(obj)
> > >                                             mutex_lock(&vdev->idev->igroup->lock) (PANIC)
> > >                                             iommufd_vdevice_abort(obj)
> > >                                             ...
> >
> > Yes, you can't touch idev inside the destroy function at all, under
> > any version. idev is only valid if you have a refcount on vdev.
> >
> > But why are you touching this lock? Arrange things so abort doesn't
> > touch the idev??
> 
> idev has a pointer idev->vdev to track the vdev's lifecycle.
> idev->igroup->lock protects the pointer. At the end of
> iommufd_vdevice_destroy() this pointer should be NULLed so that idev
> knows vdev is really destroyed.
> 
> I haven't found a safer way for vdev to sync up its validness with idev
> w/o touching idev.
> 
> I was thinking of using vdev->idev and some vdev lock for tracking
> instead. Then iommufd_vdevice_abort() doesn't touch idev. But it is
> still the same, just switch to put idev in risk:
> 
> 
>                                                iommufd_destroy(vdev_id)
>                                                iommufd_vdevice_destroy(obj)
>   iommufd_device_unbind(idev)
>   iommufd_device_destroy(obj)
>                                                mutex_lock(&vdev->some_lock)
>   mutex_lock(&idev->vdev->some_lock) (wait)

panic could happen here, between acquiring idev->vdev and
mutex(vdev->some_lock) as vdev might be destroyed/freed 
in-between.

>                                                iommufd_vdevice_abort(obj)
>                                                mutex_unlock(&vdev->some_lock)
>                                                kfree(obj)
>   mutex_lock(&idev->vdev->some_lock) (PANIC)
>   iommufd_vdevice_abort(idev->vdev.obj)
>   ...
> 

I cannot find a safe way either, except using certain global lock.

but comparing to that I'd prefer to the original wait approach...
Re: [PATCH v3 2/5] iommufd: Destroy vdevice on idevice destroy
Posted by Jason Gunthorpe 3 months, 1 week ago
On Wed, Jul 02, 2025 at 09:13:50AM +0000, Tian, Kevin wrote:
> > > Yes, you can't touch idev inside the destroy function at all, under
> > > any version. idev is only valid if you have a refcount on vdev.
> > >
> > > But why are you touching this lock? Arrange things so abort doesn't
> > > touch the idev??
> > 
> > idev has a pointer idev->vdev to track the vdev's lifecycle.
> > idev->igroup->lock protects the pointer. At the end of
> > iommufd_vdevice_destroy() this pointer should be NULLed so that idev
> > knows vdev is really destroyed.

Well, that is destroy, not abort, but OK, there is an issue with
destroy.

> but comparing to that I'd prefer to the original wait approach...

Okay, but lets try to keep the wait hidden inside the refcounting..

The issue here is we don't hold a refcount on idev while working with
idev. Let's fix that and then things should work properly?

Maybe something like this:

diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index 4e781aa9fc6329..9174fa7c972b80 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -178,12 +178,20 @@ static void iommufd_device_remove_vdev(struct iommufd_device *idev)
 	mutex_unlock(&idev->igroup->lock);
 }
 
+void iommufd_device_pre_destroy(struct iommufd_object *obj)
+{
+	struct iommufd_device *idev =
+		container_of(obj, struct iommufd_device, obj);
+
+	/* Release the short term users on this */
+	iommufd_device_remove_vdev(idev);
+}
+
 void iommufd_device_destroy(struct iommufd_object *obj)
 {
 	struct iommufd_device *idev =
 		container_of(obj, struct iommufd_device, obj);
 
-	iommufd_device_remove_vdev(idev);
 	iommu_device_release_dma_owner(idev->dev);
 	iommufd_put_group(idev->igroup);
 	if (!iommufd_selftest_is_mock_dev(idev->dev))
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index b2e8e106c16158..387c630fdabfbd 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -151,6 +151,9 @@ static int iommufd_object_dec_wait_shortterm(struct iommufd_ctx *ictx,
 	if (refcount_dec_and_test(&to_destroy->shortterm_users))
 		return 0;
 
+	if (iommufd_object_ops[to_destroy->type].pre_destroy)
+		iommufd_object_ops[to_destroy->type].pre_destroy(to_destroy);
+
 	if (wait_event_timeout(ictx->destroy_wait,
 			       refcount_read(&to_destroy->shortterm_users) == 0,
 			       msecs_to_jiffies(60000)))
@@ -567,6 +570,7 @@ static const struct iommufd_object_ops iommufd_object_ops[] = {
 		.destroy = iommufd_access_destroy_object,
 	},
 	[IOMMUFD_OBJ_DEVICE] = {
+		.pre_destroy = iommufd_device_pre_destroy,
 		.destroy = iommufd_device_destroy,
 	},
 	[IOMMUFD_OBJ_FAULT] = {
diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
index 9451a311745f7b..cbf99daa7dc25d 100644
--- a/drivers/iommu/iommufd/viommu.c
+++ b/drivers/iommu/iommufd/viommu.c
@@ -135,6 +135,7 @@ void iommufd_vdevice_destroy(struct iommufd_object *obj)
 	mutex_lock(&vdev->idev->igroup->lock);
 	iommufd_vdevice_abort(obj);
 	mutex_unlock(&vdev->idev->igroup->lock);
+	iommufd_put_object(vdev->viommu->ictx, &vdev->idev->obj);
 }
 
 int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
@@ -180,13 +181,19 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
 	vdev->id = virt_id;
 	vdev->viommu = viommu;
 	refcount_inc(&viommu->obj.users);
+
+	/*
+	 * A reference is held on the idev so long as we have a pointer.
+	 * iommufd_device_pre_destroy() will revoke it before the real
+	 * destruction.
+	 */
+	vdev->idev = idev;
+
 	/*
 	 * iommufd_device_destroy() waits until idev->vdev is NULL before
 	 * freeing the idev, which only happens once the vdev is finished
-	 * destruction. Thus we do not need refcounting on either idev->vdev or
-	 * vdev->idev.
+	 * destruction.
 	 */
-	vdev->idev = idev;
 	idev->vdev = vdev;
 
 	curr = xa_cmpxchg(&viommu->vdevs, virt_id, NULL, vdev, GFP_KERNEL);
@@ -207,7 +214,8 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
 out_unlock_igroup:
 	mutex_unlock(&idev->igroup->lock);
 out_put_idev:
-	iommufd_put_object(ucmd->ictx, &idev->obj);
+	if (rc)
+		iommufd_put_object(ucmd->ictx, &idev->obj);
 out_put_viommu:
 	iommufd_put_object(ucmd->ictx, &viommu->obj);
 	return rc;
Re: [PATCH v3 2/5] iommufd: Destroy vdevice on idevice destroy
Posted by Xu Yilun 3 months ago
On Wed, Jul 02, 2025 at 09:40:42AM -0300, Jason Gunthorpe wrote:
> On Wed, Jul 02, 2025 at 09:13:50AM +0000, Tian, Kevin wrote:
> > > > Yes, you can't touch idev inside the destroy function at all, under
> > > > any version. idev is only valid if you have a refcount on vdev.
> > > >
> > > > But why are you touching this lock? Arrange things so abort doesn't
> > > > touch the idev??
> > > 
> > > idev has a pointer idev->vdev to track the vdev's lifecycle.
> > > idev->igroup->lock protects the pointer. At the end of
> > > iommufd_vdevice_destroy() this pointer should be NULLed so that idev
> > > knows vdev is really destroyed.
> 
> Well, that is destroy, not abort, but OK, there is an issue with
> destroy.
> 
> > but comparing to that I'd prefer to the original wait approach...
> 
> Okay, but lets try to keep the wait hidden inside the refcounting..
> 
> The issue here is we don't hold a refcount on idev while working with
> idev. Let's fix that and then things should work properly?
> 
> Maybe something like this:
> 
> diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
> index 4e781aa9fc6329..9174fa7c972b80 100644
> --- a/drivers/iommu/iommufd/device.c
> +++ b/drivers/iommu/iommufd/device.c
> @@ -178,12 +178,20 @@ static void iommufd_device_remove_vdev(struct iommufd_device *idev)
>  	mutex_unlock(&idev->igroup->lock);
>  }
>  
> +void iommufd_device_pre_destroy(struct iommufd_object *obj)
> +{
> +	struct iommufd_device *idev =
> +		container_of(obj, struct iommufd_device, obj);
> +
> +	/* Release the short term users on this */
> +	iommufd_device_remove_vdev(idev);
> +}
> +
>  void iommufd_device_destroy(struct iommufd_object *obj)
>  {
>  	struct iommufd_device *idev =
>  		container_of(obj, struct iommufd_device, obj);
>  
> -	iommufd_device_remove_vdev(idev);
>  	iommu_device_release_dma_owner(idev->dev);
>  	iommufd_put_group(idev->igroup);
>  	if (!iommufd_selftest_is_mock_dev(idev->dev))
> diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
> index b2e8e106c16158..387c630fdabfbd 100644
> --- a/drivers/iommu/iommufd/main.c
> +++ b/drivers/iommu/iommufd/main.c
> @@ -151,6 +151,9 @@ static int iommufd_object_dec_wait_shortterm(struct iommufd_ctx *ictx,
>  	if (refcount_dec_and_test(&to_destroy->shortterm_users))
>  		return 0;
>  
> +	if (iommufd_object_ops[to_destroy->type].pre_destroy)
> +		iommufd_object_ops[to_destroy->type].pre_destroy(to_destroy);
> +
>  	if (wait_event_timeout(ictx->destroy_wait,
>  			       refcount_read(&to_destroy->shortterm_users) ==i 0,

I still see concerns here. The pre_destroy() and wait_event is before the
idev's users count decreasing to 0, which means a new user could get in.
The worse thing is the new user could hold the shortterm_users until
pre_destroy(), but there isn't a second pre_destroy()...

I think extending the life of shortterm_users at runtime makes things
complex. My idea is keep the original definition of shortterm_users at
runtime, only repurpose it as a wait completion after users refcount == 0.
I.e. idev increase its own shortterm_users, waiting for vdev to decrease
it.

diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index 9876bf70980a..49b787bc0688 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -147,8 +147,15 @@ static void iommufd_device_remove_vdev(struct iommufd_device *idev)
                goto out_unlock;

        vdev = iommufd_get_vdevice(idev->ictx, idev->vdev->obj.id);
-       if (IS_ERR(vdev))
+       if (IS_ERR(vdev)) {
+               /*
+               * vdev is removed from xarray by userspace, but is not
+               * destroyed. We should wait for vdev real destruction before
+               * idev destruction.
+               */
+               iommufd_object_destroy_wait_prepare(&idev->obj);
                goto out_unlock;
+       }

        /* Should never happen */
        if (WARN_ON(vdev != idev->vdev)) {
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 08f3c66366ea..6620006df8d1 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -180,6 +180,47 @@ static inline void iommufd_put_object(struct iommufd_ctx *ictx,
                wake_up_interruptible_all(&ictx->destroy_wait);
 }

+/*
+ * The iommufd_object_destroy_wait_xx() are used to synchronize the last minute
+ * destructions of interrelated objects. E.g. the vdev's destroy() should
+ * always be called before idev's destroy().
+ *
+ * iommufd_object_destroy_wait_prepare() should only be called by
+ * later-destroyer in its pre_destroy() ops, where the users & shortterm_users
+ * refcounts are all zeroed. I.e. no new user is possible. Use "users == 0 &&
+ * shortterm_users == 1" as the "destroy wait" state, as this combination
+ * is not used in runtime usecase and it also prevents new users.
+ *
+ * The later-destroyer should use their own locking to ensure the
+ * first-destroyer still exists when calling this function.
+ */
+static inline void
+iommufd_object_destroy_wait_prepare(struct iommufd_object *to_destroy)
+{
+       if (WARN_ON(refcount_read(&to_destroy->users) != 0 ||
+                   refcount_read(&to_destroy->shortterm_users) != 0))
+               return;
+
+       refcount_set(&to_destroy->shortterm_users, 1);
+}
+
+/*
+ * iommufd_object_destroy_wait_complete() should be called by first-destroyer
+ * in its destroy() ops. The first-destroyer should not access the
+ * later-destroyer after calling this function.
+ */
+static inline void
+iommufd_object_destroy_wait_complete(struct iommufd_ctx *ictx,
+                                    struct iommufd_object *to_destroy)
+{
+       if (refcount_read(&to_destroy->users) != 0 ||
+           refcount_read(&to_destroy->shortterm_users) != 1)
+               return;
+
+       refcount_dec(&to_destroy->shortterm_users);
+       wake_up_interruptible_all(&ictx->destroy_wait);
+}
+
 void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj);
 void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx,
                                      struct iommufd_object *obj);
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index fbccfbd8fb03..148218293f55 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -96,15 +96,37 @@ struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id,
        return obj;
 }

+static int iommufd_object_pre_destroy_wait(struct iommufd_ctx *ictx,
+                                          struct iommufd_object *to_destroy)
+{
+       if (!iommufd_object_ops[to_destroy->type].pre_destroy)
+               return 0;
+
+       /*
+        * pre_destroy() may put the object in destroy wait state (users == 0
+        * && shortterm_users == 1). Another thread completes the waiting by
+        * decreasing the shortterm_users back to 0. The helpers
+        * iommufd_object_destroy_wait_prepare() and
+        * iommufd_object_destroy_wait_complete() should be used for this
+        * purpose.
+        */
+       iommufd_object_ops[to_destroy->type].pre_destroy(to_destroy);
+       if (wait_event_timeout(ictx->destroy_wait,
+                              refcount_read(&to_destroy->shortterm_users) ==
+                                       0,
+                              msecs_to_jiffies(60000)))
+               return 0;
+
+       pr_crit("Time out waiting for iommufd object to become free\n");
+       return -EBUSY;
+}
+
 static int iommufd_object_dec_wait_shortterm(struct iommufd_ctx *ictx,
                                             struct iommufd_object *to_destroy)
 {
        if (refcount_dec_and_test(&to_destroy->shortterm_users))
                return 0;

-       if (iommufd_object_ops[to_destroy->type].pre_destroy)
-               iommufd_object_ops[to_destroy->type].pre_destroy(to_destroy);
-
        if (wait_event_timeout(ictx->destroy_wait,
                                refcount_read(&to_destroy->shortterm_users) ==
                                        0,
@@ -184,6 +206,10 @@ int iommufd_object_remove(struct iommufd_ctx *ictx,
                ret = iommufd_object_dec_wait_shortterm(ictx, obj);
                if (WARN_ON(ret))
                        return ret;
+       } else {
+               ret = iommufd_object_pre_destroy_wait(ictx, obj);
+               if (WARN_ON(ret))
+                       return ret;
        }

        iommufd_object_ops[obj->type].destroy(obj);
diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
index d49e8f0c8bb3..a7feefd5e472 100644
--- a/drivers/iommu/iommufd/viommu.c
+++ b/drivers/iommu/iommufd/viommu.c
@@ -103,11 +103,12 @@ void iommufd_vdevice_destroy(struct iommufd_object *obj)
 {
        struct iommufd_vdevice *vdev =
                container_of(obj, struct iommufd_vdevice, obj);
+       struct iommufd_device *idev = vdev->idev;

-       mutex_lock(&vdev->idev->igroup->lock);
+       mutex_lock(&idev->igroup->lock);
        iommufd_vdevice_abort(obj);
-       mutex_unlock(&vdev->idev->igroup->lock);
-       iommufd_put_object(vdev->viommu->ictx, &vdev->idev->obj);
+       mutex_unlock(&idev->igroup->lock);
+       iommufd_object_destroy_wait_complete(idev->ictx, &idev->obj);
 }

 int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
@@ -185,8 +186,7 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
 out_unlock_igroup:
        mutex_unlock(&idev->igroup->lock);
 out_put_idev:
-       if (rc)
-               iommufd_put_object(ucmd->ictx, &idev->obj);
+       iommufd_put_object(ucmd->ictx, &idev->obj);
 out_put_viommu:
        iommufd_put_object(ucmd->ictx, &viommu->obj);
        return rc;
Re: [PATCH v3 2/5] iommufd: Destroy vdevice on idevice destroy
Posted by Jason Gunthorpe 3 months ago
On Mon, Jul 07, 2025 at 06:58:45PM +0800, Xu Yilun wrote:

> I still see concerns here. The pre_destroy() and wait_event is before the
> idev's users count decreasing to 0, which means a new user could get
> in.

Hum, yes it seems we could get a race with another vdevice being
created during idevice destruction, but I think this could be fixed in
a straightforward way with a flag 'being destroyed' in the idev under
the lock if it is the only issue

> The worse thing is the new user could hold the shortterm_users until
> pre_destroy(), but there isn't a second pre_destroy()...

This is OK, the point of pre_destroy is to remove the reference
cycle. So it needs to prevent new vdevs from establishing references
and remove the reference from any existing vdev.

refcounts outside of this cycle are handled by the existing
mechanisms - pre_destroy is only about the refcount loop.

> I think extending the life of shortterm_users at runtime makes things
> complex. 

Well, it keeps the refcounting working the way it is supposed
to. Holding a pointer without a reference is nasty and is creating
alot of these weird problems and strange flows. Hold the refcount
along with the pointer and things follow the normal patterns.

There is nothing inherently wrong with shortterm users for this
besides its name. Think of it is as 'wait for users', and we clearly
want to wait for the vdev user to put back its idev reference.

I think there are too many gyrations with refcounts in this version
below. It is hard to reason about if zero refcounts are re-elevated to
non-zero.

>         vdev = iommufd_get_vdevice(idev->ictx, idev->vdev->obj.id);
> -       if (IS_ERR(vdev))
> +       if (IS_ERR(vdev)) {
> +               /*
> +               * vdev is removed from xarray by userspace, but is not
> +               * destroyed. We should wait for vdev real destruction before
> +               * idev destruction.
> +               */
> +               iommufd_object_destroy_wait_prepare(&idev->obj);
>                 goto out_unlock;

This is now really weird, we are mangling the refcounts and the
reasoning why it is safe with the concurrent vdev thread is tricky..

> +/*
> + * iommufd_object_destroy_wait_complete() should be called by first-destroyer
> + * in its destroy() ops. The first-destroyer should not access the
> + * later-destroyer after calling this function.
> + */
> +static inline void
> +iommufd_object_destroy_wait_complete(struct iommufd_ctx *ictx,
> +                                    struct iommufd_object *to_destroy)
> +{
> +       if (refcount_read(&to_destroy->users) != 0 ||
> +           refcount_read(&to_destroy->shortterm_users) != 1)
> +               return;
> +
> +       refcount_dec(&to_destroy->shortterm_users);
> +       wake_up_interruptible_all(&ictx->destroy_wait);
> +}

This looks weird too, why is refcount_read safe? That is usually not
how this kind of logic is written.

I'm not sure this is OK, it is not an obvious known-good pattern for
lifetime management.

Jason
Re: [PATCH v3 2/5] iommufd: Destroy vdevice on idevice destroy
Posted by Xu Yilun 3 months ago
On Mon, Jul 07, 2025 at 09:25:02AM -0300, Jason Gunthorpe wrote:
> On Mon, Jul 07, 2025 at 06:58:45PM +0800, Xu Yilun wrote:
> 
> > I still see concerns here. The pre_destroy() and wait_event is before the
> > idev's users count decreasing to 0, which means a new user could get
> > in.
> 
> Hum, yes it seems we could get a race with another vdevice being
> created during idevice destruction, but I think this could be fixed in
> a straightforward way with a flag 'being destroyed' in the idev under
> the lock if it is the only issue
> 
> > The worse thing is the new user could hold the shortterm_users until
> > pre_destroy(), but there isn't a second pre_destroy()...
> 
> This is OK, the point of pre_destroy is to remove the reference
> cycle. So it needs to prevent new vdevs from establishing references
> and remove the reference from any existing vdev.
> 
> refcounts outside of this cycle are handled by the existing
> mechanisms - pre_destroy is only about the refcount loop.

That works for me. I didn't prefer a being destroyed flag cause it
seems not align with the current refcounting mechanism. But I see it
makes sense cause idev does deterministic destruction.

The only concern is idev cannot actually prevent new reference by
itself, need vdev to honor the flag. But just keep simple, don't
over-engineering.

Thanks,
Yilun

> 
> > I think extending the life of shortterm_users at runtime makes things
> > complex. 
> 
> Well, it keeps the refcounting working the way it is supposed
> to. Holding a pointer without a reference is nasty and is creating
> alot of these weird problems and strange flows. Hold the refcount
> along with the pointer and things follow the normal patterns.
> 
> There is nothing inherently wrong with shortterm users for this
> besides its name. Think of it is as 'wait for users', and we clearly
> want to wait for the vdev user to put back its idev reference.
>
RE: [PATCH v3 2/5] iommufd: Destroy vdevice on idevice destroy
Posted by Tian, Kevin 3 months, 1 week ago
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Wednesday, July 2, 2025 8:41 PM
> 
> On Wed, Jul 02, 2025 at 09:13:50AM +0000, Tian, Kevin wrote:
> > > > Yes, you can't touch idev inside the destroy function at all, under
> > > > any version. idev is only valid if you have a refcount on vdev.
> > > >
> > > > But why are you touching this lock? Arrange things so abort doesn't
> > > > touch the idev??
> > >
> > > idev has a pointer idev->vdev to track the vdev's lifecycle.
> > > idev->igroup->lock protects the pointer. At the end of
> > > iommufd_vdevice_destroy() this pointer should be NULLed so that idev
> > > knows vdev is really destroyed.
> 
> Well, that is destroy, not abort, but OK, there is an issue with
> destroy.
> 
> > but comparing to that I'd prefer to the original wait approach...
> 
> Okay, but lets try to keep the wait hidden inside the refcounting..
> 
> The issue here is we don't hold a refcount on idev while working with
> idev. Let's fix that and then things should work properly?
> 
> Maybe something like this:
> 
> diff --git a/drivers/iommu/iommufd/device.c
> b/drivers/iommu/iommufd/device.c
> index 4e781aa9fc6329..9174fa7c972b80 100644
> --- a/drivers/iommu/iommufd/device.c
> +++ b/drivers/iommu/iommufd/device.c
> @@ -178,12 +178,20 @@ static void iommufd_device_remove_vdev(struct
> iommufd_device *idev)
>  	mutex_unlock(&idev->igroup->lock);
>  }
> 
> +void iommufd_device_pre_destroy(struct iommufd_object *obj)
> +{
> +	struct iommufd_device *idev =
> +		container_of(obj, struct iommufd_device, obj);
> +
> +	/* Release the short term users on this */
> +	iommufd_device_remove_vdev(idev);
> +}
> +
>  void iommufd_device_destroy(struct iommufd_object *obj)
>  {
>  	struct iommufd_device *idev =
>  		container_of(obj, struct iommufd_device, obj);
> 
> -	iommufd_device_remove_vdev(idev);
>  	iommu_device_release_dma_owner(idev->dev);
>  	iommufd_put_group(idev->igroup);
>  	if (!iommufd_selftest_is_mock_dev(idev->dev))
> diff --git a/drivers/iommu/iommufd/main.c
> b/drivers/iommu/iommufd/main.c
> index b2e8e106c16158..387c630fdabfbd 100644
> --- a/drivers/iommu/iommufd/main.c
> +++ b/drivers/iommu/iommufd/main.c
> @@ -151,6 +151,9 @@ static int
> iommufd_object_dec_wait_shortterm(struct iommufd_ctx *ictx,
>  	if (refcount_dec_and_test(&to_destroy->shortterm_users))
>  		return 0;
> 
> +	if (iommufd_object_ops[to_destroy->type].pre_destroy)
> +		iommufd_object_ops[to_destroy-
> >type].pre_destroy(to_destroy);
> +
>  	if (wait_event_timeout(ictx->destroy_wait,
>  			       refcount_read(&to_destroy->shortterm_users)
> == 0,
>  			       msecs_to_jiffies(60000)))
> @@ -567,6 +570,7 @@ static const struct iommufd_object_ops
> iommufd_object_ops[] = {
>  		.destroy = iommufd_access_destroy_object,
>  	},
>  	[IOMMUFD_OBJ_DEVICE] = {
> +		.pre_destroy = iommufd_device_pre_destroy,
>  		.destroy = iommufd_device_destroy,
>  	},
>  	[IOMMUFD_OBJ_FAULT] = {
> diff --git a/drivers/iommu/iommufd/viommu.c
> b/drivers/iommu/iommufd/viommu.c
> index 9451a311745f7b..cbf99daa7dc25d 100644
> --- a/drivers/iommu/iommufd/viommu.c
> +++ b/drivers/iommu/iommufd/viommu.c
> @@ -135,6 +135,7 @@ void iommufd_vdevice_destroy(struct
> iommufd_object *obj)
>  	mutex_lock(&vdev->idev->igroup->lock);
>  	iommufd_vdevice_abort(obj);
>  	mutex_unlock(&vdev->idev->igroup->lock);
> +	iommufd_put_object(vdev->viommu->ictx, &vdev->idev->obj);
>  }
> 
>  int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
> @@ -180,13 +181,19 @@ int iommufd_vdevice_alloc_ioctl(struct
> iommufd_ucmd *ucmd)
>  	vdev->id = virt_id;
>  	vdev->viommu = viommu;
>  	refcount_inc(&viommu->obj.users);
> +
> +	/*
> +	 * A reference is held on the idev so long as we have a pointer.
> +	 * iommufd_device_pre_destroy() will revoke it before the real
> +	 * destruction.
> +	 */
> +	vdev->idev = idev;
> +
>  	/*
>  	 * iommufd_device_destroy() waits until idev->vdev is NULL before
>  	 * freeing the idev, which only happens once the vdev is finished
> -	 * destruction. Thus we do not need refcounting on either idev->vdev
> or
> -	 * vdev->idev.
> +	 * destruction.
>  	 */
> -	vdev->idev = idev;
>  	idev->vdev = vdev;
> 
>  	curr = xa_cmpxchg(&viommu->vdevs, virt_id, NULL, vdev,
> GFP_KERNEL);
> @@ -207,7 +214,8 @@ int iommufd_vdevice_alloc_ioctl(struct
> iommufd_ucmd *ucmd)
>  out_unlock_igroup:
>  	mutex_unlock(&idev->igroup->lock);
>  out_put_idev:
> -	iommufd_put_object(ucmd->ictx, &idev->obj);
> +	if (rc)
> +		iommufd_put_object(ucmd->ictx, &idev->obj);

but this sounds like a misuse of shortterm_users which is not intended
to be held long beyond ioctl...
Re: [PATCH v3 2/5] iommufd: Destroy vdevice on idevice destroy
Posted by Jason Gunthorpe 3 months, 1 week ago
On Thu, Jul 03, 2025 at 04:32:26AM +0000, Tian, Kevin wrote:

> but this sounds like a misuse of shortterm_users which is not intended
> to be held long beyond ioctl...

With the addition of the pre_destroy it purpose gets changed to
'beyond the ioctl or past pre_destroy'

Jason
Re: [PATCH v3 2/5] iommufd: Destroy vdevice on idevice destroy
Posted by Baolu Lu 3 months, 1 week ago
On 6/27/25 11:38, Xu Yilun wrote:
> Destroy iommufd_vdevice (vdev) on iommufd_idevice (idev) destroy so
> that vdev can't outlive idev.
> 
> iommufd_device (idev) represents the physical device bound to iommufd,
> while the iommufd_vdevice (vdev) represents the virtual instance of the
> physical device in the VM. The lifecycle of the vdev should not be
> longer than idev. This doesn't cause real problem on existing use cases
> cause vdev doesn't impact the physical device, only provides
> virtualization information. But to extend vdev for Confidential
> Computing (CC), there are needs to do secure configuration for the vdev,
> e.g. TSM Bind/Unbind. These configurations should be rolled back on idev
> destroy, or the external driver (VFIO) functionality may be impact.
> 
> Building the association between idev & vdev requires the two objects
> pointing each other, but not referencing each other. This requires
> proper locking. This is done by reviving some of Nicolin's patch [1].
> 
> There are 3 cases on idev destroy:
> 
>    1. vdev is already destroyed by userspace. No extra handling needed.
>    2. vdev is still alive. Use iommufd_object_tombstone_user() to
>       destroy vdev and tombstone the vdev ID.
>    3. vdev is being destroyed by userspace. The vdev ID is already freed,
>       but vdev destroy handler is not completed. Destroy the vdev
>       immediately. To solve the racing with userspace destruction, make
>       iommufd_vdevice_abort() reentrant.
> 
> [1]:https://lore.kernel.org/ 
> all/53025c827c44d68edb6469bfd940a8e8bc6147a5.1729897278.git.nicolinc@nvidia.com/
> 
> Originally-by: Nicolin Chen<nicolinc@nvidia.com>
> Suggested-by: Jason Gunthorpe<jgg@nvidia.com>
> Co-developed-by: Aneesh Kumar K.V (Arm)<aneesh.kumar@kernel.org>
> Signed-off-by: Aneesh Kumar K.V (Arm)<aneesh.kumar@kernel.org>
> Signed-off-by: Xu Yilun<yilun.xu@linux.intel.com>
> ---
>   drivers/iommu/iommufd/device.c          | 42 +++++++++++++++++++++++
>   drivers/iommu/iommufd/iommufd_private.h | 11 +++++++
>   drivers/iommu/iommufd/main.c            |  1 +
>   drivers/iommu/iommufd/viommu.c          | 44 +++++++++++++++++++++++--
>   4 files changed, 95 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
> index 86244403b532..0937d4989185 100644
> --- a/drivers/iommu/iommufd/device.c
> +++ b/drivers/iommu/iommufd/device.c
> @@ -137,11 +137,53 @@ static struct iommufd_group *iommufd_get_group(struct iommufd_ctx *ictx,
>   	}
>   }
>   
> +static void iommufd_device_remove_vdev(struct iommufd_device *idev)
> +{
> +	struct iommufd_vdevice *vdev;
> +
> +	mutex_lock(&idev->igroup->lock);
> +	/* vdev has been completely destroyed by userspace */
> +	if (!idev->vdev)
> +		goto out_unlock;
> +
> +	vdev = iommufd_get_vdevice(idev->ictx, idev->vdev->obj.id);
> +	if (IS_ERR(vdev)) {
> +		/*
> +		 * vdev is removed from xarray by userspace, but is not
> +		 * destroyed/freed. Since iommufd_vdevice_abort() is reentrant,
> +		 * safe to destroy vdev here.
> +		 */
> +		iommufd_vdevice_abort(&idev->vdev->obj);
> +		goto out_unlock;
> +	}
> +
> +	/* Should never happen */
> +	if (WARN_ON(vdev != idev->vdev)) {
> +		iommufd_put_object(idev->ictx, &vdev->obj);
> +		goto out_unlock;
> +	}
> +
> +	/*
> +	 * vdev is still alive. Hold a users refcount to prevent racing with
> +	 * userspace destruction, then use iommufd_object_tombstone_user() to
> +	 * destroy it and leave a tombstone.
> +	 */
> +	refcount_inc(&vdev->obj.users);
> +	iommufd_put_object(idev->ictx, &vdev->obj);
> +	mutex_unlock(&idev->igroup->lock);
> +	iommufd_object_tombstone_user(idev->ictx, &vdev->obj);
> +	return;
> +
> +out_unlock:
> +	mutex_unlock(&idev->igroup->lock);
> +}
> +
>   void iommufd_device_destroy(struct iommufd_object *obj)
>   {
>   	struct iommufd_device *idev =
>   		container_of(obj, struct iommufd_device, obj);
>   
> +	iommufd_device_remove_vdev(idev);
>   	iommu_device_release_dma_owner(idev->dev);
>   	iommufd_put_group(idev->igroup);
>   	if (!iommufd_selftest_is_mock_dev(idev->dev))
> diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
> index fbc9ef78d81f..f58aa4439c53 100644
> --- a/drivers/iommu/iommufd/iommufd_private.h
> +++ b/drivers/iommu/iommufd/iommufd_private.h
> @@ -446,6 +446,7 @@ struct iommufd_device {
>   	/* always the physical device */
>   	struct device *dev;
>   	bool enforce_cache_coherency;
> +	struct iommufd_vdevice *vdev;
>   };
>   
>   static inline struct iommufd_device *
> @@ -621,6 +622,7 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd);
>   void iommufd_viommu_destroy(struct iommufd_object *obj);
>   int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd);
>   void iommufd_vdevice_destroy(struct iommufd_object *obj);
> +void iommufd_vdevice_abort(struct iommufd_object *obj);
>   
>   struct iommufd_vdevice {
>   	struct iommufd_object obj;
> @@ -628,8 +630,17 @@ struct iommufd_vdevice {
>   	struct iommufd_viommu *viommu;
>   	struct device *dev;
>   	u64 id; /* per-vIOMMU virtual ID */
> +	struct iommufd_device *idev;
>   };
>   
> +static inline struct iommufd_vdevice *
> +iommufd_get_vdevice(struct iommufd_ctx *ictx, u32 id)
> +{
> +	return container_of(iommufd_get_object(ictx, id,
> +					       IOMMUFD_OBJ_VDEVICE),
> +			    struct iommufd_vdevice, obj);
> +}
> +
>   #ifdef CONFIG_IOMMUFD_TEST
>   int iommufd_test(struct iommufd_ucmd *ucmd);
>   void iommufd_selftest_destroy(struct iommufd_object *obj);
> diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
> index 620923669b42..64731b4fdbdf 100644
> --- a/drivers/iommu/iommufd/main.c
> +++ b/drivers/iommu/iommufd/main.c
> @@ -529,6 +529,7 @@ static const struct iommufd_object_ops iommufd_object_ops[] = {
>   	},
>   	[IOMMUFD_OBJ_VDEVICE] = {
>   		.destroy = iommufd_vdevice_destroy,
> +		.abort = iommufd_vdevice_abort,
>   	},
>   	[IOMMUFD_OBJ_VEVENTQ] = {
>   		.destroy = iommufd_veventq_destroy,
> diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
> index 01df2b985f02..632d1d7b8fd8 100644
> --- a/drivers/iommu/iommufd/viommu.c
> +++ b/drivers/iommu/iommufd/viommu.c
> @@ -84,16 +84,38 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
>   	return rc;
>   }
>   
> -void iommufd_vdevice_destroy(struct iommufd_object *obj)
> +void iommufd_vdevice_abort(struct iommufd_object *obj)
>   {
>   	struct iommufd_vdevice *vdev =
>   		container_of(obj, struct iommufd_vdevice, obj);
>   	struct iommufd_viommu *viommu = vdev->viommu;
> +	struct iommufd_device *idev = vdev->idev;
> +
> +	lockdep_assert_held(&idev->igroup->lock);
> +
> +	/*
> +	 * iommufd_vdevice_abort() could be reentrant, by
> +	 * iommufd_device_unbind() or by iommufd_destroy(). Cleanup only once.
> +	 */
> +	if (!viommu)
> +		return;
>   
>   	/* xa_cmpxchg is okay to fail if alloc failed xa_cmpxchg previously */
>   	xa_cmpxchg(&viommu->vdevs, vdev->id, vdev, NULL, GFP_KERNEL);
>   	refcount_dec(&viommu->obj.users);
>   	put_device(vdev->dev);
> +	vdev->viommu = NULL;
> +	idev->vdev = NULL;

I feel it makes more sense to reorder the operations like this:

	vdev->viommu = NULL;
	vdev->idev = NULL;
	idev->vdev = NULL;
	put_device(vdev->dev);

put_device(vdev->dev) could potentially trigger other code paths that
might attempt to reference vdev or its associated pointers. Therefore,
it's safer to null all the relevant reference pointers before calling
put_device().

Others look good to me,

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>

Thanks,
baolu
Re: [PATCH v3 2/5] iommufd: Destroy vdevice on idevice destroy
Posted by Xu Yilun 3 months ago
> > -void iommufd_vdevice_destroy(struct iommufd_object *obj)
> > +void iommufd_vdevice_abort(struct iommufd_object *obj)
> >   {
> >   	struct iommufd_vdevice *vdev =
> >   		container_of(obj, struct iommufd_vdevice, obj);
> >   	struct iommufd_viommu *viommu = vdev->viommu;
> > +	struct iommufd_device *idev = vdev->idev;
> > +
> > +	lockdep_assert_held(&idev->igroup->lock);
> > +
> > +	/*
> > +	 * iommufd_vdevice_abort() could be reentrant, by
> > +	 * iommufd_device_unbind() or by iommufd_destroy(). Cleanup only once.
> > +	 */
> > +	if (!viommu)
> > +		return;
> >   	/* xa_cmpxchg is okay to fail if alloc failed xa_cmpxchg previously */
> >   	xa_cmpxchg(&viommu->vdevs, vdev->id, vdev, NULL, GFP_KERNEL);
> >   	refcount_dec(&viommu->obj.users);
> >   	put_device(vdev->dev);
> > +	vdev->viommu = NULL;
> > +	idev->vdev = NULL;
> 
> I feel it makes more sense to reorder the operations like this:
> 
> 	vdev->viommu = NULL;
> 	vdev->idev = NULL;
> 	idev->vdev = NULL;
> 	put_device(vdev->dev);
> 
> put_device(vdev->dev) could potentially trigger other code paths that
> might attempt to reference vdev or its associated pointers. Therefore,
> it's safer to null all the relevant reference pointers before calling
> put_device().

Yep. due to other changes, now I keep:

        xa_cmpxchg(&viommu->vdevs, vdev->id, vdev, NULL, GFP_KERNEL);
        refcount_dec(&viommu->obj.users);
+       idev->vdev = NULL;
        put_device(vdev->dev);

Anyway, vdev->dev would be completely dropped in next patch, so it
doesn't matter much.

Thanks,
Yilun

> 
> Others look good to me,
> 
> Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
> 
> Thanks,
> baolu