Destroy iommufd_vdevice (vdev) on iommufd_idevice (idev) destroy so
that vdev can't outlive idev.
iommufd_device (idev) represents the physical device bound to iommufd,
while the iommufd_vdevice (vdev) represents the virtual instance of the
physical device in the VM. The lifecycle of the vdev should not be
longer than idev. This doesn't cause real problem on existing use cases
cause vdev doesn't impact the physical device, only provides
virtualization information. But to extend vdev for Confidential
Computing (CC), there are needs to do secure configuration for the vdev,
e.g. TSM Bind/Unbind. These configurations should be rolled back on idev
destroy, or the external driver (VFIO) functionality may be impact.
Building the association between idev & vdev requires the two objects
pointing each other, but not referencing each other. This requires
proper locking. This is done by reviving some of Nicolin's patch [1].
There are 3 cases on idev destroy:
1. vdev is already destroyed by userspace. No extra handling needed.
2. vdev is still alive. Use iommufd_object_tombstone_user() to
destroy vdev and tombstone the vdev ID.
3. vdev is being destroyed by userspace. The vdev ID is already freed,
but vdev destroy handler is not completed. Destroy the vdev
immediately. To solve the racing with userspace destruction, make
iommufd_vdevice_abort() reentrant.
[1]: https://lore.kernel.org/all/53025c827c44d68edb6469bfd940a8e8bc6147a5.1729897278.git.nicolinc@nvidia.com/
Originally-by: Nicolin Chen <nicolinc@nvidia.com>
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Co-developed-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
---
drivers/iommu/iommufd/device.c | 42 +++++++++++++++++++++++
drivers/iommu/iommufd/iommufd_private.h | 11 +++++++
drivers/iommu/iommufd/main.c | 1 +
drivers/iommu/iommufd/viommu.c | 44 +++++++++++++++++++++++--
4 files changed, 95 insertions(+), 3 deletions(-)
diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index 86244403b532..0937d4989185 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -137,11 +137,53 @@ static struct iommufd_group *iommufd_get_group(struct iommufd_ctx *ictx,
}
}
+static void iommufd_device_remove_vdev(struct iommufd_device *idev)
+{
+ struct iommufd_vdevice *vdev;
+
+ mutex_lock(&idev->igroup->lock);
+ /* vdev has been completely destroyed by userspace */
+ if (!idev->vdev)
+ goto out_unlock;
+
+ vdev = iommufd_get_vdevice(idev->ictx, idev->vdev->obj.id);
+ if (IS_ERR(vdev)) {
+ /*
+ * vdev is removed from xarray by userspace, but is not
+ * destroyed/freed. Since iommufd_vdevice_abort() is reentrant,
+ * safe to destroy vdev here.
+ */
+ iommufd_vdevice_abort(&idev->vdev->obj);
+ goto out_unlock;
+ }
+
+ /* Should never happen */
+ if (WARN_ON(vdev != idev->vdev)) {
+ iommufd_put_object(idev->ictx, &vdev->obj);
+ goto out_unlock;
+ }
+
+ /*
+ * vdev is still alive. Hold a users refcount to prevent racing with
+ * userspace destruction, then use iommufd_object_tombstone_user() to
+ * destroy it and leave a tombstone.
+ */
+ refcount_inc(&vdev->obj.users);
+ iommufd_put_object(idev->ictx, &vdev->obj);
+ mutex_unlock(&idev->igroup->lock);
+ iommufd_object_tombstone_user(idev->ictx, &vdev->obj);
+ return;
+
+out_unlock:
+ mutex_unlock(&idev->igroup->lock);
+}
+
void iommufd_device_destroy(struct iommufd_object *obj)
{
struct iommufd_device *idev =
container_of(obj, struct iommufd_device, obj);
+ iommufd_device_remove_vdev(idev);
iommu_device_release_dma_owner(idev->dev);
iommufd_put_group(idev->igroup);
if (!iommufd_selftest_is_mock_dev(idev->dev))
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index fbc9ef78d81f..f58aa4439c53 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -446,6 +446,7 @@ struct iommufd_device {
/* always the physical device */
struct device *dev;
bool enforce_cache_coherency;
+ struct iommufd_vdevice *vdev;
};
static inline struct iommufd_device *
@@ -621,6 +622,7 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd);
void iommufd_viommu_destroy(struct iommufd_object *obj);
int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd);
void iommufd_vdevice_destroy(struct iommufd_object *obj);
+void iommufd_vdevice_abort(struct iommufd_object *obj);
struct iommufd_vdevice {
struct iommufd_object obj;
@@ -628,8 +630,17 @@ struct iommufd_vdevice {
struct iommufd_viommu *viommu;
struct device *dev;
u64 id; /* per-vIOMMU virtual ID */
+ struct iommufd_device *idev;
};
+static inline struct iommufd_vdevice *
+iommufd_get_vdevice(struct iommufd_ctx *ictx, u32 id)
+{
+ return container_of(iommufd_get_object(ictx, id,
+ IOMMUFD_OBJ_VDEVICE),
+ struct iommufd_vdevice, obj);
+}
+
#ifdef CONFIG_IOMMUFD_TEST
int iommufd_test(struct iommufd_ucmd *ucmd);
void iommufd_selftest_destroy(struct iommufd_object *obj);
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 620923669b42..64731b4fdbdf 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -529,6 +529,7 @@ static const struct iommufd_object_ops iommufd_object_ops[] = {
},
[IOMMUFD_OBJ_VDEVICE] = {
.destroy = iommufd_vdevice_destroy,
+ .abort = iommufd_vdevice_abort,
},
[IOMMUFD_OBJ_VEVENTQ] = {
.destroy = iommufd_veventq_destroy,
diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
index 01df2b985f02..632d1d7b8fd8 100644
--- a/drivers/iommu/iommufd/viommu.c
+++ b/drivers/iommu/iommufd/viommu.c
@@ -84,16 +84,38 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
return rc;
}
-void iommufd_vdevice_destroy(struct iommufd_object *obj)
+void iommufd_vdevice_abort(struct iommufd_object *obj)
{
struct iommufd_vdevice *vdev =
container_of(obj, struct iommufd_vdevice, obj);
struct iommufd_viommu *viommu = vdev->viommu;
+ struct iommufd_device *idev = vdev->idev;
+
+ lockdep_assert_held(&idev->igroup->lock);
+
+ /*
+ * iommufd_vdevice_abort() could be reentrant, by
+ * iommufd_device_unbind() or by iommufd_destroy(). Cleanup only once.
+ */
+ if (!viommu)
+ return;
/* xa_cmpxchg is okay to fail if alloc failed xa_cmpxchg previously */
xa_cmpxchg(&viommu->vdevs, vdev->id, vdev, NULL, GFP_KERNEL);
refcount_dec(&viommu->obj.users);
put_device(vdev->dev);
+ vdev->viommu = NULL;
+ idev->vdev = NULL;
+}
+
+void iommufd_vdevice_destroy(struct iommufd_object *obj)
+{
+ struct iommufd_vdevice *vdev =
+ container_of(obj, struct iommufd_vdevice, obj);
+
+ mutex_lock(&vdev->idev->igroup->lock);
+ iommufd_vdevice_abort(obj);
+ mutex_unlock(&vdev->idev->igroup->lock);
}
int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
@@ -124,10 +146,16 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
goto out_put_idev;
}
+ mutex_lock(&idev->igroup->lock);
+ if (idev->vdev) {
+ rc = -EEXIST;
+ goto out_unlock_igroup;
+ }
+
vdev = iommufd_object_alloc(ucmd->ictx, vdev, IOMMUFD_OBJ_VDEVICE);
if (IS_ERR(vdev)) {
rc = PTR_ERR(vdev);
- goto out_put_idev;
+ goto out_unlock_igroup;
}
vdev->id = virt_id;
@@ -135,6 +163,14 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
get_device(idev->dev);
vdev->viommu = viommu;
refcount_inc(&viommu->obj.users);
+ /*
+ * iommufd_device_destroy() waits until idev->vdev is NULL before
+ * freeing the idev, which only happens once the vdev is finished
+ * destruction. Thus we do not need refcounting on either idev->vdev or
+ * vdev->idev.
+ */
+ vdev->idev = idev;
+ idev->vdev = vdev;
curr = xa_cmpxchg(&viommu->vdevs, virt_id, NULL, vdev, GFP_KERNEL);
if (curr) {
@@ -147,10 +183,12 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
if (rc)
goto out_abort;
iommufd_object_finalize(ucmd->ictx, &vdev->obj);
- goto out_put_idev;
+ goto out_unlock_igroup;
out_abort:
iommufd_object_abort_and_destroy(ucmd->ictx, &vdev->obj);
+out_unlock_igroup:
+ mutex_unlock(&idev->igroup->lock);
out_put_idev:
iommufd_put_object(ucmd->ictx, &idev->obj);
out_put_viommu:
--
2.25.1
On Fri, Jun 27, 2025 at 11:38:06AM +0800, Xu Yilun wrote: > +static void iommufd_device_remove_vdev(struct iommufd_device *idev) > +{ > + struct iommufd_vdevice *vdev; > + > + mutex_lock(&idev->igroup->lock); > + /* vdev has been completely destroyed by userspace */ > + if (!idev->vdev) > + goto out_unlock; > + > + vdev = iommufd_get_vdevice(idev->ictx, idev->vdev->obj.id); > + if (IS_ERR(vdev)) { > + /* > + * vdev is removed from xarray by userspace, but is not > + * destroyed/freed. Since iommufd_vdevice_abort() is reentrant, > + * safe to destroy vdev here. > + */ > + iommufd_vdevice_abort(&idev->vdev->obj); > + goto out_unlock; This is the case #3, i.e. a racing vdev destory, in the commit log? I think it is worth clarifying that there is a concurrent destroy: /* * An ongoing vdev destroy ioctl has removed the vdev from the * object xarray but has not finished iommufd_vdevice_destroy() * yet, as it is holding the same mutex. Destroy the vdev here, * i.e. the iommufd_vdevice_destroy() will be a NOP once it is * unlocked. */ > @@ -147,10 +183,12 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd) > if (rc) > goto out_abort; > iommufd_object_finalize(ucmd->ictx, &vdev->obj); > - goto out_put_idev; > + goto out_unlock_igroup; > > out_abort: > iommufd_object_abort_and_destroy(ucmd->ictx, &vdev->obj); > +out_unlock_igroup: > + mutex_unlock(&idev->igroup->lock); Looks like we will have to partially revert the _ucmd allocator, in this function: https://lore.kernel.org/all/107b24a3b791091bb09c92ffb0081c56c413b26d.1749882255.git.nicolinc@nvidia.com/ Please try fixing the conflicts on top of Jason's for-next tree. Thanks Nicolin
On Mon, Jun 30, 2025 at 12:34:03PM -0700, Nicolin Chen wrote: > On Fri, Jun 27, 2025 at 11:38:06AM +0800, Xu Yilun wrote: > > +static void iommufd_device_remove_vdev(struct iommufd_device *idev) > > +{ > > + struct iommufd_vdevice *vdev; > > + > > + mutex_lock(&idev->igroup->lock); > > + /* vdev has been completely destroyed by userspace */ > > + if (!idev->vdev) > > + goto out_unlock; > > + > > + vdev = iommufd_get_vdevice(idev->ictx, idev->vdev->obj.id); > > + if (IS_ERR(vdev)) { > > + /* > > + * vdev is removed from xarray by userspace, but is not > > + * destroyed/freed. Since iommufd_vdevice_abort() is reentrant, > > + * safe to destroy vdev here. > > + */ > > + iommufd_vdevice_abort(&idev->vdev->obj); > > + goto out_unlock; > > This is the case #3, i.e. a racing vdev destory, in the commit log? > > I think it is worth clarifying that there is a concurrent destroy: > /* > * An ongoing vdev destroy ioctl has removed the vdev from the > * object xarray but has not finished iommufd_vdevice_destroy() > * yet, as it is holding the same mutex. Applied this part. > * Destroy the vdev here, > * i.e. the iommufd_vdevice_destroy() will be a NOP once it is > * unlocked. > */ > > > @@ -147,10 +183,12 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd) > > if (rc) > > goto out_abort; > > iommufd_object_finalize(ucmd->ictx, &vdev->obj); > > - goto out_put_idev; > > + goto out_unlock_igroup; > > > > out_abort: > > iommufd_object_abort_and_destroy(ucmd->ictx, &vdev->obj); > > +out_unlock_igroup: > > + mutex_unlock(&idev->igroup->lock); > > Looks like we will have to partially revert the _ucmd allocator, > in this function: > https://lore.kernel.org/all/107b24a3b791091bb09c92ffb0081c56c413b26d.1749882255.git.nicolinc@nvidia.com/ > > Please try fixing the conflicts on top of Jason's for-next tree. Yes, will rebase for next version. Thanks, Yilun > > Thanks > Nicolin
> From: Xu Yilun <yilun.xu@linux.intel.com> > Sent: Friday, June 27, 2025 11:38 AM > > +static void iommufd_device_remove_vdev(struct iommufd_device *idev) > +{ > + struct iommufd_vdevice *vdev; > + > + mutex_lock(&idev->igroup->lock); > + /* vdev has been completely destroyed by userspace */ > + if (!idev->vdev) > + goto out_unlock; > + > + vdev = iommufd_get_vdevice(idev->ictx, idev->vdev->obj.id); > + if (IS_ERR(vdev)) { > + /* > + * vdev is removed from xarray by userspace, but is not > + * destroyed/freed. Since iommufd_vdevice_abort() is > reentrant, > + * safe to destroy vdev here. > + */ > + iommufd_vdevice_abort(&idev->vdev->obj); > + goto out_unlock; > + } let's add a comment that vdev is still freed in iommufd_destroy() in this situation. > -void iommufd_vdevice_destroy(struct iommufd_object *obj) > +void iommufd_vdevice_abort(struct iommufd_object *obj) > { > struct iommufd_vdevice *vdev = > container_of(obj, struct iommufd_vdevice, obj); > struct iommufd_viommu *viommu = vdev->viommu; > + struct iommufd_device *idev = vdev->idev; > + > + lockdep_assert_held(&idev->igroup->lock); > + > + /* > + * iommufd_vdevice_abort() could be reentrant, by > + * iommufd_device_unbind() or by iommufd_destroy(). Cleanup only > once. > + */ > + if (!viommu) > + return; Just check idev->vdev, to be consistent with the other path.
On Mon, Jun 30, 2025 at 06:27:51AM +0000, Tian, Kevin wrote: > > From: Xu Yilun <yilun.xu@linux.intel.com> > > Sent: Friday, June 27, 2025 11:38 AM > > > > +static void iommufd_device_remove_vdev(struct iommufd_device *idev) > > +{ > > + struct iommufd_vdevice *vdev; > > + > > + mutex_lock(&idev->igroup->lock); > > + /* vdev has been completely destroyed by userspace */ > > + if (!idev->vdev) > > + goto out_unlock; > > + > > + vdev = iommufd_get_vdevice(idev->ictx, idev->vdev->obj.id); > > + if (IS_ERR(vdev)) { > > + /* > > + * vdev is removed from xarray by userspace, but is not > > + * destroyed/freed. Since iommufd_vdevice_abort() is > > reentrant, > > + * safe to destroy vdev here. > > + */ > > + iommufd_vdevice_abort(&idev->vdev->obj); > > + goto out_unlock; > > + } > > let's add a comment that vdev is still freed in iommufd_destroy() > in this situation. Yes. > > > -void iommufd_vdevice_destroy(struct iommufd_object *obj) > > +void iommufd_vdevice_abort(struct iommufd_object *obj) > > { > > struct iommufd_vdevice *vdev = > > container_of(obj, struct iommufd_vdevice, obj); > > struct iommufd_viommu *viommu = vdev->viommu; > > + struct iommufd_device *idev = vdev->idev; > > + > > + lockdep_assert_held(&idev->igroup->lock); > > + > > + /* > > + * iommufd_vdevice_abort() could be reentrant, by > > + * iommufd_device_unbind() or by iommufd_destroy(). Cleanup only > > once. > > + */ > > + if (!viommu) > > + return; > > Just check idev->vdev, to be consistent with the other path. I think there is problem here. From your comments above, vdev could be aborted/destroyed by iommufd_destroy() again *after* idev is freed. That means in iommufd_vdevice_abort/destroy(), usage of vdev->idev or idev->vdev or vdev->idev->igroup->lock may be invalid. I need to reconsider this, seems we need a dedicated vdev lock to synchronize concurrent vdev abort/destroy. Thanks, Yilun
On Mon, Jun 30, 2025 at 06:18:50PM +0800, Xu Yilun wrote: > I need to reconsider this, seems we need a dedicated vdev lock to > synchronize concurrent vdev abort/destroy. It is not possible to be concurrent destroy is only called once after it is no longer possible to call abort. Jason
On Mon, Jun 30, 2025 at 11:50:51AM -0300, Jason Gunthorpe wrote: > On Mon, Jun 30, 2025 at 06:18:50PM +0800, Xu Yilun wrote: > > > I need to reconsider this, seems we need a dedicated vdev lock to > > synchronize concurrent vdev abort/destroy. > > It is not possible to be concurrent > > destroy is only called once after it is no longer possible to call > abort. I'm almost about to drop the "abort twice" idea. [1] [1]: https://lore.kernel.org/linux-iommu/20250625123832.GF167785@nvidia.com/ See from the flow below, T1. iommufd_device_unbind(idev) iommufd_device_destroy(obj) mutex_lock(&idev->igroup->lock) iommufd_vdevice_abort(idev->vdev.obj) mutex_unlock(&idev->igroup->lock) kfree(obj) T2. iommufd_destroy(vdev_id) iommufd_vdevice_destroy(obj) mutex_lock(&vdev->idev->igroup->lock) iommufd_vdevice_abort(obj); mutex_unlock(&vdev->idev->igroup->lock) kfree(obj) iommufd_vdevice_destroy() will access idev->igroup->lock, but it is possible the idev is already freed at that time: iommufd_destroy(vdev_id) iommufd_vdevice_destroy(obj) iommufd_device_unbind(idev) iommufd_device_destroy(obj) mutex_lock(&idev->igroup->lock) mutex_lock(&vdev->idev->igroup->lock) (wait) iommufd_vdevice_abort(idev->vdev.obj) mutex_unlock(&idev->igroup->lock) kfree(obj) mutex_lock(&vdev->idev->igroup->lock) (PANIC) iommufd_vdevice_abort(obj) ... We also can't introduce some vdev side lock instead of idev side lock, vdev could also be freed right after iommufd_vdevice_destroy(). I think the only simple way is to let idev destruction wait for vdev destruction, that go back to the wait_event idea ... Thanks, Yilun
On Tue, Jul 01, 2025 at 05:19:05PM +0800, Xu Yilun wrote: > On Mon, Jun 30, 2025 at 11:50:51AM -0300, Jason Gunthorpe wrote: > > On Mon, Jun 30, 2025 at 06:18:50PM +0800, Xu Yilun wrote: > > > > > I need to reconsider this, seems we need a dedicated vdev lock to > > > synchronize concurrent vdev abort/destroy. > > > > It is not possible to be concurrent > > > > destroy is only called once after it is no longer possible to call > > abort. > > I'm almost about to drop the "abort twice" idea. [1] > > [1]: https://lore.kernel.org/linux-iommu/20250625123832.GF167785@nvidia.com/ > > See from the flow below, > > T1. iommufd_device_unbind(idev) > iommufd_device_destroy(obj) > mutex_lock(&idev->igroup->lock) > iommufd_vdevice_abort(idev->vdev.obj) > mutex_unlock(&idev->igroup->lock) > kfree(obj) > > T2. iommufd_destroy(vdev_id) > iommufd_vdevice_destroy(obj) > mutex_lock(&vdev->idev->igroup->lock) > iommufd_vdevice_abort(obj); > mutex_unlock(&vdev->idev->igroup->lock) > kfree(obj) > > iommufd_vdevice_destroy() will access idev->igroup->lock, but it is > possible the idev is already freed at that time: > > iommufd_destroy(vdev_id) > iommufd_vdevice_destroy(obj) > iommufd_device_unbind(idev) > iommufd_device_destroy(obj) > mutex_lock(&idev->igroup->lock) > mutex_lock(&vdev->idev->igroup->lock) (wait) > iommufd_vdevice_abort(idev->vdev.obj) > mutex_unlock(&idev->igroup->lock) > kfree(obj) > mutex_lock(&vdev->idev->igroup->lock) (PANIC) > iommufd_vdevice_abort(obj) > ... Yes, you can't touch idev inside the destroy function at all, under any version. idev is only valid if you have a refcount on vdev. But why are you touching this lock? Arrange things so abort doesn't touch the idev?? Jason
On Tue, Jul 01, 2025 at 09:13:15AM -0300, Jason Gunthorpe wrote: > On Tue, Jul 01, 2025 at 05:19:05PM +0800, Xu Yilun wrote: > > On Mon, Jun 30, 2025 at 11:50:51AM -0300, Jason Gunthorpe wrote: > > > On Mon, Jun 30, 2025 at 06:18:50PM +0800, Xu Yilun wrote: > > > > > > > I need to reconsider this, seems we need a dedicated vdev lock to > > > > synchronize concurrent vdev abort/destroy. > > > > > > It is not possible to be concurrent > > > > > > destroy is only called once after it is no longer possible to call > > > abort. > > > > I'm almost about to drop the "abort twice" idea. [1] > > > > [1]: https://lore.kernel.org/linux-iommu/20250625123832.GF167785@nvidia.com/ > > > > See from the flow below, > > > > T1. iommufd_device_unbind(idev) > > iommufd_device_destroy(obj) > > mutex_lock(&idev->igroup->lock) > > iommufd_vdevice_abort(idev->vdev.obj) > > mutex_unlock(&idev->igroup->lock) > > kfree(obj) > > > > T2. iommufd_destroy(vdev_id) > > iommufd_vdevice_destroy(obj) > > mutex_lock(&vdev->idev->igroup->lock) > > iommufd_vdevice_abort(obj); > > mutex_unlock(&vdev->idev->igroup->lock) > > kfree(obj) > > > > iommufd_vdevice_destroy() will access idev->igroup->lock, but it is > > possible the idev is already freed at that time: > > > > iommufd_destroy(vdev_id) > > iommufd_vdevice_destroy(obj) > > iommufd_device_unbind(idev) > > iommufd_device_destroy(obj) > > mutex_lock(&idev->igroup->lock) > > mutex_lock(&vdev->idev->igroup->lock) (wait) > > iommufd_vdevice_abort(idev->vdev.obj) > > mutex_unlock(&idev->igroup->lock) > > kfree(obj) > > mutex_lock(&vdev->idev->igroup->lock) (PANIC) > > iommufd_vdevice_abort(obj) > > ... > > Yes, you can't touch idev inside the destroy function at all, under > any version. idev is only valid if you have a refcount on vdev. > > But why are you touching this lock? Arrange things so abort doesn't > touch the idev?? idev has a pointer idev->vdev to track the vdev's lifecycle. idev->igroup->lock protects the pointer. At the end of iommufd_vdevice_destroy() this pointer should be NULLed so that idev knows vdev is really destroyed. I haven't found a safer way for vdev to sync up its validness with idev w/o touching idev. I was thinking of using vdev->idev and some vdev lock for tracking instead. Then iommufd_vdevice_abort() doesn't touch idev. But it is still the same, just switch to put idev in risk: iommufd_destroy(vdev_id) iommufd_vdevice_destroy(obj) iommufd_device_unbind(idev) iommufd_device_destroy(obj) mutex_lock(&vdev->some_lock) mutex_lock(&idev->vdev->some_lock) (wait) iommufd_vdevice_abort(obj) mutex_unlock(&vdev->some_lock) kfree(obj) mutex_lock(&idev->vdev->some_lock) (PANIC) iommufd_vdevice_abort(idev->vdev.obj) ... Thanks, Yilun
> From: Xu Yilun <yilun.xu@linux.intel.com> > Sent: Wednesday, July 2, 2025 10:24 AM > > On Tue, Jul 01, 2025 at 09:13:15AM -0300, Jason Gunthorpe wrote: > > On Tue, Jul 01, 2025 at 05:19:05PM +0800, Xu Yilun wrote: > > > On Mon, Jun 30, 2025 at 11:50:51AM -0300, Jason Gunthorpe wrote: > > > > On Mon, Jun 30, 2025 at 06:18:50PM +0800, Xu Yilun wrote: > > > > > > > > > I need to reconsider this, seems we need a dedicated vdev lock to > > > > > synchronize concurrent vdev abort/destroy. > > > > > > > > It is not possible to be concurrent > > > > > > > > destroy is only called once after it is no longer possible to call > > > > abort. > > > > > > I'm almost about to drop the "abort twice" idea. [1] > > > > > > [1]: https://lore.kernel.org/linux- > iommu/20250625123832.GF167785@nvidia.com/ > > > > > > See from the flow below, > > > > > > T1. iommufd_device_unbind(idev) > > > iommufd_device_destroy(obj) > > > mutex_lock(&idev->igroup->lock) > > > iommufd_vdevice_abort(idev->vdev.obj) > > > mutex_unlock(&idev->igroup->lock) > > > kfree(obj) > > > > > > T2. iommufd_destroy(vdev_id) > > > iommufd_vdevice_destroy(obj) > > > mutex_lock(&vdev->idev->igroup->lock) > > > iommufd_vdevice_abort(obj); > > > mutex_unlock(&vdev->idev->igroup->lock) > > > kfree(obj) > > > > > > iommufd_vdevice_destroy() will access idev->igroup->lock, but it is > > > possible the idev is already freed at that time: > > > > > > iommufd_destroy(vdev_id) > > > iommufd_vdevice_destroy(obj) > > > iommufd_device_unbind(idev) > > > iommufd_device_destroy(obj) > > > mutex_lock(&idev->igroup->lock) > > > mutex_lock(&vdev->idev->igroup->lock) (wait) > > > iommufd_vdevice_abort(idev->vdev.obj) > > > mutex_unlock(&idev->igroup->lock) > > > kfree(obj) > > > mutex_lock(&vdev->idev->igroup->lock) (PANIC) > > > iommufd_vdevice_abort(obj) > > > ... > > > > Yes, you can't touch idev inside the destroy function at all, under > > any version. idev is only valid if you have a refcount on vdev. > > > > But why are you touching this lock? Arrange things so abort doesn't > > touch the idev?? > > idev has a pointer idev->vdev to track the vdev's lifecycle. > idev->igroup->lock protects the pointer. At the end of > iommufd_vdevice_destroy() this pointer should be NULLed so that idev > knows vdev is really destroyed. > > I haven't found a safer way for vdev to sync up its validness with idev > w/o touching idev. > > I was thinking of using vdev->idev and some vdev lock for tracking > instead. Then iommufd_vdevice_abort() doesn't touch idev. But it is > still the same, just switch to put idev in risk: > > > iommufd_destroy(vdev_id) > iommufd_vdevice_destroy(obj) > iommufd_device_unbind(idev) > iommufd_device_destroy(obj) > mutex_lock(&vdev->some_lock) > mutex_lock(&idev->vdev->some_lock) (wait) panic could happen here, between acquiring idev->vdev and mutex(vdev->some_lock) as vdev might be destroyed/freed in-between. > iommufd_vdevice_abort(obj) > mutex_unlock(&vdev->some_lock) > kfree(obj) > mutex_lock(&idev->vdev->some_lock) (PANIC) > iommufd_vdevice_abort(idev->vdev.obj) > ... > I cannot find a safe way either, except using certain global lock. but comparing to that I'd prefer to the original wait approach...
On Wed, Jul 02, 2025 at 09:13:50AM +0000, Tian, Kevin wrote: > > > Yes, you can't touch idev inside the destroy function at all, under > > > any version. idev is only valid if you have a refcount on vdev. > > > > > > But why are you touching this lock? Arrange things so abort doesn't > > > touch the idev?? > > > > idev has a pointer idev->vdev to track the vdev's lifecycle. > > idev->igroup->lock protects the pointer. At the end of > > iommufd_vdevice_destroy() this pointer should be NULLed so that idev > > knows vdev is really destroyed. Well, that is destroy, not abort, but OK, there is an issue with destroy. > but comparing to that I'd prefer to the original wait approach... Okay, but lets try to keep the wait hidden inside the refcounting.. The issue here is we don't hold a refcount on idev while working with idev. Let's fix that and then things should work properly? Maybe something like this: diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 4e781aa9fc6329..9174fa7c972b80 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -178,12 +178,20 @@ static void iommufd_device_remove_vdev(struct iommufd_device *idev) mutex_unlock(&idev->igroup->lock); } +void iommufd_device_pre_destroy(struct iommufd_object *obj) +{ + struct iommufd_device *idev = + container_of(obj, struct iommufd_device, obj); + + /* Release the short term users on this */ + iommufd_device_remove_vdev(idev); +} + void iommufd_device_destroy(struct iommufd_object *obj) { struct iommufd_device *idev = container_of(obj, struct iommufd_device, obj); - iommufd_device_remove_vdev(idev); iommu_device_release_dma_owner(idev->dev); iommufd_put_group(idev->igroup); if (!iommufd_selftest_is_mock_dev(idev->dev)) diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index b2e8e106c16158..387c630fdabfbd 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -151,6 +151,9 @@ static int iommufd_object_dec_wait_shortterm(struct iommufd_ctx *ictx, if (refcount_dec_and_test(&to_destroy->shortterm_users)) return 0; + if (iommufd_object_ops[to_destroy->type].pre_destroy) + iommufd_object_ops[to_destroy->type].pre_destroy(to_destroy); + if (wait_event_timeout(ictx->destroy_wait, refcount_read(&to_destroy->shortterm_users) == 0, msecs_to_jiffies(60000))) @@ -567,6 +570,7 @@ static const struct iommufd_object_ops iommufd_object_ops[] = { .destroy = iommufd_access_destroy_object, }, [IOMMUFD_OBJ_DEVICE] = { + .pre_destroy = iommufd_device_pre_destroy, .destroy = iommufd_device_destroy, }, [IOMMUFD_OBJ_FAULT] = { diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c index 9451a311745f7b..cbf99daa7dc25d 100644 --- a/drivers/iommu/iommufd/viommu.c +++ b/drivers/iommu/iommufd/viommu.c @@ -135,6 +135,7 @@ void iommufd_vdevice_destroy(struct iommufd_object *obj) mutex_lock(&vdev->idev->igroup->lock); iommufd_vdevice_abort(obj); mutex_unlock(&vdev->idev->igroup->lock); + iommufd_put_object(vdev->viommu->ictx, &vdev->idev->obj); } int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd) @@ -180,13 +181,19 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd) vdev->id = virt_id; vdev->viommu = viommu; refcount_inc(&viommu->obj.users); + + /* + * A reference is held on the idev so long as we have a pointer. + * iommufd_device_pre_destroy() will revoke it before the real + * destruction. + */ + vdev->idev = idev; + /* * iommufd_device_destroy() waits until idev->vdev is NULL before * freeing the idev, which only happens once the vdev is finished - * destruction. Thus we do not need refcounting on either idev->vdev or - * vdev->idev. + * destruction. */ - vdev->idev = idev; idev->vdev = vdev; curr = xa_cmpxchg(&viommu->vdevs, virt_id, NULL, vdev, GFP_KERNEL); @@ -207,7 +214,8 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd) out_unlock_igroup: mutex_unlock(&idev->igroup->lock); out_put_idev: - iommufd_put_object(ucmd->ictx, &idev->obj); + if (rc) + iommufd_put_object(ucmd->ictx, &idev->obj); out_put_viommu: iommufd_put_object(ucmd->ictx, &viommu->obj); return rc;
On Wed, Jul 02, 2025 at 09:40:42AM -0300, Jason Gunthorpe wrote: > On Wed, Jul 02, 2025 at 09:13:50AM +0000, Tian, Kevin wrote: > > > > Yes, you can't touch idev inside the destroy function at all, under > > > > any version. idev is only valid if you have a refcount on vdev. > > > > > > > > But why are you touching this lock? Arrange things so abort doesn't > > > > touch the idev?? > > > > > > idev has a pointer idev->vdev to track the vdev's lifecycle. > > > idev->igroup->lock protects the pointer. At the end of > > > iommufd_vdevice_destroy() this pointer should be NULLed so that idev > > > knows vdev is really destroyed. > > Well, that is destroy, not abort, but OK, there is an issue with > destroy. > > > but comparing to that I'd prefer to the original wait approach... > > Okay, but lets try to keep the wait hidden inside the refcounting.. > > The issue here is we don't hold a refcount on idev while working with > idev. Let's fix that and then things should work properly? > > Maybe something like this: > > diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c > index 4e781aa9fc6329..9174fa7c972b80 100644 > --- a/drivers/iommu/iommufd/device.c > +++ b/drivers/iommu/iommufd/device.c > @@ -178,12 +178,20 @@ static void iommufd_device_remove_vdev(struct iommufd_device *idev) > mutex_unlock(&idev->igroup->lock); > } > > +void iommufd_device_pre_destroy(struct iommufd_object *obj) > +{ > + struct iommufd_device *idev = > + container_of(obj, struct iommufd_device, obj); > + > + /* Release the short term users on this */ > + iommufd_device_remove_vdev(idev); > +} > + > void iommufd_device_destroy(struct iommufd_object *obj) > { > struct iommufd_device *idev = > container_of(obj, struct iommufd_device, obj); > > - iommufd_device_remove_vdev(idev); > iommu_device_release_dma_owner(idev->dev); > iommufd_put_group(idev->igroup); > if (!iommufd_selftest_is_mock_dev(idev->dev)) > diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c > index b2e8e106c16158..387c630fdabfbd 100644 > --- a/drivers/iommu/iommufd/main.c > +++ b/drivers/iommu/iommufd/main.c > @@ -151,6 +151,9 @@ static int iommufd_object_dec_wait_shortterm(struct iommufd_ctx *ictx, > if (refcount_dec_and_test(&to_destroy->shortterm_users)) > return 0; > > + if (iommufd_object_ops[to_destroy->type].pre_destroy) > + iommufd_object_ops[to_destroy->type].pre_destroy(to_destroy); > + > if (wait_event_timeout(ictx->destroy_wait, > refcount_read(&to_destroy->shortterm_users) ==i 0, I still see concerns here. The pre_destroy() and wait_event is before the idev's users count decreasing to 0, which means a new user could get in. The worse thing is the new user could hold the shortterm_users until pre_destroy(), but there isn't a second pre_destroy()... I think extending the life of shortterm_users at runtime makes things complex. My idea is keep the original definition of shortterm_users at runtime, only repurpose it as a wait completion after users refcount == 0. I.e. idev increase its own shortterm_users, waiting for vdev to decrease it. diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 9876bf70980a..49b787bc0688 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -147,8 +147,15 @@ static void iommufd_device_remove_vdev(struct iommufd_device *idev) goto out_unlock; vdev = iommufd_get_vdevice(idev->ictx, idev->vdev->obj.id); - if (IS_ERR(vdev)) + if (IS_ERR(vdev)) { + /* + * vdev is removed from xarray by userspace, but is not + * destroyed. We should wait for vdev real destruction before + * idev destruction. + */ + iommufd_object_destroy_wait_prepare(&idev->obj); goto out_unlock; + } /* Should never happen */ if (WARN_ON(vdev != idev->vdev)) { diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 08f3c66366ea..6620006df8d1 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -180,6 +180,47 @@ static inline void iommufd_put_object(struct iommufd_ctx *ictx, wake_up_interruptible_all(&ictx->destroy_wait); } +/* + * The iommufd_object_destroy_wait_xx() are used to synchronize the last minute + * destructions of interrelated objects. E.g. the vdev's destroy() should + * always be called before idev's destroy(). + * + * iommufd_object_destroy_wait_prepare() should only be called by + * later-destroyer in its pre_destroy() ops, where the users & shortterm_users + * refcounts are all zeroed. I.e. no new user is possible. Use "users == 0 && + * shortterm_users == 1" as the "destroy wait" state, as this combination + * is not used in runtime usecase and it also prevents new users. + * + * The later-destroyer should use their own locking to ensure the + * first-destroyer still exists when calling this function. + */ +static inline void +iommufd_object_destroy_wait_prepare(struct iommufd_object *to_destroy) +{ + if (WARN_ON(refcount_read(&to_destroy->users) != 0 || + refcount_read(&to_destroy->shortterm_users) != 0)) + return; + + refcount_set(&to_destroy->shortterm_users, 1); +} + +/* + * iommufd_object_destroy_wait_complete() should be called by first-destroyer + * in its destroy() ops. The first-destroyer should not access the + * later-destroyer after calling this function. + */ +static inline void +iommufd_object_destroy_wait_complete(struct iommufd_ctx *ictx, + struct iommufd_object *to_destroy) +{ + if (refcount_read(&to_destroy->users) != 0 || + refcount_read(&to_destroy->shortterm_users) != 1) + return; + + refcount_dec(&to_destroy->shortterm_users); + wake_up_interruptible_all(&ictx->destroy_wait); +} + void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj); void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx, struct iommufd_object *obj); diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index fbccfbd8fb03..148218293f55 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -96,15 +96,37 @@ struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id, return obj; } +static int iommufd_object_pre_destroy_wait(struct iommufd_ctx *ictx, + struct iommufd_object *to_destroy) +{ + if (!iommufd_object_ops[to_destroy->type].pre_destroy) + return 0; + + /* + * pre_destroy() may put the object in destroy wait state (users == 0 + * && shortterm_users == 1). Another thread completes the waiting by + * decreasing the shortterm_users back to 0. The helpers + * iommufd_object_destroy_wait_prepare() and + * iommufd_object_destroy_wait_complete() should be used for this + * purpose. + */ + iommufd_object_ops[to_destroy->type].pre_destroy(to_destroy); + if (wait_event_timeout(ictx->destroy_wait, + refcount_read(&to_destroy->shortterm_users) == + 0, + msecs_to_jiffies(60000))) + return 0; + + pr_crit("Time out waiting for iommufd object to become free\n"); + return -EBUSY; +} + static int iommufd_object_dec_wait_shortterm(struct iommufd_ctx *ictx, struct iommufd_object *to_destroy) { if (refcount_dec_and_test(&to_destroy->shortterm_users)) return 0; - if (iommufd_object_ops[to_destroy->type].pre_destroy) - iommufd_object_ops[to_destroy->type].pre_destroy(to_destroy); - if (wait_event_timeout(ictx->destroy_wait, refcount_read(&to_destroy->shortterm_users) == 0, @@ -184,6 +206,10 @@ int iommufd_object_remove(struct iommufd_ctx *ictx, ret = iommufd_object_dec_wait_shortterm(ictx, obj); if (WARN_ON(ret)) return ret; + } else { + ret = iommufd_object_pre_destroy_wait(ictx, obj); + if (WARN_ON(ret)) + return ret; } iommufd_object_ops[obj->type].destroy(obj); diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c index d49e8f0c8bb3..a7feefd5e472 100644 --- a/drivers/iommu/iommufd/viommu.c +++ b/drivers/iommu/iommufd/viommu.c @@ -103,11 +103,12 @@ void iommufd_vdevice_destroy(struct iommufd_object *obj) { struct iommufd_vdevice *vdev = container_of(obj, struct iommufd_vdevice, obj); + struct iommufd_device *idev = vdev->idev; - mutex_lock(&vdev->idev->igroup->lock); + mutex_lock(&idev->igroup->lock); iommufd_vdevice_abort(obj); - mutex_unlock(&vdev->idev->igroup->lock); - iommufd_put_object(vdev->viommu->ictx, &vdev->idev->obj); + mutex_unlock(&idev->igroup->lock); + iommufd_object_destroy_wait_complete(idev->ictx, &idev->obj); } int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd) @@ -185,8 +186,7 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd) out_unlock_igroup: mutex_unlock(&idev->igroup->lock); out_put_idev: - if (rc) - iommufd_put_object(ucmd->ictx, &idev->obj); + iommufd_put_object(ucmd->ictx, &idev->obj); out_put_viommu: iommufd_put_object(ucmd->ictx, &viommu->obj); return rc;
On Mon, Jul 07, 2025 at 06:58:45PM +0800, Xu Yilun wrote: > I still see concerns here. The pre_destroy() and wait_event is before the > idev's users count decreasing to 0, which means a new user could get > in. Hum, yes it seems we could get a race with another vdevice being created during idevice destruction, but I think this could be fixed in a straightforward way with a flag 'being destroyed' in the idev under the lock if it is the only issue > The worse thing is the new user could hold the shortterm_users until > pre_destroy(), but there isn't a second pre_destroy()... This is OK, the point of pre_destroy is to remove the reference cycle. So it needs to prevent new vdevs from establishing references and remove the reference from any existing vdev. refcounts outside of this cycle are handled by the existing mechanisms - pre_destroy is only about the refcount loop. > I think extending the life of shortterm_users at runtime makes things > complex. Well, it keeps the refcounting working the way it is supposed to. Holding a pointer without a reference is nasty and is creating alot of these weird problems and strange flows. Hold the refcount along with the pointer and things follow the normal patterns. There is nothing inherently wrong with shortterm users for this besides its name. Think of it is as 'wait for users', and we clearly want to wait for the vdev user to put back its idev reference. I think there are too many gyrations with refcounts in this version below. It is hard to reason about if zero refcounts are re-elevated to non-zero. > vdev = iommufd_get_vdevice(idev->ictx, idev->vdev->obj.id); > - if (IS_ERR(vdev)) > + if (IS_ERR(vdev)) { > + /* > + * vdev is removed from xarray by userspace, but is not > + * destroyed. We should wait for vdev real destruction before > + * idev destruction. > + */ > + iommufd_object_destroy_wait_prepare(&idev->obj); > goto out_unlock; This is now really weird, we are mangling the refcounts and the reasoning why it is safe with the concurrent vdev thread is tricky.. > +/* > + * iommufd_object_destroy_wait_complete() should be called by first-destroyer > + * in its destroy() ops. The first-destroyer should not access the > + * later-destroyer after calling this function. > + */ > +static inline void > +iommufd_object_destroy_wait_complete(struct iommufd_ctx *ictx, > + struct iommufd_object *to_destroy) > +{ > + if (refcount_read(&to_destroy->users) != 0 || > + refcount_read(&to_destroy->shortterm_users) != 1) > + return; > + > + refcount_dec(&to_destroy->shortterm_users); > + wake_up_interruptible_all(&ictx->destroy_wait); > +} This looks weird too, why is refcount_read safe? That is usually not how this kind of logic is written. I'm not sure this is OK, it is not an obvious known-good pattern for lifetime management. Jason
On Mon, Jul 07, 2025 at 09:25:02AM -0300, Jason Gunthorpe wrote: > On Mon, Jul 07, 2025 at 06:58:45PM +0800, Xu Yilun wrote: > > > I still see concerns here. The pre_destroy() and wait_event is before the > > idev's users count decreasing to 0, which means a new user could get > > in. > > Hum, yes it seems we could get a race with another vdevice being > created during idevice destruction, but I think this could be fixed in > a straightforward way with a flag 'being destroyed' in the idev under > the lock if it is the only issue > > > The worse thing is the new user could hold the shortterm_users until > > pre_destroy(), but there isn't a second pre_destroy()... > > This is OK, the point of pre_destroy is to remove the reference > cycle. So it needs to prevent new vdevs from establishing references > and remove the reference from any existing vdev. > > refcounts outside of this cycle are handled by the existing > mechanisms - pre_destroy is only about the refcount loop. That works for me. I didn't prefer a being destroyed flag cause it seems not align with the current refcounting mechanism. But I see it makes sense cause idev does deterministic destruction. The only concern is idev cannot actually prevent new reference by itself, need vdev to honor the flag. But just keep simple, don't over-engineering. Thanks, Yilun > > > I think extending the life of shortterm_users at runtime makes things > > complex. > > Well, it keeps the refcounting working the way it is supposed > to. Holding a pointer without a reference is nasty and is creating > alot of these weird problems and strange flows. Hold the refcount > along with the pointer and things follow the normal patterns. > > There is nothing inherently wrong with shortterm users for this > besides its name. Think of it is as 'wait for users', and we clearly > want to wait for the vdev user to put back its idev reference. >
> From: Jason Gunthorpe <jgg@nvidia.com> > Sent: Wednesday, July 2, 2025 8:41 PM > > On Wed, Jul 02, 2025 at 09:13:50AM +0000, Tian, Kevin wrote: > > > > Yes, you can't touch idev inside the destroy function at all, under > > > > any version. idev is only valid if you have a refcount on vdev. > > > > > > > > But why are you touching this lock? Arrange things so abort doesn't > > > > touch the idev?? > > > > > > idev has a pointer idev->vdev to track the vdev's lifecycle. > > > idev->igroup->lock protects the pointer. At the end of > > > iommufd_vdevice_destroy() this pointer should be NULLed so that idev > > > knows vdev is really destroyed. > > Well, that is destroy, not abort, but OK, there is an issue with > destroy. > > > but comparing to that I'd prefer to the original wait approach... > > Okay, but lets try to keep the wait hidden inside the refcounting.. > > The issue here is we don't hold a refcount on idev while working with > idev. Let's fix that and then things should work properly? > > Maybe something like this: > > diff --git a/drivers/iommu/iommufd/device.c > b/drivers/iommu/iommufd/device.c > index 4e781aa9fc6329..9174fa7c972b80 100644 > --- a/drivers/iommu/iommufd/device.c > +++ b/drivers/iommu/iommufd/device.c > @@ -178,12 +178,20 @@ static void iommufd_device_remove_vdev(struct > iommufd_device *idev) > mutex_unlock(&idev->igroup->lock); > } > > +void iommufd_device_pre_destroy(struct iommufd_object *obj) > +{ > + struct iommufd_device *idev = > + container_of(obj, struct iommufd_device, obj); > + > + /* Release the short term users on this */ > + iommufd_device_remove_vdev(idev); > +} > + > void iommufd_device_destroy(struct iommufd_object *obj) > { > struct iommufd_device *idev = > container_of(obj, struct iommufd_device, obj); > > - iommufd_device_remove_vdev(idev); > iommu_device_release_dma_owner(idev->dev); > iommufd_put_group(idev->igroup); > if (!iommufd_selftest_is_mock_dev(idev->dev)) > diff --git a/drivers/iommu/iommufd/main.c > b/drivers/iommu/iommufd/main.c > index b2e8e106c16158..387c630fdabfbd 100644 > --- a/drivers/iommu/iommufd/main.c > +++ b/drivers/iommu/iommufd/main.c > @@ -151,6 +151,9 @@ static int > iommufd_object_dec_wait_shortterm(struct iommufd_ctx *ictx, > if (refcount_dec_and_test(&to_destroy->shortterm_users)) > return 0; > > + if (iommufd_object_ops[to_destroy->type].pre_destroy) > + iommufd_object_ops[to_destroy- > >type].pre_destroy(to_destroy); > + > if (wait_event_timeout(ictx->destroy_wait, > refcount_read(&to_destroy->shortterm_users) > == 0, > msecs_to_jiffies(60000))) > @@ -567,6 +570,7 @@ static const struct iommufd_object_ops > iommufd_object_ops[] = { > .destroy = iommufd_access_destroy_object, > }, > [IOMMUFD_OBJ_DEVICE] = { > + .pre_destroy = iommufd_device_pre_destroy, > .destroy = iommufd_device_destroy, > }, > [IOMMUFD_OBJ_FAULT] = { > diff --git a/drivers/iommu/iommufd/viommu.c > b/drivers/iommu/iommufd/viommu.c > index 9451a311745f7b..cbf99daa7dc25d 100644 > --- a/drivers/iommu/iommufd/viommu.c > +++ b/drivers/iommu/iommufd/viommu.c > @@ -135,6 +135,7 @@ void iommufd_vdevice_destroy(struct > iommufd_object *obj) > mutex_lock(&vdev->idev->igroup->lock); > iommufd_vdevice_abort(obj); > mutex_unlock(&vdev->idev->igroup->lock); > + iommufd_put_object(vdev->viommu->ictx, &vdev->idev->obj); > } > > int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd) > @@ -180,13 +181,19 @@ int iommufd_vdevice_alloc_ioctl(struct > iommufd_ucmd *ucmd) > vdev->id = virt_id; > vdev->viommu = viommu; > refcount_inc(&viommu->obj.users); > + > + /* > + * A reference is held on the idev so long as we have a pointer. > + * iommufd_device_pre_destroy() will revoke it before the real > + * destruction. > + */ > + vdev->idev = idev; > + > /* > * iommufd_device_destroy() waits until idev->vdev is NULL before > * freeing the idev, which only happens once the vdev is finished > - * destruction. Thus we do not need refcounting on either idev->vdev > or > - * vdev->idev. > + * destruction. > */ > - vdev->idev = idev; > idev->vdev = vdev; > > curr = xa_cmpxchg(&viommu->vdevs, virt_id, NULL, vdev, > GFP_KERNEL); > @@ -207,7 +214,8 @@ int iommufd_vdevice_alloc_ioctl(struct > iommufd_ucmd *ucmd) > out_unlock_igroup: > mutex_unlock(&idev->igroup->lock); > out_put_idev: > - iommufd_put_object(ucmd->ictx, &idev->obj); > + if (rc) > + iommufd_put_object(ucmd->ictx, &idev->obj); but this sounds like a misuse of shortterm_users which is not intended to be held long beyond ioctl...
On Thu, Jul 03, 2025 at 04:32:26AM +0000, Tian, Kevin wrote: > but this sounds like a misuse of shortterm_users which is not intended > to be held long beyond ioctl... With the addition of the pre_destroy it purpose gets changed to 'beyond the ioctl or past pre_destroy' Jason
On 6/27/25 11:38, Xu Yilun wrote: > Destroy iommufd_vdevice (vdev) on iommufd_idevice (idev) destroy so > that vdev can't outlive idev. > > iommufd_device (idev) represents the physical device bound to iommufd, > while the iommufd_vdevice (vdev) represents the virtual instance of the > physical device in the VM. The lifecycle of the vdev should not be > longer than idev. This doesn't cause real problem on existing use cases > cause vdev doesn't impact the physical device, only provides > virtualization information. But to extend vdev for Confidential > Computing (CC), there are needs to do secure configuration for the vdev, > e.g. TSM Bind/Unbind. These configurations should be rolled back on idev > destroy, or the external driver (VFIO) functionality may be impact. > > Building the association between idev & vdev requires the two objects > pointing each other, but not referencing each other. This requires > proper locking. This is done by reviving some of Nicolin's patch [1]. > > There are 3 cases on idev destroy: > > 1. vdev is already destroyed by userspace. No extra handling needed. > 2. vdev is still alive. Use iommufd_object_tombstone_user() to > destroy vdev and tombstone the vdev ID. > 3. vdev is being destroyed by userspace. The vdev ID is already freed, > but vdev destroy handler is not completed. Destroy the vdev > immediately. To solve the racing with userspace destruction, make > iommufd_vdevice_abort() reentrant. > > [1]:https://lore.kernel.org/ > all/53025c827c44d68edb6469bfd940a8e8bc6147a5.1729897278.git.nicolinc@nvidia.com/ > > Originally-by: Nicolin Chen<nicolinc@nvidia.com> > Suggested-by: Jason Gunthorpe<jgg@nvidia.com> > Co-developed-by: Aneesh Kumar K.V (Arm)<aneesh.kumar@kernel.org> > Signed-off-by: Aneesh Kumar K.V (Arm)<aneesh.kumar@kernel.org> > Signed-off-by: Xu Yilun<yilun.xu@linux.intel.com> > --- > drivers/iommu/iommufd/device.c | 42 +++++++++++++++++++++++ > drivers/iommu/iommufd/iommufd_private.h | 11 +++++++ > drivers/iommu/iommufd/main.c | 1 + > drivers/iommu/iommufd/viommu.c | 44 +++++++++++++++++++++++-- > 4 files changed, 95 insertions(+), 3 deletions(-) > > diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c > index 86244403b532..0937d4989185 100644 > --- a/drivers/iommu/iommufd/device.c > +++ b/drivers/iommu/iommufd/device.c > @@ -137,11 +137,53 @@ static struct iommufd_group *iommufd_get_group(struct iommufd_ctx *ictx, > } > } > > +static void iommufd_device_remove_vdev(struct iommufd_device *idev) > +{ > + struct iommufd_vdevice *vdev; > + > + mutex_lock(&idev->igroup->lock); > + /* vdev has been completely destroyed by userspace */ > + if (!idev->vdev) > + goto out_unlock; > + > + vdev = iommufd_get_vdevice(idev->ictx, idev->vdev->obj.id); > + if (IS_ERR(vdev)) { > + /* > + * vdev is removed from xarray by userspace, but is not > + * destroyed/freed. Since iommufd_vdevice_abort() is reentrant, > + * safe to destroy vdev here. > + */ > + iommufd_vdevice_abort(&idev->vdev->obj); > + goto out_unlock; > + } > + > + /* Should never happen */ > + if (WARN_ON(vdev != idev->vdev)) { > + iommufd_put_object(idev->ictx, &vdev->obj); > + goto out_unlock; > + } > + > + /* > + * vdev is still alive. Hold a users refcount to prevent racing with > + * userspace destruction, then use iommufd_object_tombstone_user() to > + * destroy it and leave a tombstone. > + */ > + refcount_inc(&vdev->obj.users); > + iommufd_put_object(idev->ictx, &vdev->obj); > + mutex_unlock(&idev->igroup->lock); > + iommufd_object_tombstone_user(idev->ictx, &vdev->obj); > + return; > + > +out_unlock: > + mutex_unlock(&idev->igroup->lock); > +} > + > void iommufd_device_destroy(struct iommufd_object *obj) > { > struct iommufd_device *idev = > container_of(obj, struct iommufd_device, obj); > > + iommufd_device_remove_vdev(idev); > iommu_device_release_dma_owner(idev->dev); > iommufd_put_group(idev->igroup); > if (!iommufd_selftest_is_mock_dev(idev->dev)) > diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h > index fbc9ef78d81f..f58aa4439c53 100644 > --- a/drivers/iommu/iommufd/iommufd_private.h > +++ b/drivers/iommu/iommufd/iommufd_private.h > @@ -446,6 +446,7 @@ struct iommufd_device { > /* always the physical device */ > struct device *dev; > bool enforce_cache_coherency; > + struct iommufd_vdevice *vdev; > }; > > static inline struct iommufd_device * > @@ -621,6 +622,7 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd); > void iommufd_viommu_destroy(struct iommufd_object *obj); > int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd); > void iommufd_vdevice_destroy(struct iommufd_object *obj); > +void iommufd_vdevice_abort(struct iommufd_object *obj); > > struct iommufd_vdevice { > struct iommufd_object obj; > @@ -628,8 +630,17 @@ struct iommufd_vdevice { > struct iommufd_viommu *viommu; > struct device *dev; > u64 id; /* per-vIOMMU virtual ID */ > + struct iommufd_device *idev; > }; > > +static inline struct iommufd_vdevice * > +iommufd_get_vdevice(struct iommufd_ctx *ictx, u32 id) > +{ > + return container_of(iommufd_get_object(ictx, id, > + IOMMUFD_OBJ_VDEVICE), > + struct iommufd_vdevice, obj); > +} > + > #ifdef CONFIG_IOMMUFD_TEST > int iommufd_test(struct iommufd_ucmd *ucmd); > void iommufd_selftest_destroy(struct iommufd_object *obj); > diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c > index 620923669b42..64731b4fdbdf 100644 > --- a/drivers/iommu/iommufd/main.c > +++ b/drivers/iommu/iommufd/main.c > @@ -529,6 +529,7 @@ static const struct iommufd_object_ops iommufd_object_ops[] = { > }, > [IOMMUFD_OBJ_VDEVICE] = { > .destroy = iommufd_vdevice_destroy, > + .abort = iommufd_vdevice_abort, > }, > [IOMMUFD_OBJ_VEVENTQ] = { > .destroy = iommufd_veventq_destroy, > diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c > index 01df2b985f02..632d1d7b8fd8 100644 > --- a/drivers/iommu/iommufd/viommu.c > +++ b/drivers/iommu/iommufd/viommu.c > @@ -84,16 +84,38 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd) > return rc; > } > > -void iommufd_vdevice_destroy(struct iommufd_object *obj) > +void iommufd_vdevice_abort(struct iommufd_object *obj) > { > struct iommufd_vdevice *vdev = > container_of(obj, struct iommufd_vdevice, obj); > struct iommufd_viommu *viommu = vdev->viommu; > + struct iommufd_device *idev = vdev->idev; > + > + lockdep_assert_held(&idev->igroup->lock); > + > + /* > + * iommufd_vdevice_abort() could be reentrant, by > + * iommufd_device_unbind() or by iommufd_destroy(). Cleanup only once. > + */ > + if (!viommu) > + return; > > /* xa_cmpxchg is okay to fail if alloc failed xa_cmpxchg previously */ > xa_cmpxchg(&viommu->vdevs, vdev->id, vdev, NULL, GFP_KERNEL); > refcount_dec(&viommu->obj.users); > put_device(vdev->dev); > + vdev->viommu = NULL; > + idev->vdev = NULL; I feel it makes more sense to reorder the operations like this: vdev->viommu = NULL; vdev->idev = NULL; idev->vdev = NULL; put_device(vdev->dev); put_device(vdev->dev) could potentially trigger other code paths that might attempt to reference vdev or its associated pointers. Therefore, it's safer to null all the relevant reference pointers before calling put_device(). Others look good to me, Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com> Thanks, baolu
> > -void iommufd_vdevice_destroy(struct iommufd_object *obj) > > +void iommufd_vdevice_abort(struct iommufd_object *obj) > > { > > struct iommufd_vdevice *vdev = > > container_of(obj, struct iommufd_vdevice, obj); > > struct iommufd_viommu *viommu = vdev->viommu; > > + struct iommufd_device *idev = vdev->idev; > > + > > + lockdep_assert_held(&idev->igroup->lock); > > + > > + /* > > + * iommufd_vdevice_abort() could be reentrant, by > > + * iommufd_device_unbind() or by iommufd_destroy(). Cleanup only once. > > + */ > > + if (!viommu) > > + return; > > /* xa_cmpxchg is okay to fail if alloc failed xa_cmpxchg previously */ > > xa_cmpxchg(&viommu->vdevs, vdev->id, vdev, NULL, GFP_KERNEL); > > refcount_dec(&viommu->obj.users); > > put_device(vdev->dev); > > + vdev->viommu = NULL; > > + idev->vdev = NULL; > > I feel it makes more sense to reorder the operations like this: > > vdev->viommu = NULL; > vdev->idev = NULL; > idev->vdev = NULL; > put_device(vdev->dev); > > put_device(vdev->dev) could potentially trigger other code paths that > might attempt to reference vdev or its associated pointers. Therefore, > it's safer to null all the relevant reference pointers before calling > put_device(). Yep. due to other changes, now I keep: xa_cmpxchg(&viommu->vdevs, vdev->id, vdev, NULL, GFP_KERNEL); refcount_dec(&viommu->obj.users); + idev->vdev = NULL; put_device(vdev->dev); Anyway, vdev->dev would be completely dropped in next patch, so it doesn't matter much. Thanks, Yilun > > Others look good to me, > > Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com> > > Thanks, > baolu
© 2016 - 2025 Red Hat, Inc.