Add the iommufd_object_tombstone_user() helper, which allows the caller
to destroy an iommufd object created by userspace.
This is useful on some destroy paths when the kernel caller finds the
object should have been removed by userspace but is still alive. With
this helper, the caller destroys the object but leave the object ID
reserved (so called tombstone). The tombstone prevents repurposing the
object ID without awareness from the original user.
Since this happens for abnomal userspace behavior, for simplicity, the
tombstoned object ID would be permanently leaked until
iommufd_fops_release(). I.e. the original user gets an error when
calling ioctl(IOMMU_DESTROY) on that ID.
The first use case would be to ensure the iommufd_vdevice can't outlive
the associated iommufd_device.
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
---
drivers/iommu/iommufd/iommufd_private.h | 23 +++++++++++++++++-
drivers/iommu/iommufd/main.c | 31 ++++++++++++++++++-------
2 files changed, 45 insertions(+), 9 deletions(-)
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 9ccc83341f32..fbc9ef78d81f 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -187,7 +187,8 @@ void iommufd_object_finalize(struct iommufd_ctx *ictx,
struct iommufd_object *obj);
enum {
- REMOVE_WAIT_SHORTTERM = 1,
+ REMOVE_WAIT_SHORTTERM = BIT(0),
+ REMOVE_OBJ_TOMBSTONE = BIT(1),
};
int iommufd_object_remove(struct iommufd_ctx *ictx,
struct iommufd_object *to_destroy, u32 id,
@@ -213,6 +214,26 @@ static inline void iommufd_object_destroy_user(struct iommufd_ctx *ictx,
WARN_ON(ret);
}
+/*
+ * Similar to iommufd_object_destroy_user(), except that the object ID is left
+ * reserved/tombstoned.
+ */
+static inline void iommufd_object_tombstone_user(struct iommufd_ctx *ictx,
+ struct iommufd_object *obj)
+{
+ int ret;
+
+ ret = iommufd_object_remove(ictx, obj, obj->id,
+ REMOVE_WAIT_SHORTTERM | REMOVE_OBJ_TOMBSTONE);
+
+ /*
+ * If there is a bug and we couldn't destroy the object then we did put
+ * back the caller's users refcount and will eventually try to free it
+ * again during close.
+ */
+ WARN_ON(ret);
+}
+
/*
* The HWPT allocated by autodomains is used in possibly many devices and
* is automatically destroyed when its refcount reaches zero.
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 3df468f64e7d..5fd75aba068b 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -167,7 +167,7 @@ int iommufd_object_remove(struct iommufd_ctx *ictx,
goto err_xa;
}
- xas_store(&xas, NULL);
+ xas_store(&xas, (flags & REMOVE_OBJ_TOMBSTONE) ? XA_ZERO_ENTRY : NULL);
if (ictx->vfio_ioas == container_of(obj, struct iommufd_ioas, obj))
ictx->vfio_ioas = NULL;
xa_unlock(&ictx->objects);
@@ -238,6 +238,7 @@ static int iommufd_fops_release(struct inode *inode, struct file *filp)
struct iommufd_ctx *ictx = filp->private_data;
struct iommufd_sw_msi_map *next;
struct iommufd_sw_msi_map *cur;
+ XA_STATE(xas, &ictx->objects, 0);
struct iommufd_object *obj;
/*
@@ -251,16 +252,30 @@ static int iommufd_fops_release(struct inode *inode, struct file *filp)
*/
while (!xa_empty(&ictx->objects)) {
unsigned int destroyed = 0;
- unsigned long index;
- xa_for_each(&ictx->objects, index, obj) {
- if (!refcount_dec_if_one(&obj->users))
- continue;
+ xas_set(&xas, 0);
+ for (;;) {
+ rcu_read_lock();
+ obj = xas_find(&xas, ULONG_MAX);
+ rcu_read_unlock();
+
+ if (!obj)
+ break;
+
+ if (!xa_is_zero(obj)) {
+ if (!refcount_dec_if_one(&obj->users))
+ continue;
+
+ iommufd_object_ops[obj->type].destroy(obj);
+ kfree(obj);
+ }
+
destroyed++;
- xa_erase(&ictx->objects, index);
- iommufd_object_ops[obj->type].destroy(obj);
- kfree(obj);
+ xas_lock(&xas);
+ xas_store(&xas, NULL);
+ xas_unlock(&xas);
}
+
/* Bug related to users refcount */
if (WARN_ON(!destroyed))
break;
--
2.25.1
Xu Yilun <yilun.xu@linux.intel.com> writes: > Add the iommufd_object_tombstone_user() helper, which allows the caller > to destroy an iommufd object created by userspace. > > This is useful on some destroy paths when the kernel caller finds the > object should have been removed by userspace but is still alive. With > this helper, the caller destroys the object but leave the object ID > reserved (so called tombstone). The tombstone prevents repurposing the > object ID without awareness from the original user. > > Since this happens for abnomal userspace behavior, for simplicity, the > tombstoned object ID would be permanently leaked until > iommufd_fops_release(). I.e. the original user gets an error when > calling ioctl(IOMMU_DESTROY) on that ID. > > The first use case would be to ensure the iommufd_vdevice can't outlive > the associated iommufd_device. > > Suggested-by: Jason Gunthorpe <jgg@nvidia.com> > Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com> > --- > drivers/iommu/iommufd/iommufd_private.h | 23 +++++++++++++++++- > drivers/iommu/iommufd/main.c | 31 ++++++++++++++++++------- > 2 files changed, 45 insertions(+), 9 deletions(-) > > diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h > index 9ccc83341f32..fbc9ef78d81f 100644 > --- a/drivers/iommu/iommufd/iommufd_private.h > +++ b/drivers/iommu/iommufd/iommufd_private.h > @@ -187,7 +187,8 @@ void iommufd_object_finalize(struct iommufd_ctx *ictx, > struct iommufd_object *obj); > > enum { > - REMOVE_WAIT_SHORTTERM = 1, > + REMOVE_WAIT_SHORTTERM = BIT(0), > + REMOVE_OBJ_TOMBSTONE = BIT(1), > }; > int iommufd_object_remove(struct iommufd_ctx *ictx, > struct iommufd_object *to_destroy, u32 id, > @@ -213,6 +214,26 @@ static inline void iommufd_object_destroy_user(struct iommufd_ctx *ictx, > WARN_ON(ret); > } > > +/* > + * Similar to iommufd_object_destroy_user(), except that the object ID is left > + * reserved/tombstoned. > + */ > +static inline void iommufd_object_tombstone_user(struct iommufd_ctx *ictx, > + struct iommufd_object *obj) > +{ > + int ret; > + > + ret = iommufd_object_remove(ictx, obj, obj->id, > + REMOVE_WAIT_SHORTTERM | REMOVE_OBJ_TOMBSTONE); > + > + /* > + * If there is a bug and we couldn't destroy the object then we did put > + * back the caller's users refcount and will eventually try to free it > + * again during close. > + */ > + WARN_ON(ret); > +} > + > /* > * The HWPT allocated by autodomains is used in possibly many devices and > * is automatically destroyed when its refcount reaches zero. > diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c > index 3df468f64e7d..5fd75aba068b 100644 > --- a/drivers/iommu/iommufd/main.c > +++ b/drivers/iommu/iommufd/main.c > @@ -167,7 +167,7 @@ int iommufd_object_remove(struct iommufd_ctx *ictx, > goto err_xa; > } > > - xas_store(&xas, NULL); > + xas_store(&xas, (flags & REMOVE_OBJ_TOMBSTONE) ? XA_ZERO_ENTRY : NULL); > if (ictx->vfio_ioas == container_of(obj, struct iommufd_ioas, obj)) > ictx->vfio_ioas = NULL; > xa_unlock(&ictx->objects); > @@ -238,6 +238,7 @@ static int iommufd_fops_release(struct inode *inode, struct file *filp) > struct iommufd_ctx *ictx = filp->private_data; > struct iommufd_sw_msi_map *next; > struct iommufd_sw_msi_map *cur; > + XA_STATE(xas, &ictx->objects, 0); > struct iommufd_object *obj; > > /* > @@ -251,16 +252,30 @@ static int iommufd_fops_release(struct inode *inode, struct file *filp) > */ > while (!xa_empty(&ictx->objects)) { > unsigned int destroyed = 0; > - unsigned long index; > > - xa_for_each(&ictx->objects, index, obj) { > - if (!refcount_dec_if_one(&obj->users)) > - continue; > + xas_set(&xas, 0); > + for (;;) { > + rcu_read_lock(); > + obj = xas_find(&xas, ULONG_MAX); > + rcu_read_unlock(); > What is the need for the rcu_read_lock()? > + > + if (!obj) > + break; > + > + if (!xa_is_zero(obj)) { > + if (!refcount_dec_if_one(&obj->users)) > + continue; > + > + iommufd_object_ops[obj->type].destroy(obj); > + kfree(obj); > + } > + > destroyed++; > - xa_erase(&ictx->objects, index); > - iommufd_object_ops[obj->type].destroy(obj); > - kfree(obj); > + xas_lock(&xas); > + xas_store(&xas, NULL); > + xas_unlock(&xas); is that xas_lock needed?. we can't run a xarray update parallel to this because neither iommufd ioctl not vfio cdev unbind can happen in parallel. I have this as an additonal comment added to the function in my change. /* * We don't need additional locks because the iommufd_fops_release() function is * only triggered when the iommufd descriptor is released. At that point, no * other iommufd-based ioctl operations can be running concurrently. * * The destruction of the vdevice via idevice unbind is also safe: * iommufd_fops_release() can only be called after the idevice has been unbound. * The idevice bind operation takes a reference to the iommufd descriptor, * preventing early release. */ > } > + > /* Bug related to users refcount */ > if (WARN_ON(!destroyed)) > break; > -- > 2.25.1 -aneesh
On Wed, Jun 25, 2025 at 11:21:15AM +0530, Aneesh Kumar K.V wrote: > Xu Yilun <yilun.xu@linux.intel.com> writes: > > > Add the iommufd_object_tombstone_user() helper, which allows the caller > > to destroy an iommufd object created by userspace. > > > > This is useful on some destroy paths when the kernel caller finds the > > object should have been removed by userspace but is still alive. With > > this helper, the caller destroys the object but leave the object ID > > reserved (so called tombstone). The tombstone prevents repurposing the > > object ID without awareness from the original user. > > > > Since this happens for abnomal userspace behavior, for simplicity, the > > tombstoned object ID would be permanently leaked until > > iommufd_fops_release(). I.e. the original user gets an error when > > calling ioctl(IOMMU_DESTROY) on that ID. > > > > The first use case would be to ensure the iommufd_vdevice can't outlive > > the associated iommufd_device. > > > > Suggested-by: Jason Gunthorpe <jgg@nvidia.com> > > Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com> > > --- > > drivers/iommu/iommufd/iommufd_private.h | 23 +++++++++++++++++- > > drivers/iommu/iommufd/main.c | 31 ++++++++++++++++++------- > > 2 files changed, 45 insertions(+), 9 deletions(-) > > > > diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h > > index 9ccc83341f32..fbc9ef78d81f 100644 > > --- a/drivers/iommu/iommufd/iommufd_private.h > > +++ b/drivers/iommu/iommufd/iommufd_private.h > > @@ -187,7 +187,8 @@ void iommufd_object_finalize(struct iommufd_ctx *ictx, > > struct iommufd_object *obj); > > > > enum { > > - REMOVE_WAIT_SHORTTERM = 1, > > + REMOVE_WAIT_SHORTTERM = BIT(0), > > + REMOVE_OBJ_TOMBSTONE = BIT(1), > > }; > > int iommufd_object_remove(struct iommufd_ctx *ictx, > > struct iommufd_object *to_destroy, u32 id, > > @@ -213,6 +214,26 @@ static inline void iommufd_object_destroy_user(struct iommufd_ctx *ictx, > > WARN_ON(ret); > > } > > > > +/* > > + * Similar to iommufd_object_destroy_user(), except that the object ID is left > > + * reserved/tombstoned. > > + */ > > +static inline void iommufd_object_tombstone_user(struct iommufd_ctx *ictx, > > + struct iommufd_object *obj) > > +{ > > + int ret; > > + > > + ret = iommufd_object_remove(ictx, obj, obj->id, > > + REMOVE_WAIT_SHORTTERM | REMOVE_OBJ_TOMBSTONE); > > + > > + /* > > + * If there is a bug and we couldn't destroy the object then we did put > > + * back the caller's users refcount and will eventually try to free it > > + * again during close. > > + */ > > + WARN_ON(ret); > > +} > > + > > /* > > * The HWPT allocated by autodomains is used in possibly many devices and > > * is automatically destroyed when its refcount reaches zero. > > diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c > > index 3df468f64e7d..5fd75aba068b 100644 > > --- a/drivers/iommu/iommufd/main.c > > +++ b/drivers/iommu/iommufd/main.c > > @@ -167,7 +167,7 @@ int iommufd_object_remove(struct iommufd_ctx *ictx, > > goto err_xa; > > } > > > > - xas_store(&xas, NULL); > > + xas_store(&xas, (flags & REMOVE_OBJ_TOMBSTONE) ? XA_ZERO_ENTRY : NULL); > > if (ictx->vfio_ioas == container_of(obj, struct iommufd_ioas, obj)) > > ictx->vfio_ioas = NULL; > > xa_unlock(&ictx->objects); > > @@ -238,6 +238,7 @@ static int iommufd_fops_release(struct inode *inode, struct file *filp) > > struct iommufd_ctx *ictx = filp->private_data; > > struct iommufd_sw_msi_map *next; > > struct iommufd_sw_msi_map *cur; > > + XA_STATE(xas, &ictx->objects, 0); > > struct iommufd_object *obj; > > > > /* > > @@ -251,16 +252,30 @@ static int iommufd_fops_release(struct inode *inode, struct file *filp) > > */ > > while (!xa_empty(&ictx->objects)) { > > unsigned int destroyed = 0; > > - unsigned long index; > > > > - xa_for_each(&ictx->objects, index, obj) { > > - if (!refcount_dec_if_one(&obj->users)) > > - continue; > > + xas_set(&xas, 0); > > + for (;;) { > > + rcu_read_lock(); > > + obj = xas_find(&xas, ULONG_MAX); > > + rcu_read_unlock(); > > > > What is the need for the rcu_read_lock()? To surpress rcu warning in xas_find(). > > > + > > + if (!obj) > > + break; > > + > > + if (!xa_is_zero(obj)) { > > + if (!refcount_dec_if_one(&obj->users)) > > + continue; > > + > > + iommufd_object_ops[obj->type].destroy(obj); > > + kfree(obj); > > + } > > + > > destroyed++; > > - xa_erase(&ictx->objects, index); > > - iommufd_object_ops[obj->type].destroy(obj); > > - kfree(obj); > > + xas_lock(&xas); > > + xas_store(&xas, NULL); > > + xas_unlock(&xas); > > is that xas_lock needed?. we can't run a xarray update parallel to this > because neither iommufd ioctl not vfio cdev unbind can happen in parallel. That's true, but also to surpress warning in xas_store(). > > I have this as an additonal comment added to the function in my change. > > /* > * We don't need additional locks because the iommufd_fops_release() function is > * only triggered when the iommufd descriptor is released. At that point, no > * other iommufd-based ioctl operations can be running concurrently. > * > * The destruction of the vdevice via idevice unbind is also safe: > * iommufd_fops_release() can only be called after the idevice has been unbound. > * The idevice bind operation takes a reference to the iommufd descriptor, > * preventing early release. > */ That's good. But Jason has another suggestion that no need to clear tombstones on fops_release(), so we don't need these changes at all. Thanks, Yilun > > > > } > > + > > /* Bug related to users refcount */ > > if (WARN_ON(!destroyed)) > > break; > > -- > > 2.25.1 > > -aneesh
On Mon, Jun 23, 2025 at 05:49:43PM +0800, Xu Yilun wrote: > @@ -251,16 +252,30 @@ static int iommufd_fops_release(struct inode *inode, struct file *filp) > */ > while (!xa_empty(&ictx->objects)) { > unsigned int destroyed = 0; > - unsigned long index; > > - xa_for_each(&ictx->objects, index, obj) { > - if (!refcount_dec_if_one(&obj->users)) > - continue; I don't think you need all these changes.. xa_for_each will skip the XA_ZERO_ENTRIES because it won't return NULL xa_empty() will fail, but just remove it. @@ -288,6 +288,7 @@ static int iommufd_fops_release(struct inode *inode, struct file *filp) struct iommufd_sw_msi_map *next; struct iommufd_sw_msi_map *cur; struct iommufd_object *obj; + bool empty; /* * The objects in the xarray form a graph of "users" counts, and we have @@ -298,11 +299,13 @@ static int iommufd_fops_release(struct inode *inode, struct file *filp) * until the entire list is destroyed. If this can't progress then there * is some bug related to object refcounting. */ - while (!xa_empty(&ictx->objects)) { + do { unsigned int destroyed = 0; unsigned long index; + empty = true; xa_for_each(&ictx->objects, index, obj) { + empty = false; if (!refcount_dec_if_one(&obj->users)) continue; destroyed++; @@ -313,8 +316,13 @@ static int iommufd_fops_release(struct inode *inode, struct file *filp) /* Bug related to users refcount */ if (WARN_ON(!destroyed)) break; - } - WARN_ON(!xa_empty(&ictx->groups)); + } while (!empty); + + /* + * There may be some tombstones left over from + * iommufd_object_tombstone_user() + */ + xa_destroy(&ictx->groups); Jason
On Tue, Jun 24, 2025 at 10:35:20AM -0300, Jason Gunthorpe wrote: > On Mon, Jun 23, 2025 at 05:49:43PM +0800, Xu Yilun wrote: > > @@ -251,16 +252,30 @@ static int iommufd_fops_release(struct inode *inode, struct file *filp) > > */ > > while (!xa_empty(&ictx->objects)) { > > unsigned int destroyed = 0; > > - unsigned long index; > > > > - xa_for_each(&ictx->objects, index, obj) { > > - if (!refcount_dec_if_one(&obj->users)) > > - continue; > > I don't think you need all these changes.. > > xa_for_each will skip the XA_ZERO_ENTRIES because it won't return NULL > > xa_empty() will fail, but just remove it. > > @@ -288,6 +288,7 @@ static int iommufd_fops_release(struct inode *inode, struct file *filp) > struct iommufd_sw_msi_map *next; > struct iommufd_sw_msi_map *cur; > struct iommufd_object *obj; > + bool empty; > > /* > * The objects in the xarray form a graph of "users" counts, and we have > @@ -298,11 +299,13 @@ static int iommufd_fops_release(struct inode *inode, struct file *filp) > * until the entire list is destroyed. If this can't progress then there > * is some bug related to object refcounting. > */ > - while (!xa_empty(&ictx->objects)) { > + do { > unsigned int destroyed = 0; > unsigned long index; > > + empty = true; > xa_for_each(&ictx->objects, index, obj) { > + empty = false; > if (!refcount_dec_if_one(&obj->users)) > continue; > destroyed++; > @@ -313,8 +316,13 @@ static int iommufd_fops_release(struct inode *inode, struct file *filp) > /* Bug related to users refcount */ > if (WARN_ON(!destroyed)) > break; > - } > - WARN_ON(!xa_empty(&ictx->groups)); > + } while (!empty); > + > + /* > + * There may be some tombstones left over from > + * iommufd_object_tombstone_user() > + */ > + xa_destroy(&ictx->groups); I'm good to it. I was obsessed to ensure the xarray super clean. Thanks, Yilun > > > Jason
© 2016 - 2025 Red Hat, Inc.