The fault->mutex was to serialize the fault read()/write() fops and the
iommufd_fault_auto_response_faults(). And it was also conveniently used
to protect fault->deliver in poll() and iommufd_fault_iopf_handler().
However, copy_from/to_user() may sleep if pagefaults are enabled. Thus,
they could take a long time to wait for user pages to swap in, blocking
iommufd_fault_iopf_handler() and its caller that is typically a shared
IRQ handler of an IOMMU driver, resulting in a potential global DOS.
Instead of resuing the mutex to protect the fault->deliver list, add a
separate spinlock to do the job, so iommufd_fault_iopf_handler() would
no longer be blocked by copy_from/to_user().
Provide two list manipulation helpers for fault->deliver:
- Extract the first iopf_group out of the fault->deliver list
- Restore an iopf_group back to the head of the fault->deliver list
Replace list_first_entry and list_for_each accordingly.
Fixes: 07838f7fd529 ("iommufd: Add iommufd fault object")
Cc: stable@vger.kernel.org
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
drivers/iommu/iommufd/iommufd_private.h | 26 +++++++++++++++
drivers/iommu/iommufd/fault.c | 43 ++++++++++++++-----------
2 files changed, 50 insertions(+), 19 deletions(-)
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index b6d706cf2c66..d3097c857abf 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -445,12 +445,38 @@ struct iommufd_fault {
/* The lists of outstanding faults protected by below mutex. */
struct mutex mutex;
+ spinlock_t lock; /* protects the deliver list */
struct list_head deliver;
struct xarray response;
struct wait_queue_head wait_queue;
};
+/* Extract the first node out of the fault->deliver list */
+static inline struct iopf_group *
+iommufd_fault_deliver_extract(struct iommufd_fault *fault)
+{
+ struct list_head *list = &fault->deliver;
+ struct iopf_group *group = NULL;
+
+ spin_lock(&fault->lock);
+ if (!list_empty(list)) {
+ group = list_first_entry(list, struct iopf_group, node);
+ list_del(&group->node);
+ }
+ spin_unlock(&fault->lock);
+ return group;
+}
+
+/* Restore a node back to the head in fault->deliver */
+static inline void iommufd_fault_deliver_restore(struct iommufd_fault *fault,
+ struct iopf_group *group)
+{
+ spin_lock(&fault->lock);
+ list_add(&fault->deliver, &group->node);
+ spin_unlock(&fault->lock);
+}
+
struct iommufd_attach_handle {
struct iommu_attach_handle handle;
struct iommufd_device *idev;
diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c
index 685510224d05..fa69240daa28 100644
--- a/drivers/iommu/iommufd/fault.c
+++ b/drivers/iommu/iommufd/fault.c
@@ -102,17 +102,19 @@ static void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt,
struct iommufd_attach_handle *handle)
{
struct iommufd_fault *fault = hwpt->fault;
- struct iopf_group *group, *next;
+ struct iopf_group *group;
unsigned long index;
if (!fault)
return;
mutex_lock(&fault->mutex);
- list_for_each_entry_safe(group, next, &fault->deliver, node) {
- if (group->attach_handle != &handle->handle)
+ for (group = iommufd_fault_deliver_extract(fault); group;
+ group = iommufd_fault_deliver_extract(fault)) {
+ if (group->attach_handle != &handle->handle) {
+ iommufd_fault_deliver_restore(fault, group);
continue;
- list_del(&group->node);
+ }
iopf_group_response(group, IOMMU_PAGE_RESP_INVALID);
iopf_free_group(group);
}
@@ -212,7 +214,7 @@ int iommufd_fault_domain_replace_dev(struct iommufd_device *idev,
void iommufd_fault_destroy(struct iommufd_object *obj)
{
struct iommufd_fault *fault = container_of(obj, struct iommufd_fault, obj);
- struct iopf_group *group, *next;
+ struct iopf_group *group;
unsigned long index;
/*
@@ -221,8 +223,8 @@ void iommufd_fault_destroy(struct iommufd_object *obj)
* accessing this pointer. Therefore, acquiring the mutex here
* is unnecessary.
*/
- list_for_each_entry_safe(group, next, &fault->deliver, node) {
- list_del(&group->node);
+ for (group = iommufd_fault_deliver_extract(fault); group;
+ group = iommufd_fault_deliver_extract(fault)) {
iopf_group_response(group, IOMMU_PAGE_RESP_INVALID);
iopf_free_group(group);
}
@@ -266,17 +268,20 @@ static ssize_t iommufd_fault_fops_read(struct file *filep, char __user *buf,
return -ESPIPE;
mutex_lock(&fault->mutex);
- while (!list_empty(&fault->deliver) && count > done) {
- group = list_first_entry(&fault->deliver,
- struct iopf_group, node);
-
- if (group->fault_count * fault_size > count - done)
+ for (group = iommufd_fault_deliver_extract(fault); group;
+ group = iommufd_fault_deliver_extract(fault)) {
+ if (done >= count ||
+ group->fault_count * fault_size > count - done) {
+ iommufd_fault_deliver_restore(fault, group);
break;
+ }
rc = xa_alloc(&fault->response, &group->cookie, group,
xa_limit_32b, GFP_KERNEL);
- if (rc)
+ if (rc) {
+ iommufd_fault_deliver_restore(fault, group);
break;
+ }
idev = to_iommufd_handle(group->attach_handle)->idev;
list_for_each_entry(iopf, &group->faults, list) {
@@ -284,14 +289,13 @@ static ssize_t iommufd_fault_fops_read(struct file *filep, char __user *buf,
&data, idev,
group->cookie);
if (copy_to_user(buf + done, &data, fault_size)) {
+ iommufd_fault_deliver_restore(fault, group);
xa_erase(&fault->response, group->cookie);
rc = -EFAULT;
break;
}
done += fault_size;
}
-
- list_del(&group->node);
}
mutex_unlock(&fault->mutex);
@@ -349,10 +353,10 @@ static __poll_t iommufd_fault_fops_poll(struct file *filep,
__poll_t pollflags = EPOLLOUT;
poll_wait(filep, &fault->wait_queue, wait);
- mutex_lock(&fault->mutex);
+ spin_lock(&fault->lock);
if (!list_empty(&fault->deliver))
pollflags |= EPOLLIN | EPOLLRDNORM;
- mutex_unlock(&fault->mutex);
+ spin_unlock(&fault->lock);
return pollflags;
}
@@ -394,6 +398,7 @@ int iommufd_fault_alloc(struct iommufd_ucmd *ucmd)
INIT_LIST_HEAD(&fault->deliver);
xa_init_flags(&fault->response, XA_FLAGS_ALLOC1);
mutex_init(&fault->mutex);
+ spin_lock_init(&fault->lock);
init_waitqueue_head(&fault->wait_queue);
filep = anon_inode_getfile("[iommufd-pgfault]", &iommufd_fault_fops,
@@ -442,9 +447,9 @@ int iommufd_fault_iopf_handler(struct iopf_group *group)
hwpt = group->attach_handle->domain->fault_data;
fault = hwpt->fault;
- mutex_lock(&fault->mutex);
+ spin_lock(&fault->lock);
list_add_tail(&group->node, &fault->deliver);
- mutex_unlock(&fault->mutex);
+ spin_unlock(&fault->lock);
wake_up_interruptible(&fault->wait_queue);
--
2.43.0
On 1/15/25 07:28, Nicolin Chen wrote:
> The fault->mutex was to serialize the fault read()/write() fops and the
> iommufd_fault_auto_response_faults(). And it was also conveniently used
> to protect fault->deliver in poll() and iommufd_fault_iopf_handler().
>
> However, copy_from/to_user() may sleep if pagefaults are enabled. Thus,
> they could take a long time to wait for user pages to swap in, blocking
> iommufd_fault_iopf_handler() and its caller that is typically a shared
> IRQ handler of an IOMMU driver, resulting in a potential global DOS.
>
> Instead of resuing the mutex to protect the fault->deliver list, add a
> separate spinlock to do the job, so iommufd_fault_iopf_handler() would
> no longer be blocked by copy_from/to_user().
>
> Provide two list manipulation helpers for fault->deliver:
> - Extract the first iopf_group out of the fault->deliver list
> - Restore an iopf_group back to the head of the fault->deliver list
>
> Replace list_first_entry and list_for_each accordingly.
>
> Fixes: 07838f7fd529 ("iommufd: Add iommufd fault object")
> Cc:stable@vger.kernel.org
> Suggested-by: Jason Gunthorpe<jgg@nvidia.com>
> Signed-off-by: Nicolin Chen<nicolinc@nvidia.com>
> ---
> drivers/iommu/iommufd/iommufd_private.h | 26 +++++++++++++++
> drivers/iommu/iommufd/fault.c | 43 ++++++++++++++-----------
> 2 files changed, 50 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
> index b6d706cf2c66..d3097c857abf 100644
> --- a/drivers/iommu/iommufd/iommufd_private.h
> +++ b/drivers/iommu/iommufd/iommufd_private.h
> @@ -445,12 +445,38 @@ struct iommufd_fault {
>
> /* The lists of outstanding faults protected by below mutex. */
It's better to update above comment as well.
> struct mutex mutex;
> + spinlock_t lock; /* protects the deliver list */
> struct list_head deliver;
> struct xarray response;
>
> struct wait_queue_head wait_queue;
> };
>
> +/* Extract the first node out of the fault->deliver list */
> +static inline struct iopf_group *
> +iommufd_fault_deliver_extract(struct iommufd_fault *fault)
> +{
> + struct list_head *list = &fault->deliver;
> + struct iopf_group *group = NULL;
> +
> + spin_lock(&fault->lock);
> + if (!list_empty(list)) {
> + group = list_first_entry(list, struct iopf_group, node);
> + list_del(&group->node);
> + }
> + spin_unlock(&fault->lock);
> + return group;
> +}
> +
> +/* Restore a node back to the head in fault->deliver */
> +static inline void iommufd_fault_deliver_restore(struct iommufd_fault *fault,
> + struct iopf_group *group)
> +{
> + spin_lock(&fault->lock);
> + list_add(&fault->deliver, &group->node);
This is not right. It should be
list_add(&group->node, &fault->deliver);
> + spin_unlock(&fault->lock);
> +}
Others look good to me. With above addressed,
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
On Wed, Jan 15, 2025 at 01:24:53PM +0800, Baolu Lu wrote:
> On 1/15/25 07:28, Nicolin Chen wrote:
> > @@ -445,12 +445,38 @@ struct iommufd_fault {
> > /* The lists of outstanding faults protected by below mutex. */
>
> It's better to update above comment as well.
>
> > struct mutex mutex;
> > + spinlock_t lock; /* protects the deliver list */
> > struct list_head deliver;
> > struct xarray response;
Ack. I'll do:
- /* The lists of outstanding faults protected by below mutex. */
- struct mutex mutex;
+ spinlock_t lock; /* protects the deliver list */
struct list_head deliver;
+ struct mutex mutex; /* serializes response flows */
struct xarray response;
> > struct wait_queue_head wait_queue;
> > };
> > +/* Extract the first node out of the fault->deliver list */
> > +static inline struct iopf_group *
> > +iommufd_fault_deliver_extract(struct iommufd_fault *fault)
> > +{
> > + struct list_head *list = &fault->deliver;
> > + struct iopf_group *group = NULL;
> > +
> > + spin_lock(&fault->lock);
> > + if (!list_empty(list)) {
> > + group = list_first_entry(list, struct iopf_group, node);
> > + list_del(&group->node);
> > + }
> > + spin_unlock(&fault->lock);
> > + return group;
> > +}
> > +
> > +/* Restore a node back to the head in fault->deliver */
> > +static inline void iommufd_fault_deliver_restore(struct iommufd_fault *fault,
> > + struct iopf_group *group)
> > +{
> > + spin_lock(&fault->lock);
> > + list_add(&fault->deliver, &group->node);
>
> This is not right. It should be
>
> list_add(&group->node, &fault->deliver);
>
> > + spin_unlock(&fault->lock);
> > +}
Oh, right!
> Others look good to me. With above addressed,
>
> Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Thanks!
Nicolin
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Wednesday, January 15, 2025 7:29 AM
>
> @@ -445,12 +445,38 @@ struct iommufd_fault {
>
> /* The lists of outstanding faults protected by below mutex. */
> struct mutex mutex;
> + spinlock_t lock; /* protects the deliver list */
> struct list_head deliver;
> struct xarray response;
Move 'mutex' together with response then?
>
> +/* Extract the first node out of the fault->deliver list */
> +static inline struct iopf_group *
> +iommufd_fault_deliver_extract(struct iommufd_fault *fault)
Probably simpler be iommufd_fault_fetch()
> @@ -102,17 +102,19 @@ static void iommufd_auto_response_faults(struct
> iommufd_hw_pagetable *hwpt,
> struct iommufd_attach_handle
> *handle)
> {
> struct iommufd_fault *fault = hwpt->fault;
> - struct iopf_group *group, *next;
> + struct iopf_group *group;
> unsigned long index;
>
> if (!fault)
> return;
>
> mutex_lock(&fault->mutex);
> - list_for_each_entry_safe(group, next, &fault->deliver, node) {
> - if (group->attach_handle != &handle->handle)
> + for (group = iommufd_fault_deliver_extract(fault); group;
> + group = iommufd_fault_deliver_extract(fault)) {
while (group = iommufd_fault_fetch(fault)) {
...
}
> @@ -266,17 +268,20 @@ static ssize_t iommufd_fault_fops_read(struct file
> *filep, char __user *buf,
> return -ESPIPE;
>
> mutex_lock(&fault->mutex);
> - while (!list_empty(&fault->deliver) && count > done) {
> - group = list_first_entry(&fault->deliver,
> - struct iopf_group, node);
> -
> - if (group->fault_count * fault_size > count - done)
> + for (group = iommufd_fault_deliver_extract(fault); group;
> + group = iommufd_fault_deliver_extract(fault)) {
> + if (done >= count ||
> + group->fault_count * fault_size > count - done) {
> + iommufd_fault_deliver_restore(fault, group);
> break;
> + }
>
> rc = xa_alloc(&fault->response, &group->cookie, group,
> xa_limit_32b, GFP_KERNEL);
> - if (rc)
> + if (rc) {
> + iommufd_fault_deliver_restore(fault, group);
> break;
> + }
The scope of mutex can be reduced to just protect the smaller trunk
touching fault->response.
Otherwise looks good:
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
On Wed, Jan 15, 2025 at 05:24:44AM +0000, Tian, Kevin wrote:
> > From: Nicolin Chen <nicolinc@nvidia.com>
> > Sent: Wednesday, January 15, 2025 7:29 AM
> >
> > @@ -445,12 +445,38 @@ struct iommufd_fault {
> >
> > /* The lists of outstanding faults protected by below mutex. */
> > struct mutex mutex;
> > + spinlock_t lock; /* protects the deliver list */
> > struct list_head deliver;
> > struct xarray response;
>
> Move 'mutex' together with response then?
Ack.
> >
> > +/* Extract the first node out of the fault->deliver list */
> > +static inline struct iopf_group *
> > +iommufd_fault_deliver_extract(struct iommufd_fault *fault)
>
> Probably simpler be iommufd_fault_fetch()
We have deliver and response two lists. So I think that "deliver"
would be necessary. Yet, I can do "fetch" v.s. "extract".
> > @@ -102,17 +102,19 @@ static void iommufd_auto_response_faults(struct
> > iommufd_hw_pagetable *hwpt,
> > struct iommufd_attach_handle
> > *handle)
> > {
> > struct iommufd_fault *fault = hwpt->fault;
> > - struct iopf_group *group, *next;
> > + struct iopf_group *group;
> > unsigned long index;
> >
> > if (!fault)
> > return;
> >
> > mutex_lock(&fault->mutex);
> > - list_for_each_entry_safe(group, next, &fault->deliver, node) {
> > - if (group->attach_handle != &handle->handle)
> > + for (group = iommufd_fault_deliver_extract(fault); group;
> > + group = iommufd_fault_deliver_extract(fault)) {
>
> while (group = iommufd_fault_fetch(fault)) {
> ...
> }
Ah, right...how didn't I see this lol.
>
> > @@ -266,17 +268,20 @@ static ssize_t iommufd_fault_fops_read(struct file
> > *filep, char __user *buf,
> > return -ESPIPE;
> >
> > mutex_lock(&fault->mutex);
> > - while (!list_empty(&fault->deliver) && count > done) {
> > - group = list_first_entry(&fault->deliver,
> > - struct iopf_group, node);
> > -
> > - if (group->fault_count * fault_size > count - done)
> > + for (group = iommufd_fault_deliver_extract(fault); group;
> > + group = iommufd_fault_deliver_extract(fault)) {
> > + if (done >= count ||
> > + group->fault_count * fault_size > count - done) {
> > + iommufd_fault_deliver_restore(fault, group);
> > break;
> > + }
> >
> > rc = xa_alloc(&fault->response, &group->cookie, group,
> > xa_limit_32b, GFP_KERNEL);
> > - if (rc)
> > + if (rc) {
> > + iommufd_fault_deliver_restore(fault, group);
> > break;
> > + }
>
> The scope of mutex can be reduced to just protect the smaller trunk
> touching fault->response.
Ack.
> Otherwise looks good:
>
> Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Thanks!
Nicolin
© 2016 - 2025 Red Hat, Inc.