drivers/iommu/iommufd/fault.c | 40 ++++++++++++++++--------- drivers/iommu/iommufd/iommufd_private.h | 29 ++++++++++++++++-- 2 files changed, 53 insertions(+), 16 deletions(-)
The fault->mutex was to serialize the fault read()/write() fops and the
iommufd_fault_auto_response_faults(), mainly for fault->response. Also,
it was conveniently used to fence the fault->deliver in poll() fop and
iommufd_fault_iopf_handler().
However, copy_from/to_user() may sleep if pagefaults are enabled. Thus,
they could take a long time to wait for user pages to swap in, blocking
iommufd_fault_iopf_handler() and its caller that is typically a shared
IRQ handler of an IOMMU driver, resulting in a potential global DOS.
Instead of reusing the mutex to protect the fault->deliver list, add a
separate spinlock to do the job, so iommufd_fault_iopf_handler() would
no longer be blocked by copy_from/to_user().
Add a free_list in iommufd_auto_response_faults(), so the spinlock can
simply fence a fast list_for_each_entry_safe routine.
Provide two deliver list helpers for iommufd_fault_fops_read() to use:
- Fetch the first iopf_group out of the fault->deliver list
- Restore an iopf_group back to the head of the fault->deliver list
Lastly, move the fault->mutex closer to the fault->response and update
its kdoc accordingly.
Fixes: 07838f7fd529 ("iommufd: Add iommufd fault object")
Cc: stable@vger.kernel.org
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
Changelog:
v3
* Fix iommufd_fault_auto_response_faults() with a free_list
* Drop unnecessary function change in iommufd_fault_destroy()
v2
https://lore.kernel.org/all/cover.1736923732.git.nicolinc@nvidia.com/
* Add "Reviewed-by" from Jason/Kevin/Baolu
* Move fault->mutex closer to fault->response
* Fix inversed arguments passing at list_add()
* Replace "for" loops with simpler "while" loops
* Update kdoc to reflex all the changes in this version
* Rename iommufd_fault_deliver_extract to iommufd_fault_deliver_fetch
v1
https://lore.kernel.org/all/cover.1736894696.git.nicolinc@nvidia.com/
drivers/iommu/iommufd/fault.c | 40 ++++++++++++++++---------
drivers/iommu/iommufd/iommufd_private.h | 29 ++++++++++++++++--
2 files changed, 53 insertions(+), 16 deletions(-)
diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c
index 685510224d05..e93a8a5accbc 100644
--- a/drivers/iommu/iommufd/fault.c
+++ b/drivers/iommu/iommufd/fault.c
@@ -103,15 +103,23 @@ static void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt,
{
struct iommufd_fault *fault = hwpt->fault;
struct iopf_group *group, *next;
+ struct list_head free_list;
unsigned long index;
if (!fault)
return;
+ INIT_LIST_HEAD(&free_list);
mutex_lock(&fault->mutex);
+ spin_lock(&fault->lock);
list_for_each_entry_safe(group, next, &fault->deliver, node) {
if (group->attach_handle != &handle->handle)
continue;
+ list_move(&group->node, &free_list);
+ }
+ spin_unlock(&fault->lock);
+
+ list_for_each_entry_safe(group, next, &free_list, node) {
list_del(&group->node);
iopf_group_response(group, IOMMU_PAGE_RESP_INVALID);
iopf_free_group(group);
@@ -265,18 +273,21 @@ static ssize_t iommufd_fault_fops_read(struct file *filep, char __user *buf,
if (*ppos || count % fault_size)
return -ESPIPE;
- mutex_lock(&fault->mutex);
- while (!list_empty(&fault->deliver) && count > done) {
- group = list_first_entry(&fault->deliver,
- struct iopf_group, node);
-
- if (group->fault_count * fault_size > count - done)
+ while ((group = iommufd_fault_deliver_fetch(fault))) {
+ if (done >= count ||
+ group->fault_count * fault_size > count - done) {
+ iommufd_fault_deliver_restore(fault, group);
break;
+ }
+ mutex_lock(&fault->mutex);
rc = xa_alloc(&fault->response, &group->cookie, group,
xa_limit_32b, GFP_KERNEL);
- if (rc)
+ if (rc) {
+ mutex_unlock(&fault->mutex);
+ iommufd_fault_deliver_restore(fault, group);
break;
+ }
idev = to_iommufd_handle(group->attach_handle)->idev;
list_for_each_entry(iopf, &group->faults, list) {
@@ -285,15 +296,15 @@ static ssize_t iommufd_fault_fops_read(struct file *filep, char __user *buf,
group->cookie);
if (copy_to_user(buf + done, &data, fault_size)) {
xa_erase(&fault->response, group->cookie);
+ mutex_unlock(&fault->mutex);
+ iommufd_fault_deliver_restore(fault, group);
rc = -EFAULT;
break;
}
done += fault_size;
}
-
- list_del(&group->node);
+ mutex_unlock(&fault->mutex);
}
- mutex_unlock(&fault->mutex);
return done == 0 ? rc : done;
}
@@ -349,10 +360,10 @@ static __poll_t iommufd_fault_fops_poll(struct file *filep,
__poll_t pollflags = EPOLLOUT;
poll_wait(filep, &fault->wait_queue, wait);
- mutex_lock(&fault->mutex);
+ spin_lock(&fault->lock);
if (!list_empty(&fault->deliver))
pollflags |= EPOLLIN | EPOLLRDNORM;
- mutex_unlock(&fault->mutex);
+ spin_unlock(&fault->lock);
return pollflags;
}
@@ -394,6 +405,7 @@ int iommufd_fault_alloc(struct iommufd_ucmd *ucmd)
INIT_LIST_HEAD(&fault->deliver);
xa_init_flags(&fault->response, XA_FLAGS_ALLOC1);
mutex_init(&fault->mutex);
+ spin_lock_init(&fault->lock);
init_waitqueue_head(&fault->wait_queue);
filep = anon_inode_getfile("[iommufd-pgfault]", &iommufd_fault_fops,
@@ -442,9 +454,9 @@ int iommufd_fault_iopf_handler(struct iopf_group *group)
hwpt = group->attach_handle->domain->fault_data;
fault = hwpt->fault;
- mutex_lock(&fault->mutex);
+ spin_lock(&fault->lock);
list_add_tail(&group->node, &fault->deliver);
- mutex_unlock(&fault->mutex);
+ spin_unlock(&fault->lock);
wake_up_interruptible(&fault->wait_queue);
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index b6d706cf2c66..0b1bafc7fd99 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -443,14 +443,39 @@ struct iommufd_fault {
struct iommufd_ctx *ictx;
struct file *filep;
- /* The lists of outstanding faults protected by below mutex. */
- struct mutex mutex;
+ spinlock_t lock; /* protects the deliver list */
struct list_head deliver;
+ struct mutex mutex; /* serializes response flows */
struct xarray response;
struct wait_queue_head wait_queue;
};
+/* Fetch the first node out of the fault->deliver list */
+static inline struct iopf_group *
+iommufd_fault_deliver_fetch(struct iommufd_fault *fault)
+{
+ struct list_head *list = &fault->deliver;
+ struct iopf_group *group = NULL;
+
+ spin_lock(&fault->lock);
+ if (!list_empty(list)) {
+ group = list_first_entry(list, struct iopf_group, node);
+ list_del(&group->node);
+ }
+ spin_unlock(&fault->lock);
+ return group;
+}
+
+/* Restore a node back to the head of the fault->deliver list */
+static inline void iommufd_fault_deliver_restore(struct iommufd_fault *fault,
+ struct iopf_group *group)
+{
+ spin_lock(&fault->lock);
+ list_add(&group->node, &fault->deliver);
+ spin_unlock(&fault->lock);
+}
+
struct iommufd_attach_handle {
struct iommu_attach_handle handle;
struct iommufd_device *idev;
--
2.34.1
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Friday, January 17, 2025 10:05 AM
>
> mutex_lock(&fault->mutex);
Nit. The scope of above can be reduced too, by guarding only the
lines for fault->response.
Perhaps Jason can help adjust it when picking it in.
> + spin_lock(&fault->lock);
> list_for_each_entry_safe(group, next, &fault->deliver, node) {
> if (group->attach_handle != &handle->handle)
> continue;
> + list_move(&group->node, &free_list);
> + }
> + spin_unlock(&fault->lock);
> +
> + list_for_each_entry_safe(group, next, &free_list, node) {
> list_del(&group->node);
> iopf_group_response(group, IOMMU_PAGE_RESP_INVALID);
> iopf_free_group(group);
On Fri, Jan 17, 2025 at 06:20:15AM +0000, Tian, Kevin wrote: > > From: Nicolin Chen <nicolinc@nvidia.com> > > Sent: Friday, January 17, 2025 10:05 AM > > > > mutex_lock(&fault->mutex); > > Nit. The scope of above can be reduced too, by guarding only the > lines for fault->response. Hmm, I think you have found a flaw unfortunately.. iommufd_auto_response_faults() is called async to all of this if a device is removed. It should clean out that device from all the fault machinery. With the new locking we don't hold the mutex across the list manipulation in read so there is a window where a fault can be on the stack in iommufd_fault_fops_read() but not in the fault->response or the deliver list. Thus it will be missed during cleanup. I think because of the cleanup we have to continue to hold the mutex across all of fops_read and this patch is just adding an additional spinlock around the deliver list to isolate it from the copy_to_user(). Is that right Nicolin? Jason
On Fri, Jan 17, 2025 at 10:38:56AM -0400, Jason Gunthorpe wrote: > On Fri, Jan 17, 2025 at 06:20:15AM +0000, Tian, Kevin wrote: > > > From: Nicolin Chen <nicolinc@nvidia.com> > > > Sent: Friday, January 17, 2025 10:05 AM > > > > > > mutex_lock(&fault->mutex); > > > > Nit. The scope of above can be reduced too, by guarding only the > > lines for fault->response. > > Hmm, I think you have found a flaw unfortunately.. > > iommufd_auto_response_faults() is called async to all of this if a > device is removed. It should clean out that device from all the fault > machinery. > > With the new locking we don't hold the mutex across the list > manipulation in read so there is a window where a fault can be on the > stack in iommufd_fault_fops_read() but not in the fault->response or > the deliver list. > > Thus it will be missed during cleanup. > > I think because of the cleanup we have to continue to hold the mutex > across all of fops_read and this patch is just adding an additional > spinlock around the deliver list to isolate it from the > copy_to_user(). > > Is that right Nicolin? Yes. I've missed that too.. A group can be read out of the deliver list in fops_read() prior to auto_response_faults() taking the mutex, then its following xa_alloc() will add to the response list that fetched group, and it will stay in the xarray until iommufd_fault_destroy() flushes everything away. It might not be a bug to the existing flow (?), but doesn't seem to worth touching the mutex in this patch. Let me send a v4 changing that mutex back. Thanks Nicolin
> From: Nicolin Chen <nicolinc@nvidia.com> > Sent: Saturday, January 18, 2025 2:49 AM > > On Fri, Jan 17, 2025 at 10:38:56AM -0400, Jason Gunthorpe wrote: > > On Fri, Jan 17, 2025 at 06:20:15AM +0000, Tian, Kevin wrote: > > > > From: Nicolin Chen <nicolinc@nvidia.com> > > > > Sent: Friday, January 17, 2025 10:05 AM > > > > > > > > mutex_lock(&fault->mutex); > > > > > > Nit. The scope of above can be reduced too, by guarding only the > > > lines for fault->response. > > > > Hmm, I think you have found a flaw unfortunately.. > > > > iommufd_auto_response_faults() is called async to all of this if a > > device is removed. It should clean out that device from all the fault > > machinery. > > > > With the new locking we don't hold the mutex across the list > > manipulation in read so there is a window where a fault can be on the > > stack in iommufd_fault_fops_read() but not in the fault->response or > > the deliver list. > > > > Thus it will be missed during cleanup. > > > > I think because of the cleanup we have to continue to hold the mutex > > across all of fops_read and this patch is just adding an additional > > spinlock around the deliver list to isolate it from the > > copy_to_user(). > > > > Is that right Nicolin? > > Yes. I've missed that too.. > > A group can be read out of the deliver list in fops_read() prior > to auto_response_faults() taking the mutex, then its following > xa_alloc() will add to the response list that fetched group, and > it will stay in the xarray until iommufd_fault_destroy() flushes > everything away. > > It might not be a bug to the existing flow (?), but doesn't seem > to worth touching the mutex in this patch. > > Let me send a v4 changing that mutex back. > While looking at the related logic I wonder whether there is another potential issue in smmu side. Based on discussion with Baolu currently iommu_detach_group_handle() plays the role of quiescing fault reporting from iommu core. Once it's completed there won't be any new event queued to fault->deliver then it's safe to call iommufd_auto_response_faults() to handle those already queued events in iommufd. Intel-iommu driver calls intel_iommu_drain_pasid_prq() as the last step in detach which scans the prq queue and waits for all pending requests related to the device to be delivered (by comparing head/tail pointer). There is a bug (just fixed by Baolu [1]) but the logic is there. But I didn't see the similar logic in arm_smmu_attach_dev_blocked(). And arm_smmu_evtq_thread() just removes an entry from the cmd queue and calls arm_smmu_handle_evt() to report. Seems no track of when those events are delivered. Detach may happen after iommu_report_device_fault() acquires old domain/handle/handler/etc. but before iommufd_fault_iopf_handler() actually queues an event to fault->deliver. Then UAF. Did I overlook anything here? Thanks Kevin [1] https://lore.kernel.org/all/20250120080144.810455-1-baolu.lu@linux.intel.com/
On Mon, Jan 20, 2025 at 09:22:28AM +0000, Tian, Kevin wrote: > But I didn't see the similar logic in arm_smmu_attach_dev_blocked(). > And arm_smmu_evtq_thread() just removes an entry from the cmd > queue and calls arm_smmu_handle_evt() to report. Seems no track of > when those events are delivered. I did not look closely, but I would not be surprised if this is the case. Jason
© 2016 - 2026 Red Hat, Inc.