[PATCH rc v2 2/2] iommufd/fault: Use a separate spinlock to protect fault->deliver list

Nicolin Chen posted 2 patches 11 months ago
[PATCH rc v2 2/2] iommufd/fault: Use a separate spinlock to protect fault->deliver list
Posted by Nicolin Chen 11 months ago
The fault->mutex was to serialize the fault read()/write() fops and the
iommufd_fault_auto_response_faults(), mainly for fault->response. Also,
it was conveniently used to fence the fault->deliver in poll() fop and
iommufd_fault_iopf_handler().

However, copy_from/to_user() may sleep if pagefaults are enabled. Thus,
they could take a long time to wait for user pages to swap in, blocking
iommufd_fault_iopf_handler() and its caller that is typically a shared
IRQ handler of an IOMMU driver, resulting in a potential global DOS.

Instead of resuing the mutex to protect the fault->deliver list, add a
separate spinlock to do the job, so iommufd_fault_iopf_handler() would
no longer be blocked by copy_from/to_user().

Provide two list manipulation helpers for fault->deliver:
 - Fetch the first iopf_group out of the fault->deliver list
 - Restore an iopf_group back to the head of the fault->deliver list
Then, replace list_first_entry and list_for_each with those.

Lastly, move the fault->mutex closer to the fault->response and update
its kdoc accordingly.

Fixes: 07838f7fd529 ("iommufd: Add iommufd fault object")
Cc: stable@vger.kernel.org
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
 drivers/iommu/iommufd/iommufd_private.h | 29 ++++++++++++++--
 drivers/iommu/iommufd/fault.c           | 46 ++++++++++++++-----------
 2 files changed, 52 insertions(+), 23 deletions(-)

diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index b6d706cf2c66..0b1bafc7fd99 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -443,14 +443,39 @@ struct iommufd_fault {
 	struct iommufd_ctx *ictx;
 	struct file *filep;
 
-	/* The lists of outstanding faults protected by below mutex. */
-	struct mutex mutex;
+	spinlock_t lock; /* protects the deliver list */
 	struct list_head deliver;
+	struct mutex mutex; /* serializes response flows */
 	struct xarray response;
 
 	struct wait_queue_head wait_queue;
 };
 
+/* Fetch the first node out of the fault->deliver list */
+static inline struct iopf_group *
+iommufd_fault_deliver_fetch(struct iommufd_fault *fault)
+{
+	struct list_head *list = &fault->deliver;
+	struct iopf_group *group = NULL;
+
+	spin_lock(&fault->lock);
+	if (!list_empty(list)) {
+		group = list_first_entry(list, struct iopf_group, node);
+		list_del(&group->node);
+	}
+	spin_unlock(&fault->lock);
+	return group;
+}
+
+/* Restore a node back to the head of the fault->deliver list */
+static inline void iommufd_fault_deliver_restore(struct iommufd_fault *fault,
+						 struct iopf_group *group)
+{
+	spin_lock(&fault->lock);
+	list_add(&group->node, &fault->deliver);
+	spin_unlock(&fault->lock);
+}
+
 struct iommufd_attach_handle {
 	struct iommu_attach_handle handle;
 	struct iommufd_device *idev;
diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c
index 685510224d05..8c82338ea303 100644
--- a/drivers/iommu/iommufd/fault.c
+++ b/drivers/iommu/iommufd/fault.c
@@ -102,17 +102,18 @@ static void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt,
 					 struct iommufd_attach_handle *handle)
 {
 	struct iommufd_fault *fault = hwpt->fault;
-	struct iopf_group *group, *next;
+	struct iopf_group *group;
 	unsigned long index;
 
 	if (!fault)
 		return;
 
 	mutex_lock(&fault->mutex);
-	list_for_each_entry_safe(group, next, &fault->deliver, node) {
-		if (group->attach_handle != &handle->handle)
+	while ((group = iommufd_fault_deliver_fetch(fault))) {
+		if (group->attach_handle != &handle->handle) {
+			iommufd_fault_deliver_restore(fault, group);
 			continue;
-		list_del(&group->node);
+		}
 		iopf_group_response(group, IOMMU_PAGE_RESP_INVALID);
 		iopf_free_group(group);
 	}
@@ -212,7 +213,7 @@ int iommufd_fault_domain_replace_dev(struct iommufd_device *idev,
 void iommufd_fault_destroy(struct iommufd_object *obj)
 {
 	struct iommufd_fault *fault = container_of(obj, struct iommufd_fault, obj);
-	struct iopf_group *group, *next;
+	struct iopf_group *group;
 	unsigned long index;
 
 	/*
@@ -221,8 +222,7 @@ void iommufd_fault_destroy(struct iommufd_object *obj)
 	 * accessing this pointer. Therefore, acquiring the mutex here
 	 * is unnecessary.
 	 */
-	list_for_each_entry_safe(group, next, &fault->deliver, node) {
-		list_del(&group->node);
+	while ((group = iommufd_fault_deliver_fetch(fault))) {
 		iopf_group_response(group, IOMMU_PAGE_RESP_INVALID);
 		iopf_free_group(group);
 	}
@@ -265,18 +265,21 @@ static ssize_t iommufd_fault_fops_read(struct file *filep, char __user *buf,
 	if (*ppos || count % fault_size)
 		return -ESPIPE;
 
-	mutex_lock(&fault->mutex);
-	while (!list_empty(&fault->deliver) && count > done) {
-		group = list_first_entry(&fault->deliver,
-					 struct iopf_group, node);
-
-		if (group->fault_count * fault_size > count - done)
+	while ((group = iommufd_fault_deliver_fetch(fault))) {
+		if (done >= count ||
+		    group->fault_count * fault_size > count - done) {
+			iommufd_fault_deliver_restore(fault, group);
 			break;
+		}
 
+		mutex_lock(&fault->mutex);
 		rc = xa_alloc(&fault->response, &group->cookie, group,
 			      xa_limit_32b, GFP_KERNEL);
-		if (rc)
+		if (rc) {
+			mutex_unlock(&fault->mutex);
+			iommufd_fault_deliver_restore(fault, group);
 			break;
+		}
 
 		idev = to_iommufd_handle(group->attach_handle)->idev;
 		list_for_each_entry(iopf, &group->faults, list) {
@@ -285,15 +288,15 @@ static ssize_t iommufd_fault_fops_read(struct file *filep, char __user *buf,
 						      group->cookie);
 			if (copy_to_user(buf + done, &data, fault_size)) {
 				xa_erase(&fault->response, group->cookie);
+				mutex_unlock(&fault->mutex);
+				iommufd_fault_deliver_restore(fault, group);
 				rc = -EFAULT;
 				break;
 			}
 			done += fault_size;
 		}
-
-		list_del(&group->node);
+		mutex_unlock(&fault->mutex);
 	}
-	mutex_unlock(&fault->mutex);
 
 	return done == 0 ? rc : done;
 }
@@ -349,10 +352,10 @@ static __poll_t iommufd_fault_fops_poll(struct file *filep,
 	__poll_t pollflags = EPOLLOUT;
 
 	poll_wait(filep, &fault->wait_queue, wait);
-	mutex_lock(&fault->mutex);
+	spin_lock(&fault->lock);
 	if (!list_empty(&fault->deliver))
 		pollflags |= EPOLLIN | EPOLLRDNORM;
-	mutex_unlock(&fault->mutex);
+	spin_unlock(&fault->lock);
 
 	return pollflags;
 }
@@ -394,6 +397,7 @@ int iommufd_fault_alloc(struct iommufd_ucmd *ucmd)
 	INIT_LIST_HEAD(&fault->deliver);
 	xa_init_flags(&fault->response, XA_FLAGS_ALLOC1);
 	mutex_init(&fault->mutex);
+	spin_lock_init(&fault->lock);
 	init_waitqueue_head(&fault->wait_queue);
 
 	filep = anon_inode_getfile("[iommufd-pgfault]", &iommufd_fault_fops,
@@ -442,9 +446,9 @@ int iommufd_fault_iopf_handler(struct iopf_group *group)
 	hwpt = group->attach_handle->domain->fault_data;
 	fault = hwpt->fault;
 
-	mutex_lock(&fault->mutex);
+	spin_lock(&fault->lock);
 	list_add_tail(&group->node, &fault->deliver);
-	mutex_unlock(&fault->mutex);
+	spin_unlock(&fault->lock);
 
 	wake_up_interruptible(&fault->wait_queue);
 
-- 
2.43.0
Re: [PATCH rc v2 2/2] iommufd/fault: Use a separate spinlock to protect fault->deliver list
Posted by Jason Gunthorpe 11 months ago
On Tue, Jan 14, 2025 at 10:56:00PM -0800, Nicolin Chen wrote:
> @@ -102,17 +102,18 @@ static void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt,
>  					 struct iommufd_attach_handle *handle)
>  {
>  	struct iommufd_fault *fault = hwpt->fault;
> -	struct iopf_group *group, *next;
> +	struct iopf_group *group;
>  	unsigned long index;
>  
>  	if (!fault)
>  		return;
>  
>  	mutex_lock(&fault->mutex);
> -	list_for_each_entry_safe(group, next, &fault->deliver, node) {
> -		if (group->attach_handle != &handle->handle)
> +	while ((group = iommufd_fault_deliver_fetch(fault))) {
> +		if (group->attach_handle != &handle->handle) {
> +			iommufd_fault_deliver_restore(fault, group);
>  			continue;
> -		list_del(&group->node);
> +		}

I think this does not work, if we take the 'if attach_handle' leg then
restore will put the same entry back into the front and the next fetch
will pick it up and then it infinite loops without forward progress.

To make this algorithm work I suggest to do a
list_for_each_entry_safe() under the spinlock and list_move each
matching entry to a temporary list on the stack.

Then you can drop the spinlock and run over the temporary list doing this:

> @@ -221,8 +222,7 @@ void iommufd_fault_destroy(struct iommufd_object *obj)
>  	 * accessing this pointer. Therefore, acquiring the mutex here
>  	 * is unnecessary.
>  	 */
> -	list_for_each_entry_safe(group, next, &fault->deliver, node) {
> -		list_del(&group->node);

The comment above says there is no concurrency so no locking is
necessary. I'd leave it alone and just leat it be the efficient
list_for_each_entry_safe()

Jason
Re: [PATCH rc v2 2/2] iommufd/fault: Use a separate spinlock to protect fault->deliver list
Posted by Nicolin Chen 11 months ago
On Thu, Jan 16, 2025 at 04:34:06PM -0400, Jason Gunthorpe wrote:
> On Tue, Jan 14, 2025 at 10:56:00PM -0800, Nicolin Chen wrote:
> > @@ -102,17 +102,18 @@ static void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt,
> >  					 struct iommufd_attach_handle *handle)
> >  {
> >  	struct iommufd_fault *fault = hwpt->fault;
> > -	struct iopf_group *group, *next;
> > +	struct iopf_group *group;
> >  	unsigned long index;
> >  
> >  	if (!fault)
> >  		return;
> >  
> >  	mutex_lock(&fault->mutex);
> > -	list_for_each_entry_safe(group, next, &fault->deliver, node) {
> > -		if (group->attach_handle != &handle->handle)
> > +	while ((group = iommufd_fault_deliver_fetch(fault))) {
> > +		if (group->attach_handle != &handle->handle) {
> > +			iommufd_fault_deliver_restore(fault, group);
> >  			continue;
> > -		list_del(&group->node);
> > +		}
> 
> I think this does not work, if we take the 'if attach_handle' leg then
> restore will put the same entry back into the front and the next fetch
> will pick it up and then it infinite loops without forward progress.

!! Needed more careful thinking. All these continues shouldn't be
handled like those breaks.

> To make this algorithm work I suggest to do a
> list_for_each_entry_safe() under the spinlock and list_move each
> matching entry to a temporary list on the stack.

Ack. I added a free_list for that.

> Then you can drop the spinlock and run over the temporary list doing this:
> 
> > @@ -221,8 +222,7 @@ void iommufd_fault_destroy(struct iommufd_object *obj)
> >  	 * accessing this pointer. Therefore, acquiring the mutex here
> >  	 * is unnecessary.
> >  	 */
> > -	list_for_each_entry_safe(group, next, &fault->deliver, node) {
> > -		list_del(&group->node);
> 
> The comment above says there is no concurrency so no locking is
> necessary. I'd leave it alone and just leat it be the efficient
> list_for_each_entry_safe()

Ack. Will send a v3.

Thanks
Nicolin