[PATCH v7 10/28] iommufd/access: Bypass access->ops->unmap for internal use

Nicolin Chen posted 28 patches 3 months, 1 week ago
There is a newer version of this series
[PATCH v7 10/28] iommufd/access: Bypass access->ops->unmap for internal use
Posted by Nicolin Chen 3 months, 1 week ago
The access object has been used externally by VFIO mdev devices, allowing
them to pin/unpin physical pages (via needs_pin_pages). Meanwhile, a racy
unmap can occur in this case, so these devices usually implement an unmap
handler, invoked by iommufd_access_notify_unmap().

The new HW queue object will need the same pin/unpin feature, although it
(unlike the mdev case) wants to reject any unmap attempt, during its life
cycle. Instead, it would not implement an unmap handler. Thus, bypass any
access->ops->unmap call when the access is marked as internal. Also error
out the internal case in iommufd_access_notify_unmap() to reject an unmap
operation and propagatethe errno upwards.

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Pranjal Shrivastava <praan@google.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
 drivers/iommu/iommufd/iommufd_private.h |  4 ++--
 drivers/iommu/iommufd/device.c          | 21 +++++++++++++++++----
 drivers/iommu/iommufd/io_pagetable.c    | 10 +++++++++-
 3 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 9d1f55deb9ca..b849099e804b 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -111,8 +111,8 @@ int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt,
 int iopt_set_dirty_tracking(struct io_pagetable *iopt,
 			    struct iommu_domain *domain, bool enable);
 
-void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova,
-				 unsigned long length);
+int iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova,
+				unsigned long length);
 int iopt_table_add_domain(struct io_pagetable *iopt,
 			  struct iommu_domain *domain);
 void iopt_table_remove_domain(struct io_pagetable *iopt,
diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index 07a4ff753c12..8f078fda795a 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -1048,7 +1048,7 @@ static int iommufd_access_change_ioas(struct iommufd_access *access,
 	}
 
 	if (cur_ioas) {
-		if (access->ops->unmap) {
+		if (!iommufd_access_is_internal(access) && access->ops->unmap) {
 			mutex_unlock(&access->ioas_lock);
 			access->ops->unmap(access->data, 0, ULONG_MAX);
 			mutex_lock(&access->ioas_lock);
@@ -1245,15 +1245,24 @@ EXPORT_SYMBOL_NS_GPL(iommufd_access_replace, "IOMMUFD");
  * run in the future. Due to this a driver must not create locking that prevents
  * unmap to complete while iommufd_access_destroy() is running.
  */
-void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova,
-				 unsigned long length)
+int iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova,
+				unsigned long length)
 {
 	struct iommufd_ioas *ioas =
 		container_of(iopt, struct iommufd_ioas, iopt);
 	struct iommufd_access *access;
 	unsigned long index;
+	int ret = 0;
 
 	xa_lock(&ioas->iopt.access_list);
+	/* Bypass any unmap if there is an internal access */
+	xa_for_each(&ioas->iopt.access_list, index, access) {
+		if (iommufd_access_is_internal(access)) {
+			ret = -EBUSY;
+			goto unlock;
+		}
+	}
+
 	xa_for_each(&ioas->iopt.access_list, index, access) {
 		if (!iommufd_lock_obj(&access->obj))
 			continue;
@@ -1264,7 +1273,9 @@ void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova,
 		iommufd_put_object(access->ictx, &access->obj);
 		xa_lock(&ioas->iopt.access_list);
 	}
+unlock:
 	xa_unlock(&ioas->iopt.access_list);
+	return ret;
 }
 
 /**
@@ -1362,7 +1373,9 @@ int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova,
 
 	/* Driver's ops don't support pin_pages */
 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
-	    WARN_ON(access->iova_alignment != PAGE_SIZE || !access->ops->unmap))
+	    WARN_ON(access->iova_alignment != PAGE_SIZE ||
+		    (!iommufd_access_is_internal(access) &&
+		     !access->ops->unmap)))
 		return -EINVAL;
 
 	if (!length)
diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c
index 22fc3a12109f..6b8477b1f94b 100644
--- a/drivers/iommu/iommufd/io_pagetable.c
+++ b/drivers/iommu/iommufd/io_pagetable.c
@@ -740,7 +740,15 @@ static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
 			up_write(&iopt->iova_rwsem);
 			up_read(&iopt->domains_rwsem);
 
-			iommufd_access_notify_unmap(iopt, area_first, length);
+			rc = iommufd_access_notify_unmap(iopt, area_first,
+							 length);
+			if (rc) {
+				down_read(&iopt->domains_rwsem);
+				down_write(&iopt->iova_rwsem);
+				area->prevent_access = false;
+				goto out_unlock_iova;
+			}
+
 			/* Something is not responding to unmap requests. */
 			tries++;
 			if (WARN_ON(tries > 100)) {
-- 
2.43.0
RE: [PATCH v7 10/28] iommufd/access: Bypass access->ops->unmap for internal use
Posted by Tian, Kevin 3 months, 1 week ago
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Friday, June 27, 2025 3:35 AM
> 
> +int iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long
> iova,
> +				unsigned long length)
>  {
>  	struct iommufd_ioas *ioas =
>  		container_of(iopt, struct iommufd_ioas, iopt);
>  	struct iommufd_access *access;
>  	unsigned long index;
> +	int ret = 0;
> 
>  	xa_lock(&ioas->iopt.access_list);
> +	/* Bypass any unmap if there is an internal access */
> +	xa_for_each(&ioas->iopt.access_list, index, access) {
> +		if (iommufd_access_is_internal(access)) {
> +			ret = -EBUSY;
> +			goto unlock;
> +		}
> +	}
> +

hmm all those checks are per iopt. Could do one-off check in
iopt_unmap_iova_range() and store the result in a local flag.

Then use that flag to decide whether to return -EBUSY if
area->num_accesses is true in the loop.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Re: [PATCH v7 10/28] iommufd/access: Bypass access->ops->unmap for internal use
Posted by Nicolin Chen 3 months, 1 week ago
On Wed, Jul 02, 2025 at 09:45:26AM +0000, Tian, Kevin wrote:
> > From: Nicolin Chen <nicolinc@nvidia.com>
> > Sent: Friday, June 27, 2025 3:35 AM
> > 
> > +int iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long
> > iova,
> > +				unsigned long length)
> >  {
> >  	struct iommufd_ioas *ioas =
> >  		container_of(iopt, struct iommufd_ioas, iopt);
> >  	struct iommufd_access *access;
> >  	unsigned long index;
> > +	int ret = 0;
> > 
> >  	xa_lock(&ioas->iopt.access_list);
> > +	/* Bypass any unmap if there is an internal access */
> > +	xa_for_each(&ioas->iopt.access_list, index, access) {
> > +		if (iommufd_access_is_internal(access)) {
> > +			ret = -EBUSY;
> > +			goto unlock;
> > +		}
> > +	}
> > +
> 
> hmm all those checks are per iopt. Could do one-off check in
> iopt_unmap_iova_range() and store the result in a local flag.
> 
> Then use that flag to decide whether to return -EBUSY if
> area->num_accesses is true in the loop.

I don't quite follow this...

Do you suggest to move this xa_for_each to iopt_unmap_iova_range?

What's that local flag used for?

Thanks
Nicolin
RE: [PATCH v7 10/28] iommufd/access: Bypass access->ops->unmap for internal use
Posted by Tian, Kevin 3 months, 1 week ago
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Thursday, July 3, 2025 4:12 AM
> 
> On Wed, Jul 02, 2025 at 09:45:26AM +0000, Tian, Kevin wrote:
> > > From: Nicolin Chen <nicolinc@nvidia.com>
> > > Sent: Friday, June 27, 2025 3:35 AM
> > >
> > > +int iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned
> long
> > > iova,
> > > +				unsigned long length)
> > >  {
> > >  	struct iommufd_ioas *ioas =
> > >  		container_of(iopt, struct iommufd_ioas, iopt);
> > >  	struct iommufd_access *access;
> > >  	unsigned long index;
> > > +	int ret = 0;
> > >
> > >  	xa_lock(&ioas->iopt.access_list);
> > > +	/* Bypass any unmap if there is an internal access */
> > > +	xa_for_each(&ioas->iopt.access_list, index, access) {
> > > +		if (iommufd_access_is_internal(access)) {
> > > +			ret = -EBUSY;
> > > +			goto unlock;
> > > +		}
> > > +	}
> > > +
> >
> > hmm all those checks are per iopt. Could do one-off check in
> > iopt_unmap_iova_range() and store the result in a local flag.
> >
> > Then use that flag to decide whether to return -EBUSY if
> > area->num_accesses is true in the loop.
> 
> I don't quite follow this...
> 
> Do you suggest to move this xa_for_each to iopt_unmap_iova_range?

yes

> 
> What's that local flag used for?
> 

I meant something like below:

iopt_unmap_iova_range()
{
	bool internal_access = false;

	down_read(&iopt->domains_rwsem);
	down_write(&iopt->iova_rwsem);
	/* Bypass any unmap if there is an internal access */
	xa_for_each(&iopt->access_list, index, access) {
		if (iommufd_access_is_internal(access)) {
			internal_access = true;
			break;
		}
	}

	while ((area = iopt_area_iter_first(iopt, start, last))) {
		if (area->num_access) {
			if (internal_access) {
				rc = -EBUSY;
				goto out_unlock_iova;
			}
			up_write(&iopt->iova_rwsem);
			up_read(&iopt->domains_rwsem);
			iommufd_access_notify_unmap(iopt, area_first, length);	
		}
	}
}

it checks the access_list in the common path, but the cost should be
negligible when there is no access attached to this iopt. The upside
is that now unmap is denied explicitly in the area loop instead of 
still trying to unmap and then handling errors.

but the current way is also fine. After another thought I'm neutral
to it. 😊
Re: [PATCH v7 10/28] iommufd/access: Bypass access->ops->unmap for internal use
Posted by Nicolin Chen 3 months ago
On Thu, Jul 03, 2025 at 04:57:34AM +0000, Tian, Kevin wrote:
> I meant something like below:
> 
> iopt_unmap_iova_range()
> {
> 	bool internal_access = false;
> 
> 	down_read(&iopt->domains_rwsem);
> 	down_write(&iopt->iova_rwsem);
> 	/* Bypass any unmap if there is an internal access */
> 	xa_for_each(&iopt->access_list, index, access) {
> 		if (iommufd_access_is_internal(access)) {
> 			internal_access = true;
> 			break;
> 		}
> 	}
> 
> 	while ((area = iopt_area_iter_first(iopt, start, last))) {
> 		if (area->num_access) {
> 			if (internal_access) {
> 				rc = -EBUSY;
> 				goto out_unlock_iova;
> 			}
> 			up_write(&iopt->iova_rwsem);
> 			up_read(&iopt->domains_rwsem);
> 			iommufd_access_notify_unmap(iopt, area_first, length);	
> 		}
> 	}
> }
> 
> it checks the access_list in the common path, but the cost should be
> negligible when there is no access attached to this iopt. The upside
> is that now unmap is denied explicitly in the area loop instead of 
> still trying to unmap and then handling errors.

Hmm, I realized that either way might be incorrect, as it iterates
the entire iopt for any internal access regardless its iova ranges.

What we really want is to reject an unmap against the same range as
once pinged by an internal access, i.e. other range of unmap should
be still allowed.

So, doing it at this level isn't enough. I think we should still go
down to struct iopt_area as my v5 did:
https://lore.kernel.org/all/3ddc8c678406772a8358a265912bb1c064f4c796.1747537752.git.nicolinc@nvidia.com/
We'd only need to rename to num_locked as you suggested, i.e.

@@ -719,6 +719,12 @@ static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
 			goto out_unlock_iova;
 		}
 
+		/* The area is locked by an object that has not been destroyed */
+		if (area->num_locked) {
+			rc = -EBUSY;
+			goto out_unlock_iova;
+		}
+
 		if (area_first < start || area_last > last) {
 			rc = -ENOENT;
 			goto out_unlock_iova;

Thanks
Nicolin