[PATCH v6 06/25] iommufd/access: Allow access->ops to be NULL for internal use

Nicolin Chen posted 25 patches 3 months, 4 weeks ago
There is a newer version of this series
[PATCH v6 06/25] iommufd/access: Allow access->ops to be NULL for internal use
Posted by Nicolin Chen 3 months, 4 weeks ago
The access object has been used externally by VFIO mdev devices, allowing
them to pin/unpin physical pages (via needs_pin_pages). Meanwhile, a racy
unmap can occur in this case, so these devices usually implement an unmap
handler, invoked by iommufd_access_notify_unmap().

The new HW queue object will need the same pin/unpin feature, although it
(unlike the mdev case) wants to reject any unmap attempt, during its life
cycle.

To reuse the access object for the new HW queue, allow its access->ops to
be NULL. Error out a NULL access->ops in iommufd_access_notify_unmap() to
reject an unmap operation and propagatethe errno upwards.

Then, update iommufd_access_change_ioas() and iommufd_access_pin_pages(),
to allow this new use case.

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
 drivers/iommu/iommufd/iommufd_private.h |  4 ++--
 drivers/iommu/iommufd/device.c          | 15 +++++++++++----
 drivers/iommu/iommufd/io_pagetable.c    | 17 ++++++++++++++---
 3 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 4f5e8cd99c96..4a375a8c9216 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -111,8 +111,8 @@ int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt,
 int iopt_set_dirty_tracking(struct io_pagetable *iopt,
 			    struct iommu_domain *domain, bool enable);
 
-void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova,
-				 unsigned long length);
+int iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova,
+				unsigned long length);
 int iopt_table_add_domain(struct io_pagetable *iopt,
 			  struct iommu_domain *domain);
 void iopt_table_remove_domain(struct io_pagetable *iopt,
diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index e9b6ca47095c..9293722b9cff 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -1048,7 +1048,7 @@ static int iommufd_access_change_ioas(struct iommufd_access *access,
 	}
 
 	if (cur_ioas) {
-		if (access->ops->unmap) {
+		if (access->ops && access->ops->unmap) {
 			mutex_unlock(&access->ioas_lock);
 			access->ops->unmap(access->data, 0, ULONG_MAX);
 			mutex_lock(&access->ioas_lock);
@@ -1204,16 +1204,21 @@ EXPORT_SYMBOL_NS_GPL(iommufd_access_replace, "IOMMUFD");
  * run in the future. Due to this a driver must not create locking that prevents
  * unmap to complete while iommufd_access_destroy() is running.
  */
-void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova,
-				 unsigned long length)
+int iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova,
+				unsigned long length)
 {
 	struct iommufd_ioas *ioas =
 		container_of(iopt, struct iommufd_ioas, iopt);
 	struct iommufd_access *access;
 	unsigned long index;
+	int ret = 0;
 
 	xa_lock(&ioas->iopt.access_list);
 	xa_for_each(&ioas->iopt.access_list, index, access) {
+		if (!access->ops || !access->ops->unmap) {
+			ret = -EBUSY;
+			goto unlock;
+		}
 		if (!iommufd_lock_obj(&access->obj))
 			continue;
 		xa_unlock(&ioas->iopt.access_list);
@@ -1223,7 +1228,9 @@ void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova,
 		iommufd_put_object(access->ictx, &access->obj);
 		xa_lock(&ioas->iopt.access_list);
 	}
+unlock:
 	xa_unlock(&ioas->iopt.access_list);
+	return ret;
 }
 
 /**
@@ -1321,7 +1328,7 @@ int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova,
 
 	/* Driver's ops don't support pin_pages */
 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
-	    WARN_ON(access->iova_alignment != PAGE_SIZE || !access->ops->unmap))
+	    WARN_ON(access->iova_alignment != PAGE_SIZE))
 		return -EINVAL;
 
 	if (!length)
diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c
index 13d010f19ed1..6b8477b1f94b 100644
--- a/drivers/iommu/iommufd/io_pagetable.c
+++ b/drivers/iommu/iommufd/io_pagetable.c
@@ -740,11 +740,21 @@ static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
 			up_write(&iopt->iova_rwsem);
 			up_read(&iopt->domains_rwsem);
 
-			iommufd_access_notify_unmap(iopt, area_first, length);
+			rc = iommufd_access_notify_unmap(iopt, area_first,
+							 length);
+			if (rc) {
+				down_read(&iopt->domains_rwsem);
+				down_write(&iopt->iova_rwsem);
+				area->prevent_access = false;
+				goto out_unlock_iova;
+			}
+
 			/* Something is not responding to unmap requests. */
 			tries++;
-			if (WARN_ON(tries > 100))
-				return -EDEADLOCK;
+			if (WARN_ON(tries > 100)) {
+				rc = -EDEADLOCK;
+				goto out_unmapped;
+			}
 			goto again;
 		}
 
@@ -766,6 +776,7 @@ static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
 out_unlock_iova:
 	up_write(&iopt->iova_rwsem);
 	up_read(&iopt->domains_rwsem);
+out_unmapped:
 	if (unmapped)
 		*unmapped = unmapped_bytes;
 	return rc;
-- 
2.43.0
RE: [PATCH v6 06/25] iommufd/access: Allow access->ops to be NULL for internal use
Posted by Tian, Kevin 3 months, 2 weeks ago
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Saturday, June 14, 2025 3:15 PM
> 
> +int iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long
> iova,
> +				unsigned long length)
>  {
>  	struct iommufd_ioas *ioas =
>  		container_of(iopt, struct iommufd_ioas, iopt);
>  	struct iommufd_access *access;
>  	unsigned long index;
> +	int ret = 0;
> 
>  	xa_lock(&ioas->iopt.access_list);
>  	xa_for_each(&ioas->iopt.access_list, index, access) {
> +		if (!access->ops || !access->ops->unmap) {
> +			ret = -EBUSY;
> +			goto unlock;
> +		}

then accesses before this one have been notified to unpin the area
while accesses afterwards are left unnotified.

in the end the unmap fails but with some side-effect incurred.

I'm not sure whether this intermediate state may lead to any undesired
effect later. Just raise it in case you or Jason already thought about it.


>  			/* Something is not responding to unmap requests.
> */
>  			tries++;
> -			if (WARN_ON(tries > 100))
> -				return -EDEADLOCK;
> +			if (WARN_ON(tries > 100)) {
> +				rc = -EDEADLOCK;
> +				goto out_unmapped;
> +			}

this looks an unrelated fix?
Re: [PATCH v6 06/25] iommufd/access: Allow access->ops to be NULL for internal use
Posted by Nicolin Chen 3 months, 2 weeks ago
On Wed, Jun 25, 2025 at 03:38:19AM +0000, Tian, Kevin wrote:
> > From: Nicolin Chen <nicolinc@nvidia.com>
> > Sent: Saturday, June 14, 2025 3:15 PM
> > 
> > +int iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long
> > iova,
> > +				unsigned long length)
> >  {
> >  	struct iommufd_ioas *ioas =
> >  		container_of(iopt, struct iommufd_ioas, iopt);
> >  	struct iommufd_access *access;
> >  	unsigned long index;
> > +	int ret = 0;
> > 
> >  	xa_lock(&ioas->iopt.access_list);
> >  	xa_for_each(&ioas->iopt.access_list, index, access) {
> > +		if (!access->ops || !access->ops->unmap) {
> > +			ret = -EBUSY;
> > +			goto unlock;
> > +		}
> 
> then accesses before this one have been notified to unpin the area
> while accesses afterwards are left unnotified.
> 
> in the end the unmap fails but with some side-effect incurred.
> 
> I'm not sure whether this intermediate state may lead to any undesired
> effect later. Just raise it in case you or Jason already thought about it.

That's a good point. When an access blocks the unmap, there is no
unmap happening so no point in notifying devices for ops->unmap.

And, when the function is re-entered, there could be a duplicated
ops->unmap call for those devices that are already notified once?

So, if we play safe, there can be a standalone xa_for_each to dig
for !access->ops->unmap. And it could be a bit cleaner to add an
iommufd_access_has_internal_use() to be called under those rwsems.

> >  			/* Something is not responding to unmap requests.
> > */
> >  			tries++;
> > -			if (WARN_ON(tries > 100))
> > -				return -EDEADLOCK;
> > +			if (WARN_ON(tries > 100)) {
> > +				rc = -EDEADLOCK;
> > +				goto out_unmapped;
> > +			}
> 
> this looks an unrelated fix?

Yea.. let me separate it out.

Thanks
Nicolin
Re: [PATCH v6 06/25] iommufd/access: Allow access->ops to be NULL for internal use
Posted by Nicolin Chen 3 months, 2 weeks ago
On Wed, Jun 25, 2025 at 09:37:18AM -0700, Nicolin Chen wrote:
> On Wed, Jun 25, 2025 at 03:38:19AM +0000, Tian, Kevin wrote:
> > > From: Nicolin Chen <nicolinc@nvidia.com>
> > > Sent: Saturday, June 14, 2025 3:15 PM
> > > 
> > > +int iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long
> > > iova,
> > > +				unsigned long length)
> > >  {
> > >  	struct iommufd_ioas *ioas =
> > >  		container_of(iopt, struct iommufd_ioas, iopt);
> > >  	struct iommufd_access *access;
> > >  	unsigned long index;
> > > +	int ret = 0;
> > > 
> > >  	xa_lock(&ioas->iopt.access_list);
> > >  	xa_for_each(&ioas->iopt.access_list, index, access) {
> > > +		if (!access->ops || !access->ops->unmap) {
> > > +			ret = -EBUSY;
> > > +			goto unlock;
> > > +		}
> > 
> > then accesses before this one have been notified to unpin the area
> > while accesses afterwards are left unnotified.
> > 
> > in the end the unmap fails but with some side-effect incurred.
> > 
> > I'm not sure whether this intermediate state may lead to any undesired
> > effect later. Just raise it in case you or Jason already thought about it.
> 
> That's a good point. When an access blocks the unmap, there is no
> unmap happening so no point in notifying devices for ops->unmap.
> 
> And, when the function is re-entered, there could be a duplicated
> ops->unmap call for those devices that are already notified once?
> 
> So, if we play safe, there can be a standalone xa_for_each to dig
> for !access->ops->unmap. And it could be a bit cleaner to add an
> iommufd_access_has_internal_use() to be called under those rwsems.

Correct: it has to be under the xa_lock. So, this pre-check needs
to be done in this function.

Thanks
Nicolin
Re: [PATCH v6 06/25] iommufd/access: Allow access->ops to be NULL for internal use
Posted by Jason Gunthorpe 3 months, 3 weeks ago
On Sat, Jun 14, 2025 at 12:14:31AM -0700, Nicolin Chen wrote:
> @@ -1321,7 +1328,7 @@ int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova,
>  
>  	/* Driver's ops don't support pin_pages */
>  	if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
> -	    WARN_ON(access->iova_alignment != PAGE_SIZE || !access->ops->unmap))
> +	    WARN_ON(access->iova_alignment != PAGE_SIZE))
>  		return -EINVAL;

I don't want to loose this check, continuing blocking mdevs is still
important. Only the internal access should be able to use this
mechanism.

Otherwise:

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>

Jason
Re: [PATCH v6 06/25] iommufd/access: Allow access->ops to be NULL for internal use
Posted by Nicolin Chen 3 months, 3 weeks ago
On Mon, Jun 16, 2025 at 10:33:05AM -0300, Jason Gunthorpe wrote:
> On Sat, Jun 14, 2025 at 12:14:31AM -0700, Nicolin Chen wrote:
> > @@ -1321,7 +1328,7 @@ int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova,
> >  
> >  	/* Driver's ops don't support pin_pages */
> >  	if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
> > -	    WARN_ON(access->iova_alignment != PAGE_SIZE || !access->ops->unmap))
> > +	    WARN_ON(access->iova_alignment != PAGE_SIZE))
> >  		return -EINVAL;
> 
> I don't want to loose this check, continuing blocking mdevs is still
> important. Only the internal access should be able to use this
> mechanism.

OK. So, it probably should be 
	if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
	    WARN_ON(access->iova_alignment != PAGE_SIZE ||
		    (access->ictx && !access->ops->unmap))

Thanks
Nicolin
Re: [PATCH v6 06/25] iommufd/access: Allow access->ops to be NULL for internal use
Posted by Pranjal Shrivastava 3 months, 3 weeks ago
On Mon, Jun 16, 2025 at 07:21:28PM -0700, Nicolin Chen wrote:
> On Mon, Jun 16, 2025 at 10:33:05AM -0300, Jason Gunthorpe wrote:
> > On Sat, Jun 14, 2025 at 12:14:31AM -0700, Nicolin Chen wrote:
> > > @@ -1321,7 +1328,7 @@ int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova,
> > >  
> > >  	/* Driver's ops don't support pin_pages */
> > >  	if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
> > > -	    WARN_ON(access->iova_alignment != PAGE_SIZE || !access->ops->unmap))
> > > +	    WARN_ON(access->iova_alignment != PAGE_SIZE))
> > >  		return -EINVAL;
> > 
> > I don't want to loose this check, continuing blocking mdevs is still
> > important. Only the internal access should be able to use this
> > mechanism.
> 
> OK. So, it probably should be 
> 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
> 	    WARN_ON(access->iova_alignment != PAGE_SIZE ||
> 		    (access->ictx && !access->ops->unmap))
> 

Nit: Probably worth mentioning in a comment that access->ops shouldn't
be NULL during selftests and hence we avoid checking for !access->ops?

If you feel it's too verbose, you can let be as well.

Reviewed-by: Pranjal Shrivastava <praan@google.com>

> Thanks
> Nicolin
Re: [PATCH v6 06/25] iommufd/access: Allow access->ops to be NULL for internal use
Posted by Baolu Lu 3 months, 3 weeks ago
On 6/14/25 15:14, Nicolin Chen wrote:
> The access object has been used externally by VFIO mdev devices, allowing
> them to pin/unpin physical pages (via needs_pin_pages). Meanwhile, a racy
> unmap can occur in this case, so these devices usually implement an unmap
> handler, invoked by iommufd_access_notify_unmap().
> 
> The new HW queue object will need the same pin/unpin feature, although it
> (unlike the mdev case) wants to reject any unmap attempt, during its life
> cycle.
> 
> To reuse the access object for the new HW queue, allow its access->ops to
> be NULL. Error out a NULL access->ops in iommufd_access_notify_unmap() to
> reject an unmap operation and propagatethe errno upwards.
> 
> Then, update iommufd_access_change_ioas() and iommufd_access_pin_pages(),
> to allow this new use case.
> 
> Suggested-by: Jason Gunthorpe<jgg@nvidia.com>
> Signed-off-by: Nicolin Chen<nicolinc@nvidia.com>

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>