[v6] IOMMUFD: Deliver IO page faults to user space

[PATCH v6 07/10] iommufd: Fault-capable hwpt attach/detach/replace

Posted by Lu Baolu 1 year, 8 months ago

Add iopf-capable hw page table attach/detach/replace helpers. The pointer
to iommufd_device is stored in the domain attachment handle, so that it
can be echo'ed back in the iopf_group.

The iopf-capable hw page tables can only be attached to devices that
support the IOMMU_DEV_FEAT_IOPF feature. On the first attachment of an
iopf-capable hw_pagetable to the device, the IOPF feature is enabled on
the device. Similarly, after the last iopf-capable hwpt is detached from
the device, the IOPF feature is disabled on the device.

The current implementation allows a replacement between iopf-capable and
non-iopf-capable hw page tables. This matches the nested translation use
case, where a parent domain is attached by default and can then be
replaced with a nested user domain with iopf support.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
---
 drivers/iommu/iommufd/iommufd_private.h |  12 ++
 drivers/iommu/iommufd/device.c          |  16 +-
 drivers/iommu/iommufd/fault.c           | 191 ++++++++++++++++++++++++
 3 files changed, 216 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index c8a4519f1405..ba89c86e1af7 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -293,6 +293,7 @@ int iommufd_check_iova_range(struct io_pagetable *iopt,
 struct iommufd_hw_pagetable {
 	struct iommufd_object obj;
 	struct iommu_domain *domain;
+	struct iommufd_fault *fault;
 };
 
 struct iommufd_hwpt_paging {
@@ -396,6 +397,9 @@ struct iommufd_device {
 	/* always the physical device */
 	struct device *dev;
 	bool enforce_cache_coherency;
+	/* protect iopf_enabled counter */
+	struct mutex iopf_lock;
+	unsigned int iopf_enabled;
 };
 
 static inline struct iommufd_device *
@@ -456,6 +460,14 @@ struct iommufd_attach_handle {
 int iommufd_fault_alloc(struct iommufd_ucmd *ucmd);
 void iommufd_fault_destroy(struct iommufd_object *obj);
 
+int iommufd_fault_domain_attach_dev(struct iommufd_hw_pagetable *hwpt,
+				    struct iommufd_device *idev);
+void iommufd_fault_domain_detach_dev(struct iommufd_hw_pagetable *hwpt,
+				     struct iommufd_device *idev);
+int iommufd_fault_domain_replace_dev(struct iommufd_device *idev,
+				     struct iommufd_hw_pagetable *hwpt,
+				     struct iommufd_hw_pagetable *old);
+
 #ifdef CONFIG_IOMMUFD_TEST
 int iommufd_test(struct iommufd_ucmd *ucmd);
 void iommufd_selftest_destroy(struct iommufd_object *obj);
diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index 873630c111c1..63681d79b72d 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -215,6 +215,7 @@ struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx,
 	refcount_inc(&idev->obj.users);
 	/* igroup refcount moves into iommufd_device */
 	idev->igroup = igroup;
+	mutex_init(&idev->iopf_lock);
 
 	/*
 	 * If the caller fails after this success it must call
@@ -376,7 +377,10 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
 	 * attachment.
 	 */
 	if (list_empty(&idev->igroup->device_list)) {
-		rc = iommu_attach_group(hwpt->domain, idev->igroup->group);
+		if (hwpt->fault)
+			rc = iommufd_fault_domain_attach_dev(hwpt, idev);
+		else
+			rc = iommu_attach_group(hwpt->domain, idev->igroup->group);
 		if (rc)
 			goto err_unresv;
 		idev->igroup->hwpt = hwpt;
@@ -402,7 +406,10 @@ iommufd_hw_pagetable_detach(struct iommufd_device *idev)
 	mutex_lock(&idev->igroup->lock);
 	list_del(&idev->group_item);
 	if (list_empty(&idev->igroup->device_list)) {
-		iommu_detach_group(hwpt->domain, idev->igroup->group);
+		if (hwpt->fault)
+			iommufd_fault_domain_detach_dev(hwpt, idev);
+		else
+			iommu_detach_group(hwpt->domain, idev->igroup->group);
 		idev->igroup->hwpt = NULL;
 	}
 	if (hwpt_is_paging(hwpt))
@@ -497,7 +504,10 @@ iommufd_device_do_replace(struct iommufd_device *idev,
 			goto err_unlock;
 	}
 
-	rc = iommu_group_replace_domain(igroup->group, hwpt->domain);
+	if (old_hwpt->fault || hwpt->fault)
+		rc = iommufd_fault_domain_replace_dev(idev, hwpt, old_hwpt);
+	else
+		rc = iommu_group_replace_domain(igroup->group, hwpt->domain);
 	if (rc)
 		goto err_unresv;
 
diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c
index d0dafe761075..94dde1f57cfc 100644
--- a/drivers/iommu/iommufd/fault.c
+++ b/drivers/iommu/iommufd/fault.c
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/iommufd.h>
+#include <linux/pci.h>
 #include <linux/poll.h>
 #include <linux/anon_inodes.h>
 #include <uapi/linux/iommufd.h>
@@ -15,6 +16,196 @@
 #include "../iommu-priv.h"
 #include "iommufd_private.h"
 
+static int iommufd_fault_iopf_enable(struct iommufd_device *idev)
+{
+	struct device *dev = idev->dev;
+	int ret;
+
+	/*
+	 * Once we turn on PCI/PRI support for VF, the response failure code
+	 * could not be forwarded to the hardware due to PRI being a shared
+	 * resource between PF and VFs. There is no coordination for this
+	 * shared capability. This waits for a vPRI reset to recover.
+	 */
+	if (dev_is_pci(dev) && to_pci_dev(dev)->is_virtfn)
+		return -EINVAL;
+
+	mutex_lock(&idev->iopf_lock);
+	/* Device iopf has already been on. */
+	if (++idev->iopf_enabled > 1) {
+		mutex_unlock(&idev->iopf_lock);
+		return 0;
+	}
+
+	ret = iommu_dev_enable_feature(dev, IOMMU_DEV_FEAT_IOPF);
+	if (ret)
+		--idev->iopf_enabled;
+	mutex_unlock(&idev->iopf_lock);
+
+	return ret;
+}
+
+static void iommufd_fault_iopf_disable(struct iommufd_device *idev)
+{
+	mutex_lock(&idev->iopf_lock);
+	if (!WARN_ON(idev->iopf_enabled == 0)) {
+		if (--idev->iopf_enabled == 0)
+			iommu_dev_disable_feature(idev->dev, IOMMU_DEV_FEAT_IOPF);
+	}
+	mutex_unlock(&idev->iopf_lock);
+}
+
+static int __fault_domain_attach_dev(struct iommufd_hw_pagetable *hwpt,
+				     struct iommufd_device *idev)
+{
+	struct iommufd_attach_handle *handle;
+	int ret;
+
+	handle = kzalloc(sizeof(*handle), GFP_KERNEL);
+	if (!handle)
+		return -ENOMEM;
+
+	handle->handle.domain = hwpt->domain;
+	handle->idev = idev;
+	ret = iommu_attach_group_handle(hwpt->domain, idev->igroup->group,
+					&handle->handle);
+	if (ret)
+		kfree(handle);
+
+	return ret;
+}
+
+int iommufd_fault_domain_attach_dev(struct iommufd_hw_pagetable *hwpt,
+				    struct iommufd_device *idev)
+{
+	int ret;
+
+	if (!hwpt->fault)
+		return -EINVAL;
+
+	ret = iommufd_fault_iopf_enable(idev);
+	if (ret)
+		return ret;
+
+	ret = __fault_domain_attach_dev(hwpt, idev);
+	if (ret)
+		iommufd_fault_iopf_disable(idev);
+
+	return ret;
+}
+
+static void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt,
+					 struct iommufd_attach_handle *handle)
+{
+	struct iommufd_fault *fault = hwpt->fault;
+	struct iopf_group *group, *next;
+	unsigned long index;
+
+	if (!fault)
+		return;
+
+	mutex_lock(&fault->mutex);
+	list_for_each_entry_safe(group, next, &fault->deliver, node) {
+		if (group->attach_handle != &handle->handle)
+			continue;
+		list_del(&group->node);
+		iopf_group_response(group, IOMMU_PAGE_RESP_INVALID);
+		iopf_free_group(group);
+	}
+
+	xa_for_each(&fault->response, index, group) {
+		if (group->attach_handle != &handle->handle)
+			continue;
+		xa_erase(&fault->response, index);
+		iopf_group_response(group, IOMMU_PAGE_RESP_INVALID);
+		iopf_free_group(group);
+	}
+	mutex_unlock(&fault->mutex);
+}
+
+static struct iommufd_attach_handle *
+iommufd_device_get_attach_handle(struct iommufd_device *idev)
+{
+	struct iommu_attach_handle *handle;
+
+	handle = iommu_attach_handle_get(idev->igroup->group, IOMMU_NO_PASID, 0);
+	if (!handle)
+		return NULL;
+
+	return to_iommufd_handle(handle);
+}
+
+void iommufd_fault_domain_detach_dev(struct iommufd_hw_pagetable *hwpt,
+				     struct iommufd_device *idev)
+{
+	struct iommufd_attach_handle *handle;
+
+	handle = iommufd_device_get_attach_handle(idev);
+	iommu_detach_group_handle(hwpt->domain, idev->igroup->group);
+	iommufd_auto_response_faults(hwpt, handle);
+	iommufd_fault_iopf_disable(idev);
+	kfree(handle);
+}
+
+static int __fault_domain_replace_dev(struct iommufd_device *idev,
+				      struct iommufd_hw_pagetable *hwpt,
+				      struct iommufd_hw_pagetable *old)
+{
+	struct iommufd_attach_handle *handle, *curr = NULL;
+	int ret;
+
+	if (old->fault)
+		curr = iommufd_device_get_attach_handle(idev);
+
+	if (hwpt->fault) {
+		handle = kzalloc(sizeof(*handle), GFP_KERNEL);
+		if (!handle)
+			return -ENOMEM;
+
+		handle->handle.domain = hwpt->domain;
+		handle->idev = idev;
+		ret = iommu_replace_group_handle(idev->igroup->group,
+						 hwpt->domain, &handle->handle);
+	} else {
+		ret = iommu_replace_group_handle(idev->igroup->group,
+						 hwpt->domain, NULL);
+	}
+
+	if (!ret && curr) {
+		iommufd_auto_response_faults(old, curr);
+		kfree(curr);
+	}
+
+	return ret;
+}
+
+int iommufd_fault_domain_replace_dev(struct iommufd_device *idev,
+				     struct iommufd_hw_pagetable *hwpt,
+				     struct iommufd_hw_pagetable *old)
+{
+	bool iopf_off = !hwpt->fault && old->fault;
+	bool iopf_on = hwpt->fault && !old->fault;
+	int ret;
+
+	if (iopf_on) {
+		ret = iommufd_fault_iopf_enable(idev);
+		if (ret)
+			return ret;
+	}
+
+	ret = __fault_domain_replace_dev(idev, hwpt, old);
+	if (ret) {
+		if (iopf_on)
+			iommufd_fault_iopf_disable(idev);
+		return ret;
+	}
+
+	if (iopf_off)
+		iommufd_fault_iopf_disable(idev);
+
+	return 0;
+}
+
 void iommufd_fault_destroy(struct iommufd_object *obj)
 {
 	struct iommufd_fault *fault = container_of(obj, struct iommufd_fault, obj);
-- 
2.34.1

RE: [PATCH v6 07/10] iommufd: Fault-capable hwpt attach/detach/replace

Posted by Tian, Kevin 1 year, 8 months ago

> From: Lu Baolu <baolu.lu@linux.intel.com>
> Sent: Monday, May 27, 2024 12:05 PM
> 
> Add iopf-capable hw page table attach/detach/replace helpers. The pointer
> to iommufd_device is stored in the domain attachment handle, so that it
> can be echo'ed back in the iopf_group.

this message needs an update. now the device pointer is not in the
attach handle.

and there worths a explanation about VF in the commit msg.

> @@ -376,7 +377,10 @@ int iommufd_hw_pagetable_attach(struct
> iommufd_hw_pagetable *hwpt,
>  	 * attachment.
>  	 */
>  	if (list_empty(&idev->igroup->device_list)) {
> -		rc = iommu_attach_group(hwpt->domain, idev->igroup->group);
> +		if (hwpt->fault)
> +			rc = iommufd_fault_domain_attach_dev(hwpt, idev);
> +		else
> +			rc = iommu_attach_group(hwpt->domain, idev-
> >igroup->group);

Does it read better to have a iommufd_attach_device() wrapper with
above branches handled internally?

> 
> +static int iommufd_fault_iopf_enable(struct iommufd_device *idev)
> +{
> +	struct device *dev = idev->dev;
> +	int ret;
> +
> +	/*
> +	 * Once we turn on PCI/PRI support for VF, the response failure code
> +	 * could not be forwarded to the hardware due to PRI being a shared

you could but just doing so is incorrect. 😊

s/could/should/

> +	 * resource between PF and VFs. There is no coordination for this
> +	 * shared capability. This waits for a vPRI reset to recover.
> +	 */

this may go a bit further to talk about supporting it requires an emulation
in iommufd (i.e. pause any further fault delivery until vPRI reset). It is a
future work so disable it for VF at this point.

> +void iommufd_fault_domain_detach_dev(struct iommufd_hw_pagetable
> *hwpt,
> +				     struct iommufd_device *idev)
> +{
> +	struct iommufd_attach_handle *handle;
> +
> +	handle = iommufd_device_get_attach_handle(idev);
> +	iommu_detach_group_handle(hwpt->domain, idev->igroup->group);
> +	iommufd_auto_response_faults(hwpt, handle);
> +	iommufd_fault_iopf_disable(idev);
> +	kfree(handle);

this assumes that the detach callback of the iommu driver needs to drain
in-fly page requests so no further reference to handle or queue new req
to the deliver queue when it returns, otherwise there is a use-after-free
race or stale requests in the fault queue which auto response doesn't
cleanly handle.

iirc previous discussion reveals that only intel-iommu driver guarantees
that behavior. In any case this should be documented to avoid this being
used in a non-conforming iommu driver.

If I misunderstood, some comment why no race in this window is also
appreciated. 😊

> +}
> +
> +static int __fault_domain_replace_dev(struct iommufd_device *idev,
> +				      struct iommufd_hw_pagetable *hwpt,
> +				      struct iommufd_hw_pagetable *old)
> +{
> +	struct iommufd_attach_handle *handle, *curr = NULL;
> +	int ret;
> +
> +	if (old->fault)
> +		curr = iommufd_device_get_attach_handle(idev);
> +
> +	if (hwpt->fault) {
> +		handle = kzalloc(sizeof(*handle), GFP_KERNEL);
> +		if (!handle)
> +			return -ENOMEM;
> +
> +		handle->handle.domain = hwpt->domain;
> +		handle->idev = idev;
> +		ret = iommu_replace_group_handle(idev->igroup->group,
> +						 hwpt->domain, &handle-
> >handle);
> +	} else {
> +		ret = iommu_replace_group_handle(idev->igroup->group,
> +						 hwpt->domain, NULL);
> +	}
> +
> +	if (!ret && curr) {
> +		iommufd_auto_response_faults(old, curr);
> +		kfree(curr);
> +	}

In last version you said auto response is required only when switching
from fault-capable old to fault-incapable new. But above code doesn't
reflect that description?

Re: [PATCH v6 07/10] iommufd: Fault-capable hwpt attach/detach/replace

Posted by Baolu Lu 1 year, 8 months ago

On 6/7/24 5:30 PM, Tian, Kevin wrote:
>> From: Lu Baolu <baolu.lu@linux.intel.com>
>> Sent: Monday, May 27, 2024 12:05 PM
>>
>> Add iopf-capable hw page table attach/detach/replace helpers. The pointer
>> to iommufd_device is stored in the domain attachment handle, so that it
>> can be echo'ed back in the iopf_group.
> 
> this message needs an update. now the device pointer is not in the
> attach handle.

The iommufd_device pointer is in the attach handle provided by iommufd 
in attach or replace path.

> and there worths a explanation about VF in the commit msg.
> 
>> @@ -376,7 +377,10 @@ int iommufd_hw_pagetable_attach(struct
>> iommufd_hw_pagetable *hwpt,
>>   	 * attachment.
>>   	 */
>>   	if (list_empty(&idev->igroup->device_list)) {
>> -		rc = iommu_attach_group(hwpt->domain, idev->igroup->group);
>> +		if (hwpt->fault)
>> +			rc = iommufd_fault_domain_attach_dev(hwpt, idev);
>> +		else
>> +			rc = iommu_attach_group(hwpt->domain, idev-
>>> igroup->group);
> 
> Does it read better to have a iommufd_attach_device() wrapper with
> above branches handled internally?

Yes. Will do this in the next version.

> 
>>
>> +static int iommufd_fault_iopf_enable(struct iommufd_device *idev)
>> +{
>> +	struct device *dev = idev->dev;
>> +	int ret;
>> +
>> +	/*
>> +	 * Once we turn on PCI/PRI support for VF, the response failure code
>> +	 * could not be forwarded to the hardware due to PRI being a shared
> 
> you could but just doing so is incorrect. 😊
> 
> s/could/should/

Done.

> 
>> +	 * resource between PF and VFs. There is no coordination for this
>> +	 * shared capability. This waits for a vPRI reset to recover.
>> +	 */
> 
> this may go a bit further to talk about supporting it requires an emulation
> in iommufd (i.e. pause any further fault delivery until vPRI reset). It is a
> future work so disable it for VF at this point.

Yes.

> 
>> +void iommufd_fault_domain_detach_dev(struct iommufd_hw_pagetable
>> *hwpt,
>> +				     struct iommufd_device *idev)
>> +{
>> +	struct iommufd_attach_handle *handle;
>> +
>> +	handle = iommufd_device_get_attach_handle(idev);
>> +	iommu_detach_group_handle(hwpt->domain, idev->igroup->group);
>> +	iommufd_auto_response_faults(hwpt, handle);
>> +	iommufd_fault_iopf_disable(idev);
>> +	kfree(handle);
> 
> this assumes that the detach callback of the iommu driver needs to drain
> in-fly page requests so no further reference to handle or queue new req
> to the deliver queue when it returns, otherwise there is a use-after-free
> race or stale requests in the fault queue which auto response doesn't
> cleanly handle.
> 
> iirc previous discussion reveals that only intel-iommu driver guarantees
> that behavior. In any case this should be documented to avoid this being
> used in a non-conforming iommu driver.
> 
> If I misunderstood, some comment why no race in this window is also
> appreciated. 😊

Yes. The iommu driver needs to guarantee that there will be no iopf
request for a RID or PASID after the domain has been detached. This
implies that:

- IOMMU hardware should not put further iopf in its hardware queue if
   the domain has been detached.
- Before the domain detachment is complete, the iommu driver should
   flush all iopf targeting the detached domain in the hardware page
   request queue.

> 
>> +}
>> +
>> +static int __fault_domain_replace_dev(struct iommufd_device *idev,
>> +				      struct iommufd_hw_pagetable *hwpt,
>> +				      struct iommufd_hw_pagetable *old)
>> +{
>> +	struct iommufd_attach_handle *handle, *curr = NULL;
>> +	int ret;
>> +
>> +	if (old->fault)
>> +		curr = iommufd_device_get_attach_handle(idev);
>> +
>> +	if (hwpt->fault) {
>> +		handle = kzalloc(sizeof(*handle), GFP_KERNEL);
>> +		if (!handle)
>> +			return -ENOMEM;
>> +
>> +		handle->handle.domain = hwpt->domain;
>> +		handle->idev = idev;
>> +		ret = iommu_replace_group_handle(idev->igroup->group,
>> +						 hwpt->domain, &handle-
>>> handle);
>> +	} else {
>> +		ret = iommu_replace_group_handle(idev->igroup->group,
>> +						 hwpt->domain, NULL);
>> +	}
>> +
>> +	if (!ret && curr) {
>> +		iommufd_auto_response_faults(old, curr);
>> +		kfree(curr);
>> +	}
> 
> In last version you said auto response is required only when switching
> from fault-capable old to fault-incapable new. But above code doesn't
> reflect that description?

What the current code does is auto-respond to all page faults targeting
the old fault-capable hwpt. I'm also okay if we decide to limit this to
flushing page faults only if the new hwpt is *not* fault-capable.

Best regards,
baolu

RE: [PATCH v6 07/10] iommufd: Fault-capable hwpt attach/detach/replace

Posted by Tian, Kevin 1 year, 8 months ago

> From: Baolu Lu <baolu.lu@linux.intel.com>
> Sent: Sunday, June 9, 2024 3:23 PM
> 
> On 6/7/24 5:30 PM, Tian, Kevin wrote:
> >> From: Lu Baolu <baolu.lu@linux.intel.com>
> >> Sent: Monday, May 27, 2024 12:05 PM
> >>
> >> Add iopf-capable hw page table attach/detach/replace helpers. The
> pointer
> >> to iommufd_device is stored in the domain attachment handle, so that it
> >> can be echo'ed back in the iopf_group.
> >
> > this message needs an update. now the device pointer is not in the
> > attach handle.
> 
> The iommufd_device pointer is in the attach handle provided by iommufd
> in attach or replace path.

I thought it talked about iommu_attach_handle then it includes only
the domain pointer.

but it's correct if iommufd_attach_handle is being talked here.

depends on what 'domain attachment handle' refers to. 😊