[PATCH v3 4/8] iommufd: Add iommufd fault object

Lu Baolu posted 8 patches 1 year, 11 months ago
There is a newer version of this series
[PATCH v3 4/8] iommufd: Add iommufd fault object
Posted by Lu Baolu 1 year, 11 months ago
An iommufd fault object provides an interface for delivering I/O page
faults to user space. These objects are created and destroyed by user
space, and they can be associated with or dissociated from hardware page
table objects during page table allocation or destruction.

User space interacts with the fault object through a file interface. This
interface offers a straightforward and efficient way for user space to
handle page faults. It allows user space to read fault messages
sequentially and respond to them by writing to the same file. The file
interface supports reading messages in poll mode, so it's recommended that
user space applications use io_uring to enhance read and write efficiency.

A fault object can be associated with any iopf-capable iommufd_hw_pgtable
during the pgtable's allocation. All I/O page faults triggered by devices
when accessing the I/O addresses of an iommufd_hw_pgtable are routed
through the fault object to user space. Similarly, user space's responses
to these page faults are routed back to the iommu device driver through
the same fault object.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
---
 include/linux/iommu.h                   |   2 +
 drivers/iommu/iommufd/iommufd_private.h |  23 +++
 include/uapi/linux/iommufd.h            |  18 ++
 drivers/iommu/iommufd/device.c          |   1 +
 drivers/iommu/iommufd/fault.c           | 255 ++++++++++++++++++++++++
 drivers/iommu/iommufd/main.c            |   6 +
 drivers/iommu/iommufd/Makefile          |   1 +
 7 files changed, 306 insertions(+)
 create mode 100644 drivers/iommu/iommufd/fault.c

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 511dc7b4bdb2..4372648ac22e 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -145,6 +145,8 @@ struct iopf_group {
 	/* The device's fault data parameter. */
 	struct iommu_fault_param *fault_param;
 	struct iopf_attach_cookie *cookie;
+	/* Used by handler provider to hook the group on its own lists. */
+	struct list_head node;
 };
 
 /**
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 991f864d1f9b..52d83e888bd0 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -128,6 +128,7 @@ enum iommufd_object_type {
 	IOMMUFD_OBJ_HWPT_NESTED,
 	IOMMUFD_OBJ_IOAS,
 	IOMMUFD_OBJ_ACCESS,
+	IOMMUFD_OBJ_FAULT,
 #ifdef CONFIG_IOMMUFD_TEST
 	IOMMUFD_OBJ_SELFTEST,
 #endif
@@ -395,6 +396,8 @@ struct iommufd_device {
 	/* always the physical device */
 	struct device *dev;
 	bool enforce_cache_coherency;
+	/* outstanding faults awaiting response indexed by fault group id */
+	struct xarray faults;
 };
 
 static inline struct iommufd_device *
@@ -426,6 +429,26 @@ void iopt_remove_access(struct io_pagetable *iopt,
 			u32 iopt_access_list_id);
 void iommufd_access_destroy_object(struct iommufd_object *obj);
 
+/*
+ * An iommufd_fault object represents an interface to deliver I/O page faults
+ * to the user space. These objects are created/destroyed by the user space and
+ * associated with hardware page table objects during page-table allocation.
+ */
+struct iommufd_fault {
+	struct iommufd_object obj;
+	struct iommufd_ctx *ictx;
+
+	/* The lists of outstanding faults protected by below mutex. */
+	struct mutex mutex;
+	struct list_head deliver;
+	struct list_head response;
+
+	struct wait_queue_head wait_queue;
+};
+
+int iommufd_fault_alloc(struct iommufd_ucmd *ucmd);
+void iommufd_fault_destroy(struct iommufd_object *obj);
+
 #ifdef CONFIG_IOMMUFD_TEST
 int iommufd_test(struct iommufd_ucmd *ucmd);
 void iommufd_selftest_destroy(struct iommufd_object *obj);
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index d59e839ae49e..c32d62b02306 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -50,6 +50,7 @@ enum {
 	IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING,
 	IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP,
 	IOMMUFD_CMD_HWPT_INVALIDATE,
+	IOMMUFD_CMD_FAULT_ALLOC,
 };
 
 /**
@@ -759,4 +760,21 @@ struct iommu_hwpt_page_response {
 	__u32 code;
 	__u64 addr;
 };
+
+/**
+ * struct iommu_fault_alloc - ioctl(IOMMU_FAULT_ALLOC)
+ * @size: sizeof(struct iommu_fault_alloc)
+ * @flags: Must be 0
+ * @out_fault_id: The ID of the new FAULT
+ * @out_fault_fd: The fd of the new FAULT
+ *
+ * Explicitly allocate a fault handling object.
+ */
+struct iommu_fault_alloc {
+	__u32 size;
+	__u32 flags;
+	__u32 out_fault_id;
+	__u32 out_fault_fd;
+};
+#define IOMMU_FAULT_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_FAULT_ALLOC)
 #endif
diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index 873630c111c1..d70913ee8fdf 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -215,6 +215,7 @@ struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx,
 	refcount_inc(&idev->obj.users);
 	/* igroup refcount moves into iommufd_device */
 	idev->igroup = igroup;
+	xa_init(&idev->faults);
 
 	/*
 	 * If the caller fails after this success it must call
diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c
new file mode 100644
index 000000000000..9844a85feeb2
--- /dev/null
+++ b/drivers/iommu/iommufd/fault.c
@@ -0,0 +1,255 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2024 Intel Corporation
+ */
+#define pr_fmt(fmt) "iommufd: " fmt
+
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/iommufd.h>
+#include <linux/poll.h>
+#include <linux/anon_inodes.h>
+#include <uapi/linux/iommufd.h>
+
+#include "iommufd_private.h"
+
+static int device_add_fault(struct iopf_group *group)
+{
+	struct iommufd_device *idev = group->cookie->private;
+	void *curr;
+
+	curr = xa_cmpxchg(&idev->faults, group->last_fault.fault.prm.grpid,
+			  NULL, group, GFP_KERNEL);
+
+	return curr ? xa_err(curr) : 0;
+}
+
+static void device_remove_fault(struct iopf_group *group)
+{
+	struct iommufd_device *idev = group->cookie->private;
+
+	xa_store(&idev->faults, group->last_fault.fault.prm.grpid,
+		 NULL, GFP_KERNEL);
+}
+
+static struct iopf_group *device_get_fault(struct iommufd_device *idev,
+					   unsigned long grpid)
+{
+	return xa_load(&idev->faults, grpid);
+}
+
+void iommufd_fault_destroy(struct iommufd_object *obj)
+{
+	struct iommufd_fault *fault = container_of(obj, struct iommufd_fault, obj);
+	struct iopf_group *group, *next;
+
+	mutex_lock(&fault->mutex);
+	list_for_each_entry_safe(group, next, &fault->deliver, node) {
+		list_del(&group->node);
+		iopf_group_response(group, IOMMU_PAGE_RESP_INVALID);
+		iopf_free_group(group);
+	}
+	list_for_each_entry_safe(group, next, &fault->response, node) {
+		list_del(&group->node);
+		device_remove_fault(group);
+		iopf_group_response(group, IOMMU_PAGE_RESP_INVALID);
+		iopf_free_group(group);
+	}
+	mutex_unlock(&fault->mutex);
+
+	mutex_destroy(&fault->mutex);
+}
+
+static void iommufd_compose_fault_message(struct iommu_fault *fault,
+					  struct iommu_hwpt_pgfault *hwpt_fault,
+					  struct iommufd_device *idev)
+{
+	hwpt_fault->size = sizeof(*hwpt_fault);
+	hwpt_fault->flags = fault->prm.flags;
+	hwpt_fault->dev_id = idev->obj.id;
+	hwpt_fault->pasid = fault->prm.pasid;
+	hwpt_fault->grpid = fault->prm.grpid;
+	hwpt_fault->perm = fault->prm.perm;
+	hwpt_fault->addr = fault->prm.addr;
+}
+
+static ssize_t iommufd_fault_fops_read(struct file *filep, char __user *buf,
+				       size_t count, loff_t *ppos)
+{
+	size_t fault_size = sizeof(struct iommu_hwpt_pgfault);
+	struct iommufd_fault *fault = filep->private_data;
+	struct iommu_hwpt_pgfault data;
+	struct iommufd_device *idev;
+	struct iopf_group *group;
+	struct iopf_fault *iopf;
+	size_t done = 0;
+	int rc;
+
+	if (*ppos || count % fault_size)
+		return -ESPIPE;
+
+	mutex_lock(&fault->mutex);
+	while (!list_empty(&fault->deliver) && count > done) {
+		group = list_first_entry(&fault->deliver,
+					 struct iopf_group, node);
+
+		if (list_count_nodes(&group->faults) * fault_size > count - done)
+			break;
+
+		idev = (struct iommufd_device *)group->cookie->private;
+		list_for_each_entry(iopf, &group->faults, list) {
+			iommufd_compose_fault_message(&iopf->fault, &data, idev);
+			rc = copy_to_user(buf + done, &data, fault_size);
+			if (rc)
+				goto err_unlock;
+			done += fault_size;
+		}
+
+		rc = device_add_fault(group);
+		if (rc)
+			goto err_unlock;
+
+		list_move_tail(&group->node, &fault->response);
+	}
+	mutex_unlock(&fault->mutex);
+
+	return done;
+err_unlock:
+	mutex_unlock(&fault->mutex);
+	return rc;
+}
+
+static ssize_t iommufd_fault_fops_write(struct file *filep, const char __user *buf,
+					size_t count, loff_t *ppos)
+{
+	size_t response_size = sizeof(struct iommu_hwpt_page_response);
+	struct iommufd_fault *fault = filep->private_data;
+	struct iommu_hwpt_page_response response;
+	struct iommufd_device *idev;
+	struct iopf_group *group;
+	size_t done = 0;
+	int rc;
+
+	if (*ppos || count % response_size)
+		return -ESPIPE;
+
+	while (!list_empty(&fault->response) && count > done) {
+		rc = copy_from_user(&response, buf + done, response_size);
+		if (rc)
+			break;
+
+		idev = container_of(iommufd_get_object(fault->ictx,
+						       response.dev_id,
+						       IOMMUFD_OBJ_DEVICE),
+				    struct iommufd_device, obj);
+		if (IS_ERR(idev))
+			break;
+
+		group = device_get_fault(idev, response.grpid);
+		if (!group ||
+		    response.addr != group->last_fault.fault.prm.addr ||
+		    ((group->last_fault.fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) &&
+		      response.pasid != group->last_fault.fault.prm.pasid)) {
+			iommufd_put_object(fault->ictx, &idev->obj);
+			break;
+		}
+
+		iopf_group_response(group, response.code);
+
+		mutex_lock(&fault->mutex);
+		list_del(&group->node);
+		mutex_unlock(&fault->mutex);
+
+		device_remove_fault(group);
+		iopf_free_group(group);
+		done += response_size;
+
+		iommufd_put_object(fault->ictx, &idev->obj);
+	}
+
+	return done;
+}
+
+static __poll_t iommufd_fault_fops_poll(struct file *filep,
+					struct poll_table_struct *wait)
+{
+	struct iommufd_fault *fault = filep->private_data;
+	__poll_t pollflags = 0;
+
+	poll_wait(filep, &fault->wait_queue, wait);
+	mutex_lock(&fault->mutex);
+	if (!list_empty(&fault->deliver))
+		pollflags = EPOLLIN | EPOLLRDNORM;
+	mutex_unlock(&fault->mutex);
+
+	return pollflags;
+}
+
+static const struct file_operations iommufd_fault_fops = {
+	.owner		= THIS_MODULE,
+	.open		= nonseekable_open,
+	.read		= iommufd_fault_fops_read,
+	.write		= iommufd_fault_fops_write,
+	.poll		= iommufd_fault_fops_poll,
+	.llseek		= no_llseek,
+};
+
+static int get_fault_fd(struct iommufd_fault *fault)
+{
+	struct file *filep;
+	int fdno;
+
+	fdno = get_unused_fd_flags(O_CLOEXEC);
+	if (fdno < 0)
+		return fdno;
+
+	filep = anon_inode_getfile("[iommufd-pgfault]", &iommufd_fault_fops,
+				   fault, O_RDWR);
+	if (IS_ERR(filep)) {
+		put_unused_fd(fdno);
+		return PTR_ERR(filep);
+	}
+
+	fd_install(fdno, filep);
+
+	return fdno;
+}
+
+int iommufd_fault_alloc(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_fault_alloc *cmd = ucmd->cmd;
+	struct iommufd_fault *fault;
+	int rc;
+
+	if (cmd->flags)
+		return -EOPNOTSUPP;
+
+	fault = iommufd_object_alloc(ucmd->ictx, fault, IOMMUFD_OBJ_FAULT);
+	if (IS_ERR(fault))
+		return PTR_ERR(fault);
+
+	fault->ictx = ucmd->ictx;
+	INIT_LIST_HEAD(&fault->deliver);
+	INIT_LIST_HEAD(&fault->response);
+	mutex_init(&fault->mutex);
+	init_waitqueue_head(&fault->wait_queue);
+
+	rc = get_fault_fd(fault);
+	if (rc < 0)
+		goto out_abort;
+
+	cmd->out_fault_id = fault->obj.id;
+	cmd->out_fault_fd = rc;
+
+	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+	if (rc)
+		goto out_abort;
+	iommufd_object_finalize(ucmd->ictx, &fault->obj);
+
+	return 0;
+out_abort:
+	iommufd_object_abort_and_destroy(ucmd->ictx, &fault->obj);
+
+	return rc;
+}
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 39b32932c61e..792db077d33e 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -332,6 +332,7 @@ union ucmd_buffer {
 	struct iommu_ioas_unmap unmap;
 	struct iommu_option option;
 	struct iommu_vfio_ioas vfio_ioas;
+	struct iommu_fault_alloc fault;
 #ifdef CONFIG_IOMMUFD_TEST
 	struct iommu_test_cmd test;
 #endif
@@ -381,6 +382,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
 		 val64),
 	IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas,
 		 __reserved),
+	IOCTL_OP(IOMMU_FAULT_ALLOC, iommufd_fault_alloc, struct iommu_fault_alloc,
+		 out_fault_fd),
 #ifdef CONFIG_IOMMUFD_TEST
 	IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last),
 #endif
@@ -513,6 +516,9 @@ static const struct iommufd_object_ops iommufd_object_ops[] = {
 		.destroy = iommufd_hwpt_nested_destroy,
 		.abort = iommufd_hwpt_nested_abort,
 	},
+	[IOMMUFD_OBJ_FAULT] = {
+		.destroy = iommufd_fault_destroy,
+	},
 #ifdef CONFIG_IOMMUFD_TEST
 	[IOMMUFD_OBJ_SELFTEST] = {
 		.destroy = iommufd_selftest_destroy,
diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
index 34b446146961..b94a74366eed 100644
--- a/drivers/iommu/iommufd/Makefile
+++ b/drivers/iommu/iommufd/Makefile
@@ -6,6 +6,7 @@ iommufd-y := \
 	ioas.o \
 	main.o \
 	pages.o \
+	fault.o \
 	vfio_compat.o
 
 iommufd-$(CONFIG_IOMMUFD_TEST) += selftest.o
-- 
2.34.1
RE: [PATCH v3 4/8] iommufd: Add iommufd fault object
Posted by Shameerali Kolothum Thodi 1 year, 9 months ago

> -----Original Message-----
> From: Lu Baolu <baolu.lu@linux.intel.com>
> Sent: Monday, January 22, 2024 7:39 AM
> To: Jason Gunthorpe <jgg@ziepe.ca>; Kevin Tian <kevin.tian@intel.com>; Joerg
> Roedel <joro@8bytes.org>; Will Deacon <will@kernel.org>; Robin Murphy
> <robin.murphy@arm.com>; Jean-Philippe Brucker <jean-philippe@linaro.org>;
> Nicolin Chen <nicolinc@nvidia.com>; Yi Liu <yi.l.liu@intel.com>; Jacob Pan
> <jacob.jun.pan@linux.intel.com>; Joel Granados <j.granados@samsung.com>
> Cc: iommu@lists.linux.dev; virtualization@lists.linux-foundation.org; linux-
> kernel@vger.kernel.org; Lu Baolu <baolu.lu@linux.intel.com>
> Subject: [PATCH v3 4/8] iommufd: Add iommufd fault object
> 
> An iommufd fault object provides an interface for delivering I/O page
> faults to user space. These objects are created and destroyed by user
> space, and they can be associated with or dissociated from hardware page
> table objects during page table allocation or destruction.
> 
> User space interacts with the fault object through a file interface. This
> interface offers a straightforward and efficient way for user space to
> handle page faults. It allows user space to read fault messages
> sequentially and respond to them by writing to the same file. The file
> interface supports reading messages in poll mode, so it's recommended that
> user space applications use io_uring to enhance read and write efficiency.
> 
> A fault object can be associated with any iopf-capable iommufd_hw_pgtable
> during the pgtable's allocation. All I/O page faults triggered by devices
> when accessing the I/O addresses of an iommufd_hw_pgtable are routed
> through the fault object to user space. Similarly, user space's responses
> to these page faults are routed back to the iommu device driver through
> the same fault object.
> 
> Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>

[...]

> +static __poll_t iommufd_fault_fops_poll(struct file *filep,
> +					struct poll_table_struct *wait)
> +{
> +	struct iommufd_fault *fault = filep->private_data;
> +	__poll_t pollflags = 0;
> +
> +	poll_wait(filep, &fault->wait_queue, wait);
> +	mutex_lock(&fault->mutex);
> +	if (!list_empty(&fault->deliver))
> +		pollflags = EPOLLIN | EPOLLRDNORM;
> +	mutex_unlock(&fault->mutex);
> +
> +	return pollflags;
> +}
> +
> +static const struct file_operations iommufd_fault_fops = {
> +	.owner		= THIS_MODULE,
> +	.open		= nonseekable_open,
> +	.read		= iommufd_fault_fops_read,
> +	.write		= iommufd_fault_fops_write,
> +	.poll		= iommufd_fault_fops_poll,
> +	.llseek		= no_llseek,
> +};

Hi

I am trying to enable Qemu vSVA support on ARM with this series.
I am using io_uring APIs with the fault fd to handle the page fault
in the Qemu.

Please find the implementation here[1]. This is still a work in progress 
and is based on Nicolin's latest nested Qemu branch.

And I am running into a problem when we have the poll interface added
for the fault fd in kernel.

What I have noticed is that,
-read interface works fine and I can receive struct tiommu_hwpt_pgfault data.
-But once Guest handles the page faults and returns the page response,
 the write to fault fd never reaches the kernel. The sequence is like below,
 
  sqe = io_uring_get_sqe(ring);
  io_uring_prep_write(sqe, hwpt->fault_fd, resp, sizeof(*resp), 0);
  io_uring_sqe_set_data(sqe, resp);
  io_uring_submit(ring);
  ret = io_uring_wait_cqe(ring, &cqe); 
  ....
Please find the function here[2]

The above cqe wait never returns and hardware times out without receiving
page response. My understanding of io_uring default op is that it tries to 
issue an sqe as non-blocking first. But it looks like the above write sequence
ends up in kernel poll_wait() as well.Not sure how we can avoid that for
write.

All works fine if I comment out the poll for the fault_fd from the kernel.
But then of course Qemu ends up repeatedly reading the ring Queue for
any pending page fault.

It might be something I am missing in my understanding of io_uring APIs.
Just thought of checking with you if you have any Qemu implementation
using io_uring APIs to test this.

Also appreciate any pointers in resolving this.

Thanks,
Shameer
[1] https://github.com/hisilicon/qemu/tree/iommufd_vsmmu-02292024-vsva-wip
[2] https://github.com/hisilicon/qemu/blob/2b984fb5c692a03e6f5463d005670d2e2a2c7304/hw/arm/smmuv3.c#L1310
Re: [PATCH v3 4/8] iommufd: Add iommufd fault object
Posted by Jason Gunthorpe 1 year, 9 months ago
On Wed, Mar 20, 2024 at 04:18:05PM +0000, Shameerali Kolothum Thodi wrote:
> 
> What I have noticed is that,
> -read interface works fine and I can receive struct tiommu_hwpt_pgfault data.
> -But once Guest handles the page faults and returns the page response,
>  the write to fault fd never reaches the kernel. The sequence is like below,
>  
>   sqe = io_uring_get_sqe(ring);
>   io_uring_prep_write(sqe, hwpt->fault_fd, resp, sizeof(*resp), 0);
>   io_uring_sqe_set_data(sqe, resp);
>   io_uring_submit(ring);
>   ret = io_uring_wait_cqe(ring, &cqe); 
>   ....
> Please find the function here[2]
> 
> The above cqe wait never returns and hardware times out without receiving
> page response. My understanding of io_uring default op is that it tries to 
> issue an sqe as non-blocking first. But it looks like the above write sequence
> ends up in kernel poll_wait() as well.Not sure how we can avoid that for
> write.

Ah, right, it is because poll can't be choosy about read/write, it has
to work equally for both directions. iommufd_fault_fops_poll() never
returns EPOLLOUT

It should just always return EPOLLOUT because we don't have any queue
to manage.

Jason
Re: [PATCH v3 4/8] iommufd: Add iommufd fault object
Posted by Baolu Lu 1 year, 9 months ago
On 3/23/24 1:22 AM, Jason Gunthorpe wrote:
> On Wed, Mar 20, 2024 at 04:18:05PM +0000, Shameerali Kolothum Thodi wrote:
>> What I have noticed is that,
>> -read interface works fine and I can receive struct tiommu_hwpt_pgfault data.
>> -But once Guest handles the page faults and returns the page response,
>>   the write to fault fd never reaches the kernel. The sequence is like below,
>>   
>>    sqe = io_uring_get_sqe(ring);
>>    io_uring_prep_write(sqe, hwpt->fault_fd, resp, sizeof(*resp), 0);
>>    io_uring_sqe_set_data(sqe, resp);
>>    io_uring_submit(ring);
>>    ret = io_uring_wait_cqe(ring, &cqe);
>>    ....
>> Please find the function here[2]
>>
>> The above cqe wait never returns and hardware times out without receiving
>> page response. My understanding of io_uring default op is that it tries to
>> issue an sqe as non-blocking first. But it looks like the above write sequence
>> ends up in kernel poll_wait() as well.Not sure how we can avoid that for
>> write.
> Ah, right, it is because poll can't be choosy about read/write, it has
> to work equally for both directions. iommufd_fault_fops_poll() never
> returns EPOLLOUT
> 
> It should just always return EPOLLOUT because we don't have any queue
> to manage.

Are you suggesting the poll file operation to be like below?

static __poll_t iommufd_fault_fops_poll(struct file *filep,
                                         struct poll_table_struct *wait)
{
         struct iommufd_fault *fault = filep->private_data;
         __poll_t pollflags = EPOLLOUT;

         poll_wait(filep, &fault->wait_queue, wait);
         mutex_lock(&fault->mutex);
         if (!list_empty(&fault->deliver))
                 pollflags = EPOLLIN | EPOLLRDNORM;
         mutex_unlock(&fault->mutex);

         return pollflags;
}

The diff is,

diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c
index ede16702d433..a33f8aa92575 100644
--- a/drivers/iommu/iommufd/fault.c
+++ b/drivers/iommu/iommufd/fault.c
@@ -175,7 +175,7 @@ static __poll_t iommufd_fault_fops_poll(struct file 
*filep,
                                         struct poll_table_struct *wait)
  {
         struct iommufd_fault *fault = filep->private_data;
-       __poll_t pollflags = 0;
+       __poll_t pollflags = EPOLLOUT;

         poll_wait(filep, &fault->wait_queue, wait);
         mutex_lock(&fault->mutex);


I was originally thinking that poll file operation is specifically
designed for polling on read events associated with IOMMU faults.

Best regards,
baolu
Re: [PATCH v3 4/8] iommufd: Add iommufd fault object
Posted by Baolu Lu 1 year, 9 months ago
On 3/25/24 11:26 AM, Baolu Lu wrote:
> On 3/23/24 1:22 AM, Jason Gunthorpe wrote:
>> On Wed, Mar 20, 2024 at 04:18:05PM +0000, Shameerali Kolothum Thodi 
>> wrote:
>>> What I have noticed is that,
>>> -read interface works fine and I can receive struct 
>>> tiommu_hwpt_pgfault data.
>>> -But once Guest handles the page faults and returns the page response,
>>>   the write to fault fd never reaches the kernel. The sequence is 
>>> like below,
>>>    sqe = io_uring_get_sqe(ring);
>>>    io_uring_prep_write(sqe, hwpt->fault_fd, resp, sizeof(*resp), 0);
>>>    io_uring_sqe_set_data(sqe, resp);
>>>    io_uring_submit(ring);
>>>    ret = io_uring_wait_cqe(ring, &cqe);
>>>    ....
>>> Please find the function here[2]
>>>
>>> The above cqe wait never returns and hardware times out without 
>>> receiving
>>> page response. My understanding of io_uring default op is that it 
>>> tries to
>>> issue an sqe as non-blocking first. But it looks like the above write 
>>> sequence
>>> ends up in kernel poll_wait() as well.Not sure how we can avoid that for
>>> write.
>> Ah, right, it is because poll can't be choosy about read/write, it has
>> to work equally for both directions. iommufd_fault_fops_poll() never
>> returns EPOLLOUT
>>
>> It should just always return EPOLLOUT because we don't have any queue
>> to manage.
> 
> Are you suggesting the poll file operation to be like below?
> 
> static __poll_t iommufd_fault_fops_poll(struct file *filep,
>                                          struct poll_table_struct *wait)
> {
>          struct iommufd_fault *fault = filep->private_data;
>          __poll_t pollflags = EPOLLOUT;
> 
>          poll_wait(filep, &fault->wait_queue, wait);
>          mutex_lock(&fault->mutex);
>          if (!list_empty(&fault->deliver))
>                  pollflags = EPOLLIN | EPOLLRDNORM;

If I understood it correctly, here it should be,

		pollflags |= EPOLLIN | EPOLLRDNORM;

>          mutex_unlock(&fault->mutex);
> 
>          return pollflags;
> }

Best regards,
baolu
Re: [PATCH v3 4/8] iommufd: Add iommufd fault object
Posted by Jason Gunthorpe 1 year, 9 months ago
On Mon, Jan 22, 2024 at 03:38:59PM +0800, Lu Baolu wrote:
> --- /dev/null
> +++ b/drivers/iommu/iommufd/fault.c
> @@ -0,0 +1,255 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (C) 2024 Intel Corporation
> + */
> +#define pr_fmt(fmt) "iommufd: " fmt
> +
> +#include <linux/file.h>
> +#include <linux/fs.h>
> +#include <linux/module.h>
> +#include <linux/mutex.h>
> +#include <linux/iommufd.h>
> +#include <linux/poll.h>
> +#include <linux/anon_inodes.h>
> +#include <uapi/linux/iommufd.h>
> +
> +#include "iommufd_private.h"
> +
> +static int device_add_fault(struct iopf_group *group)
> +{
> +	struct iommufd_device *idev = group->cookie->private;
> +	void *curr;
> +
> +	curr = xa_cmpxchg(&idev->faults, group->last_fault.fault.prm.grpid,
> +			  NULL, group, GFP_KERNEL);
> +
> +	return curr ? xa_err(curr) : 0;
> +}
> +
> +static void device_remove_fault(struct iopf_group *group)
> +{
> +	struct iommufd_device *idev = group->cookie->private;
> +
> +	xa_store(&idev->faults, group->last_fault.fault.prm.grpid,
> +		 NULL, GFP_KERNEL);

xa_erase ?

Is grpid OK to use this way? Doesn't it come from the originating
device?

> +void iommufd_fault_destroy(struct iommufd_object *obj)
> +{
> +	struct iommufd_fault *fault = container_of(obj, struct iommufd_fault, obj);
> +	struct iopf_group *group, *next;
> +
> +	mutex_lock(&fault->mutex);
> +	list_for_each_entry_safe(group, next, &fault->deliver, node) {
> +		list_del(&group->node);
> +		iopf_group_response(group, IOMMU_PAGE_RESP_INVALID);
> +		iopf_free_group(group);
> +	}
> +	list_for_each_entry_safe(group, next, &fault->response, node) {
> +		list_del(&group->node);
> +		device_remove_fault(group);
> +		iopf_group_response(group, IOMMU_PAGE_RESP_INVALID);
> +		iopf_free_group(group);
> +	}
> +	mutex_unlock(&fault->mutex);
> +
> +	mutex_destroy(&fault->mutex);

It is really weird to lock a mutex we are about to destroy? At this
point the refcount on the iommufd_object is zero so there had better
not be any other threads with access to this pointer!

> +static ssize_t iommufd_fault_fops_read(struct file *filep, char __user *buf,
> +				       size_t count, loff_t *ppos)
> +{
> +	size_t fault_size = sizeof(struct iommu_hwpt_pgfault);
> +	struct iommufd_fault *fault = filep->private_data;
> +	struct iommu_hwpt_pgfault data;
> +	struct iommufd_device *idev;
> +	struct iopf_group *group;
> +	struct iopf_fault *iopf;
> +	size_t done = 0;
> +	int rc;
> +
> +	if (*ppos || count % fault_size)
> +		return -ESPIPE;
> +
> +	mutex_lock(&fault->mutex);
> +	while (!list_empty(&fault->deliver) && count > done) {
> +		group = list_first_entry(&fault->deliver,
> +					 struct iopf_group, node);
> +
> +		if (list_count_nodes(&group->faults) * fault_size > count - done)
> +			break;
> +
> +		idev = (struct iommufd_device *)group->cookie->private;
> +		list_for_each_entry(iopf, &group->faults, list) {
> +			iommufd_compose_fault_message(&iopf->fault, &data, idev);
> +			rc = copy_to_user(buf + done, &data, fault_size);
> +			if (rc)
> +				goto err_unlock;
> +			done += fault_size;
> +		}
> +
> +		rc = device_add_fault(group);

See I wonder if this should be some xa_alloc or something instead of
trying to use the grpid?

> +	while (!list_empty(&fault->response) && count > done) {
> +		rc = copy_from_user(&response, buf + done, response_size);
> +		if (rc)
> +			break;
> +
> +		idev = container_of(iommufd_get_object(fault->ictx,
> +						       response.dev_id,
> +						       IOMMUFD_OBJ_DEVICE),
> +				    struct iommufd_device, obj);

It seems unfortunate we do this lookup for every iteration of the loop,
I would lift it up and cache it..

> +		if (IS_ERR(idev))
> +			break;
> +
> +		group = device_get_fault(idev, response.grpid);

This looks locked wrong, it should hold the fault mutex here and we
should probably do xchng to zero it at the same time we fetch it.

> +		if (!group ||
> +		    response.addr != group->last_fault.fault.prm.addr ||
> +		    ((group->last_fault.fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) &&
> +		      response.pasid != group->last_fault.fault.prm.pasid)) {

Why? If grpid is unique then just trust it.

> +			iommufd_put_object(fault->ictx, &idev->obj);
> +			break;
> +		}
> +
> +		iopf_group_response(group, response.code);
> +
> +		mutex_lock(&fault->mutex);
> +		list_del(&group->node);
> +		mutex_unlock(&fault->mutex);
> +
> +		device_remove_fault(group);
> +		iopf_free_group(group);
> +		done += response_size;
> +
> +		iommufd_put_object(fault->ictx, &idev->obj);
> +	}
> +
> +	return done;
> +}
> +
> +static __poll_t iommufd_fault_fops_poll(struct file *filep,
> +					struct poll_table_struct *wait)
> +{
> +	struct iommufd_fault *fault = filep->private_data;
> +	__poll_t pollflags = 0;
> +
> +	poll_wait(filep, &fault->wait_queue, wait);
> +	mutex_lock(&fault->mutex);
> +	if (!list_empty(&fault->deliver))
> +		pollflags = EPOLLIN | EPOLLRDNORM;
> +	mutex_unlock(&fault->mutex);
> +
> +	return pollflags;
> +}
> +
> +static const struct file_operations iommufd_fault_fops = {
> +	.owner		= THIS_MODULE,
> +	.open		= nonseekable_open,
> +	.read		= iommufd_fault_fops_read,
> +	.write		= iommufd_fault_fops_write,
> +	.poll		= iommufd_fault_fops_poll,
> +	.llseek		= no_llseek,
> +};

No release? That seems wrong..

> +static int get_fault_fd(struct iommufd_fault *fault)
> +{
> +	struct file *filep;
> +	int fdno;
> +
> +	fdno = get_unused_fd_flags(O_CLOEXEC);
> +	if (fdno < 0)
> +		return fdno;
> +
> +	filep = anon_inode_getfile("[iommufd-pgfault]", &iommufd_fault_fops,
> +				   fault, O_RDWR);
                                   ^^^^^^

See here you stick the iommufd_object into the FD but we don't
refcount it...

And iommufd_fault_destroy() doesn't do anything about this, so it just
UAFs the fault memory in the FD.

You need to refcount the iommufd_object here and add a release
function for the fd to put it back

*and* this FD needs to take a reference on the base iommufd_ctx fd too
as we can't tolerate them being destroyed out of sequence.

> +int iommufd_fault_alloc(struct iommufd_ucmd *ucmd)
> +{
> +	struct iommu_fault_alloc *cmd = ucmd->cmd;
> +	struct iommufd_fault *fault;
> +	int rc;
> +
> +	if (cmd->flags)
> +		return -EOPNOTSUPP;
> +
> +	fault = iommufd_object_alloc(ucmd->ictx, fault, IOMMUFD_OBJ_FAULT);
> +	if (IS_ERR(fault))
> +		return PTR_ERR(fault);
> +
> +	fault->ictx = ucmd->ictx;
> +	INIT_LIST_HEAD(&fault->deliver);
> +	INIT_LIST_HEAD(&fault->response);
> +	mutex_init(&fault->mutex);
> +	init_waitqueue_head(&fault->wait_queue);
> +
> +	rc = get_fault_fd(fault);
> +	if (rc < 0)
> +		goto out_abort;

And this is ordered wrong, fd_install has to happen after return to
user space is guarenteed as it cannot be undone.

> +	cmd->out_fault_id = fault->obj.id;
> +	cmd->out_fault_fd = rc;
> +
> +	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
> +	if (rc)
> +		goto out_abort;
> +	iommufd_object_finalize(ucmd->ictx, &fault->obj);

fd_install() goes here

> +	return 0;
> +out_abort:
> +	iommufd_object_abort_and_destroy(ucmd->ictx, &fault->obj);
> +
> +	return rc;
> +}

Jason
Re: [PATCH v3 4/8] iommufd: Add iommufd fault object
Posted by Baolu Lu 1 year, 9 months ago
On 3/9/24 2:03 AM, Jason Gunthorpe wrote:
> On Mon, Jan 22, 2024 at 03:38:59PM +0800, Lu Baolu wrote:
>> --- /dev/null
>> +++ b/drivers/iommu/iommufd/fault.c
>> @@ -0,0 +1,255 @@
>> +// SPDX-License-Identifier: GPL-2.0-only
>> +/* Copyright (C) 2024 Intel Corporation
>> + */
>> +#define pr_fmt(fmt) "iommufd: " fmt
>> +
>> +#include <linux/file.h>
>> +#include <linux/fs.h>
>> +#include <linux/module.h>
>> +#include <linux/mutex.h>
>> +#include <linux/iommufd.h>
>> +#include <linux/poll.h>
>> +#include <linux/anon_inodes.h>
>> +#include <uapi/linux/iommufd.h>
>> +
>> +#include "iommufd_private.h"
>> +
>> +static int device_add_fault(struct iopf_group *group)
>> +{
>> +	struct iommufd_device *idev = group->cookie->private;
>> +	void *curr;
>> +
>> +	curr = xa_cmpxchg(&idev->faults, group->last_fault.fault.prm.grpid,
>> +			  NULL, group, GFP_KERNEL);
>> +
>> +	return curr ? xa_err(curr) : 0;
>> +}
>> +
>> +static void device_remove_fault(struct iopf_group *group)
>> +{
>> +	struct iommufd_device *idev = group->cookie->private;
>> +
>> +	xa_store(&idev->faults, group->last_fault.fault.prm.grpid,
>> +		 NULL, GFP_KERNEL);
> 
> xa_erase ?

Yes. Sure.

> Is grpid OK to use this way? Doesn't it come from the originating
> device?

The group ID is generated by the hardware. Here, we use it as an index
in the fault array to ensure it can be quickly retrieved in the page
fault response path.

>> +void iommufd_fault_destroy(struct iommufd_object *obj)
>> +{
>> +	struct iommufd_fault *fault = container_of(obj, struct iommufd_fault, obj);
>> +	struct iopf_group *group, *next;
>> +
>> +	mutex_lock(&fault->mutex);
>> +	list_for_each_entry_safe(group, next, &fault->deliver, node) {
>> +		list_del(&group->node);
>> +		iopf_group_response(group, IOMMU_PAGE_RESP_INVALID);
>> +		iopf_free_group(group);
>> +	}
>> +	list_for_each_entry_safe(group, next, &fault->response, node) {
>> +		list_del(&group->node);
>> +		device_remove_fault(group);
>> +		iopf_group_response(group, IOMMU_PAGE_RESP_INVALID);
>> +		iopf_free_group(group);
>> +	}
>> +	mutex_unlock(&fault->mutex);
>> +
>> +	mutex_destroy(&fault->mutex);
> 
> It is really weird to lock a mutex we are about to destroy? At this
> point the refcount on the iommufd_object is zero so there had better
> not be any other threads with access to this pointer!

You are right. I will remove the lock/unlock and add a comment instead.

> 
>> +static ssize_t iommufd_fault_fops_read(struct file *filep, char __user *buf,
>> +				       size_t count, loff_t *ppos)
>> +{
>> +	size_t fault_size = sizeof(struct iommu_hwpt_pgfault);
>> +	struct iommufd_fault *fault = filep->private_data;
>> +	struct iommu_hwpt_pgfault data;
>> +	struct iommufd_device *idev;
>> +	struct iopf_group *group;
>> +	struct iopf_fault *iopf;
>> +	size_t done = 0;
>> +	int rc;
>> +
>> +	if (*ppos || count % fault_size)
>> +		return -ESPIPE;
>> +
>> +	mutex_lock(&fault->mutex);
>> +	while (!list_empty(&fault->deliver) && count > done) {
>> +		group = list_first_entry(&fault->deliver,
>> +					 struct iopf_group, node);
>> +
>> +		if (list_count_nodes(&group->faults) * fault_size > count - done)
>> +			break;
>> +
>> +		idev = (struct iommufd_device *)group->cookie->private;
>> +		list_for_each_entry(iopf, &group->faults, list) {
>> +			iommufd_compose_fault_message(&iopf->fault, &data, idev);
>> +			rc = copy_to_user(buf + done, &data, fault_size);
>> +			if (rc)
>> +				goto err_unlock;
>> +			done += fault_size;
>> +		}
>> +
>> +		rc = device_add_fault(group);
> 
> See I wonder if this should be some xa_alloc or something instead of
> trying to use the grpid?

So this magic number will be passed to user space in the fault message.
And the user will then include this number in its response message. The
response message is valid only when the magic number matches. Do I get
you correctly?

> 
>> +	while (!list_empty(&fault->response) && count > done) {
>> +		rc = copy_from_user(&response, buf + done, response_size);
>> +		if (rc)
>> +			break;
>> +
>> +		idev = container_of(iommufd_get_object(fault->ictx,
>> +						       response.dev_id,
>> +						       IOMMUFD_OBJ_DEVICE),
>> +				    struct iommufd_device, obj);
> 
> It seems unfortunate we do this lookup for every iteration of the loop,
> I would lift it up and cache it..

Okay.

> 
>> +		if (IS_ERR(idev))
>> +			break;
>> +
>> +		group = device_get_fault(idev, response.grpid);
> 
> This looks locked wrong, it should hold the fault mutex here and we
> should probably do xchng to zero it at the same time we fetch it.

Okay, will fix it.

> 
>> +		if (!group ||
>> +		    response.addr != group->last_fault.fault.prm.addr ||
>> +		    ((group->last_fault.fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) &&
>> +		      response.pasid != group->last_fault.fault.prm.pasid)) {
> 
> Why? If grpid is unique then just trust it.

I was just considering that we should add some sanity check here to
ensure the response message is composed in the right way.

> 
>> +			iommufd_put_object(fault->ictx, &idev->obj);
>> +			break;
>> +		}
>> +
>> +		iopf_group_response(group, response.code);
>> +
>> +		mutex_lock(&fault->mutex);
>> +		list_del(&group->node);
>> +		mutex_unlock(&fault->mutex);
>> +
>> +		device_remove_fault(group);
>> +		iopf_free_group(group);
>> +		done += response_size;
>> +
>> +		iommufd_put_object(fault->ictx, &idev->obj);
>> +	}
>> +
>> +	return done;
>> +}
>> +
>> +static __poll_t iommufd_fault_fops_poll(struct file *filep,
>> +					struct poll_table_struct *wait)
>> +{
>> +	struct iommufd_fault *fault = filep->private_data;
>> +	__poll_t pollflags = 0;
>> +
>> +	poll_wait(filep, &fault->wait_queue, wait);
>> +	mutex_lock(&fault->mutex);
>> +	if (!list_empty(&fault->deliver))
>> +		pollflags = EPOLLIN | EPOLLRDNORM;
>> +	mutex_unlock(&fault->mutex);
>> +
>> +	return pollflags;
>> +}
>> +
>> +static const struct file_operations iommufd_fault_fops = {
>> +	.owner		= THIS_MODULE,
>> +	.open		= nonseekable_open,
>> +	.read		= iommufd_fault_fops_read,
>> +	.write		= iommufd_fault_fops_write,
>> +	.poll		= iommufd_fault_fops_poll,
>> +	.llseek		= no_llseek,
>> +};
> 
> No release? That seems wrong..

Yes. Will fix it.

> 
>> +static int get_fault_fd(struct iommufd_fault *fault)
>> +{
>> +	struct file *filep;
>> +	int fdno;
>> +
>> +	fdno = get_unused_fd_flags(O_CLOEXEC);
>> +	if (fdno < 0)
>> +		return fdno;
>> +
>> +	filep = anon_inode_getfile("[iommufd-pgfault]", &iommufd_fault_fops,
>> +				   fault, O_RDWR);
>                                     ^^^^^^
> 
> See here you stick the iommufd_object into the FD but we don't
> refcount it...
> 
> And iommufd_fault_destroy() doesn't do anything about this, so it just
> UAFs the fault memory in the FD.
> 
> You need to refcount the iommufd_object here and add a release
> function for the fd to put it back
> 
> *and* this FD needs to take a reference on the base iommufd_ctx fd too
> as we can't tolerate them being destroyed out of sequence.

Good catch. I will fix it.

> 
>> +int iommufd_fault_alloc(struct iommufd_ucmd *ucmd)
>> +{
>> +	struct iommu_fault_alloc *cmd = ucmd->cmd;
>> +	struct iommufd_fault *fault;
>> +	int rc;
>> +
>> +	if (cmd->flags)
>> +		return -EOPNOTSUPP;
>> +
>> +	fault = iommufd_object_alloc(ucmd->ictx, fault, IOMMUFD_OBJ_FAULT);
>> +	if (IS_ERR(fault))
>> +		return PTR_ERR(fault);
>> +
>> +	fault->ictx = ucmd->ictx;
>> +	INIT_LIST_HEAD(&fault->deliver);
>> +	INIT_LIST_HEAD(&fault->response);
>> +	mutex_init(&fault->mutex);
>> +	init_waitqueue_head(&fault->wait_queue);
>> +
>> +	rc = get_fault_fd(fault);
>> +	if (rc < 0)
>> +		goto out_abort;
> 
> And this is ordered wrong, fd_install has to happen after return to
> user space is guarenteed as it cannot be undone.
> 
>> +	cmd->out_fault_id = fault->obj.id;
>> +	cmd->out_fault_fd = rc;
>> +
>> +	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
>> +	if (rc)
>> +		goto out_abort;
>> +	iommufd_object_finalize(ucmd->ictx, &fault->obj);
> 
> fd_install() goes here

Yes. Sure.

> 
>> +	return 0;
>> +out_abort:
>> +	iommufd_object_abort_and_destroy(ucmd->ictx, &fault->obj);
>> +
>> +	return rc;
>> +}
> 
> Jason

Best regards,
baolu
Re: [PATCH v3 4/8] iommufd: Add iommufd fault object
Posted by Jason Gunthorpe 1 year, 9 months ago
On Fri, Mar 15, 2024 at 09:46:06AM +0800, Baolu Lu wrote:
> On 3/9/24 2:03 AM, Jason Gunthorpe wrote:
> > On Mon, Jan 22, 2024 at 03:38:59PM +0800, Lu Baolu wrote:
> > > --- /dev/null
> > > +++ b/drivers/iommu/iommufd/fault.c
> > > @@ -0,0 +1,255 @@
> > > +// SPDX-License-Identifier: GPL-2.0-only
> > > +/* Copyright (C) 2024 Intel Corporation
> > > + */
> > > +#define pr_fmt(fmt) "iommufd: " fmt
> > > +
> > > +#include <linux/file.h>
> > > +#include <linux/fs.h>
> > > +#include <linux/module.h>
> > > +#include <linux/mutex.h>
> > > +#include <linux/iommufd.h>
> > > +#include <linux/poll.h>
> > > +#include <linux/anon_inodes.h>
> > > +#include <uapi/linux/iommufd.h>
> > > +
> > > +#include "iommufd_private.h"
> > > +
> > > +static int device_add_fault(struct iopf_group *group)
> > > +{
> > > +	struct iommufd_device *idev = group->cookie->private;
> > > +	void *curr;
> > > +
> > > +	curr = xa_cmpxchg(&idev->faults, group->last_fault.fault.prm.grpid,
> > > +			  NULL, group, GFP_KERNEL);
> > > +
> > > +	return curr ? xa_err(curr) : 0;
> > > +}
> > > +
> > > +static void device_remove_fault(struct iopf_group *group)
> > > +{
> > > +	struct iommufd_device *idev = group->cookie->private;
> > > +
> > > +	xa_store(&idev->faults, group->last_fault.fault.prm.grpid,
> > > +		 NULL, GFP_KERNEL);
> > 
> > xa_erase ?
> 
> Yes. Sure.
> 
> > Is grpid OK to use this way? Doesn't it come from the originating
> > device?
> 
> The group ID is generated by the hardware. Here, we use it as an index
> in the fault array to ensure it can be quickly retrieved in the page
> fault response path.

I'm nervous about this, we are trusting HW outside the kernel to
provide unique grp id's which are integral to how the kernel
operates..

> > > +static ssize_t iommufd_fault_fops_read(struct file *filep, char __user *buf,
> > > +				       size_t count, loff_t *ppos)
> > > +{
> > > +	size_t fault_size = sizeof(struct iommu_hwpt_pgfault);
> > > +	struct iommufd_fault *fault = filep->private_data;
> > > +	struct iommu_hwpt_pgfault data;
> > > +	struct iommufd_device *idev;
> > > +	struct iopf_group *group;
> > > +	struct iopf_fault *iopf;
> > > +	size_t done = 0;
> > > +	int rc;
> > > +
> > > +	if (*ppos || count % fault_size)
> > > +		return -ESPIPE;
> > > +
> > > +	mutex_lock(&fault->mutex);
> > > +	while (!list_empty(&fault->deliver) && count > done) {
> > > +		group = list_first_entry(&fault->deliver,
> > > +					 struct iopf_group, node);
> > > +
> > > +		if (list_count_nodes(&group->faults) * fault_size > count - done)
> > > +			break;
> > > +
> > > +		idev = (struct iommufd_device *)group->cookie->private;
> > > +		list_for_each_entry(iopf, &group->faults, list) {
> > > +			iommufd_compose_fault_message(&iopf->fault, &data, idev);
> > > +			rc = copy_to_user(buf + done, &data, fault_size);
> > > +			if (rc)
> > > +				goto err_unlock;
> > > +			done += fault_size;
> > > +		}
> > > +
> > > +		rc = device_add_fault(group);
> > 
> > See I wonder if this should be some xa_alloc or something instead of
> > trying to use the grpid?
> 
> So this magic number will be passed to user space in the fault message.
> And the user will then include this number in its response message. The
> response message is valid only when the magic number matches. Do I get
> you correctly?

Yes, then it is simple xa_alloc() and xa_load() without any other
searching and we don't have to rely on the grpid to be correctly
formed by the PCI device.

But I don't know about performance xa_alloc() is pretty fast but
trusting the grpid would be faster..

IMHO from a uapi perspective we should have a definate "cookie" that
gets echo'd back. If the kernel uses xa_alloc or grpid to build that
cookie it doesn't matter to the uAPI.

Jason
Re: [PATCH v3 4/8] iommufd: Add iommufd fault object
Posted by Baolu Lu 1 year, 9 months ago
On 2024/3/23 1:09, Jason Gunthorpe wrote:
> On Fri, Mar 15, 2024 at 09:46:06AM +0800, Baolu Lu wrote:
>> On 3/9/24 2:03 AM, Jason Gunthorpe wrote:
>>> On Mon, Jan 22, 2024 at 03:38:59PM +0800, Lu Baolu wrote:
>>>> --- /dev/null
>>>> +++ b/drivers/iommu/iommufd/fault.c
>>>> @@ -0,0 +1,255 @@
>>>> +// SPDX-License-Identifier: GPL-2.0-only
>>>> +/* Copyright (C) 2024 Intel Corporation
>>>> + */
>>>> +#define pr_fmt(fmt) "iommufd: " fmt
>>>> +
>>>> +#include <linux/file.h>
>>>> +#include <linux/fs.h>
>>>> +#include <linux/module.h>
>>>> +#include <linux/mutex.h>
>>>> +#include <linux/iommufd.h>
>>>> +#include <linux/poll.h>
>>>> +#include <linux/anon_inodes.h>
>>>> +#include <uapi/linux/iommufd.h>
>>>> +
>>>> +#include "iommufd_private.h"
>>>> +
>>>> +static int device_add_fault(struct iopf_group *group)
>>>> +{
>>>> +	struct iommufd_device *idev = group->cookie->private;
>>>> +	void *curr;
>>>> +
>>>> +	curr = xa_cmpxchg(&idev->faults, group->last_fault.fault.prm.grpid,
>>>> +			  NULL, group, GFP_KERNEL);
>>>> +
>>>> +	return curr ? xa_err(curr) : 0;
>>>> +}
>>>> +
>>>> +static void device_remove_fault(struct iopf_group *group)
>>>> +{
>>>> +	struct iommufd_device *idev = group->cookie->private;
>>>> +
>>>> +	xa_store(&idev->faults, group->last_fault.fault.prm.grpid,
>>>> +		 NULL, GFP_KERNEL);
>>>
>>> xa_erase ?
>>
>> Yes. Sure.
>>
>>> Is grpid OK to use this way? Doesn't it come from the originating
>>> device?
>>
>> The group ID is generated by the hardware. Here, we use it as an index
>> in the fault array to ensure it can be quickly retrieved in the page
>> fault response path.
> 
> I'm nervous about this, we are trusting HW outside the kernel to
> provide unique grp id's which are integral to how the kernel
> operates..

Agreed.

> 
>>>> +static ssize_t iommufd_fault_fops_read(struct file *filep, char __user *buf,
>>>> +				       size_t count, loff_t *ppos)
>>>> +{
>>>> +	size_t fault_size = sizeof(struct iommu_hwpt_pgfault);
>>>> +	struct iommufd_fault *fault = filep->private_data;
>>>> +	struct iommu_hwpt_pgfault data;
>>>> +	struct iommufd_device *idev;
>>>> +	struct iopf_group *group;
>>>> +	struct iopf_fault *iopf;
>>>> +	size_t done = 0;
>>>> +	int rc;
>>>> +
>>>> +	if (*ppos || count % fault_size)
>>>> +		return -ESPIPE;
>>>> +
>>>> +	mutex_lock(&fault->mutex);
>>>> +	while (!list_empty(&fault->deliver) && count > done) {
>>>> +		group = list_first_entry(&fault->deliver,
>>>> +					 struct iopf_group, node);
>>>> +
>>>> +		if (list_count_nodes(&group->faults) * fault_size > count - done)
>>>> +			break;
>>>> +
>>>> +		idev = (struct iommufd_device *)group->cookie->private;
>>>> +		list_for_each_entry(iopf, &group->faults, list) {
>>>> +			iommufd_compose_fault_message(&iopf->fault, &data, idev);
>>>> +			rc = copy_to_user(buf + done, &data, fault_size);
>>>> +			if (rc)
>>>> +				goto err_unlock;
>>>> +			done += fault_size;
>>>> +		}
>>>> +
>>>> +		rc = device_add_fault(group);
>>>
>>> See I wonder if this should be some xa_alloc or something instead of
>>> trying to use the grpid?
>>
>> So this magic number will be passed to user space in the fault message.
>> And the user will then include this number in its response message. The
>> response message is valid only when the magic number matches. Do I get
>> you correctly?
> 
> Yes, then it is simple xa_alloc() and xa_load() without any other
> searching and we don't have to rely on the grpid to be correctly
> formed by the PCI device.
> 
> But I don't know about performance xa_alloc() is pretty fast but
> trusting the grpid would be faster..
> 
> IMHO from a uapi perspective we should have a definate "cookie" that
> gets echo'd back. If the kernel uses xa_alloc or grpid to build that
> cookie it doesn't matter to the uAPI.

Okay, I will head in this direction.

Best regards,
baolu