[v8] IOMMUFD: Deliver IO page faults to user space

[PATCH v8 06/10] iommufd: Add iommufd fault object

Posted by Lu Baolu 1 year, 7 months ago

An iommufd fault object provides an interface for delivering I/O page
faults to user space. These objects are created and destroyed by user
space, and they can be associated with or dissociated from hardware page
table objects during page table allocation or destruction.

User space interacts with the fault object through a file interface. This
interface offers a straightforward and efficient way for user space to
handle page faults. It allows user space to read fault messages
sequentially and respond to them by writing to the same file. The file
interface supports reading messages in poll mode, so it's recommended that
user space applications use io_uring to enhance read and write efficiency.

A fault object can be associated with any iopf-capable iommufd_hw_pgtable
during the pgtable's allocation. All I/O page faults triggered by devices
when accessing the I/O addresses of an iommufd_hw_pgtable are routed
through the fault object to user space. Similarly, user space's responses
to these page faults are routed back to the iommu device driver through
the same fault object.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
---
 include/linux/iommu.h                   |   4 +
 drivers/iommu/iommufd/iommufd_private.h |  30 ++++
 include/uapi/linux/iommufd.h            |  18 ++
 drivers/iommu/io-pgfault.c              |   2 +
 drivers/iommu/iommufd/fault.c           | 226 ++++++++++++++++++++++++
 drivers/iommu/iommufd/main.c            |   6 +
 drivers/iommu/iommufd/Makefile          |   1 +
 7 files changed, 287 insertions(+)
 create mode 100644 drivers/iommu/iommufd/fault.c

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 910aec80886e..73bc3aee95a1 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -124,12 +124,16 @@ struct iopf_fault {
 struct iopf_group {
 	struct iopf_fault last_fault;
 	struct list_head faults;
+	size_t fault_count;
 	/* list node for iommu_fault_param::faults */
 	struct list_head pending_node;
 	struct work_struct work;
 	struct iommu_attach_handle *attach_handle;
 	/* The device's fault data parameter. */
 	struct iommu_fault_param *fault_param;
+	/* Used by handler provider to hook the group on its own lists. */
+	struct list_head node;
+	u32 cookie;
 };
 
 /**
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 991f864d1f9b..c8a4519f1405 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -128,6 +128,7 @@ enum iommufd_object_type {
 	IOMMUFD_OBJ_HWPT_NESTED,
 	IOMMUFD_OBJ_IOAS,
 	IOMMUFD_OBJ_ACCESS,
+	IOMMUFD_OBJ_FAULT,
 #ifdef CONFIG_IOMMUFD_TEST
 	IOMMUFD_OBJ_SELFTEST,
 #endif
@@ -426,6 +427,35 @@ void iopt_remove_access(struct io_pagetable *iopt,
 			u32 iopt_access_list_id);
 void iommufd_access_destroy_object(struct iommufd_object *obj);
 
+/*
+ * An iommufd_fault object represents an interface to deliver I/O page faults
+ * to the user space. These objects are created/destroyed by the user space and
+ * associated with hardware page table objects during page-table allocation.
+ */
+struct iommufd_fault {
+	struct iommufd_object obj;
+	struct iommufd_ctx *ictx;
+	struct file *filep;
+
+	/* The lists of outstanding faults protected by below mutex. */
+	struct mutex mutex;
+	struct list_head deliver;
+	struct xarray response;
+
+	struct wait_queue_head wait_queue;
+};
+
+struct iommufd_attach_handle {
+	struct iommu_attach_handle handle;
+	struct iommufd_device *idev;
+};
+
+/* Convert an iommu attach handle to iommufd handle. */
+#define to_iommufd_handle(hdl)	container_of(hdl, struct iommufd_attach_handle, handle)
+
+int iommufd_fault_alloc(struct iommufd_ucmd *ucmd);
+void iommufd_fault_destroy(struct iommufd_object *obj);
+
 #ifdef CONFIG_IOMMUFD_TEST
 int iommufd_test(struct iommufd_ucmd *ucmd);
 void iommufd_selftest_destroy(struct iommufd_object *obj);
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 4d89ed97b533..70b8a38fcd46 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -50,6 +50,7 @@ enum {
 	IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING,
 	IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP,
 	IOMMUFD_CMD_HWPT_INVALIDATE,
+	IOMMUFD_CMD_FAULT_QUEUE_ALLOC,
 };
 
 /**
@@ -775,4 +776,21 @@ struct iommu_hwpt_page_response {
 	__u32 cookie;
 	__u32 code;
 };
+
+/**
+ * struct iommu_fault_alloc - ioctl(IOMMU_FAULT_QUEUE_ALLOC)
+ * @size: sizeof(struct iommu_fault_alloc)
+ * @flags: Must be 0
+ * @out_fault_id: The ID of the new FAULT
+ * @out_fault_fd: The fd of the new FAULT
+ *
+ * Explicitly allocate a fault handling object.
+ */
+struct iommu_fault_alloc {
+	__u32 size;
+	__u32 flags;
+	__u32 out_fault_id;
+	__u32 out_fault_fd;
+};
+#define IOMMU_FAULT_QUEUE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_FAULT_QUEUE_ALLOC)
 #endif
diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c
index 7c9011992d3f..cd679c13752e 100644
--- a/drivers/iommu/io-pgfault.c
+++ b/drivers/iommu/io-pgfault.c
@@ -110,6 +110,8 @@ static struct iopf_group *iopf_group_alloc(struct iommu_fault_param *iopf_param,
 	list_add(&group->pending_node, &iopf_param->faults);
 	mutex_unlock(&iopf_param->lock);
 
+	group->fault_count = list_count_nodes(&group->faults);
+
 	return group;
 }
 
diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c
new file mode 100644
index 000000000000..68ff94671d48
--- /dev/null
+++ b/drivers/iommu/iommufd/fault.c
@@ -0,0 +1,226 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2024 Intel Corporation
+ */
+#define pr_fmt(fmt) "iommufd: " fmt
+
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/iommufd.h>
+#include <linux/poll.h>
+#include <linux/anon_inodes.h>
+#include <uapi/linux/iommufd.h>
+
+#include "../iommu-priv.h"
+#include "iommufd_private.h"
+
+void iommufd_fault_destroy(struct iommufd_object *obj)
+{
+	struct iommufd_fault *fault = container_of(obj, struct iommufd_fault, obj);
+	struct iopf_group *group, *next;
+
+	/*
+	 * The iommufd object's reference count is zero at this point.
+	 * We can be confident that no other threads are currently
+	 * accessing this pointer. Therefore, acquiring the mutex here
+	 * is unnecessary.
+	 */
+	list_for_each_entry_safe(group, next, &fault->deliver, node) {
+		list_del(&group->node);
+		iopf_group_response(group, IOMMU_PAGE_RESP_INVALID);
+		iopf_free_group(group);
+	}
+}
+
+static void iommufd_compose_fault_message(struct iommu_fault *fault,
+					  struct iommu_hwpt_pgfault *hwpt_fault,
+					  struct iommufd_device *idev,
+					  u32 cookie)
+{
+	hwpt_fault->flags = fault->prm.flags;
+	hwpt_fault->dev_id = idev->obj.id;
+	hwpt_fault->pasid = fault->prm.pasid;
+	hwpt_fault->grpid = fault->prm.grpid;
+	hwpt_fault->perm = fault->prm.perm;
+	hwpt_fault->addr = fault->prm.addr;
+	hwpt_fault->length = 0;
+	hwpt_fault->cookie = cookie;
+}
+
+static ssize_t iommufd_fault_fops_read(struct file *filep, char __user *buf,
+				       size_t count, loff_t *ppos)
+{
+	size_t fault_size = sizeof(struct iommu_hwpt_pgfault);
+	struct iommufd_fault *fault = filep->private_data;
+	struct iommu_hwpt_pgfault data;
+	struct iommufd_device *idev;
+	struct iopf_group *group;
+	struct iopf_fault *iopf;
+	size_t done = 0;
+	int rc = 0;
+
+	if (*ppos || count % fault_size)
+		return -ESPIPE;
+
+	mutex_lock(&fault->mutex);
+	while (!list_empty(&fault->deliver) && count > done) {
+		group = list_first_entry(&fault->deliver,
+					 struct iopf_group, node);
+
+		if (group->fault_count * fault_size > count - done)
+			break;
+
+		rc = xa_alloc(&fault->response, &group->cookie, group,
+			      xa_limit_32b, GFP_KERNEL);
+		if (rc)
+			break;
+
+		idev = to_iommufd_handle(group->attach_handle)->idev;
+		list_for_each_entry(iopf, &group->faults, list) {
+			iommufd_compose_fault_message(&iopf->fault,
+						      &data, idev,
+						      group->cookie);
+			if (copy_to_user(buf + done, &data, fault_size)) {
+				xa_erase(&fault->response, group->cookie);
+				rc = -EFAULT;
+				break;
+			}
+			done += fault_size;
+		}
+
+		list_del(&group->node);
+	}
+	mutex_unlock(&fault->mutex);
+
+	return done == 0 ? rc : done;
+}
+
+static ssize_t iommufd_fault_fops_write(struct file *filep, const char __user *buf,
+					size_t count, loff_t *ppos)
+{
+	size_t response_size = sizeof(struct iommu_hwpt_page_response);
+	struct iommufd_fault *fault = filep->private_data;
+	struct iommu_hwpt_page_response response;
+	struct iopf_group *group;
+	size_t done = 0;
+	int rc = 0;
+
+	if (*ppos || count % response_size)
+		return -ESPIPE;
+
+	mutex_lock(&fault->mutex);
+	while (count > done) {
+		rc = copy_from_user(&response, buf + done, response_size);
+		if (rc)
+			break;
+
+		group = xa_erase(&fault->response, response.cookie);
+		if (!group) {
+			rc = -EINVAL;
+			break;
+		}
+
+		iopf_group_response(group, response.code);
+		iopf_free_group(group);
+		done += response_size;
+	}
+	mutex_unlock(&fault->mutex);
+
+	return done == 0 ? rc : done;
+}
+
+static __poll_t iommufd_fault_fops_poll(struct file *filep,
+					struct poll_table_struct *wait)
+{
+	struct iommufd_fault *fault = filep->private_data;
+	__poll_t pollflags = EPOLLOUT;
+
+	poll_wait(filep, &fault->wait_queue, wait);
+	mutex_lock(&fault->mutex);
+	if (!list_empty(&fault->deliver))
+		pollflags |= EPOLLIN | EPOLLRDNORM;
+	mutex_unlock(&fault->mutex);
+
+	return pollflags;
+}
+
+static int iommufd_fault_fops_release(struct inode *inode, struct file *filep)
+{
+	struct iommufd_fault *fault = filep->private_data;
+
+	refcount_dec(&fault->obj.users);
+	iommufd_ctx_put(fault->ictx);
+	return 0;
+}
+
+static const struct file_operations iommufd_fault_fops = {
+	.owner		= THIS_MODULE,
+	.open		= nonseekable_open,
+	.read		= iommufd_fault_fops_read,
+	.write		= iommufd_fault_fops_write,
+	.poll		= iommufd_fault_fops_poll,
+	.release	= iommufd_fault_fops_release,
+	.llseek		= no_llseek,
+};
+
+int iommufd_fault_alloc(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_fault_alloc *cmd = ucmd->cmd;
+	struct iommufd_fault *fault;
+	struct file *filep;
+	int fdno;
+	int rc;
+
+	if (cmd->flags)
+		return -EOPNOTSUPP;
+
+	fault = iommufd_object_alloc(ucmd->ictx, fault, IOMMUFD_OBJ_FAULT);
+	if (IS_ERR(fault))
+		return PTR_ERR(fault);
+
+	fault->ictx = ucmd->ictx;
+	INIT_LIST_HEAD(&fault->deliver);
+	xa_init_flags(&fault->response, XA_FLAGS_ALLOC1);
+	mutex_init(&fault->mutex);
+	init_waitqueue_head(&fault->wait_queue);
+
+	filep = anon_inode_getfile("[iommufd-pgfault]", &iommufd_fault_fops,
+				   fault, O_RDWR);
+	if (IS_ERR(filep)) {
+		rc = PTR_ERR(filep);
+		goto out_abort;
+	}
+
+	refcount_inc(&fault->obj.users);
+	iommufd_ctx_get(fault->ictx);
+	fault->filep = filep;
+
+	fdno = get_unused_fd_flags(O_CLOEXEC);
+	if (fdno < 0) {
+		rc = fdno;
+		goto out_fput;
+	}
+
+	cmd->out_fault_id = fault->obj.id;
+	cmd->out_fault_fd = fdno;
+
+	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+	if (rc)
+		goto out_put_fdno;
+	iommufd_object_finalize(ucmd->ictx, &fault->obj);
+
+	fd_install(fdno, fault->filep);
+
+	return 0;
+out_put_fdno:
+	put_unused_fd(fdno);
+out_fput:
+	fput(filep);
+	refcount_dec(&fault->obj.users);
+	iommufd_ctx_put(fault->ictx);
+out_abort:
+	iommufd_object_abort_and_destroy(ucmd->ictx, &fault->obj);
+
+	return rc;
+}
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 39b32932c61e..83bbd7c5d160 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -319,6 +319,7 @@ static int iommufd_option(struct iommufd_ucmd *ucmd)
 
 union ucmd_buffer {
 	struct iommu_destroy destroy;
+	struct iommu_fault_alloc fault;
 	struct iommu_hw_info info;
 	struct iommu_hwpt_alloc hwpt;
 	struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap;
@@ -355,6 +356,8 @@ struct iommufd_ioctl_op {
 	}
 static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
 	IOCTL_OP(IOMMU_DESTROY, iommufd_destroy, struct iommu_destroy, id),
+	IOCTL_OP(IOMMU_FAULT_QUEUE_ALLOC, iommufd_fault_alloc, struct iommu_fault_alloc,
+		 out_fault_fd),
 	IOCTL_OP(IOMMU_GET_HW_INFO, iommufd_get_hw_info, struct iommu_hw_info,
 		 __reserved),
 	IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc,
@@ -513,6 +516,9 @@ static const struct iommufd_object_ops iommufd_object_ops[] = {
 		.destroy = iommufd_hwpt_nested_destroy,
 		.abort = iommufd_hwpt_nested_abort,
 	},
+	[IOMMUFD_OBJ_FAULT] = {
+		.destroy = iommufd_fault_destroy,
+	},
 #ifdef CONFIG_IOMMUFD_TEST
 	[IOMMUFD_OBJ_SELFTEST] = {
 		.destroy = iommufd_selftest_destroy,
diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
index 34b446146961..cf4605962bea 100644
--- a/drivers/iommu/iommufd/Makefile
+++ b/drivers/iommu/iommufd/Makefile
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 iommufd-y := \
 	device.o \
+	fault.o \
 	hw_pagetable.o \
 	io_pagetable.o \
 	ioas.o \
-- 
2.34.1

Re: [PATCH v8 06/10] iommufd: Add iommufd fault object

Posted by Nicolin Chen 1 year, 7 months ago

Hi Baolu,

On Tue, Jul 02, 2024 at 02:34:40PM +0800, Lu Baolu wrote:

> An iommufd fault object provides an interface for delivering I/O page
> faults to user space. These objects are created and destroyed by user
> space, and they can be associated with or dissociated from hardware page
> table objects during page table allocation or destruction.
> 
> User space interacts with the fault object through a file interface. This
> interface offers a straightforward and efficient way for user space to
> handle page faults. It allows user space to read fault messages
> sequentially and respond to them by writing to the same file. The file
> interface supports reading messages in poll mode, so it's recommended that
> user space applications use io_uring to enhance read and write efficiency.
> 
> A fault object can be associated with any iopf-capable iommufd_hw_pgtable
> during the pgtable's allocation. All I/O page faults triggered by devices
> when accessing the I/O addresses of an iommufd_hw_pgtable are routed
> through the fault object to user space. Similarly, user space's responses
> to these page faults are routed back to the iommu device driver through
> the same fault object.

There is a need for VIOMMU object to report HW fault to VMM. For
example, a HW-accelerated VCMDQ may encounter HW errors. HW will
raise an IRQ to the host kernel and the host kernel will forward
it to the guest. I think we can have a viommu->fault, similar to
the hwpt->fault introduced by this series. This viommu->fault can
also benefit nested IOMMU for reporting translation error.

I learned that this hwpt->fault is exclusively for IOPF/PRI. And
Jason suggested me to add a different one for VIOMMU. Yet, after
taking a closer look, I found the fault object in this series is
seemingly quite generic at the uAPI level: its naming/structure,
and the way how it's allocated and passed to hwpt, despite being
highly correlated with IOPF in its fops code. So, I feel that we
might have a chance of reusing it for different fault types:

+enum iommu_fault_type {
+	IOMMU_FAULT_TYPE_HWPT_IOPF,
+	IOMMU_FAULT_TYPE_VIOMMU_IRQ,
+};

 struct iommu_fault_alloc {
 	__u32 size;
 	__u32 flags;
+	__u32 type;  /* enum iommu_fault_type */
 	__u32 out_fault_id;
 	__u32 out_fault_fd;
 };

I understand that this is already v8. So, maybe we can, for now,
apply the small diff above with an IOMMU_FAULT_TYPE_HWPT_IOPF type
check in the ioctl handler. And a decoupling for the iopf fops in
the ioctl handler can come later in the viommu series:
	switch (type) {
	case IOMMU_FAULT_TYPE_HWPT_IOPF:
		filep = anon_inode_getfile("[iommufd-pgfault]",
					   &iommufd_fault_fops_iopf);
	case IOMMU_FAULT_TYPE_VIOMMU_IRQ:
		filep = anon_inode_getfile("[iommufd-viommu-irq]",
					   &iommufd_fault_fops_viommu);
	default:
		return -EOPNOSUPP;
	}

Since you are the designer here, I think you have a better 10000
foot view -- maybe I am missing something here implying that the
fault object can't be really reused by viommu.

Would you mind sharing some thoughts here?

Thanks
Nic

Re: [PATCH v8 06/10] iommufd: Add iommufd fault object

Posted by Jason Gunthorpe 1 year, 7 months ago

On Wed, Jul 03, 2024 at 04:06:15PM -0700, Nicolin Chen wrote:

> I learned that this hwpt->fault is exclusively for IOPF/PRI. And
> Jason suggested me to add a different one for VIOMMU. Yet, after
> taking a closer look, I found the fault object in this series is
> seemingly quite generic at the uAPI level: its naming/structure,
> and the way how it's allocated and passed to hwpt, despite being
> highly correlated with IOPF in its fops code. So, I feel that we
> might have a chance of reusing it for different fault types:
> 
> +enum iommu_fault_type {
> +	IOMMU_FAULT_TYPE_HWPT_IOPF,
> +	IOMMU_FAULT_TYPE_VIOMMU_IRQ,
> +};
> 
>  struct iommu_fault_alloc {
>  	__u32 size;
>  	__u32 flags;
> +	__u32 type;  /* enum iommu_fault_type */
>  	__u32 out_fault_id;
>  	__u32 out_fault_fd;
>  };

I think I would just add the type at the end of the struct and rely on
our existing 0 is backwards compat mechanism. 0 means HWPT_IOPF. ie no
need to do anything now.

It would make some sense to call this a "report" object than "fault"
if we are going to use it for different things. We could probably
rename it without much trouble. There is also not a significant issue
with having two alloc commands for FDs.

I'd also think VIOMMU_IRQ is probably not that right abstraction,
likely it makes more sense to push driver-specific event messages sort
of like IOPF and one of the messages can indicate a arm-smmu-v3 VCDMQ
interrupt, other messages could indicate BAD_CD and similar sorts of
events we might want to capture and forward.

So, I'm inclined to just take this series as-is

Jason

Re: [PATCH v8 06/10] iommufd: Add iommufd fault object

Posted by Nicolin Chen 1 year, 7 months ago

On Mon, Jul 08, 2024 at 01:29:57PM -0300, Jason Gunthorpe wrote:
> On Wed, Jul 03, 2024 at 04:06:15PM -0700, Nicolin Chen wrote:
> 
> > I learned that this hwpt->fault is exclusively for IOPF/PRI. And
> > Jason suggested me to add a different one for VIOMMU. Yet, after
> > taking a closer look, I found the fault object in this series is
> > seemingly quite generic at the uAPI level: its naming/structure,
> > and the way how it's allocated and passed to hwpt, despite being
> > highly correlated with IOPF in its fops code. So, I feel that we
> > might have a chance of reusing it for different fault types:
> >
> > +enum iommu_fault_type {
> > +     IOMMU_FAULT_TYPE_HWPT_IOPF,
> > +     IOMMU_FAULT_TYPE_VIOMMU_IRQ,
> > +};
> >
> >  struct iommu_fault_alloc {
> >       __u32 size;
> >       __u32 flags;
> > +     __u32 type;  /* enum iommu_fault_type */
> >       __u32 out_fault_id;
> >       __u32 out_fault_fd;
> >  };
> 
> I think I would just add the type at the end of the struct and rely on
> our existing 0 is backwards compat mechanism. 0 means HWPT_IOPF. ie no
> need to do anything now.

Yea, I figured that it would work too, so let's add one in the
VIOMMU series (if we eventually decide to reuse the same ioctl).

> It would make some sense to call this a "report" object than "fault"
> if we are going to use it for different things. We could probably
> rename it without much trouble. There is also not a significant issue
> with having two alloc commands for FDs.

Ack.

> I'd also think VIOMMU_IRQ is probably not that right abstraction,
> likely it makes more sense to push driver-specific event messages sort
> of like IOPF and one of the messages can indicate a arm-smmu-v3 VCDMQ
> interrupt, other messages could indicate BAD_CD and similar sorts of
> events we might want to capture and forward.

Maybe something like this?

struct iommu_viommu_event_arm_smmuv3 {
	u64 evt[4];
};

struct iommu_viommu_event_tegra241_cmdqv {
	u64 vcmdq_err_map[2];
};

enum iommu_event_type {
	IOMMM_HWPT_EVENT_TYPE_IOPF,
	IOMMU_VIOMMU_EVENT_TYPE_SMMUv3,
	IOMMU_VIOMMU_EVENT_TYPE_TEGRA241_CMDQV,
};

struct iommu_event_alloc {
	__u32 size;
	__u32 flags;
	__u32 out_event_id;
	__u32 out_event_fd;
	__u32 type;
	__u32 _reserved;
};

It can be "report" if you prefer.

Thanks
Nicolin

Re: [PATCH v8 06/10] iommufd: Add iommufd fault object

Posted by Jason Gunthorpe 1 year, 7 months ago

On Mon, Jul 08, 2024 at 11:36:57AM -0700, Nicolin Chen wrote:
> Maybe something like this?
> 
> struct iommu_viommu_event_arm_smmuv3 {
> 	u64 evt[4];
> };
> 
> struct iommu_viommu_event_tegra241_cmdqv {
> 	u64 vcmdq_err_map[2];
> };
> 
> enum iommu_event_type {
> 	IOMMM_HWPT_EVENT_TYPE_IOPF,
> 	IOMMU_VIOMMU_EVENT_TYPE_SMMUv3,
> 	IOMMU_VIOMMU_EVENT_TYPE_TEGRA241_CMDQV,
> };
> 
> struct iommu_event_alloc {
> 	__u32 size;
> 	__u32 flags;
> 	__u32 out_event_id;
> 	__u32 out_event_fd;
> 	__u32 type;
> 	__u32 _reserved;
> };
> 
> It can be "report" if you prefer.

Yeah, something like that makes sense to me. The other question is if
you want to multiplex the SMMUv3 and CMDQV on the same FD?

Or multiplex physical smmus on the same FD.

We are potentially talking about 5-10 physical smmus and 2-3 FDs per
physical? Does that scare anyone?

Jason

Re: [PATCH v8 06/10] iommufd: Add iommufd fault object

Posted by Nicolin Chen 1 year, 7 months ago

On Tue, Jul 09, 2024 at 02:00:38PM -0300, Jason Gunthorpe wrote:
> On Mon, Jul 08, 2024 at 11:36:57AM -0700, Nicolin Chen wrote:
> > Maybe something like this?
> >
> > struct iommu_viommu_event_arm_smmuv3 {
> >       u64 evt[4];
> > };
> >
> > struct iommu_viommu_event_tegra241_cmdqv {
> >       u64 vcmdq_err_map[2];
> > };
> >
> > enum iommu_event_type {
> >       IOMMM_HWPT_EVENT_TYPE_IOPF,
> >       IOMMU_VIOMMU_EVENT_TYPE_SMMUv3,
> >       IOMMU_VIOMMU_EVENT_TYPE_TEGRA241_CMDQV,
> > };
> >
> > struct iommu_event_alloc {
> >       __u32 size;
> >       __u32 flags;
> >       __u32 out_event_id;
> >       __u32 out_event_fd;
> >       __u32 type;
> >       __u32 _reserved;
> > };
> >
> > It can be "report" if you prefer.
> 
> Yeah, something like that makes sense to me. The other question is if
> you want to multiplex the SMMUv3 and CMDQV on the same FD?

I think at least SMMUv3 and CMDQV could be the same FD. IMHO,
a TEGRA241_CMDQV type VIOMMU should include all the features
of SMMUv3 type... otherwise, we would have two VIOMMU objects
per pSMMU on Grace, which doesn't seem to make sense either.

> Or multiplex physical smmus on the same FD.
> 
> We are potentially talking about 5-10 physical smmus and 2-3 FDs per
> physical? Does that scare anyone?

I think we can share the same FD by adding a viommu_id somewhere
to indicate what the data/event belongs to. Yet, it seemed that
you had a counter-argument that a shared FD (queue) might have a
security concern as well?
https://lore.kernel.org/linux-iommu/20240522232833.GH20229@nvidia.com/

Thanks
Nicolin

Re: [PATCH v8 06/10] iommufd: Add iommufd fault object

Posted by Jason Gunthorpe 1 year, 7 months ago

On Tue, Jul 09, 2024 at 10:33:42AM -0700, Nicolin Chen wrote:

> > We are potentially talking about 5-10 physical smmus and 2-3 FDs per
> > physical? Does that scare anyone?
> 
> I think we can share the same FD by adding a viommu_id somewhere
> to indicate what the data/event belongs to. Yet, it seemed that
> you had a counter-argument that a shared FD (queue) might have a
> security concern as well?
> https://lore.kernel.org/linux-iommu/20240522232833.GH20229@nvidia.com/

That was for the physical HW queue not so much the FD.

We need to be mindful that these events can't DOS the hypervisor, that
constrains how we track pending events in the kernel, not how they get
marshaled to FDs to deliver to user space.

Thinking more, it makes sense that a FD would tie 1:1 with a queue in
the VM.

That way backpressure on a queue will not cause head of line blocking
to other queues because they multiplex onto a single FD.

Jason

Re: [PATCH v8 06/10] iommufd: Add iommufd fault object

Posted by Baolu Lu 1 year, 7 months ago

On 7/4/24 7:06 AM, Nicolin Chen wrote:
> Hi Baolu,

Hi Nicolin,

> On Tue, Jul 02, 2024 at 02:34:40PM +0800, Lu Baolu wrote:
> 
>> An iommufd fault object provides an interface for delivering I/O page
>> faults to user space. These objects are created and destroyed by user
>> space, and they can be associated with or dissociated from hardware page
>> table objects during page table allocation or destruction.
>>
>> User space interacts with the fault object through a file interface. This
>> interface offers a straightforward and efficient way for user space to
>> handle page faults. It allows user space to read fault messages
>> sequentially and respond to them by writing to the same file. The file
>> interface supports reading messages in poll mode, so it's recommended that
>> user space applications use io_uring to enhance read and write efficiency.
>>
>> A fault object can be associated with any iopf-capable iommufd_hw_pgtable
>> during the pgtable's allocation. All I/O page faults triggered by devices
>> when accessing the I/O addresses of an iommufd_hw_pgtable are routed
>> through the fault object to user space. Similarly, user space's responses
>> to these page faults are routed back to the iommu device driver through
>> the same fault object.
> There is a need for VIOMMU object to report HW fault to VMM. For
> example, a HW-accelerated VCMDQ may encounter HW errors. HW will
> raise an IRQ to the host kernel and the host kernel will forward
> it to the guest. I think we can have a viommu->fault, similar to
> the hwpt->fault introduced by this series. This viommu->fault can
> also benefit nested IOMMU for reporting translation error.
> 
> I learned that this hwpt->fault is exclusively for IOPF/PRI. And
> Jason suggested me to add a different one for VIOMMU. Yet, after
> taking a closer look, I found the fault object in this series is
> seemingly quite generic at the uAPI level: its naming/structure,
> and the way how it's allocated and passed to hwpt, despite being
> highly correlated with IOPF in its fops code. So, I feel that we
> might have a chance of reusing it for different fault types:
> 
> +enum iommu_fault_type {
> +	IOMMU_FAULT_TYPE_HWPT_IOPF,
> +	IOMMU_FAULT_TYPE_VIOMMU_IRQ,
> +};
> 
>   struct iommu_fault_alloc {
>   	__u32 size;
>   	__u32 flags;
> +	__u32 type;  /* enum iommu_fault_type */
>   	__u32 out_fault_id;
>   	__u32 out_fault_fd;
>   };
> 
> I understand that this is already v8. So, maybe we can, for now,
> apply the small diff above with an IOMMU_FAULT_TYPE_HWPT_IOPF type
> check in the ioctl handler. And a decoupling for the iopf fops in
> the ioctl handler can come later in the viommu series:
> 	switch (type) {
> 	case IOMMU_FAULT_TYPE_HWPT_IOPF:
> 		filep = anon_inode_getfile("[iommufd-pgfault]",
> 					   &iommufd_fault_fops_iopf);
> 	case IOMMU_FAULT_TYPE_VIOMMU_IRQ:
> 		filep = anon_inode_getfile("[iommufd-viommu-irq]",
> 					   &iommufd_fault_fops_viommu);
> 	default:
> 		return -EOPNOSUPP;
> 	}
> 
> Since you are the designer here, I think you have a better 10000
> foot view -- maybe I am missing something here implying that the
> fault object can't be really reused by viommu.
> 
> Would you mind sharing some thoughts here?

I think this is a choice between "two different objects" vs. "same
object with different FD interfaces". If I understand it correctly, your
proposal of unrecoverable fault delivery is not limited to vcmdq, but
generic to all unrecoverable events that userspace should be aware of
when the passed-through device is affected.

 From a hardware architecture perspective, the interfaces for
unrecoverable events don't always match the page faults. For example,
VT-d architecture defines a PR queue for page faults, but uses a
register set to report unrecoverable events. The 'reason', 'request id'
and 'pasid' fields of the register set indicate what happened on the
hardware. New unrecoverable events will not be reported until the
previous one has been fetched.

With the above being said, I have no strong opinions between these two
choices. Jason and Kevin should have more insights.

Thanks,
baolu

Re: [PATCH v8 06/10] iommufd: Add iommufd fault object

Posted by Nicolin Chen 1 year, 7 months ago

On Thu, Jul 04, 2024 at 10:59:45AM +0800, Baolu Lu wrote:
> > On Tue, Jul 02, 2024 at 02:34:40PM +0800, Lu Baolu wrote:
> > 
> > > An iommufd fault object provides an interface for delivering I/O page
> > > faults to user space. These objects are created and destroyed by user
> > > space, and they can be associated with or dissociated from hardware page
> > > table objects during page table allocation or destruction.
> > > 
> > > User space interacts with the fault object through a file interface. This
> > > interface offers a straightforward and efficient way for user space to
> > > handle page faults. It allows user space to read fault messages
> > > sequentially and respond to them by writing to the same file. The file
> > > interface supports reading messages in poll mode, so it's recommended that
> > > user space applications use io_uring to enhance read and write efficiency.
> > > 
> > > A fault object can be associated with any iopf-capable iommufd_hw_pgtable
> > > during the pgtable's allocation. All I/O page faults triggered by devices
> > > when accessing the I/O addresses of an iommufd_hw_pgtable are routed
> > > through the fault object to user space. Similarly, user space's responses
> > > to these page faults are routed back to the iommu device driver through
> > > the same fault object.
> > There is a need for VIOMMU object to report HW fault to VMM. For
> > example, a HW-accelerated VCMDQ may encounter HW errors. HW will
> > raise an IRQ to the host kernel and the host kernel will forward
> > it to the guest. I think we can have a viommu->fault, similar to
> > the hwpt->fault introduced by this series. This viommu->fault can
> > also benefit nested IOMMU for reporting translation error.
> > 
> > I learned that this hwpt->fault is exclusively for IOPF/PRI. And
> > Jason suggested me to add a different one for VIOMMU. Yet, after
> > taking a closer look, I found the fault object in this series is
> > seemingly quite generic at the uAPI level: its naming/structure,
> > and the way how it's allocated and passed to hwpt, despite being
> > highly correlated with IOPF in its fops code. So, I feel that we
> > might have a chance of reusing it for different fault types:
> > 
> > +enum iommu_fault_type {
> > +     IOMMU_FAULT_TYPE_HWPT_IOPF,
> > +     IOMMU_FAULT_TYPE_VIOMMU_IRQ,
> > +};
> > 
> >   struct iommu_fault_alloc {
> >       __u32 size;
> >       __u32 flags;
> > +     __u32 type;  /* enum iommu_fault_type */
> >       __u32 out_fault_id;
> >       __u32 out_fault_fd;
> >   };
> > 
> > I understand that this is already v8. So, maybe we can, for now,
> > apply the small diff above with an IOMMU_FAULT_TYPE_HWPT_IOPF type
> > check in the ioctl handler. And a decoupling for the iopf fops in
> > the ioctl handler can come later in the viommu series:
> >       switch (type) {
> >       case IOMMU_FAULT_TYPE_HWPT_IOPF:
> >               filep = anon_inode_getfile("[iommufd-pgfault]",
> >                                          &iommufd_fault_fops_iopf);
> >       case IOMMU_FAULT_TYPE_VIOMMU_IRQ:
> >               filep = anon_inode_getfile("[iommufd-viommu-irq]",
> >                                          &iommufd_fault_fops_viommu);
> >       default:
> >               return -EOPNOSUPP;
> >       }
> > 
> > Since you are the designer here, I think you have a better 10000
> > foot view -- maybe I am missing something here implying that the
> > fault object can't be really reused by viommu.
> > 
> > Would you mind sharing some thoughts here?
> 
> I think this is a choice between "two different objects" vs. "same
> object with different FD interfaces". If I understand it correctly, your
> proposal of unrecoverable fault delivery is not limited to vcmdq, but
> generic to all unrecoverable events that userspace should be aware of
> when the passed-through device is affected.

It's basically IRQ forwarding, not confined to unrecoverable
faults. For example, a VCMDQ used by the guest kernel would
raise an HW IRQ if the guest kernel issues an illegal command
to the HW Queue assigned to it. The host kernel will receive
the IRQ, so it needs a way to forward it to the VM for guest
kernel to recover the HW queue.

The way that we define the structure can follow what we have
for hwpt_alloc/invalidate uAPIs, i.e. driver data/event. And
such an event can carry unrecoverable translation faults too.
SMMU at least reports DMA translation faults using an eventQ
in its own native language.

> From a hardware architecture perspective, the interfaces for
> unrecoverable events don't always match the page faults. For example,
> VT-d architecture defines a PR queue for page faults, but uses a
> register set to report unrecoverable events. The 'reason', 'request id'
> and 'pasid' fields of the register set indicate what happened on the
> hardware. New unrecoverable events will not be reported until the
> previous one has been fetched.

Understood. I don't think we can share the majority pieces in
the fault.c. Just the "IOMMU_FAULT_QUEUE_ALLOC" ioctl itself
looks way too general to be limited to page-fault usage only.
So, I feel we can share, for example:
    IOMMU_FAULT_QUEUE_ALLOC (type=hwpt_iopf) -> fault_id=1
    IOMMU_HWPT_ALLOC (fault_id=1) -> hwpt_id=2
    IOMMU_FAULT_QUEUE_ALLOC (type=viommu_irq) -> fault_id=3
    IOMMU_VIOMMU_ALLOC (fault_id=2) -> viommu_id=4
The handler will direct to different fops as I drafted in my
previous mail.

> With the above being said, I have no strong opinions between these two
> choices. Jason and Kevin should have more insights.

Thanks. Jason is out of office this week, so hopefully Kevin
may shed some light. I personally feel that we don't need to
largely update this series until we add VIOMMU. Yet, it would
be convenient if we add a "type" in the uAPI with this series.

Thank you
Nic

RE: [PATCH v8 06/10] iommufd: Add iommufd fault object

Posted by Tian, Kevin 1 year, 7 months ago

> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Thursday, July 4, 2024 1:36 PM
> 
> On Thu, Jul 04, 2024 at 10:59:45AM +0800, Baolu Lu wrote:
> > > On Tue, Jul 02, 2024 at 02:34:40PM +0800, Lu Baolu wrote:
> > >
> > > +enum iommu_fault_type {
> > > +     IOMMU_FAULT_TYPE_HWPT_IOPF,
> > > +     IOMMU_FAULT_TYPE_VIOMMU_IRQ,
> > > +};
> > >
> > >   struct iommu_fault_alloc {
> > >       __u32 size;
> > >       __u32 flags;
> > > +     __u32 type;  /* enum iommu_fault_type */
> > >       __u32 out_fault_id;
> > >       __u32 out_fault_fd;

and need a new reserved field for alignment.

> > >   };
> > >
> > > I understand that this is already v8. So, maybe we can, for now,
> > > apply the small diff above with an IOMMU_FAULT_TYPE_HWPT_IOPF
> type
> > > check in the ioctl handler. And a decoupling for the iopf fops in
> > > the ioctl handler can come later in the viommu series:
> > >       switch (type) {
> > >       case IOMMU_FAULT_TYPE_HWPT_IOPF:
> > >               filep = anon_inode_getfile("[iommufd-pgfault]",
> > >                                          &iommufd_fault_fops_iopf);
> > >       case IOMMU_FAULT_TYPE_VIOMMU_IRQ:
> > >               filep = anon_inode_getfile("[iommufd-viommu-irq]",
> > >                                          &iommufd_fault_fops_viommu);
> > >       default:
> > >               return -EOPNOSUPP;
> > >       }
> > >
> > > Since you are the designer here, I think you have a better 10000
> > > foot view -- maybe I am missing something here implying that the
> > > fault object can't be really reused by viommu.
> > >
> > > Would you mind sharing some thoughts here?
> >
> > I think this is a choice between "two different objects" vs. "same
> > object with different FD interfaces". If I understand it correctly, your
> > proposal of unrecoverable fault delivery is not limited to vcmdq, but
> > generic to all unrecoverable events that userspace should be aware of
> > when the passed-through device is affected.
> 
> It's basically IRQ forwarding, not confined to unrecoverable
> faults. For example, a VCMDQ used by the guest kernel would
> raise an HW IRQ if the guest kernel issues an illegal command
> to the HW Queue assigned to it. The host kernel will receive
> the IRQ, so it needs a way to forward it to the VM for guest
> kernel to recover the HW queue.
> 
> The way that we define the structure can follow what we have
> for hwpt_alloc/invalidate uAPIs, i.e. driver data/event. And
> such an event can carry unrecoverable translation faults too.
> SMMU at least reports DMA translation faults using an eventQ
> in its own native language.
> 
> > From a hardware architecture perspective, the interfaces for
> > unrecoverable events don't always match the page faults. For example,
> > VT-d architecture defines a PR queue for page faults, but uses a
> > register set to report unrecoverable events. The 'reason', 'request id'
> > and 'pasid' fields of the register set indicate what happened on the
> > hardware. New unrecoverable events will not be reported until the
> > previous one has been fetched.
> 
> Understood. I don't think we can share the majority pieces in
> the fault.c. Just the "IOMMU_FAULT_QUEUE_ALLOC" ioctl itself
> looks way too general to be limited to page-fault usage only.
> So, I feel we can share, for example:
>     IOMMU_FAULT_QUEUE_ALLOC (type=hwpt_iopf) -> fault_id=1
>     IOMMU_HWPT_ALLOC (fault_id=1) -> hwpt_id=2
>     IOMMU_FAULT_QUEUE_ALLOC (type=viommu_irq) -> fault_id=3
>     IOMMU_VIOMMU_ALLOC (fault_id=2) -> viommu_id=4
> The handler will direct to different fops as I drafted in my
> previous mail.
> 
> > With the above being said, I have no strong opinions between these two
> > choices. Jason and Kevin should have more insights.
> 
> Thanks. Jason is out of office this week, so hopefully Kevin
> may shed some light. I personally feel that we don't need to
> largely update this series until we add VIOMMU. Yet, it would
> be convenient if we add a "type" in the uAPI with this series.
> 

This is ok to me.

Re: [PATCH v8 06/10] iommufd: Add iommufd fault object

Posted by Baolu Lu 1 year, 7 months ago

On 2024/7/4 14:37, Tian, Kevin wrote:
>> From: Nicolin Chen<nicolinc@nvidia.com>
>> Sent: Thursday, July 4, 2024 1:36 PM
>>
>> On Thu, Jul 04, 2024 at 10:59:45AM +0800, Baolu Lu wrote:
>>>> On Tue, Jul 02, 2024 at 02:34:40PM +0800, Lu Baolu wrote:
>>>>
>>>> +enum iommu_fault_type {
>>>> +     IOMMU_FAULT_TYPE_HWPT_IOPF,
>>>> +     IOMMU_FAULT_TYPE_VIOMMU_IRQ,
>>>> +};
>>>>
>>>>    struct iommu_fault_alloc {
>>>>        __u32 size;
>>>>        __u32 flags;
>>>> +     __u32 type;  /* enum iommu_fault_type */
>>>>        __u32 out_fault_id;
>>>>        __u32 out_fault_fd;
> and need a new reserved field for alignment.
> 
>>>>    };
>>>>
>>>> I understand that this is already v8. So, maybe we can, for now,
>>>> apply the small diff above with an IOMMU_FAULT_TYPE_HWPT_IOPF
>> type
>>>> check in the ioctl handler. And a decoupling for the iopf fops in
>>>> the ioctl handler can come later in the viommu series:
>>>>        switch (type) {
>>>>        case IOMMU_FAULT_TYPE_HWPT_IOPF:
>>>>                filep = anon_inode_getfile("[iommufd-pgfault]",
>>>>                                           &iommufd_fault_fops_iopf);
>>>>        case IOMMU_FAULT_TYPE_VIOMMU_IRQ:
>>>>                filep = anon_inode_getfile("[iommufd-viommu-irq]",
>>>>                                           &iommufd_fault_fops_viommu);
>>>>        default:
>>>>                return -EOPNOSUPP;
>>>>        }
>>>>
>>>> Since you are the designer here, I think you have a better 10000
>>>> foot view -- maybe I am missing something here implying that the
>>>> fault object can't be really reused by viommu.
>>>>
>>>> Would you mind sharing some thoughts here?
>>> I think this is a choice between "two different objects" vs. "same
>>> object with different FD interfaces". If I understand it correctly, your
>>> proposal of unrecoverable fault delivery is not limited to vcmdq, but
>>> generic to all unrecoverable events that userspace should be aware of
>>> when the passed-through device is affected.
>> It's basically IRQ forwarding, not confined to unrecoverable
>> faults. For example, a VCMDQ used by the guest kernel would
>> raise an HW IRQ if the guest kernel issues an illegal command
>> to the HW Queue assigned to it. The host kernel will receive
>> the IRQ, so it needs a way to forward it to the VM for guest
>> kernel to recover the HW queue.
>>
>> The way that we define the structure can follow what we have
>> for hwpt_alloc/invalidate uAPIs, i.e. driver data/event. And
>> such an event can carry unrecoverable translation faults too.
>> SMMU at least reports DMA translation faults using an eventQ
>> in its own native language.
>>
>>>  From a hardware architecture perspective, the interfaces for
>>> unrecoverable events don't always match the page faults. For example,
>>> VT-d architecture defines a PR queue for page faults, but uses a
>>> register set to report unrecoverable events. The 'reason', 'request id'
>>> and 'pasid' fields of the register set indicate what happened on the
>>> hardware. New unrecoverable events will not be reported until the
>>> previous one has been fetched.
>> Understood. I don't think we can share the majority pieces in
>> the fault.c. Just the "IOMMU_FAULT_QUEUE_ALLOC" ioctl itself
>> looks way too general to be limited to page-fault usage only.
>> So, I feel we can share, for example:
>>      IOMMU_FAULT_QUEUE_ALLOC (type=hwpt_iopf) -> fault_id=1
>>      IOMMU_HWPT_ALLOC (fault_id=1) -> hwpt_id=2
>>      IOMMU_FAULT_QUEUE_ALLOC (type=viommu_irq) -> fault_id=3
>>      IOMMU_VIOMMU_ALLOC (fault_id=2) -> viommu_id=4
>> The handler will direct to different fops as I drafted in my
>> previous mail.
>>
>>> With the above being said, I have no strong opinions between these two
>>> choices. Jason and Kevin should have more insights.
>> Thanks. Jason is out of office this week, so hopefully Kevin
>> may shed some light. I personally feel that we don't need to
>> largely update this series until we add VIOMMU. Yet, it would
>> be convenient if we add a "type" in the uAPI with this series.
>>
> This is ok to me.

So

Nicolin, perhaps can you please cook an additional patch on top of this
series and post it for further review?

Thanks,
baolu

Re: [PATCH v8 06/10] iommufd: Add iommufd fault object

Posted by Nicolin Chen 1 year, 7 months ago

On Thu, Jul 04, 2024 at 03:32:32PM +0800, Baolu Lu wrote:
> On 2024/7/4 14:37, Tian, Kevin wrote:
> > > From: Nicolin Chen<nicolinc@nvidia.com>
> > > Sent: Thursday, July 4, 2024 1:36 PM
> > > 
> > > On Thu, Jul 04, 2024 at 10:59:45AM +0800, Baolu Lu wrote:
> > > > > On Tue, Jul 02, 2024 at 02:34:40PM +0800, Lu Baolu wrote:
> > > > > 
> > > > > +enum iommu_fault_type {
> > > > > +     IOMMU_FAULT_TYPE_HWPT_IOPF,
> > > > > +     IOMMU_FAULT_TYPE_VIOMMU_IRQ,
> > > > > +};
> > > > > 
> > > > >    struct iommu_fault_alloc {
> > > > >        __u32 size;
> > > > >        __u32 flags;
> > > > > +     __u32 type;  /* enum iommu_fault_type */
> > > > >        __u32 out_fault_id;
> > > > >        __u32 out_fault_fd;
> > and need a new reserved field for alignment.

Hmm, what's the reason for enforcing a 64-bit alignment to an
all-u32 struct though? I thought we need a reserved field only
for padding. The struct iommu_ioas_alloc has three u32 members
for example?

> > > > >    };
> > > > > 
> > > > > I understand that this is already v8. So, maybe we can, for now,
> > > > > apply the small diff above with an IOMMU_FAULT_TYPE_HWPT_IOPF
> > > type
> > > > > check in the ioctl handler. And a decoupling for the iopf fops in
> > > > > the ioctl handler can come later in the viommu series:
> > > > >        switch (type) {
> > > > >        case IOMMU_FAULT_TYPE_HWPT_IOPF:
> > > > >                filep = anon_inode_getfile("[iommufd-pgfault]",
> > > > >                                           &iommufd_fault_fops_iopf);
> > > > >        case IOMMU_FAULT_TYPE_VIOMMU_IRQ:
> > > > >                filep = anon_inode_getfile("[iommufd-viommu-irq]",
> > > > >                                           &iommufd_fault_fops_viommu);
> > > > >        default:
> > > > >                return -EOPNOSUPP;
> > > > >        }
> > > > > 
> > > > > Since you are the designer here, I think you have a better 10000
> > > > > foot view -- maybe I am missing something here implying that the
> > > > > fault object can't be really reused by viommu.
> > > > > 
> > > > > Would you mind sharing some thoughts here?
> > > > I think this is a choice between "two different objects" vs. "same
> > > > object with different FD interfaces". If I understand it correctly, your
> > > > proposal of unrecoverable fault delivery is not limited to vcmdq, but
> > > > generic to all unrecoverable events that userspace should be aware of
> > > > when the passed-through device is affected.
> > > It's basically IRQ forwarding, not confined to unrecoverable
> > > faults. For example, a VCMDQ used by the guest kernel would
> > > raise an HW IRQ if the guest kernel issues an illegal command
> > > to the HW Queue assigned to it. The host kernel will receive
> > > the IRQ, so it needs a way to forward it to the VM for guest
> > > kernel to recover the HW queue.
> > > 
> > > The way that we define the structure can follow what we have
> > > for hwpt_alloc/invalidate uAPIs, i.e. driver data/event. And
> > > such an event can carry unrecoverable translation faults too.
> > > SMMU at least reports DMA translation faults using an eventQ
> > > in its own native language.
> > > 
> > > >  From a hardware architecture perspective, the interfaces for
> > > > unrecoverable events don't always match the page faults. For example,
> > > > VT-d architecture defines a PR queue for page faults, but uses a
> > > > register set to report unrecoverable events. The 'reason', 'request id'
> > > > and 'pasid' fields of the register set indicate what happened on the
> > > > hardware. New unrecoverable events will not be reported until the
> > > > previous one has been fetched.
> > > Understood. I don't think we can share the majority pieces in
> > > the fault.c. Just the "IOMMU_FAULT_QUEUE_ALLOC" ioctl itself
> > > looks way too general to be limited to page-fault usage only.
> > > So, I feel we can share, for example:
> > >      IOMMU_FAULT_QUEUE_ALLOC (type=hwpt_iopf) -> fault_id=1
> > >      IOMMU_HWPT_ALLOC (fault_id=1) -> hwpt_id=2
> > >      IOMMU_FAULT_QUEUE_ALLOC (type=viommu_irq) -> fault_id=3
> > >      IOMMU_VIOMMU_ALLOC (fault_id=2) -> viommu_id=4
> > > The handler will direct to different fops as I drafted in my
> > > previous mail.
> > > 
> > > > With the above being said, I have no strong opinions between these two
> > > > choices. Jason and Kevin should have more insights.
> > > Thanks. Jason is out of office this week, so hopefully Kevin
> > > may shed some light. I personally feel that we don't need to
> > > largely update this series until we add VIOMMU. Yet, it would
> > > be convenient if we add a "type" in the uAPI with this series.
> > > 
> > This is ok to me.
> 
> So
> 
> Nicolin, perhaps can you please cook an additional patch on top of this
> series and post it for further review?

Thank you both for the inputs. Yea, so long as we merge them
in the same cycle, it won't be a uAPI breakage. I will draft
an incremental one. And Jason can make a final call.

Nicolin

RE: [PATCH v8 06/10] iommufd: Add iommufd fault object

Posted by Tian, Kevin 1 year, 7 months ago

> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Friday, July 5, 2024 7:19 AM
> 
> On Thu, Jul 04, 2024 at 03:32:32PM +0800, Baolu Lu wrote:
> > On 2024/7/4 14:37, Tian, Kevin wrote:
> > > > From: Nicolin Chen<nicolinc@nvidia.com>
> > > > Sent: Thursday, July 4, 2024 1:36 PM
> > > >
> > > > On Thu, Jul 04, 2024 at 10:59:45AM +0800, Baolu Lu wrote:
> > > > > > On Tue, Jul 02, 2024 at 02:34:40PM +0800, Lu Baolu wrote:
> > > > > >
> > > > > > +enum iommu_fault_type {
> > > > > > +     IOMMU_FAULT_TYPE_HWPT_IOPF,
> > > > > > +     IOMMU_FAULT_TYPE_VIOMMU_IRQ,
> > > > > > +};
> > > > > >
> > > > > >    struct iommu_fault_alloc {
> > > > > >        __u32 size;
> > > > > >        __u32 flags;
> > > > > > +     __u32 type;  /* enum iommu_fault_type */
> > > > > >        __u32 out_fault_id;
> > > > > >        __u32 out_fault_fd;
> > > and need a new reserved field for alignment.
> 
> Hmm, what's the reason for enforcing a 64-bit alignment to an
> all-u32 struct though? I thought we need a reserved field only
> for padding. The struct iommu_ioas_alloc has three u32 members
> for example?
> 

yeah please ignore this comment.

Re: [PATCH v8 06/10] iommufd: Add iommufd fault object

Posted by Jason Gunthorpe 1 year, 7 months ago

On Fri, Jul 05, 2024 at 12:49:10AM +0000, Tian, Kevin wrote:

> > > > > > > +enum iommu_fault_type {
> > > > > > > +     IOMMU_FAULT_TYPE_HWPT_IOPF,
> > > > > > > +     IOMMU_FAULT_TYPE_VIOMMU_IRQ,
> > > > > > > +};
> > > > > > >
> > > > > > >    struct iommu_fault_alloc {
> > > > > > >        __u32 size;
> > > > > > >        __u32 flags;
> > > > > > > +     __u32 type;  /* enum iommu_fault_type */
> > > > > > >        __u32 out_fault_id;
> > > > > > >        __u32 out_fault_fd;
> > > > and need a new reserved field for alignment.
> > 
> > Hmm, what's the reason for enforcing a 64-bit alignment to an
> > all-u32 struct though? I thought we need a reserved field only
> > for padding. The struct iommu_ioas_alloc has three u32 members
> > for example?
> 
> yeah please ignore this comment.

Sometimes I encourage it so that people notice the if the structure is
changed later. Almost all structs here are 8 byte aligned. It is OK
like this too.

Jason