[PATCH v6 06/10] iommufd: Add iommufd fault object

Lu Baolu posted 10 patches 1 year, 8 months ago
There is a newer version of this series
[PATCH v6 06/10] iommufd: Add iommufd fault object
Posted by Lu Baolu 1 year, 8 months ago
An iommufd fault object provides an interface for delivering I/O page
faults to user space. These objects are created and destroyed by user
space, and they can be associated with or dissociated from hardware page
table objects during page table allocation or destruction.

User space interacts with the fault object through a file interface. This
interface offers a straightforward and efficient way for user space to
handle page faults. It allows user space to read fault messages
sequentially and respond to them by writing to the same file. The file
interface supports reading messages in poll mode, so it's recommended that
user space applications use io_uring to enhance read and write efficiency.

A fault object can be associated with any iopf-capable iommufd_hw_pgtable
during the pgtable's allocation. All I/O page faults triggered by devices
when accessing the I/O addresses of an iommufd_hw_pgtable are routed
through the fault object to user space. Similarly, user space's responses
to these page faults are routed back to the iommu device driver through
the same fault object.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
---
 include/linux/iommu.h                   |   4 +
 drivers/iommu/iommufd/iommufd_private.h |  30 ++++
 include/uapi/linux/iommufd.h            |  18 ++
 drivers/iommu/io-pgfault.c              |   2 +
 drivers/iommu/iommufd/fault.c           | 227 ++++++++++++++++++++++++
 drivers/iommu/iommufd/main.c            |   6 +
 drivers/iommu/iommufd/Makefile          |   1 +
 7 files changed, 288 insertions(+)
 create mode 100644 drivers/iommu/iommufd/fault.c

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 4067ebdd6232..16b3a2da91ef 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -124,12 +124,16 @@ struct iopf_fault {
 struct iopf_group {
 	struct iopf_fault last_fault;
 	struct list_head faults;
+	size_t fault_count;
 	/* list node for iommu_fault_param::faults */
 	struct list_head pending_node;
 	struct work_struct work;
 	struct iommu_attach_handle *attach_handle;
 	/* The device's fault data parameter. */
 	struct iommu_fault_param *fault_param;
+	/* Used by handler provider to hook the group on its own lists. */
+	struct list_head node;
+	u32 cookie;
 };
 
 /**
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 991f864d1f9b..c8a4519f1405 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -128,6 +128,7 @@ enum iommufd_object_type {
 	IOMMUFD_OBJ_HWPT_NESTED,
 	IOMMUFD_OBJ_IOAS,
 	IOMMUFD_OBJ_ACCESS,
+	IOMMUFD_OBJ_FAULT,
 #ifdef CONFIG_IOMMUFD_TEST
 	IOMMUFD_OBJ_SELFTEST,
 #endif
@@ -426,6 +427,35 @@ void iopt_remove_access(struct io_pagetable *iopt,
 			u32 iopt_access_list_id);
 void iommufd_access_destroy_object(struct iommufd_object *obj);
 
+/*
+ * An iommufd_fault object represents an interface to deliver I/O page faults
+ * to the user space. These objects are created/destroyed by the user space and
+ * associated with hardware page table objects during page-table allocation.
+ */
+struct iommufd_fault {
+	struct iommufd_object obj;
+	struct iommufd_ctx *ictx;
+	struct file *filep;
+
+	/* The lists of outstanding faults protected by below mutex. */
+	struct mutex mutex;
+	struct list_head deliver;
+	struct xarray response;
+
+	struct wait_queue_head wait_queue;
+};
+
+struct iommufd_attach_handle {
+	struct iommu_attach_handle handle;
+	struct iommufd_device *idev;
+};
+
+/* Convert an iommu attach handle to iommufd handle. */
+#define to_iommufd_handle(hdl)	container_of(hdl, struct iommufd_attach_handle, handle)
+
+int iommufd_fault_alloc(struct iommufd_ucmd *ucmd);
+void iommufd_fault_destroy(struct iommufd_object *obj);
+
 #ifdef CONFIG_IOMMUFD_TEST
 int iommufd_test(struct iommufd_ucmd *ucmd);
 void iommufd_selftest_destroy(struct iommufd_object *obj);
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 2f34d66436fb..eba452d4344e 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -50,6 +50,7 @@ enum {
 	IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING,
 	IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP,
 	IOMMUFD_CMD_HWPT_INVALIDATE,
+	IOMMUFD_CMD_FAULT_QUEUE_ALLOC,
 };
 
 /**
@@ -788,4 +789,21 @@ struct iommu_hwpt_page_response {
 	__u32 cookie;
 	__u32 reserved;
 };
+
+/**
+ * struct iommu_fault_alloc - ioctl(IOMMU_FAULT_QUEUE_ALLOC)
+ * @size: sizeof(struct iommu_fault_alloc)
+ * @flags: Must be 0
+ * @out_fault_id: The ID of the new FAULT
+ * @out_fault_fd: The fd of the new FAULT
+ *
+ * Explicitly allocate a fault handling object.
+ */
+struct iommu_fault_alloc {
+	__u32 size;
+	__u32 flags;
+	__u32 out_fault_id;
+	__u32 out_fault_fd;
+};
+#define IOMMU_FAULT_QUEUE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_FAULT_QUEUE_ALLOC)
 #endif
diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c
index c62fcb67ef02..a629d8a93614 100644
--- a/drivers/iommu/io-pgfault.c
+++ b/drivers/iommu/io-pgfault.c
@@ -110,6 +110,8 @@ static struct iopf_group *iopf_group_alloc(struct iommu_fault_param *iopf_param,
 	list_add(&group->pending_node, &iopf_param->faults);
 	mutex_unlock(&iopf_param->lock);
 
+	group->fault_count = list_count_nodes(&group->faults);
+
 	return group;
 }
 
diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c
new file mode 100644
index 000000000000..d0dafe761075
--- /dev/null
+++ b/drivers/iommu/iommufd/fault.c
@@ -0,0 +1,227 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2024 Intel Corporation
+ */
+#define pr_fmt(fmt) "iommufd: " fmt
+
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/iommufd.h>
+#include <linux/poll.h>
+#include <linux/anon_inodes.h>
+#include <uapi/linux/iommufd.h>
+
+#include "../iommu-priv.h"
+#include "iommufd_private.h"
+
+void iommufd_fault_destroy(struct iommufd_object *obj)
+{
+	struct iommufd_fault *fault = container_of(obj, struct iommufd_fault, obj);
+	struct iopf_group *group, *next;
+
+	/*
+	 * The iommufd object's reference count is zero at this point.
+	 * We can be confident that no other threads are currently
+	 * accessing this pointer. Therefore, acquiring the mutex here
+	 * is unnecessary.
+	 */
+	list_for_each_entry_safe(group, next, &fault->deliver, node) {
+		list_del(&group->node);
+		iopf_group_response(group, IOMMU_PAGE_RESP_INVALID);
+		iopf_free_group(group);
+	}
+}
+
+static void iommufd_compose_fault_message(struct iommu_fault *fault,
+					  struct iommu_hwpt_pgfault *hwpt_fault,
+					  struct iommufd_device *idev,
+					  u32 cookie)
+{
+	hwpt_fault->size = sizeof(*hwpt_fault);
+	hwpt_fault->flags = fault->prm.flags;
+	hwpt_fault->dev_id = idev->obj.id;
+	hwpt_fault->pasid = fault->prm.pasid;
+	hwpt_fault->grpid = fault->prm.grpid;
+	hwpt_fault->perm = fault->prm.perm;
+	hwpt_fault->addr = fault->prm.addr;
+	hwpt_fault->length = 0;
+	hwpt_fault->cookie = cookie;
+}
+
+static ssize_t iommufd_fault_fops_read(struct file *filep, char __user *buf,
+				       size_t count, loff_t *ppos)
+{
+	size_t fault_size = sizeof(struct iommu_hwpt_pgfault);
+	struct iommufd_fault *fault = filep->private_data;
+	struct iommu_hwpt_pgfault data;
+	struct iommufd_device *idev;
+	struct iopf_group *group;
+	struct iopf_fault *iopf;
+	size_t done = 0;
+	int rc = 0;
+
+	if (*ppos || count % fault_size)
+		return -ESPIPE;
+
+	mutex_lock(&fault->mutex);
+	while (!list_empty(&fault->deliver) && count > done) {
+		group = list_first_entry(&fault->deliver,
+					 struct iopf_group, node);
+
+		if (group->fault_count * fault_size > count - done)
+			break;
+
+		rc = xa_alloc(&fault->response, &group->cookie, group,
+			      xa_limit_32b, GFP_KERNEL);
+		if (rc)
+			break;
+
+		idev = to_iommufd_handle(group->attach_handle)->idev;
+		list_for_each_entry(iopf, &group->faults, list) {
+			iommufd_compose_fault_message(&iopf->fault,
+						      &data, idev,
+						      group->cookie);
+			rc = copy_to_user(buf + done, &data, fault_size);
+			if (rc) {
+				xa_erase(&fault->response, group->cookie);
+				break;
+			}
+			done += fault_size;
+		}
+
+		list_del(&group->node);
+	}
+	mutex_unlock(&fault->mutex);
+
+	return done == 0 ? rc : done;
+}
+
+static ssize_t iommufd_fault_fops_write(struct file *filep, const char __user *buf,
+					size_t count, loff_t *ppos)
+{
+	size_t response_size = sizeof(struct iommu_hwpt_page_response);
+	struct iommufd_fault *fault = filep->private_data;
+	struct iommu_hwpt_page_response response;
+	struct iopf_group *group;
+	size_t done = 0;
+	int rc = 0;
+
+	if (*ppos || count % response_size)
+		return -ESPIPE;
+
+	mutex_lock(&fault->mutex);
+	while (count > done) {
+		rc = copy_from_user(&response, buf + done, response_size);
+		if (rc)
+			break;
+
+		group = xa_erase(&fault->response, response.cookie);
+		if (!group) {
+			rc = -EINVAL;
+			break;
+		}
+
+		iopf_group_response(group, response.code);
+		iopf_free_group(group);
+		done += response_size;
+	}
+	mutex_unlock(&fault->mutex);
+
+	return done == 0 ? rc : done;
+}
+
+static __poll_t iommufd_fault_fops_poll(struct file *filep,
+					struct poll_table_struct *wait)
+{
+	struct iommufd_fault *fault = filep->private_data;
+	__poll_t pollflags = EPOLLOUT;
+
+	poll_wait(filep, &fault->wait_queue, wait);
+	mutex_lock(&fault->mutex);
+	if (!list_empty(&fault->deliver))
+		pollflags |= EPOLLIN | EPOLLRDNORM;
+	mutex_unlock(&fault->mutex);
+
+	return pollflags;
+}
+
+static int iommufd_fault_fops_release(struct inode *inode, struct file *filep)
+{
+	struct iommufd_fault *fault = filep->private_data;
+
+	iommufd_ctx_put(fault->ictx);
+	refcount_dec(&fault->obj.users);
+	return 0;
+}
+
+static const struct file_operations iommufd_fault_fops = {
+	.owner		= THIS_MODULE,
+	.open		= nonseekable_open,
+	.read		= iommufd_fault_fops_read,
+	.write		= iommufd_fault_fops_write,
+	.poll		= iommufd_fault_fops_poll,
+	.release	= iommufd_fault_fops_release,
+	.llseek		= no_llseek,
+};
+
+int iommufd_fault_alloc(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_fault_alloc *cmd = ucmd->cmd;
+	struct iommufd_fault *fault;
+	struct file *filep;
+	int fdno;
+	int rc;
+
+	if (cmd->flags)
+		return -EOPNOTSUPP;
+
+	fault = iommufd_object_alloc(ucmd->ictx, fault, IOMMUFD_OBJ_FAULT);
+	if (IS_ERR(fault))
+		return PTR_ERR(fault);
+
+	fault->ictx = ucmd->ictx;
+	INIT_LIST_HEAD(&fault->deliver);
+	xa_init_flags(&fault->response, XA_FLAGS_ALLOC1);
+	mutex_init(&fault->mutex);
+	init_waitqueue_head(&fault->wait_queue);
+
+	filep = anon_inode_getfile("[iommufd-pgfault]", &iommufd_fault_fops,
+				   fault, O_RDWR);
+	if (IS_ERR(filep)) {
+		rc = PTR_ERR(filep);
+		goto out_abort;
+	}
+
+	refcount_inc(&fault->obj.users);
+	iommufd_ctx_get(fault->ictx);
+	fault->filep = filep;
+
+	fdno = get_unused_fd_flags(O_CLOEXEC);
+	if (fdno < 0) {
+		rc = fdno;
+		goto out_fput;
+	}
+
+	cmd->out_fault_id = fault->obj.id;
+	cmd->out_fault_fd = fdno;
+
+	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+	if (rc)
+		goto out_put_fdno;
+	iommufd_object_finalize(ucmd->ictx, &fault->obj);
+
+	fd_install(fdno, fault->filep);
+
+	return 0;
+out_put_fdno:
+	put_unused_fd(fdno);
+out_fput:
+	fput(filep);
+	refcount_dec(&fault->obj.users);
+	iommufd_ctx_put(fault->ictx);
+out_abort:
+	iommufd_object_abort_and_destroy(ucmd->ictx, &fault->obj);
+
+	return rc;
+}
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 39b32932c61e..961b2949c06f 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -332,6 +332,7 @@ union ucmd_buffer {
 	struct iommu_ioas_unmap unmap;
 	struct iommu_option option;
 	struct iommu_vfio_ioas vfio_ioas;
+	struct iommu_fault_alloc fault;
 #ifdef CONFIG_IOMMUFD_TEST
 	struct iommu_test_cmd test;
 #endif
@@ -381,6 +382,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
 		 val64),
 	IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas,
 		 __reserved),
+	IOCTL_OP(IOMMU_FAULT_QUEUE_ALLOC, iommufd_fault_alloc, struct iommu_fault_alloc,
+		 out_fault_fd),
 #ifdef CONFIG_IOMMUFD_TEST
 	IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last),
 #endif
@@ -513,6 +516,9 @@ static const struct iommufd_object_ops iommufd_object_ops[] = {
 		.destroy = iommufd_hwpt_nested_destroy,
 		.abort = iommufd_hwpt_nested_abort,
 	},
+	[IOMMUFD_OBJ_FAULT] = {
+		.destroy = iommufd_fault_destroy,
+	},
 #ifdef CONFIG_IOMMUFD_TEST
 	[IOMMUFD_OBJ_SELFTEST] = {
 		.destroy = iommufd_selftest_destroy,
diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
index 34b446146961..b94a74366eed 100644
--- a/drivers/iommu/iommufd/Makefile
+++ b/drivers/iommu/iommufd/Makefile
@@ -6,6 +6,7 @@ iommufd-y := \
 	ioas.o \
 	main.o \
 	pages.o \
+	fault.o \
 	vfio_compat.o
 
 iommufd-$(CONFIG_IOMMUFD_TEST) += selftest.o
-- 
2.34.1
RE: [PATCH v6 06/10] iommufd: Add iommufd fault object
Posted by Tian, Kevin 1 year, 8 months ago
> From: Lu Baolu <baolu.lu@linux.intel.com>
> Sent: Monday, May 27, 2024 12:05 PM
> 
> +static ssize_t iommufd_fault_fops_read(struct file *filep, char __user *buf,
> +				       size_t count, loff_t *ppos)
> +{
> +	size_t fault_size = sizeof(struct iommu_hwpt_pgfault);
> +	struct iommufd_fault *fault = filep->private_data;
> +	struct iommu_hwpt_pgfault data;
> +	struct iommufd_device *idev;
> +	struct iopf_group *group;
> +	struct iopf_fault *iopf;
> +	size_t done = 0;
> +	int rc = 0;
> +
> +	if (*ppos || count % fault_size)
> +		return -ESPIPE;

the man page says:

"If count is zero, read() returns zero and has no  other  results."

> +
> +	mutex_lock(&fault->mutex);
> +	while (!list_empty(&fault->deliver) && count > done) {
> +		group = list_first_entry(&fault->deliver,
> +					 struct iopf_group, node);
> +
> +		if (group->fault_count * fault_size > count - done)
> +			break;
> +
> +		rc = xa_alloc(&fault->response, &group->cookie, group,
> +			      xa_limit_32b, GFP_KERNEL);
> +		if (rc)
> +			break;
> +
> +		idev = to_iommufd_handle(group->attach_handle)->idev;
> +		list_for_each_entry(iopf, &group->faults, list) {
> +			iommufd_compose_fault_message(&iopf->fault,
> +						      &data, idev,
> +						      group->cookie);
> +			rc = copy_to_user(buf + done, &data, fault_size);
> +			if (rc) {

'rc' should be converted to -EFAULT.

> +				xa_erase(&fault->response, group->cookie);
> +				break;
> +			}
> +			done += fault_size;
> +		}
> +
> +		list_del(&group->node);
> +	}
> +	mutex_unlock(&fault->mutex);
> +
> +	return done == 0 ? rc : done;

again this doesn't match the manual:

"On error, -1 is returned, and errno is set appropriately. "

it doesn't matter whether 'done' is 0.

> +
> +static int iommufd_fault_fops_release(struct inode *inode, struct file *filep)
> +{
> +	struct iommufd_fault *fault = filep->private_data;
> +
> +	iommufd_ctx_put(fault->ictx);
> +	refcount_dec(&fault->obj.users);
> +	return 0;
> +}

hmm this doesn't sound correct. the context and refcount are
acquired in iommufd_fault_alloc() but here they are reverted when
the fd is closed...

> +
> +	filep = anon_inode_getfile("[iommufd-pgfault]",
> &iommufd_fault_fops,
> +				   fault, O_RDWR);
> +	if (IS_ERR(filep)) {
> +		rc = PTR_ERR(filep);
> +		goto out_abort;
> +	}
> +
> +	refcount_inc(&fault->obj.users);
> +	iommufd_ctx_get(fault->ictx);
> +	fault->filep = filep;

those 3 lines can be moved after below fdno get. It's reads slightly
clearer to put file related work together before getting to the last piece
of intiailzation. 

> +
> +	fdno = get_unused_fd_flags(O_CLOEXEC);
> +	if (fdno < 0) {
> +		rc = fdno;
> +		goto out_fput;
> +	}
> +
> @@ -332,6 +332,7 @@ union ucmd_buffer {
>  	struct iommu_ioas_unmap unmap;
>  	struct iommu_option option;
>  	struct iommu_vfio_ioas vfio_ioas;
> +	struct iommu_fault_alloc fault;

alphabetic 

> @@ -381,6 +382,8 @@ static const struct iommufd_ioctl_op
> iommufd_ioctl_ops[] = {
>  		 val64),
>  	IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct
> iommu_vfio_ioas,
>  		 __reserved),
> +	IOCTL_OP(IOMMU_FAULT_QUEUE_ALLOC, iommufd_fault_alloc,
> struct iommu_fault_alloc,
> +		 out_fault_fd),

ditto
Re: [PATCH v6 06/10] iommufd: Add iommufd fault object
Posted by Jason Gunthorpe 1 year, 8 months ago
On Fri, Jun 07, 2024 at 09:17:28AM +0000, Tian, Kevin wrote:
> > From: Lu Baolu <baolu.lu@linux.intel.com>
> > Sent: Monday, May 27, 2024 12:05 PM
> > 
> > +static ssize_t iommufd_fault_fops_read(struct file *filep, char __user *buf,
> > +				       size_t count, loff_t *ppos)
> > +{
> > +	size_t fault_size = sizeof(struct iommu_hwpt_pgfault);
> > +	struct iommufd_fault *fault = filep->private_data;
> > +	struct iommu_hwpt_pgfault data;
> > +	struct iommufd_device *idev;
> > +	struct iopf_group *group;
> > +	struct iopf_fault *iopf;
> > +	size_t done = 0;
> > +	int rc = 0;
> > +
> > +	if (*ppos || count % fault_size)
> > +		return -ESPIPE;
> 
> the man page says:
> 
> "If count is zero, read() returns zero and has no  other  results."

The above does that? 0 % X == 0

> > +
> > +	mutex_lock(&fault->mutex);
> > +	while (!list_empty(&fault->deliver) && count > done) {
> > +		group = list_first_entry(&fault->deliver,
> > +					 struct iopf_group, node);
> > +
> > +		if (group->fault_count * fault_size > count - done)
> > +			break;
> > +
> > +		rc = xa_alloc(&fault->response, &group->cookie, group,
> > +			      xa_limit_32b, GFP_KERNEL);
> > +		if (rc)
> > +			break;
> > +
> > +		idev = to_iommufd_handle(group->attach_handle)->idev;
> > +		list_for_each_entry(iopf, &group->faults, list) {
> > +			iommufd_compose_fault_message(&iopf->fault,
> > +						      &data, idev,
> > +						      group->cookie);
> > +			rc = copy_to_user(buf + done, &data, fault_size);
> > +			if (rc) {
> 
> 'rc' should be converted to -EFAULT.

Yes


> > +				xa_erase(&fault->response, group->cookie);
> > +				break;
> > +			}
> > +			done += fault_size;
> > +		}
> > +
> > +		list_del(&group->node);
> > +	}
> > +	mutex_unlock(&fault->mutex);
> > +
> > +	return done == 0 ? rc : done;
> 
> again this doesn't match the manual:
> 
> "On error, -1 is returned, and errno is set appropriately. "
> 
> it doesn't matter whether 'done' is 0.

It is setup so that once the list_del() below happens it is guarenteed
that the system call will return a positive result so that the
list_del'd items are always returned to userspace.

If we hit any fault here on the Nth item we should still return the
prior items and ignore the fault.

If we hit a fault on the first item then we should return the fault.

Jason
RE: [PATCH v6 06/10] iommufd: Add iommufd fault object
Posted by Tian, Kevin 1 year, 8 months ago
> From: Jason Gunthorpe <jgg@ziepe.ca>
> Sent: Wednesday, June 12, 2024 9:24 PM
> 
> On Fri, Jun 07, 2024 at 09:17:28AM +0000, Tian, Kevin wrote:
> > > From: Lu Baolu <baolu.lu@linux.intel.com>
> > > Sent: Monday, May 27, 2024 12:05 PM
> > >
> > > +static ssize_t iommufd_fault_fops_read(struct file *filep, char __user
> *buf,
> > > +				       size_t count, loff_t *ppos)
> > > +{
> > > +	size_t fault_size = sizeof(struct iommu_hwpt_pgfault);
> > > +	struct iommufd_fault *fault = filep->private_data;
> > > +	struct iommu_hwpt_pgfault data;
> > > +	struct iommufd_device *idev;
> > > +	struct iopf_group *group;
> > > +	struct iopf_fault *iopf;
> > > +	size_t done = 0;
> > > +	int rc = 0;
> > > +
> > > +	if (*ppos || count % fault_size)
> > > +		return -ESPIPE;
> >
> > the man page says:
> >
> > "If count is zero, read() returns zero and has no  other  results."
> 
> The above does that? 0 % X == 0

but if *ppos is non-zero then an error will be returned even if count==0.

but seems I looked at an old man page on an old system. A newer one
says:

       If count is zero, read() may detect the errors described below.
       In the absence of any errors, or if read() does not check for
       errors, a read() with a count of 0 returns zero and has no other
       effects.

so above should be fine. 😊

> > > +				xa_erase(&fault->response, group->cookie);
> > > +				break;
> > > +			}
> > > +			done += fault_size;
> > > +		}
> > > +
> > > +		list_del(&group->node);
> > > +	}
> > > +	mutex_unlock(&fault->mutex);
> > > +
> > > +	return done == 0 ? rc : done;
> >
> > again this doesn't match the manual:
> >
> > "On error, -1 is returned, and errno is set appropriately. "
> >
> > it doesn't matter whether 'done' is 0.
> 
> It is setup so that once the list_del() below happens it is guarenteed
> that the system call will return a positive result so that the
> list_del'd items are always returned to userspace.
> 
> If we hit any fault here on the Nth item we should still return the
> prior items and ignore the fault.
> 
> If we hit a fault on the first item then we should return the fault.
> 

yes that does make sense. I just didn't find such statement in the
man page which simply said to return -1 on error. The number of
bytes  is returned only on success.

RETURN VALUE
       On success, the number of bytes read is returned (zero indicates
       end of file), and the file position is advanced by this number.
       It is not an error if this number is smaller than the number of
       bytes requested; this may happen for example because fewer bytes
       are actually available right now (maybe because we were close to
       end-of-file, or because we are reading from a pipe, or from a
       terminal), or because read() was interrupted by a signal.  See
       also NOTES.

       On error, -1 is returned, and errno is set to indicate the error.
       In this case, it is left unspecified whether the file position
       (if any) changes.
Re: [PATCH v6 06/10] iommufd: Add iommufd fault object
Posted by Baolu Lu 1 year, 8 months ago
On 6/7/24 5:17 PM, Tian, Kevin wrote:
>> From: Lu Baolu <baolu.lu@linux.intel.com>
>> Sent: Monday, May 27, 2024 12:05 PM
>>
>> +static ssize_t iommufd_fault_fops_read(struct file *filep, char __user *buf,
>> +				       size_t count, loff_t *ppos)
>> +{
>> +	size_t fault_size = sizeof(struct iommu_hwpt_pgfault);
>> +	struct iommufd_fault *fault = filep->private_data;
>> +	struct iommu_hwpt_pgfault data;
>> +	struct iommufd_device *idev;
>> +	struct iopf_group *group;
>> +	struct iopf_fault *iopf;
>> +	size_t done = 0;
>> +	int rc = 0;
>> +
>> +	if (*ppos || count % fault_size)
>> +		return -ESPIPE;
> 
> the man page says:
> 
> "If count is zero, read() returns zero and has no  other  results."

My understanding is that reading zero bytes is likely to check if a file
descriptor is valid and ready for reading without actually taking any
data from it.

In this code, it just returns 0 and it's compatible with the man page.
Or, I overlooked anything?

> 
>> +
>> +	mutex_lock(&fault->mutex);
>> +	while (!list_empty(&fault->deliver) && count > done) {
>> +		group = list_first_entry(&fault->deliver,
>> +					 struct iopf_group, node);
>> +
>> +		if (group->fault_count * fault_size > count - done)
>> +			break;
>> +
>> +		rc = xa_alloc(&fault->response, &group->cookie, group,
>> +			      xa_limit_32b, GFP_KERNEL);
>> +		if (rc)
>> +			break;
>> +
>> +		idev = to_iommufd_handle(group->attach_handle)->idev;
>> +		list_for_each_entry(iopf, &group->faults, list) {
>> +			iommufd_compose_fault_message(&iopf->fault,
>> +						      &data, idev,
>> +						      group->cookie);
>> +			rc = copy_to_user(buf + done, &data, fault_size);
>> +			if (rc) {
> 
> 'rc' should be converted to -EFAULT.

Yes. I will make it like this:

        if (copy_to_user(buf + done, &data, fault_size)) {
                xa_erase(&fault->response, group->cookie);
                rc = -EFAULT;
                break;
        }

> 
>> +				xa_erase(&fault->response, group->cookie);
>> +				break;
>> +			}
>> +			done += fault_size;
>> +		}
>> +
>> +		list_del(&group->node);
>> +	}
>> +	mutex_unlock(&fault->mutex);
>> +
>> +	return done == 0 ? rc : done;
> 
> again this doesn't match the manual:
> 
> "On error, -1 is returned, and errno is set appropriately."
> 
> it doesn't matter whether 'done' is 0.

I don't quite follow here. The code is doing the following:

- If done == 0, it means nothing has been copied to user space. This
   could be due to two reasons:

   1) the user read with a count set to 0, or
   2) a failure case.

   The code returns 0 for the first case and an error number for the
   second.

- If done != 0, some data has been copied to user space. In this case,
   the code returns the number of data copied regardless of the value of
   rc.

> 
>> +
>> +static int iommufd_fault_fops_release(struct inode *inode, struct file *filep)
>> +{
>> +	struct iommufd_fault *fault = filep->private_data;
>> +
>> +	iommufd_ctx_put(fault->ictx);
>> +	refcount_dec(&fault->obj.users);
>> +	return 0;
>> +}
> 
> hmm this doesn't sound correct. the context and refcount are
> acquired in iommufd_fault_alloc() but here they are reverted when
> the fd is closed...

These two refcounts were requested when the fault object was installed
to the fault FD.

        filep = anon_inode_getfile("[iommufd-pgfault]", &iommufd_fault_fops,
                                    fault, O_RDWR);
         if (IS_ERR(filep)) {
                 rc = PTR_ERR(filep);
                 goto out_abort;
         }

         refcount_inc(&fault->obj.users);
         iommufd_ctx_get(fault->ictx);
         fault->filep = filep;

These refcounts must then be released when the FD is released.

>> +
>> +	filep = anon_inode_getfile("[iommufd-pgfault]",
>> &iommufd_fault_fops,
>> +				   fault, O_RDWR);
>> +	if (IS_ERR(filep)) {
>> +		rc = PTR_ERR(filep);
>> +		goto out_abort;
>> +	}
>> +
>> +	refcount_inc(&fault->obj.users);
>> +	iommufd_ctx_get(fault->ictx);
>> +	fault->filep = filep;
> 
> those 3 lines can be moved after below fdno get. It's reads slightly
> clearer to put file related work together before getting to the last piece
> of intiailzation.

The filep is allocated and initialized together.

>> +
>> +	fdno = get_unused_fd_flags(O_CLOEXEC);
>> +	if (fdno < 0) {
>> +		rc = fdno;
>> +		goto out_fput;
>> +	}
>> +
>> @@ -332,6 +332,7 @@ union ucmd_buffer {
>>   	struct iommu_ioas_unmap unmap;
>>   	struct iommu_option option;
>>   	struct iommu_vfio_ioas vfio_ioas;
>> +	struct iommu_fault_alloc fault;
> 
> alphabetic
> 
>> @@ -381,6 +382,8 @@ static const struct iommufd_ioctl_op
>> iommufd_ioctl_ops[] = {
>>   		 val64),
>>   	IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct
>> iommu_vfio_ioas,
>>   		 __reserved),
>> +	IOCTL_OP(IOMMU_FAULT_QUEUE_ALLOC, iommufd_fault_alloc,
>> struct iommu_fault_alloc,
>> +		 out_fault_fd),
> 
> ditto

Yes, sure. I wasn't aware of the order.

Best regards,
baolu
Re: [PATCH v6 06/10] iommufd: Add iommufd fault object
Posted by Jason Gunthorpe 1 year, 8 months ago
On Sat, Jun 08, 2024 at 05:58:34PM +0800, Baolu Lu wrote:

> > > +static int iommufd_fault_fops_release(struct inode *inode, struct file *filep)
> > > +{
> > > +	struct iommufd_fault *fault = filep->private_data;
> > > +
> > > +	iommufd_ctx_put(fault->ictx);
> > > +	refcount_dec(&fault->obj.users);
> > > +	return 0;


This is in the wrong order, dec users before ctx_put.

> > hmm this doesn't sound correct. the context and refcount are
> > acquired in iommufd_fault_alloc() but here they are reverted when
> > the fd is closed...
> 
> These two refcounts were requested when the fault object was installed
> to the fault FD.
> 
>        filep = anon_inode_getfile("[iommufd-pgfault]", &iommufd_fault_fops,
>                                    fault, O_RDWR);
>         if (IS_ERR(filep)) {
>                 rc = PTR_ERR(filep);
>                 goto out_abort;
>         }
> 
>         refcount_inc(&fault->obj.users);
>         iommufd_ctx_get(fault->ictx);
>         fault->filep = filep;
> 
> These refcounts must then be released when the FD is released.

Yes

The ctx refcount is to avoid destroying the ctx FD, which can't work,
while the fault FD has an object pinned. This is also why the above
order is backwards.

Jason
Re: [PATCH v6 06/10] iommufd: Add iommufd fault object
Posted by Baolu Lu 1 year, 8 months ago
On 6/12/24 9:25 PM, Jason Gunthorpe wrote:
> On Sat, Jun 08, 2024 at 05:58:34PM +0800, Baolu Lu wrote:
> 
>>>> +static int iommufd_fault_fops_release(struct inode *inode, struct file *filep)
>>>> +{
>>>> +	struct iommufd_fault *fault = filep->private_data;
>>>> +
>>>> +	iommufd_ctx_put(fault->ictx);
>>>> +	refcount_dec(&fault->obj.users);
>>>> +	return 0;
> 
> This is in the wrong order, dec users before ctx_put.

Done.

Best regards,
baolu