An iommufd fault object provides an interface for delivering I/O page
faults to user space. These objects are created and destroyed by user
space, and they can be associated with or dissociated from hardware page
table objects during page table allocation or destruction.
User space interacts with the fault object through a file interface. This
interface offers a straightforward and efficient way for user space to
handle page faults. It allows user space to read fault messages
sequentially and respond to them by writing to the same file. The file
interface supports reading messages in poll mode, so it's recommended that
user space applications use io_uring to enhance read and write efficiency.
A fault object can be associated with any iopf-capable iommufd_hw_pgtable
during the pgtable's allocation. All I/O page faults triggered by devices
when accessing the I/O addresses of an iommufd_hw_pgtable are routed
through the fault object to user space. Similarly, user space's responses
to these page faults are routed back to the iommu device driver through
the same fault object.
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
---
include/linux/iommu.h | 3 +
drivers/iommu/iommu-priv.h | 4 +
drivers/iommu/iommufd/iommufd_private.h | 24 +++
include/uapi/linux/iommufd.h | 18 ++
drivers/iommu/iommufd/device.c | 1 +
drivers/iommu/iommufd/fault.c | 238 ++++++++++++++++++++++++
drivers/iommu/iommufd/main.c | 6 +
drivers/iommu/iommufd/Makefile | 1 +
8 files changed, 295 insertions(+)
create mode 100644 drivers/iommu/iommufd/fault.c
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 4ff68d50ae39..a7301b27e01f 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -131,6 +131,9 @@ struct iopf_group {
struct iommu_attach_handle *attach_handle;
/* The device's fault data parameter. */
struct iommu_fault_param *fault_param;
+ /* Used by handler provider to hook the group on its own lists. */
+ struct list_head node;
+ u32 cookie;
};
/**
diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h
index ae65e0b85d69..1a0450a83bd0 100644
--- a/drivers/iommu/iommu-priv.h
+++ b/drivers/iommu/iommu-priv.h
@@ -36,6 +36,10 @@ struct iommu_attach_handle {
struct device *dev;
refcount_t users;
};
+ /* attach data for IOMMUFD */
+ struct {
+ void *idev;
+ };
};
};
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 991f864d1f9b..acb89bbf9f8e 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -128,6 +128,7 @@ enum iommufd_object_type {
IOMMUFD_OBJ_HWPT_NESTED,
IOMMUFD_OBJ_IOAS,
IOMMUFD_OBJ_ACCESS,
+ IOMMUFD_OBJ_FAULT,
#ifdef CONFIG_IOMMUFD_TEST
IOMMUFD_OBJ_SELFTEST,
#endif
@@ -395,6 +396,8 @@ struct iommufd_device {
/* always the physical device */
struct device *dev;
bool enforce_cache_coherency;
+ /* outstanding faults awaiting response indexed by fault group id */
+ struct xarray faults;
};
static inline struct iommufd_device *
@@ -426,6 +429,27 @@ void iopt_remove_access(struct io_pagetable *iopt,
u32 iopt_access_list_id);
void iommufd_access_destroy_object(struct iommufd_object *obj);
+/*
+ * An iommufd_fault object represents an interface to deliver I/O page faults
+ * to the user space. These objects are created/destroyed by the user space and
+ * associated with hardware page table objects during page-table allocation.
+ */
+struct iommufd_fault {
+ struct iommufd_object obj;
+ struct iommufd_ctx *ictx;
+ struct file *filep;
+
+ /* The lists of outstanding faults protected by below mutex. */
+ struct mutex mutex;
+ struct list_head deliver;
+ struct list_head response;
+
+ struct wait_queue_head wait_queue;
+};
+
+int iommufd_fault_alloc(struct iommufd_ucmd *ucmd);
+void iommufd_fault_destroy(struct iommufd_object *obj);
+
#ifdef CONFIG_IOMMUFD_TEST
int iommufd_test(struct iommufd_ucmd *ucmd);
void iommufd_selftest_destroy(struct iommufd_object *obj);
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 83b45dce94a4..1819b28e9e6b 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -50,6 +50,7 @@ enum {
IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING,
IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP,
IOMMUFD_CMD_HWPT_INVALIDATE,
+ IOMMUFD_CMD_FAULT_ALLOC,
};
/**
@@ -788,4 +789,21 @@ struct iommu_hwpt_page_response {
__u32 cookie;
__u32 reserved;
};
+
+/**
+ * struct iommu_fault_alloc - ioctl(IOMMU_FAULT_ALLOC)
+ * @size: sizeof(struct iommu_fault_alloc)
+ * @flags: Must be 0
+ * @out_fault_id: The ID of the new FAULT
+ * @out_fault_fd: The fd of the new FAULT
+ *
+ * Explicitly allocate a fault handling object.
+ */
+struct iommu_fault_alloc {
+ __u32 size;
+ __u32 flags;
+ __u32 out_fault_id;
+ __u32 out_fault_fd;
+};
+#define IOMMU_FAULT_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_FAULT_ALLOC)
#endif
diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index 873630c111c1..22e22969363a 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -215,6 +215,7 @@ struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx,
refcount_inc(&idev->obj.users);
/* igroup refcount moves into iommufd_device */
idev->igroup = igroup;
+ xa_init_flags(&idev->faults, XA_FLAGS_ALLOC1);
/*
* If the caller fails after this success it must call
diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c
new file mode 100644
index 000000000000..13125c0feecb
--- /dev/null
+++ b/drivers/iommu/iommufd/fault.c
@@ -0,0 +1,238 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2024 Intel Corporation
+ */
+#define pr_fmt(fmt) "iommufd: " fmt
+
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/iommufd.h>
+#include <linux/poll.h>
+#include <linux/anon_inodes.h>
+#include <uapi/linux/iommufd.h>
+
+#include "../iommu-priv.h"
+#include "iommufd_private.h"
+
+void iommufd_fault_destroy(struct iommufd_object *obj)
+{
+ struct iommufd_fault *fault = container_of(obj, struct iommufd_fault, obj);
+ struct iopf_group *group, *next;
+
+ /*
+ * The iommufd object's reference count is zero at this point.
+ * We can be confident that no other threads are currently
+ * accessing this pointer. Therefore, acquiring the mutex here
+ * is unnecessary.
+ */
+ list_for_each_entry_safe(group, next, &fault->deliver, node) {
+ list_del(&group->node);
+ iopf_group_response(group, IOMMU_PAGE_RESP_INVALID);
+ iopf_free_group(group);
+ }
+}
+
+static void iommufd_compose_fault_message(struct iommu_fault *fault,
+ struct iommu_hwpt_pgfault *hwpt_fault,
+ struct iommufd_device *idev,
+ u32 cookie)
+{
+ hwpt_fault->size = sizeof(*hwpt_fault);
+ hwpt_fault->flags = fault->prm.flags;
+ hwpt_fault->dev_id = idev->obj.id;
+ hwpt_fault->pasid = fault->prm.pasid;
+ hwpt_fault->grpid = fault->prm.grpid;
+ hwpt_fault->perm = fault->prm.perm;
+ hwpt_fault->addr = fault->prm.addr;
+ hwpt_fault->length = 0;
+ hwpt_fault->cookie = cookie;
+}
+
+static ssize_t iommufd_fault_fops_read(struct file *filep, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ size_t fault_size = sizeof(struct iommu_hwpt_pgfault);
+ struct iommufd_fault *fault = filep->private_data;
+ struct iommu_hwpt_pgfault data;
+ struct iommufd_device *idev;
+ struct iopf_group *group;
+ struct iopf_fault *iopf;
+ size_t done = 0;
+ int rc;
+
+ if (*ppos || count % fault_size)
+ return -ESPIPE;
+
+ mutex_lock(&fault->mutex);
+ while (!list_empty(&fault->deliver) && count > done) {
+ group = list_first_entry(&fault->deliver,
+ struct iopf_group, node);
+
+ if (list_count_nodes(&group->faults) * fault_size > count - done)
+ break;
+
+ idev = group->attach_handle->idev;
+ if (!idev)
+ break;
+
+ rc = xa_alloc(&idev->faults, &group->cookie, group,
+ xa_limit_32b, GFP_KERNEL);
+ if (rc)
+ break;
+
+ list_for_each_entry(iopf, &group->faults, list) {
+ iommufd_compose_fault_message(&iopf->fault,
+ &data, idev,
+ group->cookie);
+ rc = copy_to_user(buf + done, &data, fault_size);
+ if (rc) {
+ xa_erase(&idev->faults, group->cookie);
+ break;
+ }
+ done += fault_size;
+ }
+
+ list_del(&group->node);
+ }
+ mutex_unlock(&fault->mutex);
+
+ return done;
+}
+
+static ssize_t iommufd_fault_fops_write(struct file *filep, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ size_t response_size = sizeof(struct iommu_hwpt_page_response);
+ struct iommufd_fault *fault = filep->private_data;
+ struct iommu_hwpt_page_response response;
+ struct iommufd_device *idev = NULL;
+ struct iopf_group *group;
+ size_t done = 0;
+ int rc;
+
+ if (*ppos || count % response_size)
+ return -ESPIPE;
+
+ mutex_lock(&fault->mutex);
+ while (count > done) {
+ rc = copy_from_user(&response, buf + done, response_size);
+ if (rc)
+ break;
+
+ if (!idev || idev->obj.id != response.dev_id)
+ idev = container_of(iommufd_get_object(fault->ictx,
+ response.dev_id,
+ IOMMUFD_OBJ_DEVICE),
+ struct iommufd_device, obj);
+ if (IS_ERR(idev))
+ break;
+
+ group = xa_erase(&idev->faults, response.cookie);
+ if (!group)
+ break;
+
+ iopf_group_response(group, response.code);
+ iopf_free_group(group);
+ done += response_size;
+
+ iommufd_put_object(fault->ictx, &idev->obj);
+ }
+ mutex_unlock(&fault->mutex);
+
+ return done;
+}
+
+static __poll_t iommufd_fault_fops_poll(struct file *filep,
+ struct poll_table_struct *wait)
+{
+ struct iommufd_fault *fault = filep->private_data;
+ __poll_t pollflags = EPOLLOUT;
+
+ poll_wait(filep, &fault->wait_queue, wait);
+ mutex_lock(&fault->mutex);
+ if (!list_empty(&fault->deliver))
+ pollflags |= EPOLLIN | EPOLLRDNORM;
+ mutex_unlock(&fault->mutex);
+
+ return pollflags;
+}
+
+static int iommufd_fault_fops_release(struct inode *inode, struct file *filep)
+{
+ struct iommufd_fault *fault = filep->private_data;
+
+ iommufd_ctx_put(fault->ictx);
+ refcount_dec(&fault->obj.users);
+ return 0;
+}
+
+static const struct file_operations iommufd_fault_fops = {
+ .owner = THIS_MODULE,
+ .open = nonseekable_open,
+ .read = iommufd_fault_fops_read,
+ .write = iommufd_fault_fops_write,
+ .poll = iommufd_fault_fops_poll,
+ .release = iommufd_fault_fops_release,
+ .llseek = no_llseek,
+};
+
+int iommufd_fault_alloc(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_fault_alloc *cmd = ucmd->cmd;
+ struct iommufd_fault *fault;
+ struct file *filep;
+ int fdno;
+ int rc;
+
+ if (cmd->flags)
+ return -EOPNOTSUPP;
+
+ fault = iommufd_object_alloc(ucmd->ictx, fault, IOMMUFD_OBJ_FAULT);
+ if (IS_ERR(fault))
+ return PTR_ERR(fault);
+
+ fault->ictx = ucmd->ictx;
+ INIT_LIST_HEAD(&fault->deliver);
+ mutex_init(&fault->mutex);
+ init_waitqueue_head(&fault->wait_queue);
+
+ filep = anon_inode_getfile("[iommufd-pgfault]", &iommufd_fault_fops,
+ fault, O_RDWR);
+ if (IS_ERR(filep)) {
+ rc = PTR_ERR(filep);
+ goto out_abort;
+ }
+
+ refcount_inc(&fault->obj.users);
+ iommufd_ctx_get(fault->ictx);
+ fault->filep = filep;
+
+ fdno = get_unused_fd_flags(O_CLOEXEC);
+ if (fdno < 0) {
+ rc = fdno;
+ goto out_fput;
+ }
+
+ cmd->out_fault_id = fault->obj.id;
+ cmd->out_fault_fd = fdno;
+
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+ if (rc)
+ goto out_put_fdno;
+ iommufd_object_finalize(ucmd->ictx, &fault->obj);
+
+ fd_install(fdno, fault->filep);
+
+ return 0;
+out_put_fdno:
+ put_unused_fd(fdno);
+out_fput:
+ fput(filep);
+ refcount_dec(&fault->obj.users);
+ iommufd_ctx_put(fault->ictx);
+out_abort:
+ iommufd_object_abort_and_destroy(ucmd->ictx, &fault->obj);
+
+ return rc;
+}
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 39b32932c61e..792db077d33e 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -332,6 +332,7 @@ union ucmd_buffer {
struct iommu_ioas_unmap unmap;
struct iommu_option option;
struct iommu_vfio_ioas vfio_ioas;
+ struct iommu_fault_alloc fault;
#ifdef CONFIG_IOMMUFD_TEST
struct iommu_test_cmd test;
#endif
@@ -381,6 +382,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
val64),
IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas,
__reserved),
+ IOCTL_OP(IOMMU_FAULT_ALLOC, iommufd_fault_alloc, struct iommu_fault_alloc,
+ out_fault_fd),
#ifdef CONFIG_IOMMUFD_TEST
IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last),
#endif
@@ -513,6 +516,9 @@ static const struct iommufd_object_ops iommufd_object_ops[] = {
.destroy = iommufd_hwpt_nested_destroy,
.abort = iommufd_hwpt_nested_abort,
},
+ [IOMMUFD_OBJ_FAULT] = {
+ .destroy = iommufd_fault_destroy,
+ },
#ifdef CONFIG_IOMMUFD_TEST
[IOMMUFD_OBJ_SELFTEST] = {
.destroy = iommufd_selftest_destroy,
diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
index 34b446146961..b94a74366eed 100644
--- a/drivers/iommu/iommufd/Makefile
+++ b/drivers/iommu/iommufd/Makefile
@@ -6,6 +6,7 @@ iommufd-y := \
ioas.o \
main.o \
pages.o \
+ fault.o \
vfio_compat.o
iommufd-$(CONFIG_IOMMUFD_TEST) += selftest.o
--
2.34.1
> From: Lu Baolu <baolu.lu@linux.intel.com>
> Sent: Tuesday, April 30, 2024 10:57 PM
>
> @@ -131,6 +131,9 @@ struct iopf_group {
> struct iommu_attach_handle *attach_handle;
> /* The device's fault data parameter. */
> struct iommu_fault_param *fault_param;
> + /* Used by handler provider to hook the group on its own lists. */
> + struct list_head node;
> + u32 cookie;
better put together with attach_handle.
rename 'node' to 'handle_node'
> @@ -128,6 +128,7 @@ enum iommufd_object_type {
> IOMMUFD_OBJ_HWPT_NESTED,
> IOMMUFD_OBJ_IOAS,
> IOMMUFD_OBJ_ACCESS,
> + IOMMUFD_OBJ_FAULT,
Agree with Jason that 'FAULT_QUEUE' sounds a clearer object name.
> @@ -395,6 +396,8 @@ struct iommufd_device {
> /* always the physical device */
> struct device *dev;
> bool enforce_cache_coherency;
> + /* outstanding faults awaiting response indexed by fault group id */
> + struct xarray faults;
this...
> +struct iommufd_fault {
> + struct iommufd_object obj;
> + struct iommufd_ctx *ictx;
> + struct file *filep;
> +
> + /* The lists of outstanding faults protected by below mutex. */
> + struct mutex mutex;
> + struct list_head deliver;
> + struct list_head response;
...and here worth a discussion.
First the response list is not used. If continuing the choice of queueing
faults per device it should be removed.
But I wonder whether it makes more sense to keep this response
queue per fault object. sounds simpler to me.
Also it's unclear why we need the response message to carry the
same info as the request while only id/code/cookie are used.
+struct iommu_hwpt_page_response {
+ __u32 size;
+ __u32 flags;
+ __u32 dev_id;
+ __u32 pasid;
+ __u32 grpid;
+ __u32 code;
+ __u32 cookie;
+ __u32 reserved;
+};
If we keep the response queue in the fault object, the response message
only needs to carry size/flags/code/cookie and cookie can identify the
pending message uniquely in the response queue.
> +static ssize_t iommufd_fault_fops_write(struct file *filep, const char __user
> *buf,
> + size_t count, loff_t *ppos)
> +{
> + size_t response_size = sizeof(struct iommu_hwpt_page_response);
> + struct iommufd_fault *fault = filep->private_data;
> + struct iommu_hwpt_page_response response;
> + struct iommufd_device *idev = NULL;
> + struct iopf_group *group;
> + size_t done = 0;
> + int rc;
> +
> + if (*ppos || count % response_size)
> + return -ESPIPE;
> +
> + mutex_lock(&fault->mutex);
> + while (count > done) {
> + rc = copy_from_user(&response, buf + done, response_size);
> + if (rc)
> + break;
> +
> + if (!idev || idev->obj.id != response.dev_id)
> + idev = container_of(iommufd_get_object(fault->ictx,
> + response.dev_id,
> +
> IOMMUFD_OBJ_DEVICE),
> + struct iommufd_device, obj);
> + if (IS_ERR(idev))
> + break;
> +
> + group = xa_erase(&idev->faults, response.cookie);
> + if (!group)
> + break;
is 'continue' better?
> +
> + iopf_group_response(group, response.code);
PCIe spec states that a response failure disables the PRI interface. For SR-IOV
it'd be dangerous allowing user to trigger such code to VF to close the entire
shared PRI interface.
Just another example lacking of coordination for shared capabilities between
PF/VF. But exposing such gap to userspace makes it worse.
I guess we don't want to make this work depending on that cleanup. The
minimal correct thing is to disallow attaching VF to a fault-capable hwpt
with a note here that once we turn on support for VF the response failure
code should not be forwarded to the hardware. Instead it's an indication
that the user cannot serve more requests and such situation waits for
a vPRI reset to recover.
> + iopf_free_group(group);
> + done += response_size;
> +
> + iommufd_put_object(fault->ictx, &idev->obj);
get/put is unpaired:
if (!idev || idev->obj.id != response.dev_id)
idev = iommufd_get_object();
...
iommufd_put_object(idev);
The intention might be reusing idev if multiple fault responses are
for a same idev. But idev is always put in each iteration then following
messages will access the idev w/o holding the reference.
On 5/15/24 4:37 PM, Tian, Kevin wrote: >> + iopf_free_group(group); >> + done += response_size; >> + >> + iommufd_put_object(fault->ictx, &idev->obj); > get/put is unpaired: > > if (!idev || idev->obj.id != response.dev_id) > idev = iommufd_get_object(); > > ... > > iommufd_put_object(idev); > > The intention might be reusing idev if multiple fault responses are > for a same idev. But idev is always put in each iteration then following > messages will access the idev w/o holding the reference. Good catch. Let me fix it by putting the response queue in the fault object. Best regards, baolu
On 5/15/24 4:37 PM, Tian, Kevin wrote: >> + >> + iopf_group_response(group, response.code); > PCIe spec states that a response failure disables the PRI interface. For SR-IOV > it'd be dangerous allowing user to trigger such code to VF to close the entire > shared PRI interface. > > Just another example lacking of coordination for shared capabilities between > PF/VF. But exposing such gap to userspace makes it worse. Yes. You are right. > > I guess we don't want to make this work depending on that cleanup. The > minimal correct thing is to disallow attaching VF to a fault-capable hwpt > with a note here that once we turn on support for VF the response failure > code should not be forwarded to the hardware. Instead it's an indication > that the user cannot serve more requests and such situation waits for > a vPRI reset to recover. Is it the same thing to disallow PRI for VF in IOMMUFD? Best regards, baolu
> From: Baolu Lu <baolu.lu@linux.intel.com> > Sent: Monday, May 20, 2024 9:34 AM > > On 5/15/24 4:37 PM, Tian, Kevin wrote: > >> + > >> + iopf_group_response(group, response.code); > > PCIe spec states that a response failure disables the PRI interface. For SR- > IOV > > it'd be dangerous allowing user to trigger such code to VF to close the > entire > > shared PRI interface. > > > > Just another example lacking of coordination for shared capabilities > between > > PF/VF. But exposing such gap to userspace makes it worse. > > Yes. You are right. > > > > > I guess we don't want to make this work depending on that cleanup. The > > minimal correct thing is to disallow attaching VF to a fault-capable hwpt > > with a note here that once we turn on support for VF the response failure > > code should not be forwarded to the hardware. Instead it's an indication > > that the user cannot serve more requests and such situation waits for > > a vPRI reset to recover. > > Is it the same thing to disallow PRI for VF in IOMMUFD? > yes
On 5/15/24 4:37 PM, Tian, Kevin wrote:
>> +static ssize_t iommufd_fault_fops_write(struct file *filep, const char __user
>> *buf,
>> + size_t count, loff_t *ppos)
>> +{
>> + size_t response_size = sizeof(struct iommu_hwpt_page_response);
>> + struct iommufd_fault *fault = filep->private_data;
>> + struct iommu_hwpt_page_response response;
>> + struct iommufd_device *idev = NULL;
>> + struct iopf_group *group;
>> + size_t done = 0;
>> + int rc;
>> +
>> + if (*ppos || count % response_size)
>> + return -ESPIPE;
>> +
>> + mutex_lock(&fault->mutex);
>> + while (count > done) {
>> + rc = copy_from_user(&response, buf + done, response_size);
>> + if (rc)
>> + break;
>> +
>> + if (!idev || idev->obj.id != response.dev_id)
>> + idev = container_of(iommufd_get_object(fault->ictx,
>> + response.dev_id,
>> +
>> IOMMUFD_OBJ_DEVICE),
>> + struct iommufd_device, obj);
>> + if (IS_ERR(idev))
>> + break;
>> +
>> + group = xa_erase(&idev->faults, response.cookie);
>> + if (!group)
>> + break;
> is 'continue' better?
If we can't find a matched iopf group here, it means userspace provided
something wrong. The current logic is that we stop here and tell
userspace that only part of the faults have been responded to and it
should retry the remaining responses with the right message.
Best regards,
baolu
On Mon, May 20, 2024 at 09:24:09AM +0800, Baolu Lu wrote:
> On 5/15/24 4:37 PM, Tian, Kevin wrote:
> > > +static ssize_t iommufd_fault_fops_write(struct file *filep, const char __user
> > > *buf,
> > > + size_t count, loff_t *ppos)
> > > +{
> > > + size_t response_size = sizeof(struct iommu_hwpt_page_response);
> > > + struct iommufd_fault *fault = filep->private_data;
> > > + struct iommu_hwpt_page_response response;
> > > + struct iommufd_device *idev = NULL;
> > > + struct iopf_group *group;
> > > + size_t done = 0;
> > > + int rc;
> > > +
> > > + if (*ppos || count % response_size)
> > > + return -ESPIPE;
> > > +
> > > + mutex_lock(&fault->mutex);
> > > + while (count > done) {
> > > + rc = copy_from_user(&response, buf + done, response_size);
> > > + if (rc)
> > > + break;
> > > +
> > > + if (!idev || idev->obj.id != response.dev_id)
> > > + idev = container_of(iommufd_get_object(fault->ictx,
> > > + response.dev_id,
> > > +
> > > IOMMUFD_OBJ_DEVICE),
> > > + struct iommufd_device, obj);
> > > + if (IS_ERR(idev))
> > > + break;
> > > +
> > > + group = xa_erase(&idev->faults, response.cookie);
> > > + if (!group)
> > > + break;
> > is 'continue' better?
>
> If we can't find a matched iopf group here, it means userspace provided
> something wrong. The current logic is that we stop here and tell
> userspace that only part of the faults have been responded to and it
> should retry the remaining responses with the right message.
The usual fd-ish error handling here should be to return a short write
(success) and then userspace will retry with the failing entry at the
start of the buffer and collect the errno.
Jason
On 5/15/24 4:37 PM, Tian, Kevin wrote:
>> @@ -395,6 +396,8 @@ struct iommufd_device {
>> /* always the physical device */
>> struct device *dev;
>> bool enforce_cache_coherency;
>> + /* outstanding faults awaiting response indexed by fault group id */
>> + struct xarray faults;
> this...
>
>> +struct iommufd_fault {
>> + struct iommufd_object obj;
>> + struct iommufd_ctx *ictx;
>> + struct file *filep;
>> +
>> + /* The lists of outstanding faults protected by below mutex. */
>> + struct mutex mutex;
>> + struct list_head deliver;
>> + struct list_head response;
> ...and here worth a discussion.
>
> First the response list is not used. If continuing the choice of queueing
> faults per device it should be removed.
You are right. I have removed the response list in the new version.
>
> But I wonder whether it makes more sense to keep this response
> queue per fault object. sounds simpler to me.
>
> Also it's unclear why we need the response message to carry the
> same info as the request while only id/code/cookie are used.
>
> +struct iommu_hwpt_page_response {
> + __u32 size;
> + __u32 flags;
> + __u32 dev_id;
> + __u32 pasid;
> + __u32 grpid;
> + __u32 code;
> + __u32 cookie;
> + __u32 reserved;
> +};
>
> If we keep the response queue in the fault object, the response message
> only needs to carry size/flags/code/cookie and cookie can identify the
> pending message uniquely in the response queue.
It seems fine from the code's point of view. Let's wait and see whether
there are any concerns from the uAPI's perspective.
Best regards,
baolu
On Tue, Apr 30, 2024 at 10:57:06PM +0800, Lu Baolu wrote:
> +static ssize_t iommufd_fault_fops_read(struct file *filep, char __user *buf,
> + size_t count, loff_t *ppos)
> +{
> + size_t fault_size = sizeof(struct iommu_hwpt_pgfault);
> + struct iommufd_fault *fault = filep->private_data;
> + struct iommu_hwpt_pgfault data;
> + struct iommufd_device *idev;
> + struct iopf_group *group;
> + struct iopf_fault *iopf;
> + size_t done = 0;
> + int rc;
> +
> + if (*ppos || count % fault_size)
> + return -ESPIPE;
> +
> + mutex_lock(&fault->mutex);
> + while (!list_empty(&fault->deliver) && count > done) {
> + group = list_first_entry(&fault->deliver,
> + struct iopf_group, node);
> +
> + if (list_count_nodes(&group->faults) * fault_size > count - done)
> + break;
Can this list_count be precomputed when we build the fault group?
> +
> + idev = group->attach_handle->idev;
> + if (!idev)
> + break;
This check should be done before adding the fault to the linked
list. See my other note about the race.
> +
> + rc = xa_alloc(&idev->faults, &group->cookie, group,
> + xa_limit_32b, GFP_KERNEL);
> + if (rc)
> + break;
This error handling is not quite right, if done == 0 then this should
return rc.
> +
> + list_for_each_entry(iopf, &group->faults, list) {
> + iommufd_compose_fault_message(&iopf->fault,
> + &data, idev,
> + group->cookie);
> + rc = copy_to_user(buf + done, &data, fault_size);
> + if (rc) {
> + xa_erase(&idev->faults, group->cookie);
> + break;
Same here
(same comment on the write side too)
Jason
On 2024/5/8 8:22, Jason Gunthorpe wrote:
> On Tue, Apr 30, 2024 at 10:57:06PM +0800, Lu Baolu wrote:
>> +static ssize_t iommufd_fault_fops_read(struct file *filep, char __user *buf,
>> + size_t count, loff_t *ppos)
>> +{
>> + size_t fault_size = sizeof(struct iommu_hwpt_pgfault);
>> + struct iommufd_fault *fault = filep->private_data;
>> + struct iommu_hwpt_pgfault data;
>> + struct iommufd_device *idev;
>> + struct iopf_group *group;
>> + struct iopf_fault *iopf;
>> + size_t done = 0;
>> + int rc;
>> +
>> + if (*ppos || count % fault_size)
>> + return -ESPIPE;
>> +
>> + mutex_lock(&fault->mutex);
>> + while (!list_empty(&fault->deliver) && count > done) {
>> + group = list_first_entry(&fault->deliver,
>> + struct iopf_group, node);
>> +
>> + if (list_count_nodes(&group->faults) * fault_size > count - done)
>> + break;
>
> Can this list_count be precomputed when we build the fault group?
Yes. Done.
>
>> +
>> + idev = group->attach_handle->idev;
>> + if (!idev)
>> + break;
>
> This check should be done before adding the fault to the linked
> list. See my other note about the race.
Done.
>
>> +
>> + rc = xa_alloc(&idev->faults, &group->cookie, group,
>> + xa_limit_32b, GFP_KERNEL);
>> + if (rc)
>> + break;
>
> This error handling is not quite right, if done == 0 then this should
> return rc.
>
>
>> +
>> + list_for_each_entry(iopf, &group->faults, list) {
>> + iommufd_compose_fault_message(&iopf->fault,
>> + &data, idev,
>> + group->cookie);
>> + rc = copy_to_user(buf + done, &data, fault_size);
>> + if (rc) {
>> + xa_erase(&idev->faults, group->cookie);
>> + break;
>
> Same here
>
> (same comment on the write side too)
All fixed. Thank you!
Best regards,
baolu
On Tue, Apr 30, 2024 at 10:57:06PM +0800, Lu Baolu wrote:
> diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h
> index ae65e0b85d69..1a0450a83bd0 100644
> --- a/drivers/iommu/iommu-priv.h
> +++ b/drivers/iommu/iommu-priv.h
> @@ -36,6 +36,10 @@ struct iommu_attach_handle {
> struct device *dev;
> refcount_t users;
> };
> + /* attach data for IOMMUFD */
> + struct {
> + void *idev;
> + };
We can use a proper type here, just forward declare it.
But this sequence in the other patch:
+ ret = iommu_attach_group(hwpt->domain, idev->igroup->group);
+ if (ret) {
+ iommufd_fault_iopf_disable(idev);
+ return ret;
+ }
+
+ handle = iommu_attach_handle_get(idev->igroup->group, IOMMU_NO_PASID, 0);
+ handle->idev = idev;
Is why I was imagining the caller would allocate, because now we have
the issue that a fault capable domain was installed into the IOMMU
before it's handle could be fully setup, so we have a race where a
fault could come in right between those things. Then what happens?
I suppose we can retry the fault and by the time it comes back the
race should resolve. A bit ugly I suppose.
> diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
> index 83b45dce94a4..1819b28e9e6b 100644
> --- a/include/uapi/linux/iommufd.h
> +++ b/include/uapi/linux/iommufd.h
> @@ -50,6 +50,7 @@ enum {
> IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING,
> IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP,
> IOMMUFD_CMD_HWPT_INVALIDATE,
> + IOMMUFD_CMD_FAULT_ALLOC,
> };
I think I'd call this a CMD_FAULT_QUEUE_ALLOC - does that make sense?
Jason
On 2024/5/8 8:11, Jason Gunthorpe wrote:
> On Tue, Apr 30, 2024 at 10:57:06PM +0800, Lu Baolu wrote:
>> diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h
>> index ae65e0b85d69..1a0450a83bd0 100644
>> --- a/drivers/iommu/iommu-priv.h
>> +++ b/drivers/iommu/iommu-priv.h
>> @@ -36,6 +36,10 @@ struct iommu_attach_handle {
>> struct device *dev;
>> refcount_t users;
>> };
>> + /* attach data for IOMMUFD */
>> + struct {
>> + void *idev;
>> + };
> We can use a proper type here, just forward declare it.
>
> But this sequence in the other patch:
>
> + ret = iommu_attach_group(hwpt->domain, idev->igroup->group);
> + if (ret) {
> + iommufd_fault_iopf_disable(idev);
> + return ret;
> + }
> +
> + handle = iommu_attach_handle_get(idev->igroup->group, IOMMU_NO_PASID, 0);
> + handle->idev = idev;
>
> Is why I was imagining the caller would allocate, because now we have
> the issue that a fault capable domain was installed into the IOMMU
> before it's handle could be fully setup, so we have a race where a
> fault could come in right between those things. Then what happens?
> I suppose we can retry the fault and by the time it comes back the
> race should resolve. A bit ugly I suppose.
You are right. It makes more sense if the attached data is allocated and
managed by the caller. I will go in this direction and update my series.
I will also consider other review comments you have given in other
places.
Best regards,
baolu
> From: Baolu Lu <baolu.lu@linux.intel.com>
> Sent: Wednesday, May 8, 2024 6:05 PM
>
> On 2024/5/8 8:11, Jason Gunthorpe wrote:
> > On Tue, Apr 30, 2024 at 10:57:06PM +0800, Lu Baolu wrote:
> >> diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h
> >> index ae65e0b85d69..1a0450a83bd0 100644
> >> --- a/drivers/iommu/iommu-priv.h
> >> +++ b/drivers/iommu/iommu-priv.h
> >> @@ -36,6 +36,10 @@ struct iommu_attach_handle {
> >> struct device *dev;
> >> refcount_t users;
> >> };
> >> + /* attach data for IOMMUFD */
> >> + struct {
> >> + void *idev;
> >> + };
> > We can use a proper type here, just forward declare it.
> >
> > But this sequence in the other patch:
> >
> > + ret = iommu_attach_group(hwpt->domain, idev->igroup->group);
> > + if (ret) {
> > + iommufd_fault_iopf_disable(idev);
> > + return ret;
> > + }
> > +
> > + handle = iommu_attach_handle_get(idev->igroup->group,
> IOMMU_NO_PASID, 0);
> > + handle->idev = idev;
> >
> > Is why I was imagining the caller would allocate, because now we have
> > the issue that a fault capable domain was installed into the IOMMU
> > before it's handle could be fully setup, so we have a race where a
> > fault could come in right between those things. Then what happens?
> > I suppose we can retry the fault and by the time it comes back the
> > race should resolve. A bit ugly I suppose.
>
> You are right. It makes more sense if the attached data is allocated and
> managed by the caller. I will go in this direction and update my series.
> I will also consider other review comments you have given in other
> places.
>
Does this direction imply a new iommu_attach_group_handle() helper
to pass in the caller-allocated handle pointer or exposing a new
iommu_group_set_handle() to set the handle to the group pasid_array
and then having iomm_attach_group() to update the domain info in
the handle?
On 5/15/24 3:57 PM, Tian, Kevin wrote:
>> From: Baolu Lu <baolu.lu@linux.intel.com>
>> Sent: Wednesday, May 8, 2024 6:05 PM
>>
>> On 2024/5/8 8:11, Jason Gunthorpe wrote:
>>> On Tue, Apr 30, 2024 at 10:57:06PM +0800, Lu Baolu wrote:
>>>> diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h
>>>> index ae65e0b85d69..1a0450a83bd0 100644
>>>> --- a/drivers/iommu/iommu-priv.h
>>>> +++ b/drivers/iommu/iommu-priv.h
>>>> @@ -36,6 +36,10 @@ struct iommu_attach_handle {
>>>> struct device *dev;
>>>> refcount_t users;
>>>> };
>>>> + /* attach data for IOMMUFD */
>>>> + struct {
>>>> + void *idev;
>>>> + };
>>> We can use a proper type here, just forward declare it.
>>>
>>> But this sequence in the other patch:
>>>
>>> + ret = iommu_attach_group(hwpt->domain, idev->igroup->group);
>>> + if (ret) {
>>> + iommufd_fault_iopf_disable(idev);
>>> + return ret;
>>> + }
>>> +
>>> + handle = iommu_attach_handle_get(idev->igroup->group,
>> IOMMU_NO_PASID, 0);
>>> + handle->idev = idev;
>>>
>>> Is why I was imagining the caller would allocate, because now we have
>>> the issue that a fault capable domain was installed into the IOMMU
>>> before it's handle could be fully setup, so we have a race where a
>>> fault could come in right between those things. Then what happens?
>>> I suppose we can retry the fault and by the time it comes back the
>>> race should resolve. A bit ugly I suppose.
>>
>> You are right. It makes more sense if the attached data is allocated and
>> managed by the caller. I will go in this direction and update my series.
>> I will also consider other review comments you have given in other
>> places.
>>
>
> Does this direction imply a new iommu_attach_group_handle() helper
> to pass in the caller-allocated handle pointer or exposing a new
> iommu_group_set_handle() to set the handle to the group pasid_array
> and then having iomm_attach_group() to update the domain info in
> the handle?
I will add new iommu_attach/replace/detach_group_handle() helpers. Like
below:
+/**
+ * iommu_attach_group_handle - Attach an IOMMU domain to an IOMMU group
+ * @domain: IOMMU domain to attach
+ * @group: IOMMU group that will be attached
+ * @handle: attach handle
+ *
+ * Returns 0 on success and error code on failure.
+ *
+ * This is a variant of iommu_attach_group(). It allows the caller to
provide
+ * an attach handle and use it when the domain is attached. This is
currently
+ * only designed for IOMMUFD to deliver the I/O page faults.
+ */
+int iommu_attach_group_handle(struct iommu_domain *domain,
+ struct iommu_group *group,
+ struct iommu_attach_handle *handle)
Best regards,
baolu
> From: Baolu Lu <baolu.lu@linux.intel.com>
> Sent: Monday, May 20, 2024 8:41 AM
>
> On 5/15/24 3:57 PM, Tian, Kevin wrote:
> >> From: Baolu Lu <baolu.lu@linux.intel.com>
> >> Sent: Wednesday, May 8, 2024 6:05 PM
> >>
> >> On 2024/5/8 8:11, Jason Gunthorpe wrote:
> >>> On Tue, Apr 30, 2024 at 10:57:06PM +0800, Lu Baolu wrote:
> >>>> diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-
> priv.h
> >>>> index ae65e0b85d69..1a0450a83bd0 100644
> >>>> --- a/drivers/iommu/iommu-priv.h
> >>>> +++ b/drivers/iommu/iommu-priv.h
> >>>> @@ -36,6 +36,10 @@ struct iommu_attach_handle {
> >>>> struct device *dev;
> >>>> refcount_t users;
> >>>> };
> >>>> + /* attach data for IOMMUFD */
> >>>> + struct {
> >>>> + void *idev;
> >>>> + };
> >>> We can use a proper type here, just forward declare it.
> >>>
> >>> But this sequence in the other patch:
> >>>
> >>> + ret = iommu_attach_group(hwpt->domain, idev->igroup->group);
> >>> + if (ret) {
> >>> + iommufd_fault_iopf_disable(idev);
> >>> + return ret;
> >>> + }
> >>> +
> >>> + handle = iommu_attach_handle_get(idev->igroup->group,
> >> IOMMU_NO_PASID, 0);
> >>> + handle->idev = idev;
> >>>
> >>> Is why I was imagining the caller would allocate, because now we have
> >>> the issue that a fault capable domain was installed into the IOMMU
> >>> before it's handle could be fully setup, so we have a race where a
> >>> fault could come in right between those things. Then what happens?
> >>> I suppose we can retry the fault and by the time it comes back the
> >>> race should resolve. A bit ugly I suppose.
> >>
> >> You are right. It makes more sense if the attached data is allocated and
> >> managed by the caller. I will go in this direction and update my series.
> >> I will also consider other review comments you have given in other
> >> places.
> >>
> >
> > Does this direction imply a new iommu_attach_group_handle() helper
> > to pass in the caller-allocated handle pointer or exposing a new
> > iommu_group_set_handle() to set the handle to the group pasid_array
> > and then having iomm_attach_group() to update the domain info in
> > the handle?
>
> I will add new iommu_attach/replace/detach_group_handle() helpers. Like
> below:
>
> +/**
> + * iommu_attach_group_handle - Attach an IOMMU domain to an IOMMU
> group
> + * @domain: IOMMU domain to attach
> + * @group: IOMMU group that will be attached
> + * @handle: attach handle
> + *
> + * Returns 0 on success and error code on failure.
> + *
> + * This is a variant of iommu_attach_group(). It allows the caller to
> provide
> + * an attach handle and use it when the domain is attached. This is
> currently
> + * only designed for IOMMUFD to deliver the I/O page faults.
> + */
> +int iommu_attach_group_handle(struct iommu_domain *domain,
> + struct iommu_group *group,
> + struct iommu_attach_handle *handle)
>
"currently only designed for IOMMUFD" doesn't sound correct.
design-wise this can be used by anyone which relies on the handle.
There is nothing tied to IOMMUFD.
s/designed for/used by/ is more accurate.
On 5/20/24 11:26 AM, Tian, Kevin wrote:
>> From: Baolu Lu <baolu.lu@linux.intel.com>
>> Sent: Monday, May 20, 2024 8:41 AM
>>
>> On 5/15/24 3:57 PM, Tian, Kevin wrote:
>>>> From: Baolu Lu <baolu.lu@linux.intel.com>
>>>> Sent: Wednesday, May 8, 2024 6:05 PM
>>>>
>>>> On 2024/5/8 8:11, Jason Gunthorpe wrote:
>>>>> On Tue, Apr 30, 2024 at 10:57:06PM +0800, Lu Baolu wrote:
>>>>>> diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-
>> priv.h
>>>>>> index ae65e0b85d69..1a0450a83bd0 100644
>>>>>> --- a/drivers/iommu/iommu-priv.h
>>>>>> +++ b/drivers/iommu/iommu-priv.h
>>>>>> @@ -36,6 +36,10 @@ struct iommu_attach_handle {
>>>>>> struct device *dev;
>>>>>> refcount_t users;
>>>>>> };
>>>>>> + /* attach data for IOMMUFD */
>>>>>> + struct {
>>>>>> + void *idev;
>>>>>> + };
>>>>> We can use a proper type here, just forward declare it.
>>>>>
>>>>> But this sequence in the other patch:
>>>>>
>>>>> + ret = iommu_attach_group(hwpt->domain, idev->igroup->group);
>>>>> + if (ret) {
>>>>> + iommufd_fault_iopf_disable(idev);
>>>>> + return ret;
>>>>> + }
>>>>> +
>>>>> + handle = iommu_attach_handle_get(idev->igroup->group,
>>>> IOMMU_NO_PASID, 0);
>>>>> + handle->idev = idev;
>>>>>
>>>>> Is why I was imagining the caller would allocate, because now we have
>>>>> the issue that a fault capable domain was installed into the IOMMU
>>>>> before it's handle could be fully setup, so we have a race where a
>>>>> fault could come in right between those things. Then what happens?
>>>>> I suppose we can retry the fault and by the time it comes back the
>>>>> race should resolve. A bit ugly I suppose.
>>>>
>>>> You are right. It makes more sense if the attached data is allocated and
>>>> managed by the caller. I will go in this direction and update my series.
>>>> I will also consider other review comments you have given in other
>>>> places.
>>>>
>>>
>>> Does this direction imply a new iommu_attach_group_handle() helper
>>> to pass in the caller-allocated handle pointer or exposing a new
>>> iommu_group_set_handle() to set the handle to the group pasid_array
>>> and then having iomm_attach_group() to update the domain info in
>>> the handle?
>>
>> I will add new iommu_attach/replace/detach_group_handle() helpers. Like
>> below:
>>
>> +/**
>> + * iommu_attach_group_handle - Attach an IOMMU domain to an IOMMU
>> group
>> + * @domain: IOMMU domain to attach
>> + * @group: IOMMU group that will be attached
>> + * @handle: attach handle
>> + *
>> + * Returns 0 on success and error code on failure.
>> + *
>> + * This is a variant of iommu_attach_group(). It allows the caller to
>> provide
>> + * an attach handle and use it when the domain is attached. This is
>> currently
>> + * only designed for IOMMUFD to deliver the I/O page faults.
>> + */
>> +int iommu_attach_group_handle(struct iommu_domain *domain,
>> + struct iommu_group *group,
>> + struct iommu_attach_handle *handle)
>>
>
> "currently only designed for IOMMUFD" doesn't sound correct.
>
> design-wise this can be used by anyone which relies on the handle.
> There is nothing tied to IOMMUFD.
>
> s/designed for/used by/ is more accurate.
Done.
Best regards,
baolu
© 2016 - 2025 Red Hat, Inc.