Introduce a new IOMMUFD_OBJ_VEVENTQ object for vIOMMU Event Queue that
provides user space (VMM) another FD to read the vIOMMU Events.
Allow a vIOMMU object to allocate vEVENTQs, with a condition that each
vIOMMU can only have one single vEVENTQ per type.
Add iommufd_veventq_alloc() with iommufd_veventq_ops for the new ioctl.
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
drivers/iommu/iommufd/iommufd_private.h | 88 ++++++++++
include/linux/iommufd.h | 3 +
include/uapi/linux/iommufd.h | 85 ++++++++++
drivers/iommu/iommufd/eventq.c | 206 +++++++++++++++++++++++-
drivers/iommu/iommufd/main.c | 7 +
drivers/iommu/iommufd/viommu.c | 2 +
6 files changed, 390 insertions(+), 1 deletion(-)
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index ee365c85dda9..7a8feedcea2e 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -519,6 +519,80 @@ static inline int iommufd_hwpt_replace_device(struct iommufd_device *idev,
return iommu_group_replace_domain(idev->igroup->group, hwpt->domain);
}
+/* An iommufd_vevent represents a vIOMMU event in an iommufd_veventq */
+struct iommufd_vevent {
+ struct iommufd_vevent_header header;
+ struct list_head node; /* for iommufd_eventq::deliver */
+ bool on_list;
+ ssize_t data_len;
+ u64 event_data[] __counted_by(data_len);
+};
+
+#define vevent_for_overflow(vevent) \
+ (vevent->header.flags & IOMMU_VEVENTQ_FLAG_OVERFLOW)
+
+/*
+ * An iommufd_veventq object represents an interface to deliver vIOMMU events to
+ * the user space. It is created/destroyed by the user space and associated with
+ * vIOMMU object(s) during the allocations.
+ */
+struct iommufd_veventq {
+ struct iommufd_eventq common;
+ struct iommufd_viommu *viommu;
+ struct list_head node; /* for iommufd_viommu::veventqs */
+ struct iommufd_vevent overflow; /* pre-allocated overflow node */
+
+ unsigned int type;
+ unsigned int depth;
+
+ atomic_t num_events;
+ atomic_t sequence;
+};
+
+static inline struct iommufd_veventq *
+eventq_to_veventq(struct iommufd_eventq *eventq)
+{
+ return container_of(eventq, struct iommufd_veventq, common);
+}
+
+static inline struct iommufd_veventq *
+iommufd_get_veventq(struct iommufd_ucmd *ucmd, u32 id)
+{
+ return container_of(iommufd_get_object(ucmd->ictx, id,
+ IOMMUFD_OBJ_VEVENTQ),
+ struct iommufd_veventq, common.obj);
+}
+
+int iommufd_veventq_alloc(struct iommufd_ucmd *ucmd);
+void iommufd_veventq_destroy(struct iommufd_object *obj);
+void iommufd_veventq_abort(struct iommufd_object *obj);
+
+static inline void iommufd_vevent_handler(struct iommufd_veventq *veventq,
+ struct iommufd_vevent *vevent)
+{
+ struct iommufd_eventq *eventq = &veventq->common;
+
+ /*
+ * Remove the overflow node and add the new node at the same time. Note
+ * it is possible that vevent == &veventq->overflow for sequence update
+ */
+ spin_lock(&eventq->lock);
+ if (veventq->overflow.on_list) {
+ list_del(&veventq->overflow.node);
+ veventq->overflow.on_list = false;
+ }
+ list_add_tail(&vevent->node, &eventq->deliver);
+ vevent->on_list = true;
+ vevent->header.sequence = atomic_read(&veventq->sequence);
+ if (atomic_read(&veventq->sequence) == INT_MAX)
+ atomic_set(&veventq->sequence, 0);
+ else
+ atomic_inc(&veventq->sequence);
+ spin_unlock(&eventq->lock);
+
+ wake_up_interruptible(&eventq->wait_queue);
+}
+
static inline struct iommufd_viommu *
iommufd_get_viommu(struct iommufd_ucmd *ucmd, u32 id)
{
@@ -527,6 +601,20 @@ iommufd_get_viommu(struct iommufd_ucmd *ucmd, u32 id)
struct iommufd_viommu, obj);
}
+static inline struct iommufd_veventq *
+iommufd_viommu_find_veventq(struct iommufd_viommu *viommu, u32 type)
+{
+ struct iommufd_veventq *veventq, *next;
+
+ lockdep_assert_held(&viommu->veventqs_rwsem);
+
+ list_for_each_entry_safe(veventq, next, &viommu->veventqs, node) {
+ if (veventq->type == type)
+ return veventq;
+ }
+ return NULL;
+}
+
int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd);
void iommufd_viommu_destroy(struct iommufd_object *obj);
int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd);
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 11110c749200..8948b1836940 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -34,6 +34,7 @@ enum iommufd_object_type {
IOMMUFD_OBJ_FAULT,
IOMMUFD_OBJ_VIOMMU,
IOMMUFD_OBJ_VDEVICE,
+ IOMMUFD_OBJ_VEVENTQ,
#ifdef CONFIG_IOMMUFD_TEST
IOMMUFD_OBJ_SELFTEST,
#endif
@@ -93,6 +94,8 @@ struct iommufd_viommu {
const struct iommufd_viommu_ops *ops;
struct xarray vdevs;
+ struct list_head veventqs;
+ struct rw_semaphore veventqs_rwsem;
unsigned int type;
};
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 78747b24bd0f..08cbc6bc3725 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -55,6 +55,7 @@ enum {
IOMMUFD_CMD_VIOMMU_ALLOC = 0x90,
IOMMUFD_CMD_VDEVICE_ALLOC = 0x91,
IOMMUFD_CMD_IOAS_CHANGE_PROCESS = 0x92,
+ IOMMUFD_CMD_VEVENTQ_ALLOC = 0x93,
};
/**
@@ -1014,4 +1015,88 @@ struct iommu_ioas_change_process {
#define IOMMU_IOAS_CHANGE_PROCESS \
_IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_CHANGE_PROCESS)
+/**
+ * enum iommu_veventq_flag - flag for struct iommufd_vevent_header
+ * @IOMMU_VEVENTQ_FLAG_OVERFLOW: vEVENTQ is overflowed
+ */
+enum iommu_veventq_flag {
+ IOMMU_VEVENTQ_FLAG_OVERFLOW = (1ULL << 0),
+};
+
+/**
+ * struct iommufd_vevent_header - Virtual Event Header for a vEVENTQ Status
+ * @flags: Combination of enum iommu_veventq_flag
+ * @sequence: The sequence index of a vEVENT in the vEVENTQ, with a range of
+ * [0, INT_MAX] where the following index of INT_MAX is 0
+ * @__reserved: Must be 0
+ *
+ * Each iommufd_vevent_header reports a sequence index of the following vEVENT:
+ * ---------------------------------------------------------------------------
+ * || header0 {sequence=0} | data0 | header1 {sequence=1} | data1 |...| dataN ||
+ * ---------------------------------------------------------------------------
+ * And this sequence index is expected to be monotonic to the sequence index of
+ * the previous vEVENT. If two adjacent sequence indexes has a delta larger than
+ * 1, it indicates that an overflow occurred to the vEVENTQ and that delta - 1
+ * number of vEVENTs lost due to the overflow (e.g. two lost vEVENTs):
+ * ---------------------------------------------------------------------------
+ * || ... | header3 {sequence=3} | data3 | header6 {sequence=6} | data6 | ... ||
+ * ---------------------------------------------------------------------------
+ * If an overflow occurred to the tail of the vEVENTQ and there is no following
+ * vEVENT providing the next sequence index, a special overflow header would be
+ * added to the tail of the vEVENTQ, where there would be no more type-specific
+ * data following the vEVENTQ:
+ * ---------------------------------------------------------------------------
+ * ||...| header3 {sequence=3} | data4 | header4 {flags=OVERFLOW, sequence=4} ||
+ * ---------------------------------------------------------------------------
+ */
+struct iommufd_vevent_header {
+ __aligned_u64 flags;
+ __u32 sequence;
+ __u32 __reserved;
+};
+
+/**
+ * enum iommu_veventq_type - Virtual Event Queue Type
+ * @IOMMU_VEVENTQ_TYPE_DEFAULT: Reserved for future use
+ */
+enum iommu_veventq_type {
+ IOMMU_VEVENTQ_TYPE_DEFAULT = 0,
+};
+
+/**
+ * struct iommu_veventq_alloc - ioctl(IOMMU_VEVENTQ_ALLOC)
+ * @size: sizeof(struct iommu_veventq_alloc)
+ * @flags: Must be 0
+ * @viommu: virtual IOMMU ID to associate the vEVENTQ with
+ * @type: Type of the vEVENTQ. Must be defined in enum iommu_veventq_type
+ * @veventq_depth: Maximum number of events in the vEVENTQ
+ * @out_veventq_id: The ID of the new vEVENTQ
+ * @out_veventq_fd: The fd of the new vEVENTQ. User space must close the
+ * successfully returned fd after using it
+ * @__reserved: Must be 0
+ *
+ * Explicitly allocate a virtual event queue interface for a vIOMMU. A vIOMMU
+ * can have multiple FDs for different types, but is confined to one per @type.
+ * User space should open the @out_veventq_fd to read vEVENTs out of a vEVENTQ,
+ * if there are vEVENTs available. A vEVENTQ will overflow if the number of the
+ * vEVENTs hits @veventq_depth.
+ *
+ * Each vEVENT in a vEVENTQ encloses a struct iommufd_vevent_header followed by
+ * a type-specific data structure, in a normal case:
+ * -------------------------------------------------------------
+ * || header0 | data0 | header1 | data1 | ... | headerN | dataN ||
+ * -------------------------------------------------------------
+ * unless a tailing overflow is logged (refer to struct iommufd_vevent_header).
+ */
+struct iommu_veventq_alloc {
+ __u32 size;
+ __u32 flags;
+ __u32 viommu_id;
+ __u32 type;
+ __u32 veventq_depth;
+ __u32 out_veventq_id;
+ __u32 out_veventq_fd;
+ __u32 __reserved;
+};
+#define IOMMU_VEVENTQ_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VEVENTQ_ALLOC)
#endif
diff --git a/drivers/iommu/iommufd/eventq.c b/drivers/iommu/iommufd/eventq.c
index 0da39c3dfcdb..a08c8ebaea62 100644
--- a/drivers/iommu/iommufd/eventq.c
+++ b/drivers/iommu/iommufd/eventq.c
@@ -382,13 +382,141 @@ static ssize_t iommufd_fault_fops_write(struct file *filep, const char __user *b
return done == 0 ? rc : done;
}
+/* IOMMUFD_OBJ_VEVENTQ Functions */
+
+void iommufd_veventq_abort(struct iommufd_object *obj)
+{
+ struct iommufd_eventq *eventq =
+ container_of(obj, struct iommufd_eventq, obj);
+ struct iommufd_veventq *veventq = eventq_to_veventq(eventq);
+ struct iommufd_viommu *viommu = veventq->viommu;
+ struct iommufd_vevent *cur, *next;
+
+ lockdep_assert_held_write(&viommu->veventqs_rwsem);
+
+ list_for_each_entry_safe(cur, next, &eventq->deliver, node) {
+ list_del(&cur->node);
+ kfree(cur);
+ }
+
+ refcount_dec(&viommu->obj.users);
+ list_del(&veventq->node);
+}
+
+void iommufd_veventq_destroy(struct iommufd_object *obj)
+{
+ struct iommufd_veventq *veventq = eventq_to_veventq(
+ container_of(obj, struct iommufd_eventq, obj));
+
+ down_write(&veventq->viommu->veventqs_rwsem);
+ iommufd_veventq_abort(obj);
+ up_write(&veventq->viommu->veventqs_rwsem);
+}
+
+static struct iommufd_vevent *
+iommufd_veventq_deliver_fetch(struct iommufd_veventq *veventq)
+{
+ struct iommufd_eventq *eventq = &veventq->common;
+ struct list_head *list = &eventq->deliver;
+ struct iommufd_vevent *vevent = NULL;
+
+ spin_lock(&eventq->lock);
+ if (!list_empty(list)) {
+ vevent = list_first_entry(list, struct iommufd_vevent, node);
+ list_del(&vevent->node);
+ vevent->on_list = false;
+ }
+ /* Make a copy of the overflow node for copy_to_user */
+ if (vevent == &veventq->overflow) {
+ vevent = kzalloc(sizeof(*vevent), GFP_ATOMIC);
+ if (vevent)
+ memcpy(vevent, &veventq->overflow, sizeof(*vevent));
+ }
+ spin_unlock(&eventq->lock);
+ return vevent;
+}
+
+static void iommufd_veventq_deliver_restore(struct iommufd_veventq *veventq,
+ struct iommufd_vevent *vevent)
+{
+ struct iommufd_eventq *eventq = &veventq->common;
+ struct list_head *list = &eventq->deliver;
+
+ spin_lock(&eventq->lock);
+ if (vevent_for_overflow(vevent)) {
+ /* Remove the copy of the overflow node */
+ kfree(vevent);
+ vevent = NULL;
+ /* An empty list needs the overflow node back */
+ if (list_empty(list))
+ vevent = &veventq->overflow;
+ }
+ if (vevent) {
+ list_add(&vevent->node, list);
+ vevent->on_list = true;
+ }
+ spin_unlock(&eventq->lock);
+}
+
+static ssize_t iommufd_veventq_fops_read(struct file *filep, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct iommufd_eventq *eventq = filep->private_data;
+ struct iommufd_veventq *veventq = eventq_to_veventq(eventq);
+ struct iommufd_vevent_header *hdr;
+ struct iommufd_vevent *cur;
+ size_t done = 0;
+ int rc = 0;
+
+ if (*ppos)
+ return -ESPIPE;
+
+ while ((cur = iommufd_veventq_deliver_fetch(veventq))) {
+ /* Validate the remaining bytes against the header size */
+ if (done >= count || sizeof(*hdr) > count - done) {
+ iommufd_veventq_deliver_restore(veventq, cur);
+ break;
+ }
+ hdr = &cur->header;
+
+ /* If being a normal vEVENT, validate against the full size */
+ if (!vevent_for_overflow(cur) &&
+ sizeof(hdr) + cur->data_len > count - done) {
+ iommufd_veventq_deliver_restore(veventq, cur);
+ break;
+ }
+
+ if (copy_to_user(buf + done, hdr, sizeof(*hdr))) {
+ iommufd_veventq_deliver_restore(veventq, cur);
+ rc = -EFAULT;
+ break;
+ }
+ done += sizeof(*hdr);
+
+ if (cur->data_len &&
+ copy_to_user(buf + done, cur->event_data, cur->data_len)) {
+ iommufd_veventq_deliver_restore(veventq, cur);
+ rc = -EFAULT;
+ break;
+ }
+ atomic_dec(&veventq->num_events);
+ done += cur->data_len;
+ kfree(cur);
+ }
+
+ return done == 0 ? rc : done;
+}
+
/* Common Event Queue Functions */
static __poll_t iommufd_eventq_fops_poll(struct file *filep,
struct poll_table_struct *wait)
{
struct iommufd_eventq *eventq = filep->private_data;
- __poll_t pollflags = EPOLLOUT;
+ __poll_t pollflags = 0;
+
+ if (eventq->obj.type == IOMMUFD_OBJ_FAULT)
+ pollflags |= EPOLLOUT;
poll_wait(filep, &eventq->wait_queue, wait);
spin_lock(&eventq->lock);
@@ -403,6 +531,10 @@ static int iommufd_eventq_fops_release(struct inode *inode, struct file *filep)
{
struct iommufd_eventq *eventq = filep->private_data;
+ if (eventq->obj.type == IOMMUFD_OBJ_VEVENTQ) {
+ atomic_set(&eventq_to_veventq(eventq)->sequence, 0);
+ atomic_set(&eventq_to_veventq(eventq)->num_events, 0);
+ }
refcount_dec(&eventq->obj.users);
iommufd_ctx_put(eventq->ictx);
return 0;
@@ -508,3 +640,75 @@ int iommufd_fault_iopf_handler(struct iopf_group *group)
return 0;
}
+
+static const struct file_operations iommufd_veventq_fops =
+ INIT_EVENTQ_FOPS(iommufd_veventq_fops_read, NULL);
+
+int iommufd_veventq_alloc(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_veventq_alloc *cmd = ucmd->cmd;
+ struct iommufd_veventq *veventq;
+ struct iommufd_viommu *viommu;
+ int fdno;
+ int rc;
+
+ if (cmd->flags || cmd->type == IOMMU_VEVENTQ_TYPE_DEFAULT)
+ return -EOPNOTSUPP;
+ if (!cmd->veventq_depth)
+ return -EINVAL;
+
+ viommu = iommufd_get_viommu(ucmd, cmd->viommu_id);
+ if (IS_ERR(viommu))
+ return PTR_ERR(viommu);
+
+ down_write(&viommu->veventqs_rwsem);
+
+ if (iommufd_viommu_find_veventq(viommu, cmd->type)) {
+ rc = -EEXIST;
+ goto out_unlock_veventqs;
+ }
+
+ veventq = __iommufd_object_alloc(ucmd->ictx, veventq,
+ IOMMUFD_OBJ_VEVENTQ, common.obj);
+ if (IS_ERR(veventq)) {
+ rc = PTR_ERR(veventq);
+ goto out_unlock_veventqs;
+ }
+
+ veventq->type = cmd->type;
+ veventq->viommu = viommu;
+ refcount_inc(&viommu->obj.users);
+ atomic_set(&veventq->sequence, 0);
+ atomic_set(&veventq->num_events, 0);
+ veventq->depth = cmd->veventq_depth;
+ list_add_tail(&veventq->node, &viommu->veventqs);
+ veventq->overflow.header.flags = IOMMU_VEVENTQ_FLAG_OVERFLOW;
+
+ fdno = iommufd_eventq_init(&veventq->common, "[iommufd-viommu-event]",
+ ucmd->ictx, &iommufd_veventq_fops);
+ if (fdno < 0) {
+ rc = fdno;
+ goto out_abort;
+ }
+
+ cmd->out_veventq_id = veventq->common.obj.id;
+ cmd->out_veventq_fd = fdno;
+
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+ if (rc)
+ goto out_put_fdno;
+
+ iommufd_object_finalize(ucmd->ictx, &veventq->common.obj);
+ fd_install(fdno, veventq->common.filep);
+ goto out_unlock_veventqs;
+
+out_put_fdno:
+ put_unused_fd(fdno);
+ fput(veventq->common.filep);
+out_abort:
+ iommufd_object_abort_and_destroy(ucmd->ictx, &veventq->common.obj);
+out_unlock_veventqs:
+ up_write(&viommu->veventqs_rwsem);
+ iommufd_put_object(ucmd->ictx, &viommu->obj);
+ return rc;
+}
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index a11e9cfd790f..0d451601fb9a 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -308,6 +308,7 @@ union ucmd_buffer {
struct iommu_ioas_unmap unmap;
struct iommu_option option;
struct iommu_vdevice_alloc vdev;
+ struct iommu_veventq_alloc veventq;
struct iommu_vfio_ioas vfio_ioas;
struct iommu_viommu_alloc viommu;
#ifdef CONFIG_IOMMUFD_TEST
@@ -363,6 +364,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option, val64),
IOCTL_OP(IOMMU_VDEVICE_ALLOC, iommufd_vdevice_alloc_ioctl,
struct iommu_vdevice_alloc, virt_id),
+ IOCTL_OP(IOMMU_VEVENTQ_ALLOC, iommufd_veventq_alloc,
+ struct iommu_veventq_alloc, out_veventq_fd),
IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas,
__reserved),
IOCTL_OP(IOMMU_VIOMMU_ALLOC, iommufd_viommu_alloc_ioctl,
@@ -505,6 +508,10 @@ static const struct iommufd_object_ops iommufd_object_ops[] = {
[IOMMUFD_OBJ_VDEVICE] = {
.destroy = iommufd_vdevice_destroy,
},
+ [IOMMUFD_OBJ_VEVENTQ] = {
+ .destroy = iommufd_veventq_destroy,
+ .abort = iommufd_veventq_abort,
+ },
[IOMMUFD_OBJ_VIOMMU] = {
.destroy = iommufd_viommu_destroy,
},
diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
index 69b88e8c7c26..01df2b985f02 100644
--- a/drivers/iommu/iommufd/viommu.c
+++ b/drivers/iommu/iommufd/viommu.c
@@ -59,6 +59,8 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
viommu->ictx = ucmd->ictx;
viommu->hwpt = hwpt_paging;
refcount_inc(&viommu->hwpt->common.obj.users);
+ INIT_LIST_HEAD(&viommu->veventqs);
+ init_rwsem(&viommu->veventqs_rwsem);
/*
* It is the most likely case that a physical IOMMU is unpluggable. A
* pluggable IOMMU instance (if exists) is responsible for refcounting
--
2.43.0
On Fri, Jan 24, 2025 at 04:30:34PM -0800, Nicolin Chen wrote:
> + list_add_tail(&vevent->node, &eventq->deliver);
> + vevent->on_list = true;
> + vevent->header.sequence = atomic_read(&veventq->sequence);
> + if (atomic_read(&veventq->sequence) == INT_MAX)
> + atomic_set(&veventq->sequence, 0);
> + else
> + atomic_inc(&veventq->sequence);
> + spin_unlock(&eventq->lock);
This is all locked, we don't need veventq->sequence to be an atomic?
The bounding can be done with some simple math:
veventq->sequence = (veventq->sequence + 1) & INT_MAX;
> +static struct iommufd_vevent *
> +iommufd_veventq_deliver_fetch(struct iommufd_veventq *veventq)
> +{
> + struct iommufd_eventq *eventq = &veventq->common;
> + struct list_head *list = &eventq->deliver;
> + struct iommufd_vevent *vevent = NULL;
> +
> + spin_lock(&eventq->lock);
> + if (!list_empty(list)) {
> + vevent = list_first_entry(list, struct iommufd_vevent, node);
> + list_del(&vevent->node);
> + vevent->on_list = false;
> + }
> + /* Make a copy of the overflow node for copy_to_user */
> + if (vevent == &veventq->overflow) {
> + vevent = kzalloc(sizeof(*vevent), GFP_ATOMIC);
> + if (vevent)
> + memcpy(vevent, &veventq->overflow, sizeof(*vevent));
> + }
This error handling is wonky, if we can't allocate then we shouldn't
have done the list_del. Just return NULL which will cause
iommufd_veventq_fops_read() to exist and userspace will try again.
> @@ -403,6 +531,10 @@ static int iommufd_eventq_fops_release(struct inode *inode, struct file *filep)
> {
> struct iommufd_eventq *eventq = filep->private_data;
>
> + if (eventq->obj.type == IOMMUFD_OBJ_VEVENTQ) {
> + atomic_set(&eventq_to_veventq(eventq)->sequence, 0);
> + atomic_set(&eventq_to_veventq(eventq)->num_events, 0);
> + }
Why? We are about to free the memory?
> +int iommufd_veventq_alloc(struct iommufd_ucmd *ucmd)
> +{
> + struct iommu_veventq_alloc *cmd = ucmd->cmd;
> + struct iommufd_veventq *veventq;
> + struct iommufd_viommu *viommu;
> + int fdno;
> + int rc;
> +
> + if (cmd->flags || cmd->type == IOMMU_VEVENTQ_TYPE_DEFAULT)
> + return -EOPNOTSUPP;
> + if (!cmd->veventq_depth)
> + return -EINVAL;
Check __reserved for 0 too
Jason
On Tue, Feb 18, 2025 at 11:29:59AM -0400, Jason Gunthorpe wrote:
> On Fri, Jan 24, 2025 at 04:30:34PM -0800, Nicolin Chen wrote:
> > + list_add_tail(&vevent->node, &eventq->deliver);
> > + vevent->on_list = true;
> > + vevent->header.sequence = atomic_read(&veventq->sequence);
> > + if (atomic_read(&veventq->sequence) == INT_MAX)
> > + atomic_set(&veventq->sequence, 0);
> > + else
> > + atomic_inc(&veventq->sequence);
> > + spin_unlock(&eventq->lock);
>
> This is all locked, we don't need veventq->sequence to be an atomic?
>
> The bounding can be done with some simple math:
>
> veventq->sequence = (veventq->sequence + 1) & INT_MAX;
Ack. Perhaps we can reuse eventq->lock to fence @num_events too.
> > +static struct iommufd_vevent *
> > +iommufd_veventq_deliver_fetch(struct iommufd_veventq *veventq)
> > +{
> > + struct iommufd_eventq *eventq = &veventq->common;
> > + struct list_head *list = &eventq->deliver;
> > + struct iommufd_vevent *vevent = NULL;
> > +
> > + spin_lock(&eventq->lock);
> > + if (!list_empty(list)) {
> > + vevent = list_first_entry(list, struct iommufd_vevent, node);
> > + list_del(&vevent->node);
> > + vevent->on_list = false;
> > + }
> > + /* Make a copy of the overflow node for copy_to_user */
> > + if (vevent == &veventq->overflow) {
> > + vevent = kzalloc(sizeof(*vevent), GFP_ATOMIC);
> > + if (vevent)
> > + memcpy(vevent, &veventq->overflow, sizeof(*vevent));
> > + }
>
> This error handling is wonky, if we can't allocate then we shouldn't
> have done the list_del. Just return NULL which will cause
> iommufd_veventq_fops_read() to exist and userspace will try again.
OK.
We have two cases to support here:
1) Normal vevent node -- list_del and return the node.
2) Overflow node -- list_del and return a copy.
I think we can do:
if (!list_empty(list)) {
struct iommufd_vevent *next;
next = list_first_entry(list, struct iommufd_vevent, node);
if (next == &veventq->overflow) {
/* Make a copy of the overflow node for copy_to_user */
vevent = kzalloc(sizeof(*vevent), GFP_ATOMIC);
if (!vevent)
goto out_unlock;
}
list_del(&next->node);
if (vevent)
memcpy(vevent, next, sizeof(*vevent));
else
vevent = next;
}
> > @@ -403,6 +531,10 @@ static int iommufd_eventq_fops_release(struct inode *inode, struct file *filep)
> > {
> > struct iommufd_eventq *eventq = filep->private_data;
> >
> > + if (eventq->obj.type == IOMMUFD_OBJ_VEVENTQ) {
> > + atomic_set(&eventq_to_veventq(eventq)->sequence, 0);
> > + atomic_set(&eventq_to_veventq(eventq)->num_events, 0);
> > + }
>
> Why? We are about to free the memory?
Ack. I thought about a re-entry of an open(). But release() does
lose the event_fd completely, and user space wouldn't be able to
open the same fd again.
> > +int iommufd_veventq_alloc(struct iommufd_ucmd *ucmd)
> > +{
> > + struct iommu_veventq_alloc *cmd = ucmd->cmd;
> > + struct iommufd_veventq *veventq;
> > + struct iommufd_viommu *viommu;
> > + int fdno;
> > + int rc;
> > +
> > + if (cmd->flags || cmd->type == IOMMU_VEVENTQ_TYPE_DEFAULT)
> > + return -EOPNOTSUPP;
> > + if (!cmd->veventq_depth)
> > + return -EINVAL;
>
> Check __reserved for 0 too
Kevin is suggesting a 32-bit flag field, so I think we can drop
the __reserved in that case.
Thanks
Nicolin
On Tue, Feb 18, 2025 at 09:47:50AM -0800, Nicolin Chen wrote:
> I think we can do:
> if (!list_empty(list)) {
> struct iommufd_vevent *next;
>
> next = list_first_entry(list, struct iommufd_vevent, node);
> if (next == &veventq->overflow) {
> /* Make a copy of the overflow node for copy_to_user */
> vevent = kzalloc(sizeof(*vevent), GFP_ATOMIC);
> if (!vevent)
> goto out_unlock;
> }
> list_del(&next->node);
> if (vevent)
> memcpy(vevent, next, sizeof(*vevent));
> else
> vevent = next;
> }
That looks right
> > > +int iommufd_veventq_alloc(struct iommufd_ucmd *ucmd)
> > > +{
> > > + struct iommu_veventq_alloc *cmd = ucmd->cmd;
> > > + struct iommufd_veventq *veventq;
> > > + struct iommufd_viommu *viommu;
> > > + int fdno;
> > > + int rc;
> > > +
> > > + if (cmd->flags || cmd->type == IOMMU_VEVENTQ_TYPE_DEFAULT)
> > > + return -EOPNOTSUPP;
> > > + if (!cmd->veventq_depth)
> > > + return -EINVAL;
> >
> > Check __reserved for 0 too
>
> Kevin is suggesting a 32-bit flag field, so I think we can drop
> the __reserved in that case.
Those are different structs?
Jason
On Tue, Feb 18, 2025 at 02:08:05PM -0400, Jason Gunthorpe wrote:
> On Tue, Feb 18, 2025 at 09:47:50AM -0800, Nicolin Chen wrote:
> > > > +int iommufd_veventq_alloc(struct iommufd_ucmd *ucmd)
> > > > +{
> > > > + struct iommu_veventq_alloc *cmd = ucmd->cmd;
> > > > + struct iommufd_veventq *veventq;
> > > > + struct iommufd_viommu *viommu;
> > > > + int fdno;
> > > > + int rc;
> > > > +
> > > > + if (cmd->flags || cmd->type == IOMMU_VEVENTQ_TYPE_DEFAULT)
> > > > + return -EOPNOTSUPP;
> > > > + if (!cmd->veventq_depth)
> > > > + return -EINVAL;
> > >
> > > Check __reserved for 0 too
> >
> > Kevin is suggesting a 32-bit flag field, so I think we can drop
> > the __reserved in that case.
>
> Those are different structs?
Oops. Right. I will check the __reserved.
Thanks
Nicolin
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Saturday, January 25, 2025 8:31 AM
> +
> +/*
> + * An iommufd_veventq object represents an interface to deliver vIOMMU
> events to
> + * the user space. It is created/destroyed by the user space and associated
> with
> + * vIOMMU object(s) during the allocations.
s/object(s)/object/, given the eventq cannot be shared between vIOMMUs.
> +static inline void iommufd_vevent_handler(struct iommufd_veventq
> *veventq,
> + struct iommufd_vevent *vevent)
> +{
> + struct iommufd_eventq *eventq = &veventq->common;
> +
> + /*
> + * Remove the overflow node and add the new node at the same
> time. Note
> + * it is possible that vevent == &veventq->overflow for sequence
> update
> + */
> + spin_lock(&eventq->lock);
> + if (veventq->overflow.on_list) {
> + list_del(&veventq->overflow.node);
> + veventq->overflow.on_list = false;
> + }
We can save one field 'on_list' in every entry by:
if (list_is_last(&veventq->overflow.node, &eventq->deliver))
list_del(&veventq->overflow.node);
> +
> +/**
> + * struct iommufd_vevent_header - Virtual Event Header for a vEVENTQ
> Status
> + * @flags: Combination of enum iommu_veventq_flag
> + * @sequence: The sequence index of a vEVENT in the vEVENTQ, with a
> range of
> + * [0, INT_MAX] where the following index of INT_MAX is 0
> + * @__reserved: Must be 0
> + *
> + * Each iommufd_vevent_header reports a sequence index of the following
> vEVENT:
> + * ---------------------------------------------------------------------------
> + * || header0 {sequence=0} | data0 | header1 {sequence=1} | data1 |...|
> dataN ||
> + * ---------------------------------------------------------------------------
> + * And this sequence index is expected to be monotonic to the sequence
> index of
> + * the previous vEVENT. If two adjacent sequence indexes has a delta larger
> than
> + * 1, it indicates that an overflow occurred to the vEVENTQ and that delta - 1
> + * number of vEVENTs lost due to the overflow (e.g. two lost vEVENTs):
> + * ---------------------------------------------------------------------------
> + * || ... | header3 {sequence=3} | data3 | header6 {sequence=6} | data6 | ...
> ||
> + * ---------------------------------------------------------------------------
> + * If an overflow occurred to the tail of the vEVENTQ and there is no
> following
> + * vEVENT providing the next sequence index, a special overflow header
> would be
> + * added to the tail of the vEVENTQ, where there would be no more type-
> specific
> + * data following the vEVENTQ:
> + * ---------------------------------------------------------------------------
> + * ||...| header3 {sequence=3} | data4 | header4 {flags=OVERFLOW,
> sequence=4} ||
> + * ---------------------------------------------------------------------------
> + */
> +struct iommufd_vevent_header {
> + __aligned_u64 flags;
> + __u32 sequence;
> + __u32 __reserved;
> +};
Is there a reason that flags must be u64? At a glance all flags fields
(except the one in iommu_hwpt_vtd_s1) in iommufd uAPIs are u32
which can cut the size of the header by half...
> +void iommufd_veventq_abort(struct iommufd_object *obj)
> +{
> + struct iommufd_eventq *eventq =
> + container_of(obj, struct iommufd_eventq, obj);
> + struct iommufd_veventq *veventq = eventq_to_veventq(eventq);
> + struct iommufd_viommu *viommu = veventq->viommu;
> + struct iommufd_vevent *cur, *next;
> +
> + lockdep_assert_held_write(&viommu->veventqs_rwsem);
> +
> + list_for_each_entry_safe(cur, next, &eventq->deliver, node) {
> + list_del(&cur->node);
> + kfree(cur);
kfree() doesn't apply to the overflow node.
otherwise it looks good to me:
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
On Tue, Feb 18, 2025 at 05:13:47AM +0000, Tian, Kevin wrote:
> > From: Nicolin Chen <nicolinc@nvidia.com>
> > Sent: Saturday, January 25, 2025 8:31 AM
> > +
> > +/*
> > + * An iommufd_veventq object represents an interface to deliver vIOMMU
> > events to
> > + * the user space. It is created/destroyed by the user space and associated
> > with
> > + * vIOMMU object(s) during the allocations.
>
> s/object(s)/object/, given the eventq cannot be shared between vIOMMUs.
Done. Adding an "a" too.
> > +static inline void iommufd_vevent_handler(struct iommufd_veventq
> > *veventq,
> > + struct iommufd_vevent *vevent)
> > +{
> > + struct iommufd_eventq *eventq = &veventq->common;
> > +
> > + /*
> > + * Remove the overflow node and add the new node at the same
> > time. Note
> > + * it is possible that vevent == &veventq->overflow for sequence
> > update
> > + */
> > + spin_lock(&eventq->lock);
> > + if (veventq->overflow.on_list) {
> > + list_del(&veventq->overflow.node);
> > + veventq->overflow.on_list = false;
> > + }
>
> We can save one field 'on_list' in every entry by:
>
> if (list_is_last(&veventq->overflow.node, &eventq->deliver))
> list_del(&veventq->overflow.node);
Hmm. Given that the overflow node, if being on the list, should be
always the last one... yes!
> > +struct iommufd_vevent_header {
> > + __aligned_u64 flags;
> > + __u32 sequence;
> > + __u32 __reserved;
> > +};
>
> Is there a reason that flags must be u64? At a glance all flags fields
> (except the one in iommu_hwpt_vtd_s1) in iommufd uAPIs are u32
> which can cut the size of the header by half...
Not having a particular reason yet. Just thought that a 64-bit
could make the uAPI more expandable. It's true that u32 would
be cleaner. I will make a change.
>
> > +void iommufd_veventq_abort(struct iommufd_object *obj)
> > +{
> > + struct iommufd_eventq *eventq =
> > + container_of(obj, struct iommufd_eventq, obj);
> > + struct iommufd_veventq *veventq = eventq_to_veventq(eventq);
> > + struct iommufd_viommu *viommu = veventq->viommu;
> > + struct iommufd_vevent *cur, *next;
> > +
> > + lockdep_assert_held_write(&viommu->veventqs_rwsem);
> > +
> > + list_for_each_entry_safe(cur, next, &eventq->deliver, node) {
> > + list_del(&cur->node);
> > + kfree(cur);
>
> kfree() doesn't apply to the overflow node.
Oh right, that's missed.
> otherwise it looks good to me:
>
> Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Thanks!
Nicolin
© 2016 - 2026 Red Hat, Inc.