[PATCH v5 06/14] iommufd: Add IOMMUFD_OBJ_VEVENTQ and IOMMUFD_CMD_VEVENTQ_ALLOC

Nicolin Chen posted 14 patches 11 months, 2 weeks ago
There is a newer version of this series
[PATCH v5 06/14] iommufd: Add IOMMUFD_OBJ_VEVENTQ and IOMMUFD_CMD_VEVENTQ_ALLOC
Posted by Nicolin Chen 11 months, 2 weeks ago
Introduce a new IOMMUFD_OBJ_VEVENTQ object for vIOMMU Event Queue that
provides user space (VMM) another FD to read the vIOMMU Events.

Allow a vIOMMU object to allocate vEVENTQs, with a condition that each
vIOMMU can only have one single vEVENTQ per type.

Add iommufd_veventq_alloc() with iommufd_veventq_ops for the new ioctl.

And add a supports_veventq viommu op for drivers to help the core code
validate the input vEVENTQ type.

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
 drivers/iommu/iommufd/iommufd_private.h |  58 ++++++++++
 include/linux/iommufd.h                 |   5 +
 include/uapi/linux/iommufd.h            |  31 ++++++
 drivers/iommu/iommufd/eventq.c          | 134 ++++++++++++++++++++++++
 drivers/iommu/iommufd/main.c            |   7 ++
 drivers/iommu/iommufd/viommu.c          |   2 +
 6 files changed, 237 insertions(+)

diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index dfbc5cfbd164..3c0374154a94 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -547,6 +547,50 @@ static inline int iommufd_hwpt_replace_device(struct iommufd_device *idev,
 	return iommu_group_replace_domain(idev->igroup->group, hwpt->domain);
 }
 
+/*
+ * An iommufd_veventq object represents an interface to deliver vIOMMU events to
+ * the user space. It is created/destroyed by the user space and associated with
+ * vIOMMU object(s) during the allocations.
+ */
+struct iommufd_veventq {
+	struct iommufd_eventq common;
+	struct iommufd_viommu *viommu;
+	struct list_head node; /* for iommufd_viommu::veventqs */
+
+	unsigned int type;
+};
+
+static inline struct iommufd_veventq *
+eventq_to_veventq(struct iommufd_eventq *eventq)
+{
+	return container_of(eventq, struct iommufd_veventq, common);
+}
+
+static inline struct iommufd_veventq *
+iommufd_get_veventq(struct iommufd_ucmd *ucmd, u32 id)
+{
+	return container_of(iommufd_get_object(ucmd->ictx, id,
+					       IOMMUFD_OBJ_VEVENTQ),
+			    struct iommufd_veventq, common.obj);
+}
+
+int iommufd_veventq_alloc(struct iommufd_ucmd *ucmd);
+void iommufd_veventq_destroy(struct iommufd_object *obj);
+void iommufd_veventq_abort(struct iommufd_object *obj);
+
+/* An iommufd_vevent represents a vIOMMU event in an iommufd_veventq */
+struct iommufd_vevent {
+	struct list_head node; /* for iommufd_eventq::deliver */
+	ssize_t data_len;
+	u64 event_data[] __counted_by(data_len);
+};
+
+static inline int iommufd_vevent_handler(struct iommufd_veventq *veventq,
+					 struct iommufd_vevent *vevent)
+{
+	return iommufd_eventq_notify(&veventq->common, &vevent->node);
+}
+
 static inline struct iommufd_viommu *
 iommufd_get_viommu(struct iommufd_ucmd *ucmd, u32 id)
 {
@@ -555,6 +599,20 @@ iommufd_get_viommu(struct iommufd_ucmd *ucmd, u32 id)
 			    struct iommufd_viommu, obj);
 }
 
+static inline struct iommufd_veventq *
+iommufd_viommu_find_veventq(struct iommufd_viommu *viommu, u32 type)
+{
+	struct iommufd_veventq *veventq, *next;
+
+	lockdep_assert_held(&viommu->veventqs_rwsem);
+
+	list_for_each_entry_safe(veventq, next, &viommu->veventqs, node) {
+		if (veventq->type == type)
+			return veventq;
+	}
+	return NULL;
+}
+
 int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd);
 void iommufd_viommu_destroy(struct iommufd_object *obj);
 int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd);
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 11110c749200..941f2ed29914 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -34,6 +34,7 @@ enum iommufd_object_type {
 	IOMMUFD_OBJ_FAULT,
 	IOMMUFD_OBJ_VIOMMU,
 	IOMMUFD_OBJ_VDEVICE,
+	IOMMUFD_OBJ_VEVENTQ,
 #ifdef CONFIG_IOMMUFD_TEST
 	IOMMUFD_OBJ_SELFTEST,
 #endif
@@ -93,6 +94,8 @@ struct iommufd_viommu {
 	const struct iommufd_viommu_ops *ops;
 
 	struct xarray vdevs;
+	struct list_head veventqs;
+	struct rw_semaphore veventqs_rwsem;
 
 	unsigned int type;
 };
@@ -113,6 +116,7 @@ struct iommufd_viommu {
  *                    array->entry_num to report the number of handled requests.
  *                    The data structure of the array entry must be defined in
  *                    include/uapi/linux/iommufd.h
+ * @supports_veventq: Whether the vIOMMU supports a given vEVENTQ type
  */
 struct iommufd_viommu_ops {
 	void (*destroy)(struct iommufd_viommu *viommu);
@@ -121,6 +125,7 @@ struct iommufd_viommu_ops {
 		const struct iommu_user_data *user_data);
 	int (*cache_invalidate)(struct iommufd_viommu *viommu,
 				struct iommu_user_data_array *array);
+	bool (*supports_veventq)(unsigned int type);
 };
 
 #if IS_ENABLED(CONFIG_IOMMUFD)
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 34810f6ae2b5..0a08aa82e7cc 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -55,6 +55,7 @@ enum {
 	IOMMUFD_CMD_VIOMMU_ALLOC = 0x90,
 	IOMMUFD_CMD_VDEVICE_ALLOC = 0x91,
 	IOMMUFD_CMD_IOAS_CHANGE_PROCESS = 0x92,
+	IOMMUFD_CMD_VEVENTQ_ALLOC = 0x93,
 };
 
 /**
@@ -1012,4 +1013,34 @@ struct iommu_ioas_change_process {
 #define IOMMU_IOAS_CHANGE_PROCESS \
 	_IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_CHANGE_PROCESS)
 
+/**
+ * enum iommu_veventq_type - Virtual Event Queue Type
+ * @IOMMU_VEVENTQ_TYPE_DEFAULT: Reserved for future use
+ */
+enum iommu_veventq_type {
+	IOMMU_VEVENTQ_TYPE_DEFAULT = 0,
+};
+
+/**
+ * struct iommu_veventq_alloc - ioctl(IOMMU_VEVENTQ_ALLOC)
+ * @size: sizeof(struct iommu_veventq_alloc)
+ * @flags: Must be 0
+ * @viommu: virtual IOMMU ID to associate the vEVENTQ with
+ * @type: Type of the vEVENTQ. Must be defined in enum iommu_veventq_type
+ * @out_veventq_id: The ID of the new vEVENTQ
+ * @out_veventq_fd: The fd of the new vEVENTQ. User space must close the
+ *                  successfully returned fd after using it
+ *
+ * Explicitly allocate a virtual event queue interface for a vIOMMU. A vIOMMU
+ * can have multiple FDs for different types, but is confined to one per @type.
+ */
+struct iommu_veventq_alloc {
+	__u32 size;
+	__u32 flags;
+	__u32 viommu_id;
+	__u32 type;
+	__u32 out_veventq_id;
+	__u32 out_veventq_fd;
+};
+#define IOMMU_VEVENTQ_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VEVENTQ_ALLOC)
 #endif
diff --git a/drivers/iommu/iommufd/eventq.c b/drivers/iommu/iommufd/eventq.c
index e386b6c3e6ab..b5be629f38ed 100644
--- a/drivers/iommu/iommufd/eventq.c
+++ b/drivers/iommu/iommufd/eventq.c
@@ -346,6 +346,73 @@ static const struct iommufd_eventq_ops iommufd_fault_ops = {
 	.write = &iommufd_fault_fops_write,
 };
 
+/* IOMMUFD_OBJ_VEVENTQ Functions */
+
+void iommufd_veventq_abort(struct iommufd_object *obj)
+{
+	struct iommufd_eventq *eventq =
+		container_of(obj, struct iommufd_eventq, obj);
+	struct iommufd_veventq *veventq = eventq_to_veventq(eventq);
+	struct iommufd_viommu *viommu = veventq->viommu;
+	struct iommufd_vevent *cur, *next;
+
+	lockdep_assert_held_write(&viommu->veventqs_rwsem);
+
+	list_for_each_entry_safe(cur, next, &eventq->deliver, node) {
+		list_del(&cur->node);
+		kfree(cur);
+	}
+
+	refcount_dec(&viommu->obj.users);
+	mutex_destroy(&eventq->mutex);
+	list_del(&veventq->node);
+}
+
+void iommufd_veventq_destroy(struct iommufd_object *obj)
+{
+	struct iommufd_veventq *veventq = eventq_to_veventq(
+		container_of(obj, struct iommufd_eventq, obj));
+
+	down_write(&veventq->viommu->veventqs_rwsem);
+	iommufd_veventq_abort(obj);
+	up_write(&veventq->viommu->veventqs_rwsem);
+}
+
+static ssize_t iommufd_veventq_fops_read(struct iommufd_eventq *eventq,
+					 char __user *buf, size_t count,
+					 loff_t *ppos)
+{
+	size_t done = 0;
+	int rc = 0;
+
+	if (*ppos)
+		return -ESPIPE;
+
+	mutex_lock(&eventq->mutex);
+	while (!list_empty(&eventq->deliver) && count > done) {
+		struct iommufd_vevent *cur = list_first_entry(
+			&eventq->deliver, struct iommufd_vevent, node);
+
+		if (cur->data_len > count - done)
+			break;
+
+		if (copy_to_user(buf + done, cur->event_data, cur->data_len)) {
+			rc = -EFAULT;
+			break;
+		}
+		done += cur->data_len;
+		list_del(&cur->node);
+		kfree(cur);
+	}
+	mutex_unlock(&eventq->mutex);
+
+	return done == 0 ? rc : done;
+}
+
+static const struct iommufd_eventq_ops iommufd_veventq_ops = {
+	.read = &iommufd_veventq_fops_read,
+};
+
 /* Common Event Queue Functions */
 
 static ssize_t iommufd_eventq_fops_read(struct file *filep, char __user *buf,
@@ -473,3 +540,70 @@ int iommufd_fault_alloc(struct iommufd_ucmd *ucmd)
 
 	return rc;
 }
+
+int iommufd_veventq_alloc(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_veventq_alloc *cmd = ucmd->cmd;
+	struct iommufd_veventq *veventq;
+	struct iommufd_viommu *viommu;
+	int fdno;
+	int rc;
+
+	if (cmd->flags || cmd->type == IOMMU_VEVENTQ_TYPE_DEFAULT)
+		return -EOPNOTSUPP;
+
+	viommu = iommufd_get_viommu(ucmd, cmd->viommu_id);
+	if (IS_ERR(viommu))
+		return PTR_ERR(viommu);
+
+	if (!viommu->ops || !viommu->ops->supports_veventq ||
+	    !viommu->ops->supports_veventq(cmd->type))
+		return -EOPNOTSUPP;
+
+	down_write(&viommu->veventqs_rwsem);
+
+	if (iommufd_viommu_find_veventq(viommu, cmd->type)) {
+		rc = -EEXIST;
+		goto out_unlock_veventqs;
+	}
+
+	veventq = __iommufd_object_alloc(ucmd->ictx, veventq,
+					 IOMMUFD_OBJ_VEVENTQ, common.obj);
+	if (IS_ERR(veventq)) {
+		rc = PTR_ERR(veventq);
+		goto out_unlock_veventqs;
+	}
+
+	veventq->type = cmd->type;
+	veventq->viommu = viommu;
+	refcount_inc(&viommu->obj.users);
+	list_add_tail(&veventq->node, &viommu->veventqs);
+
+	fdno = iommufd_eventq_init(&veventq->common, "[iommufd-viommu-event]",
+				   ucmd->ictx, &iommufd_veventq_ops);
+	if (fdno < 0) {
+		rc = fdno;
+		goto out_abort;
+	}
+
+	cmd->out_veventq_id = veventq->common.obj.id;
+	cmd->out_veventq_fd = fdno;
+
+	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+	if (rc)
+		goto out_put_fdno;
+
+	iommufd_object_finalize(ucmd->ictx, &veventq->common.obj);
+	fd_install(fdno, veventq->common.filep);
+	goto out_unlock_veventqs;
+
+out_put_fdno:
+	put_unused_fd(fdno);
+	fput(veventq->common.filep);
+out_abort:
+	iommufd_object_abort_and_destroy(ucmd->ictx, &veventq->common.obj);
+out_unlock_veventqs:
+	up_write(&viommu->veventqs_rwsem);
+	iommufd_put_object(ucmd->ictx, &viommu->obj);
+	return rc;
+}
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index a11e9cfd790f..0d451601fb9a 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -308,6 +308,7 @@ union ucmd_buffer {
 	struct iommu_ioas_unmap unmap;
 	struct iommu_option option;
 	struct iommu_vdevice_alloc vdev;
+	struct iommu_veventq_alloc veventq;
 	struct iommu_vfio_ioas vfio_ioas;
 	struct iommu_viommu_alloc viommu;
 #ifdef CONFIG_IOMMUFD_TEST
@@ -363,6 +364,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
 	IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option, val64),
 	IOCTL_OP(IOMMU_VDEVICE_ALLOC, iommufd_vdevice_alloc_ioctl,
 		 struct iommu_vdevice_alloc, virt_id),
+	IOCTL_OP(IOMMU_VEVENTQ_ALLOC, iommufd_veventq_alloc,
+		 struct iommu_veventq_alloc, out_veventq_fd),
 	IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas,
 		 __reserved),
 	IOCTL_OP(IOMMU_VIOMMU_ALLOC, iommufd_viommu_alloc_ioctl,
@@ -505,6 +508,10 @@ static const struct iommufd_object_ops iommufd_object_ops[] = {
 	[IOMMUFD_OBJ_VDEVICE] = {
 		.destroy = iommufd_vdevice_destroy,
 	},
+	[IOMMUFD_OBJ_VEVENTQ] = {
+		.destroy = iommufd_veventq_destroy,
+		.abort = iommufd_veventq_abort,
+	},
 	[IOMMUFD_OBJ_VIOMMU] = {
 		.destroy = iommufd_viommu_destroy,
 	},
diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
index 69b88e8c7c26..01df2b985f02 100644
--- a/drivers/iommu/iommufd/viommu.c
+++ b/drivers/iommu/iommufd/viommu.c
@@ -59,6 +59,8 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
 	viommu->ictx = ucmd->ictx;
 	viommu->hwpt = hwpt_paging;
 	refcount_inc(&viommu->hwpt->common.obj.users);
+	INIT_LIST_HEAD(&viommu->veventqs);
+	init_rwsem(&viommu->veventqs_rwsem);
 	/*
 	 * It is the most likely case that a physical IOMMU is unpluggable. A
 	 * pluggable IOMMU instance (if exists) is responsible for refcounting
-- 
2.43.0
Re: [PATCH v5 06/14] iommufd: Add IOMMUFD_OBJ_VEVENTQ and IOMMUFD_CMD_VEVENTQ_ALLOC
Posted by Jason Gunthorpe 11 months, 1 week ago
On Tue, Jan 07, 2025 at 09:10:09AM -0800, Nicolin Chen wrote:

> +static ssize_t iommufd_veventq_fops_read(struct iommufd_eventq *eventq,
> +					 char __user *buf, size_t count,
> +					 loff_t *ppos)
> +{
> +	size_t done = 0;
> +	int rc = 0;
> +
> +	if (*ppos)
> +		return -ESPIPE;
> +
> +	mutex_lock(&eventq->mutex);
> +	while (!list_empty(&eventq->deliver) && count > done) {
> +		struct iommufd_vevent *cur = list_first_entry(
> +			&eventq->deliver, struct iommufd_vevent, node);
> +
> +		if (cur->data_len > count - done)
> +			break;
> +
> +		if (copy_to_user(buf + done, cur->event_data, cur->data_len)) {
> +			rc = -EFAULT;
> +			break;
> +		}

Now that I look at this more closely, the fault path this is copied
from is not great.

This copy_to_user() can block while waiting on a page fault, possibily
for a long time. While blocked the mutex is held and we can't add more
entries to the list.

That will cause the shared IRQ handler in the iommu driver to back up,
which would cause a global DOS.

This probably wants to be organized to look more like

while (itm = eventq_get_next_item(eventq)) {
   if (..) {
       eventq_restore_failed_item(eventq);
       return -1;
   }
}

Where the next_item would just be a simple spinlock across the linked
list manipulation.

Jason
Re: [PATCH v5 06/14] iommufd: Add IOMMUFD_OBJ_VEVENTQ and IOMMUFD_CMD_VEVENTQ_ALLOC
Posted by Nicolin Chen 11 months, 1 week ago
On Fri, Jan 10, 2025 at 01:48:42PM -0400, Jason Gunthorpe wrote:
> On Tue, Jan 07, 2025 at 09:10:09AM -0800, Nicolin Chen wrote:
> 
> > +static ssize_t iommufd_veventq_fops_read(struct iommufd_eventq *eventq,
> > +					 char __user *buf, size_t count,
> > +					 loff_t *ppos)
> > +{
> > +	size_t done = 0;
> > +	int rc = 0;
> > +
> > +	if (*ppos)
> > +		return -ESPIPE;
> > +
> > +	mutex_lock(&eventq->mutex);
> > +	while (!list_empty(&eventq->deliver) && count > done) {
> > +		struct iommufd_vevent *cur = list_first_entry(
> > +			&eventq->deliver, struct iommufd_vevent, node);
> > +
> > +		if (cur->data_len > count - done)
> > +			break;
> > +
> > +		if (copy_to_user(buf + done, cur->event_data, cur->data_len)) {
> > +			rc = -EFAULT;
> > +			break;
> > +		}
> 
> Now that I look at this more closely, the fault path this is copied
> from is not great.
> 
> This copy_to_user() can block while waiting on a page fault, possibily
> for a long time. While blocked the mutex is held and we can't add more
> entries to the list.
>
> That will cause the shared IRQ handler in the iommu driver to back up,
> which would cause a global DOS.
>
> This probably wants to be organized to look more like
> 
> while (itm = eventq_get_next_item(eventq)) {
>    if (..) {
>        eventq_restore_failed_item(eventq);
>        return -1;
>    }
> }
> 
> Where the next_item would just be a simple spinlock across the linked
> list manipulation.

Would it be simpler by just limiting one node per read(), i.e.
no "while (!list_empty)" and no block?

The report() adds one node at a time, and wakes up the poll()
each time of adding a node. And user space could read one event
at a time too?

Thanks
Nicolin
Re: [PATCH v5 06/14] iommufd: Add IOMMUFD_OBJ_VEVENTQ and IOMMUFD_CMD_VEVENTQ_ALLOC
Posted by Jason Gunthorpe 11 months, 1 week ago
On Fri, Jan 10, 2025 at 11:27:53AM -0800, Nicolin Chen wrote:
> On Fri, Jan 10, 2025 at 01:48:42PM -0400, Jason Gunthorpe wrote:
> > On Tue, Jan 07, 2025 at 09:10:09AM -0800, Nicolin Chen wrote:
> > 
> > > +static ssize_t iommufd_veventq_fops_read(struct iommufd_eventq *eventq,
> > > +					 char __user *buf, size_t count,
> > > +					 loff_t *ppos)
> > > +{
> > > +	size_t done = 0;
> > > +	int rc = 0;
> > > +
> > > +	if (*ppos)
> > > +		return -ESPIPE;
> > > +
> > > +	mutex_lock(&eventq->mutex);
> > > +	while (!list_empty(&eventq->deliver) && count > done) {
> > > +		struct iommufd_vevent *cur = list_first_entry(
> > > +			&eventq->deliver, struct iommufd_vevent, node);
> > > +
> > > +		if (cur->data_len > count - done)
> > > +			break;
> > > +
> > > +		if (copy_to_user(buf + done, cur->event_data, cur->data_len)) {
> > > +			rc = -EFAULT;
> > > +			break;
> > > +		}
> > 
> > Now that I look at this more closely, the fault path this is copied
> > from is not great.
> > 
> > This copy_to_user() can block while waiting on a page fault, possibily
> > for a long time. While blocked the mutex is held and we can't add more
> > entries to the list.
> >
> > That will cause the shared IRQ handler in the iommu driver to back up,
> > which would cause a global DOS.
> >
> > This probably wants to be organized to look more like
> > 
> > while (itm = eventq_get_next_item(eventq)) {
> >    if (..) {
> >        eventq_restore_failed_item(eventq);
> >        return -1;
> >    }
> > }
> > 
> > Where the next_item would just be a simple spinlock across the linked
> > list manipulation.
> 
> Would it be simpler by just limiting one node per read(), i.e.
> no "while (!list_empty)" and no block?
> 
> The report() adds one node at a time, and wakes up the poll()
> each time of adding a node. And user space could read one event
> at a time too?

That doesn't really help, the issue is it holds the lock over the
copy_to_user() which it is doing because it doesn't want pull the item off
the list and then try to handle the failure and put it back.

Jason
Re: [PATCH v5 06/14] iommufd: Add IOMMUFD_OBJ_VEVENTQ and IOMMUFD_CMD_VEVENTQ_ALLOC
Posted by Nicolin Chen 11 months, 1 week ago
On Fri, Jan 10, 2025 at 03:49:50PM -0400, Jason Gunthorpe wrote:
> On Fri, Jan 10, 2025 at 11:27:53AM -0800, Nicolin Chen wrote:
> > On Fri, Jan 10, 2025 at 01:48:42PM -0400, Jason Gunthorpe wrote:
> > > On Tue, Jan 07, 2025 at 09:10:09AM -0800, Nicolin Chen wrote:
> > > 
> > > > +static ssize_t iommufd_veventq_fops_read(struct iommufd_eventq *eventq,
> > > > +					 char __user *buf, size_t count,
> > > > +					 loff_t *ppos)
> > > > +{
> > > > +	size_t done = 0;
> > > > +	int rc = 0;
> > > > +
> > > > +	if (*ppos)
> > > > +		return -ESPIPE;
> > > > +
> > > > +	mutex_lock(&eventq->mutex);
> > > > +	while (!list_empty(&eventq->deliver) && count > done) {
> > > > +		struct iommufd_vevent *cur = list_first_entry(
> > > > +			&eventq->deliver, struct iommufd_vevent, node);
> > > > +
> > > > +		if (cur->data_len > count - done)
> > > > +			break;
> > > > +
> > > > +		if (copy_to_user(buf + done, cur->event_data, cur->data_len)) {
> > > > +			rc = -EFAULT;
> > > > +			break;
> > > > +		}
> > > 
> > > Now that I look at this more closely, the fault path this is copied
> > > from is not great.
> > > 
> > > This copy_to_user() can block while waiting on a page fault, possibily
> > > for a long time. While blocked the mutex is held and we can't add more
> > > entries to the list.
> > >
> > > That will cause the shared IRQ handler in the iommu driver to back up,
> > > which would cause a global DOS.
> > >
> > > This probably wants to be organized to look more like
> > > 
> > > while (itm = eventq_get_next_item(eventq)) {
> > >    if (..) {
> > >        eventq_restore_failed_item(eventq);
> > >        return -1;
> > >    }
> > > }
> > > 
> > > Where the next_item would just be a simple spinlock across the linked
> > > list manipulation.
> > 
> > Would it be simpler by just limiting one node per read(), i.e.
> > no "while (!list_empty)" and no block?
> > 
> > The report() adds one node at a time, and wakes up the poll()
> > each time of adding a node. And user space could read one event
> > at a time too?
> 
> That doesn't really help, the issue is it holds the lock over the
> copy_to_user() which it is doing because it doesn't want pull the item off
> the list and then try to handle the failure and put it back.

Hmm, it seems that I haven't got your first narrative straight..

Would you mind elaborate "copy_to_user() can block while waiting
on a page fault"? When would this happen?

Thanks
Nicolin
Re: [PATCH v5 06/14] iommufd: Add IOMMUFD_OBJ_VEVENTQ and IOMMUFD_CMD_VEVENTQ_ALLOC
Posted by Jason Gunthorpe 11 months, 1 week ago
On Fri, Jan 10, 2025 at 01:58:21PM -0800, Nicolin Chen wrote:
> Hmm, it seems that I haven't got your first narrative straight..
> 
> Would you mind elaborate "copy_to_user() can block while waiting
> on a page fault"? When would this happen?

copy_to_user() is a sleeping function that sleeps if the user memory
is non-present. So userspace can cause copy_to_user to copy to
anything, including memory that is non-present and will take along
time to page fault in. Eg perhaps by abusing userfaultfd.

We should not allow userspace to globally DOS the iommu driver this
way.

So do not hold locks that are also held by the HW event path across
copy_to_user().

Jason
Re: [PATCH v5 06/14] iommufd: Add IOMMUFD_OBJ_VEVENTQ and IOMMUFD_CMD_VEVENTQ_ALLOC
Posted by Nicolin Chen 11 months, 1 week ago
On Mon, Jan 13, 2025 at 03:12:25PM -0400, Jason Gunthorpe wrote:
> On Fri, Jan 10, 2025 at 01:58:21PM -0800, Nicolin Chen wrote:
> > Hmm, it seems that I haven't got your first narrative straight..
> > 
> > Would you mind elaborate "copy_to_user() can block while waiting
> > on a page fault"? When would this happen?
> 
> copy_to_user() is a sleeping function that sleeps if the user memory
> is non-present. So userspace can cause copy_to_user to copy to
> anything, including memory that is non-present and will take along
> time to page fault in. Eg perhaps by abusing userfaultfd.
> 
> We should not allow userspace to globally DOS the iommu driver this
> way.
> 
> So do not hold locks that are also held by the HW event path across
> copy_to_user().

I see. Thanks for explaining. I will add a patch fixing the fault
read() and change the veventq read() accordingly.

Nicolin
RE: [PATCH v5 06/14] iommufd: Add IOMMUFD_OBJ_VEVENTQ and IOMMUFD_CMD_VEVENTQ_ALLOC
Posted by Tian, Kevin 11 months, 1 week ago
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Wednesday, January 8, 2025 1:10 AM
> +
> +int iommufd_veventq_alloc(struct iommufd_ucmd *ucmd)
> +{
> +	struct iommu_veventq_alloc *cmd = ucmd->cmd;
> +	struct iommufd_veventq *veventq;
> +	struct iommufd_viommu *viommu;
> +	int fdno;
> +	int rc;
> +
> +	if (cmd->flags || cmd->type == IOMMU_VEVENTQ_TYPE_DEFAULT)
> +		return -EOPNOTSUPP;
> +
> +	viommu = iommufd_get_viommu(ucmd, cmd->viommu_id);
> +	if (IS_ERR(viommu))
> +		return PTR_ERR(viommu);
> +
> +	if (!viommu->ops || !viommu->ops->supports_veventq ||
> +	    !viommu->ops->supports_veventq(cmd->type))
> +		return -EOPNOTSUPP;
> +

I'm not sure about the necessity of above check. The event queue
is just a software struct with a user-specified format for the iommu
driver to report viommu event. The struct itself is not constrained
by the hardware capability, though I'm not sure a real usage in
which a smmu driver wants to report a vtd event. But legitimately
an user can create any type of event queues which might just be
never used.

It sounds clearer to do the check when IOPF cap is actually enabled
on a device contained in the viommu. At that point check whether 
a required type eventqueue has been created. If not then fail the
iopf enabling.

Then it reveals probably another todo in this series. Seems you still
let the smmu driver statically enable iopf when probing the device. 
Sounds like iommufd_viommu_alloc_hwpt_nested() may accept
IOMMU_HWPT_FAULT_ID_VALID to refer to a event queue and
later dynamically enable/disable iopf when attaching a device to the
hwpt and check the event queue type there. Just like how the fault
object is handled.
Re: [PATCH v5 06/14] iommufd: Add IOMMUFD_OBJ_VEVENTQ and IOMMUFD_CMD_VEVENTQ_ALLOC
Posted by Nicolin Chen 11 months, 1 week ago
On Fri, Jan 10, 2025 at 07:06:49AM +0000, Tian, Kevin wrote:
> > From: Nicolin Chen <nicolinc@nvidia.com>
> > Sent: Wednesday, January 8, 2025 1:10 AM
> > +
> > +int iommufd_veventq_alloc(struct iommufd_ucmd *ucmd)
> > +{
> > +	struct iommu_veventq_alloc *cmd = ucmd->cmd;
> > +	struct iommufd_veventq *veventq;
> > +	struct iommufd_viommu *viommu;
> > +	int fdno;
> > +	int rc;
> > +
> > +	if (cmd->flags || cmd->type == IOMMU_VEVENTQ_TYPE_DEFAULT)
> > +		return -EOPNOTSUPP;
> > +
> > +	viommu = iommufd_get_viommu(ucmd, cmd->viommu_id);
> > +	if (IS_ERR(viommu))
> > +		return PTR_ERR(viommu);
> > +
> > +	if (!viommu->ops || !viommu->ops->supports_veventq ||
> > +	    !viommu->ops->supports_veventq(cmd->type))
> > +		return -EOPNOTSUPP;
> > +
> 
> I'm not sure about the necessity of above check. The event queue
> is just a software struct with a user-specified format for the iommu
> driver to report viommu event. The struct itself is not constrained
> by the hardware capability, though I'm not sure a real usage in
> which a smmu driver wants to report a vtd event. But legitimately
> an user can create any type of event queues which might just be
> never used.

Allowing a random type that a driver will never use for reporting
doesn't sound to make a lot of sense to me...

That being said, yea..I guess we could drop the limit here, since
it isn't going to break anything?

> It sounds clearer to do the check when IOPF cap is actually enabled
> on a device contained in the viommu. At that point check whether 
> a required type eventqueue has been created. If not then fail the
> iopf enabling.

Hmm, isn't IOPF a different channel?

And a vEVENTQ is per vIOMMU, not necessarily per vDEVICE/device..

> Then it reveals probably another todo in this series. Seems you still
> let the smmu driver statically enable iopf when probing the device. 
> Sounds like iommufd_viommu_alloc_hwpt_nested() may accept
> IOMMU_HWPT_FAULT_ID_VALID to refer to a event queue and
> later dynamically enable/disable iopf when attaching a device to the
> hwpt and check the event queue type there. Just like how the fault
> object is handled.

You've lost me here :-/

Thanks
Nicolin
RE: [PATCH v5 06/14] iommufd: Add IOMMUFD_OBJ_VEVENTQ and IOMMUFD_CMD_VEVENTQ_ALLOC
Posted by Tian, Kevin 11 months, 1 week ago
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Saturday, January 11, 2025 5:29 AM
> 
> On Fri, Jan 10, 2025 at 07:06:49AM +0000, Tian, Kevin wrote:
> > > From: Nicolin Chen <nicolinc@nvidia.com>
> > > Sent: Wednesday, January 8, 2025 1:10 AM
> > > +
> > > +int iommufd_veventq_alloc(struct iommufd_ucmd *ucmd)
> > > +{
> > > +	struct iommu_veventq_alloc *cmd = ucmd->cmd;
> > > +	struct iommufd_veventq *veventq;
> > > +	struct iommufd_viommu *viommu;
> > > +	int fdno;
> > > +	int rc;
> > > +
> > > +	if (cmd->flags || cmd->type == IOMMU_VEVENTQ_TYPE_DEFAULT)
> > > +		return -EOPNOTSUPP;
> > > +
> > > +	viommu = iommufd_get_viommu(ucmd, cmd->viommu_id);
> > > +	if (IS_ERR(viommu))
> > > +		return PTR_ERR(viommu);
> > > +
> > > +	if (!viommu->ops || !viommu->ops->supports_veventq ||
> > > +	    !viommu->ops->supports_veventq(cmd->type))
> > > +		return -EOPNOTSUPP;
> > > +
> >
> > I'm not sure about the necessity of above check. The event queue
> > is just a software struct with a user-specified format for the iommu
> > driver to report viommu event. The struct itself is not constrained
> > by the hardware capability, though I'm not sure a real usage in
> > which a smmu driver wants to report a vtd event. But legitimately
> > an user can create any type of event queues which might just be
> > never used.
> 
> Allowing a random type that a driver will never use for reporting
> doesn't sound to make a lot of sense to me...
> 
> That being said, yea..I guess we could drop the limit here, since
> it isn't going to break anything?
> 
> > It sounds clearer to do the check when IOPF cap is actually enabled
> > on a device contained in the viommu. At that point check whether
> > a required type eventqueue has been created. If not then fail the
> > iopf enabling.
> 
> Hmm, isn't IOPF a different channel?

We have a fault queue for delivering IOPF on hwpt, when vIOMMU is
not involved

Now with vIOMMU my understanding was that all events including
IOPF are delivered via the event queue in the vIOMMU. Just echoed
by the documentation patch:

+- IOMMUFD_OBJ_VEVENTQ, representing a software queue for a vIOMMU to report its
+  events such as translation faults occurred to a nested stage-1 and HW-specific
+  events.

> 
> And a vEVENTQ is per vIOMMU, not necessarily per vDEVICE/device..

Yes. My point was to verify whether the vEVENTQ type is compatible when
a nested faultable hwpt is created with vIOMMU as the parent. then when
attaching a device to the nested hwpt we dynamically turn on PRI on the
device just like how it's handled in the fault queue path.

> 
> > Then it reveals probably another todo in this series. Seems you still
> > let the smmu driver statically enable iopf when probing the device.
> > Sounds like iommufd_viommu_alloc_hwpt_nested() may accept
> > IOMMU_HWPT_FAULT_ID_VALID to refer to a event queue and
> > later dynamically enable/disable iopf when attaching a device to the
> > hwpt and check the event queue type there. Just like how the fault
> > object is handled.
> 
> You've lost me here :-/
> 

Hope above explanation makes my point clearer. Then for a nested
hwpt created within a vIOMMU there is an open whether we want
a per-hwpt option to mark whether it allows fault, or assume that
every nested hwpt (and the devices attached to it) must be faultable
once any vEVENTQ is created in the vIOMMU.
Re: [PATCH v5 06/14] iommufd: Add IOMMUFD_OBJ_VEVENTQ and IOMMUFD_CMD_VEVENTQ_ALLOC
Posted by Nicolin Chen 11 months, 1 week ago
On Mon, Jan 13, 2025 at 02:52:32AM +0000, Tian, Kevin wrote:
> > From: Nicolin Chen <nicolinc@nvidia.com>
> > Sent: Saturday, January 11, 2025 5:29 AM
> > 
> > On Fri, Jan 10, 2025 at 07:06:49AM +0000, Tian, Kevin wrote:
> > > > From: Nicolin Chen <nicolinc@nvidia.com>
> > > > Sent: Wednesday, January 8, 2025 1:10 AM
> > > > +
> > > > +int iommufd_veventq_alloc(struct iommufd_ucmd *ucmd)
> > > > +{
> > > > +	struct iommu_veventq_alloc *cmd = ucmd->cmd;
> > > > +	struct iommufd_veventq *veventq;
> > > > +	struct iommufd_viommu *viommu;
> > > > +	int fdno;
> > > > +	int rc;
> > > > +
> > > > +	if (cmd->flags || cmd->type == IOMMU_VEVENTQ_TYPE_DEFAULT)
> > > > +		return -EOPNOTSUPP;
> > > > +
> > > > +	viommu = iommufd_get_viommu(ucmd, cmd->viommu_id);
> > > > +	if (IS_ERR(viommu))
> > > > +		return PTR_ERR(viommu);
> > > > +
> > > > +	if (!viommu->ops || !viommu->ops->supports_veventq ||
> > > > +	    !viommu->ops->supports_veventq(cmd->type))
> > > > +		return -EOPNOTSUPP;
> > > > +
> > >
> > > I'm not sure about the necessity of above check. The event queue
> > > is just a software struct with a user-specified format for the iommu
> > > driver to report viommu event. The struct itself is not constrained
> > > by the hardware capability, though I'm not sure a real usage in
> > > which a smmu driver wants to report a vtd event. But legitimately
> > > an user can create any type of event queues which might just be
> > > never used.
> > 
> > Allowing a random type that a driver will never use for reporting
> > doesn't sound to make a lot of sense to me...
> > 
> > That being said, yea..I guess we could drop the limit here, since
> > it isn't going to break anything?
> > 
> > > It sounds clearer to do the check when IOPF cap is actually enabled
> > > on a device contained in the viommu. At that point check whether
> > > a required type eventqueue has been created. If not then fail the
> > > iopf enabling.
> > 
> > Hmm, isn't IOPF a different channel?
> 
> We have a fault queue for delivering IOPF on hwpt, when vIOMMU is
> not involved
> 
> Now with vIOMMU my understanding was that all events including
> IOPF are delivered via the event queue in the vIOMMU. Just echoed
> by the documentation patch:
> 
> +- IOMMUFD_OBJ_VEVENTQ, representing a software queue for a vIOMMU to report its
> +  events such as translation faults occurred to a nested stage-1 and HW-specific
> +  events.

Oh, looks like that line misguided you.. It should be non-PRI type
of fault, e.g. a stage-1 DMA translation error should be forwarded
to the guest. I can make it clearer.

> > 
> > And a vEVENTQ is per vIOMMU, not necessarily per vDEVICE/device..
> 
> Yes. My point was to verify whether the vEVENTQ type is compatible when
> a nested faultable hwpt is created with vIOMMU as the parent. then when
> attaching a device to the nested hwpt we dynamically turn on PRI on the
> device just like how it's handled in the fault queue path.

We will still have the fault queue:
	if (error is handled by PRI)
		report via fault queue; // need response
	else (error is handled by vEVENTQ)
		report via vEVENTQ; // no need of response
	else
		dump unhandled faults;

> > > Then it reveals probably another todo in this series. Seems you still
> > > let the smmu driver statically enable iopf when probing the device.
> > > Sounds like iommufd_viommu_alloc_hwpt_nested() may accept
> > > IOMMU_HWPT_FAULT_ID_VALID to refer to a event queue and
> > > later dynamically enable/disable iopf when attaching a device to the
> > > hwpt and check the event queue type there. Just like how the fault
> > > object is handled.
> > 
> > You've lost me here :-/
> > 
> 
> Hope above explanation makes my point clearer. Then for a nested
> hwpt created within a vIOMMU there is an open whether we want
> a per-hwpt option to mark whether it allows fault, or assume that
> every nested hwpt (and the devices attached to it) must be faultable
> once any vEVENTQ is created in the vIOMMU.

A vIOMMU-based nested HWPT should still enable IOPF via the flag
IOMMU_HWPT_FAULT_ID_VALID.

Thanks
Nicolin
Re: [PATCH v5 06/14] iommufd: Add IOMMUFD_OBJ_VEVENTQ and IOMMUFD_CMD_VEVENTQ_ALLOC
Posted by Jason Gunthorpe 11 months, 1 week ago
On Sun, Jan 12, 2025 at 08:51:25PM -0800, Nicolin Chen wrote:

> We will still have the fault queue:
> 	if (error is handled by PRI)
> 		report via fault queue; // need response

This is an important point, we can't generate a response for the PRI
unless the fault queue is used, so even though the vEVENTQ could carry
the PRI event, it must not be done. If there is no fault handle
available the kernel must NAK the PRI event internally, never send it
to the vEVENTQ.

Jason
RE: [PATCH v5 06/14] iommufd: Add IOMMUFD_OBJ_VEVENTQ and IOMMUFD_CMD_VEVENTQ_ALLOC
Posted by Tian, Kevin 11 months, 1 week ago
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Monday, January 13, 2025 12:51 PM
> 
> On Mon, Jan 13, 2025 at 02:52:32AM +0000, Tian, Kevin wrote:
> > > From: Nicolin Chen <nicolinc@nvidia.com>
> > > Sent: Saturday, January 11, 2025 5:29 AM
> > >
> > > On Fri, Jan 10, 2025 at 07:06:49AM +0000, Tian, Kevin wrote:
> > > > > From: Nicolin Chen <nicolinc@nvidia.com>
> > > > > Sent: Wednesday, January 8, 2025 1:10 AM
> > > > > +
> > > > > +int iommufd_veventq_alloc(struct iommufd_ucmd *ucmd)
> > > > > +{
> > > > > +	struct iommu_veventq_alloc *cmd = ucmd->cmd;
> > > > > +	struct iommufd_veventq *veventq;
> > > > > +	struct iommufd_viommu *viommu;
> > > > > +	int fdno;
> > > > > +	int rc;
> > > > > +
> > > > > +	if (cmd->flags || cmd->type ==
> IOMMU_VEVENTQ_TYPE_DEFAULT)
> > > > > +		return -EOPNOTSUPP;
> > > > > +
> > > > > +	viommu = iommufd_get_viommu(ucmd, cmd->viommu_id);
> > > > > +	if (IS_ERR(viommu))
> > > > > +		return PTR_ERR(viommu);
> > > > > +
> > > > > +	if (!viommu->ops || !viommu->ops->supports_veventq ||
> > > > > +	    !viommu->ops->supports_veventq(cmd->type))
> > > > > +		return -EOPNOTSUPP;
> > > > > +
> > > >
> > > > I'm not sure about the necessity of above check. The event queue
> > > > is just a software struct with a user-specified format for the iommu
> > > > driver to report viommu event. The struct itself is not constrained
> > > > by the hardware capability, though I'm not sure a real usage in
> > > > which a smmu driver wants to report a vtd event. But legitimately
> > > > an user can create any type of event queues which might just be
> > > > never used.
> > >
> > > Allowing a random type that a driver will never use for reporting
> > > doesn't sound to make a lot of sense to me...
> > >
> > > That being said, yea..I guess we could drop the limit here, since
> > > it isn't going to break anything?
> > >
> > > > It sounds clearer to do the check when IOPF cap is actually enabled
> > > > on a device contained in the viommu. At that point check whether
> > > > a required type eventqueue has been created. If not then fail the
> > > > iopf enabling.
> > >
> > > Hmm, isn't IOPF a different channel?
> >
> > We have a fault queue for delivering IOPF on hwpt, when vIOMMU is
> > not involved
> >
> > Now with vIOMMU my understanding was that all events including
> > IOPF are delivered via the event queue in the vIOMMU. Just echoed
> > by the documentation patch:
> >
> > +- IOMMUFD_OBJ_VEVENTQ, representing a software queue for a
> vIOMMU to report its
> > +  events such as translation faults occurred to a nested stage-1 and HW-
> specific
> > +  events.
> 
> Oh, looks like that line misguided you.. It should be non-PRI type
> of fault, e.g. a stage-1 DMA translation error should be forwarded
> to the guest. I can make it clearer.
> 
> > >
> > > And a vEVENTQ is per vIOMMU, not necessarily per vDEVICE/device..
> >
> > Yes. My point was to verify whether the vEVENTQ type is compatible when
> > a nested faultable hwpt is created with vIOMMU as the parent. then when
> > attaching a device to the nested hwpt we dynamically turn on PRI on the
> > device just like how it's handled in the fault queue path.
> 
> We will still have the fault queue:
> 	if (error is handled by PRI)
> 		report via fault queue; // need response
> 	else (error is handled by vEVENTQ)
> 		report via vEVENTQ; // no need of response
> 	else
> 		dump unhandled faults;
> 
> > > > Then it reveals probably another todo in this series. Seems you still
> > > > let the smmu driver statically enable iopf when probing the device.
> > > > Sounds like iommufd_viommu_alloc_hwpt_nested() may accept
> > > > IOMMU_HWPT_FAULT_ID_VALID to refer to a event queue and
> > > > later dynamically enable/disable iopf when attaching a device to the
> > > > hwpt and check the event queue type there. Just like how the fault
> > > > object is handled.
> > >
> > > You've lost me here :-/
> > >
> >
> > Hope above explanation makes my point clearer. Then for a nested
> > hwpt created within a vIOMMU there is an open whether we want
> > a per-hwpt option to mark whether it allows fault, or assume that
> > every nested hwpt (and the devices attached to it) must be faultable
> > once any vEVENTQ is created in the vIOMMU.
> 
> A vIOMMU-based nested HWPT should still enable IOPF via the flag
> IOMMU_HWPT_FAULT_ID_VALID.
> 

Thanks for clarification. Clearly I got it messed. 😊