Add a new ioctl for user space to do a vIOMMU allocation. It must be based
on a nesting parent HWPT, so take its refcount.
IOMMU driver wanting to support vIOMMUs must define its IOMMU_VIOMMU_TYPE_
in the uAPI header and implement a viommu_alloc op in its iommu_ops.
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
drivers/iommu/iommufd/Makefile | 3 +-
drivers/iommu/iommufd/iommufd_private.h | 3 +
include/uapi/linux/iommufd.h | 40 +++++++++++
drivers/iommu/iommufd/main.c | 6 ++
drivers/iommu/iommufd/viommu.c | 89 +++++++++++++++++++++++++
5 files changed, 140 insertions(+), 1 deletion(-)
create mode 100644 drivers/iommu/iommufd/viommu.c
diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
index 435124a8e1f1..7c207c5f1eb6 100644
--- a/drivers/iommu/iommufd/Makefile
+++ b/drivers/iommu/iommufd/Makefile
@@ -7,7 +7,8 @@ iommufd-y := \
ioas.o \
main.o \
pages.o \
- vfio_compat.o
+ vfio_compat.o \
+ viommu.o
iommufd-$(CONFIG_IOMMUFD_TEST) += selftest.o
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index d53c1ca75532..9adf8d616796 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -504,6 +504,9 @@ static inline int iommufd_hwpt_replace_device(struct iommufd_device *idev,
return iommu_group_replace_domain(idev->igroup->group, hwpt->domain);
}
+int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd);
+void iommufd_viommu_destroy(struct iommufd_object *obj);
+
#ifdef CONFIG_IOMMUFD_TEST
int iommufd_test(struct iommufd_ucmd *ucmd);
void iommufd_selftest_destroy(struct iommufd_object *obj);
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index cd4920886ad0..3d320d069654 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -51,6 +51,7 @@ enum {
IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP = 0x8c,
IOMMUFD_CMD_HWPT_INVALIDATE = 0x8d,
IOMMUFD_CMD_FAULT_QUEUE_ALLOC = 0x8e,
+ IOMMUFD_CMD_VIOMMU_ALLOC = 0x8f,
};
/**
@@ -852,4 +853,43 @@ struct iommu_fault_alloc {
__u32 out_fault_fd;
};
#define IOMMU_FAULT_QUEUE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_FAULT_QUEUE_ALLOC)
+
+/**
+ * enum iommu_viommu_type - Virtual IOMMU Type
+ * @IOMMU_VIOMMU_TYPE_DEFAULT: Reserved for future use
+ */
+enum iommu_viommu_type {
+ IOMMU_VIOMMU_TYPE_DEFAULT = 0,
+};
+
+/**
+ * struct iommu_viommu_alloc - ioctl(IOMMU_VIOMMU_ALLOC)
+ * @size: sizeof(struct iommu_viommu_alloc)
+ * @flags: Must be 0
+ * @type: Type of the virtual IOMMU. Must be defined in enum iommu_viommu_type
+ * @dev_id: The device's physical IOMMU will be used to back the virtual IOMMU
+ * @hwpt_id: ID of a nesting parent HWPT to associate to
+ * @out_viommu_id: Output virtual IOMMU ID for the allocated object
+ *
+ * Allocate a virtual IOMMU object, representing the underlying physical IOMMU's
+ * virtualization support that is a security-isolated slice of the real IOMMU HW
+ * that is unique to a specific VM. Operations global to the IOMMU are connected
+ * to the vIOMMU, such as:
+ * - Security namespace for guest owned ID, e.g. guest-controlled cache tags
+ * - Access to a sharable nesting parent pagetable across physical IOMMUs
+ * - Non-affiliated event reporting (e.g. an invalidation queue error)
+ * - Virtualization of various platforms IDs, e.g. RIDs and others
+ * - Delivery of paravirtualized invalidation
+ * - Direct assigned invalidation queues
+ * - Direct assigned interrupts
+ */
+struct iommu_viommu_alloc {
+ __u32 size;
+ __u32 flags;
+ __u32 type;
+ __u32 dev_id;
+ __u32 hwpt_id;
+ __u32 out_viommu_id;
+};
+#define IOMMU_VIOMMU_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VIOMMU_ALLOC)
#endif
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index e244fed1b7ab..ab5ee325d809 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -321,6 +321,7 @@ union ucmd_buffer {
struct iommu_ioas_unmap unmap;
struct iommu_option option;
struct iommu_vfio_ioas vfio_ioas;
+ struct iommu_viommu_alloc viommu;
#ifdef CONFIG_IOMMUFD_TEST
struct iommu_test_cmd test;
#endif
@@ -372,6 +373,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
val64),
IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas,
__reserved),
+ IOCTL_OP(IOMMU_VIOMMU_ALLOC, iommufd_viommu_alloc_ioctl,
+ struct iommu_viommu_alloc, out_viommu_id),
#ifdef CONFIG_IOMMUFD_TEST
IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last),
#endif
@@ -507,6 +510,9 @@ static const struct iommufd_object_ops iommufd_object_ops[] = {
[IOMMUFD_OBJ_FAULT] = {
.destroy = iommufd_fault_destroy,
},
+ [IOMMUFD_OBJ_VIOMMU] = {
+ .destroy = iommufd_viommu_destroy,
+ },
#ifdef CONFIG_IOMMUFD_TEST
[IOMMUFD_OBJ_SELFTEST] = {
.destroy = iommufd_selftest_destroy,
diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
new file mode 100644
index 000000000000..eb41e15ebab1
--- /dev/null
+++ b/drivers/iommu/iommufd/viommu.c
@@ -0,0 +1,89 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ */
+
+#include "iommufd_private.h"
+
+void iommufd_viommu_destroy(struct iommufd_object *obj)
+{
+ struct iommufd_viommu *viommu =
+ container_of(obj, struct iommufd_viommu, obj);
+
+ if (viommu->ops && viommu->ops->free)
+ viommu->ops->free(viommu);
+ refcount_dec(&viommu->hwpt->common.obj.users);
+}
+
+int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_viommu_alloc *cmd = ucmd->cmd;
+ struct iommufd_hwpt_paging *hwpt_paging;
+ struct iommufd_viommu *viommu;
+ struct iommufd_device *idev;
+ const struct iommu_ops *ops;
+ int rc;
+
+ if (cmd->flags || cmd->type == IOMMU_VIOMMU_TYPE_DEFAULT)
+ return -EOPNOTSUPP;
+
+ idev = iommufd_get_device(ucmd, cmd->dev_id);
+ if (IS_ERR(idev))
+ return PTR_ERR(idev);
+
+ ops = dev_iommu_ops(idev->dev);
+ if (!ops->viommu_alloc) {
+ rc = -EOPNOTSUPP;
+ goto out_put_idev;
+ }
+
+ hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id);
+ if (IS_ERR(hwpt_paging)) {
+ rc = PTR_ERR(hwpt_paging);
+ goto out_put_idev;
+ }
+
+ if (!hwpt_paging->nest_parent) {
+ rc = -EINVAL;
+ goto out_put_hwpt;
+ }
+
+ viommu = ops->viommu_alloc(idev->dev, hwpt_paging->common.domain,
+ ucmd->ictx, cmd->type);
+ if (IS_ERR(viommu)) {
+ rc = PTR_ERR(viommu);
+ goto out_put_hwpt;
+ }
+
+ rc = iommufd_verify_unfinalized_object(ucmd->ictx, &viommu->obj);
+ if (rc) {
+ kfree(viommu);
+ goto out_put_hwpt;
+ }
+
+ viommu->type = cmd->type;
+ viommu->ictx = ucmd->ictx;
+ viommu->hwpt = hwpt_paging;
+ /*
+ * It is the most likely case that a physical IOMMU is unpluggable. A
+ * pluggable IOMMU instance (if exists) is responsible for refcounting
+ * on its own.
+ */
+ viommu->iommu_dev = __iommu_get_iommu_dev(idev->dev);
+
+ refcount_inc(&viommu->hwpt->common.obj.users);
+
+ cmd->out_viommu_id = viommu->obj.id;
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+ if (rc)
+ goto out_abort;
+ iommufd_object_finalize(ucmd->ictx, &viommu->obj);
+ goto out_put_hwpt;
+
+out_abort:
+ iommufd_object_abort_and_destroy(ucmd->ictx, &viommu->obj);
+out_put_hwpt:
+ iommufd_put_object(ucmd->ictx, &hwpt_paging->common.obj);
+out_put_idev:
+ iommufd_put_object(ucmd->ictx, &idev->obj);
+ return rc;
+}
--
2.43.0
On Fri, Oct 25, 2024 at 04:49:44PM -0700, Nicolin Chen wrote: > +void iommufd_viommu_destroy(struct iommufd_object *obj) > +{ > + struct iommufd_viommu *viommu = > + container_of(obj, struct iommufd_viommu, obj); > + > + if (viommu->ops && viommu->ops->free) > + viommu->ops->free(viommu); Ops can't be null and free can't be null, that would mean there is a memory leak. > + refcount_dec(&viommu->hwpt->common.obj.users); Don't touch viommu after freeing it Did you run selftests with kasn? > +int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd) > +{ > + struct iommu_viommu_alloc *cmd = ucmd->cmd; > + struct iommufd_hwpt_paging *hwpt_paging; > + struct iommufd_viommu *viommu; > + struct iommufd_device *idev; > + const struct iommu_ops *ops; > + int rc; > + > + if (cmd->flags || cmd->type == IOMMU_VIOMMU_TYPE_DEFAULT) > + return -EOPNOTSUPP; > + > + idev = iommufd_get_device(ucmd, cmd->dev_id); > + if (IS_ERR(idev)) > + return PTR_ERR(idev); > + > + ops = dev_iommu_ops(idev->dev); > + if (!ops->viommu_alloc) { > + rc = -EOPNOTSUPP; > + goto out_put_idev; > + } > + > + hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id); > + if (IS_ERR(hwpt_paging)) { > + rc = PTR_ERR(hwpt_paging); > + goto out_put_idev; > + } > + > + if (!hwpt_paging->nest_parent) { > + rc = -EINVAL; > + goto out_put_hwpt; > + } > + > + viommu = ops->viommu_alloc(idev->dev, hwpt_paging->common.domain, > + ucmd->ictx, cmd->type); > + if (IS_ERR(viommu)) { > + rc = PTR_ERR(viommu); > + goto out_put_hwpt; > + } Check that ops and ops->free are valid here with a WARN_ON > + rc = iommufd_verify_unfinalized_object(ucmd->ictx, &viommu->obj); > + if (rc) { > + kfree(viommu); > + goto out_put_hwpt; > + } > + > + viommu->type = cmd->type; > + viommu->ictx = ucmd->ictx; > + viommu->hwpt = hwpt_paging; > + /* > + * It is the most likely case that a physical IOMMU is unpluggable. A > + * pluggable IOMMU instance (if exists) is responsible for refcounting > + * on its own. > + */ > + viommu->iommu_dev = __iommu_get_iommu_dev(idev->dev); > + > + refcount_inc(&viommu->hwpt->common.obj.users); Put this line right after the one storing to viommu_>hwpt Jason
On Tue, Oct 29, 2024 at 11:54:36AM -0300, Jason Gunthorpe wrote: > On Fri, Oct 25, 2024 at 04:49:44PM -0700, Nicolin Chen wrote: > > +void iommufd_viommu_destroy(struct iommufd_object *obj) > > +{ > > + struct iommufd_viommu *viommu = > > + container_of(obj, struct iommufd_viommu, obj); > > + > > + if (viommu->ops && viommu->ops->free) > > + viommu->ops->free(viommu); > > Ops can't be null and free can't be null, that would mean there is a > memory leak. What if a driver doesn't have anything to free? You're suggesting to force the driver to have an empty free function, right? We can do that, if it is preferable: void arm_vsmmu_free(struct iommufd_viommu *viommu) { } > > + refcount_dec(&viommu->hwpt->common.obj.users); > > Don't touch viommu after freeing it Drivers should only free their own stuff without touching the core: "* @free: Free all driver-specific parts of an iommufd_viommu. The memory of the * vIOMMU will be free-ed by iommufd core after calling this free op." Then, viommu object is freed by the core after ->destroy(), right? > Did you run selftests with kasn? Yea, all passed with no WARN_ON. Thanks Nicolin
On Tue, Oct 29, 2024 at 11:54:36AM -0300, Jason Gunthorpe wrote: > On Fri, Oct 25, 2024 at 04:49:44PM -0700, Nicolin Chen wrote: > > +void iommufd_viommu_destroy(struct iommufd_object *obj) > > +{ > > + struct iommufd_viommu *viommu = > > + container_of(obj, struct iommufd_viommu, obj); > > + > > + if (viommu->ops && viommu->ops->free) > > + viommu->ops->free(viommu); > > Ops can't be null and free can't be null, that would mean there is a > memory leak. Actually, it is just named wrong, it should be called destroy like this op, it doesn't free any memory.. Jason
On Tue, Oct 29, 2024 at 12:36:24PM -0300, Jason Gunthorpe wrote: > On Tue, Oct 29, 2024 at 11:54:36AM -0300, Jason Gunthorpe wrote: > > On Fri, Oct 25, 2024 at 04:49:44PM -0700, Nicolin Chen wrote: > > > +void iommufd_viommu_destroy(struct iommufd_object *obj) > > > +{ > > > + struct iommufd_viommu *viommu = > > > + container_of(obj, struct iommufd_viommu, obj); > > > + > > > + if (viommu->ops && viommu->ops->free) > > > + viommu->ops->free(viommu); > > > > Ops can't be null and free can't be null, that would mean there is a > > memory leak. > > Actually, it is just named wrong, it should be called destroy like > this op, it doesn't free any memory.. Well, it frees if driver allocated something in its top structure. Yet, "destroy" does sound less confusing. Let's rename it, assuming it can still remain to be optional as we have here. Thanks Nicolin
On Tue, Oct 29, 2024 at 08:46:40AM -0700, Nicolin Chen wrote: > On Tue, Oct 29, 2024 at 12:36:24PM -0300, Jason Gunthorpe wrote: > > On Tue, Oct 29, 2024 at 11:54:36AM -0300, Jason Gunthorpe wrote: > > > On Fri, Oct 25, 2024 at 04:49:44PM -0700, Nicolin Chen wrote: > > > > +void iommufd_viommu_destroy(struct iommufd_object *obj) > > > > +{ > > > > + struct iommufd_viommu *viommu = > > > > + container_of(obj, struct iommufd_viommu, obj); > > > > + > > > > + if (viommu->ops && viommu->ops->free) > > > > + viommu->ops->free(viommu); > > > > > > Ops can't be null and free can't be null, that would mean there is a > > > memory leak. > > > > Actually, it is just named wrong, it should be called destroy like > > this op, it doesn't free any memory.. > > Well, it frees if driver allocated something in its top structure. > Yet, "destroy" does sound less confusing. Let's rename it, assuming > it can still remain to be optional as we have here. Yes, optional is right, I was confused by the name. The core code uses destroy so I'd stick with that. Jason
On Tue, Oct 29, 2024 at 12:59:35PM -0300, Jason Gunthorpe wrote: > On Tue, Oct 29, 2024 at 08:46:40AM -0700, Nicolin Chen wrote: > > On Tue, Oct 29, 2024 at 12:36:24PM -0300, Jason Gunthorpe wrote: > > > On Tue, Oct 29, 2024 at 11:54:36AM -0300, Jason Gunthorpe wrote: > > > > On Fri, Oct 25, 2024 at 04:49:44PM -0700, Nicolin Chen wrote: > > > > > +void iommufd_viommu_destroy(struct iommufd_object *obj) > > > > > +{ > > > > > + struct iommufd_viommu *viommu = > > > > > + container_of(obj, struct iommufd_viommu, obj); > > > > > + > > > > > + if (viommu->ops && viommu->ops->free) > > > > > + viommu->ops->free(viommu); > > > > > > > > Ops can't be null and free can't be null, that would mean there is a > > > > memory leak. > > > > > > Actually, it is just named wrong, it should be called destroy like > > > this op, it doesn't free any memory.. > > > > Well, it frees if driver allocated something in its top structure. > > Yet, "destroy" does sound less confusing. Let's rename it, assuming > > it can still remain to be optional as we have here. > > Yes, optional is right, I was confused by the name. The core code uses > destroy so I'd stick with that. Ack. Nicolin
> From: Nicolin Chen <nicolinc@nvidia.com> > Sent: Saturday, October 26, 2024 7:50 AM > > Add a new ioctl for user space to do a vIOMMU allocation. It must be based > on a nesting parent HWPT, so take its refcount. > > IOMMU driver wanting to support vIOMMUs must define its > IOMMU_VIOMMU_TYPE_ > in the uAPI header and implement a viommu_alloc op in its iommu_ops. > > Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> > Signed-off-by: Nicolin Chen <nicolinc@nvidia.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com>
© 2016 - 2024 Red Hat, Inc.