[PATCH v3 11/15] iommu/amd: Add support for nested domain allocation

Suravee Suthikulpanit posted 15 patches 2 months, 1 week ago
There is a newer version of this series
[PATCH v3 11/15] iommu/amd: Add support for nested domain allocation
Posted by Suravee Suthikulpanit 2 months, 1 week ago
The nested domain is allocated with IOMMU_DOMAIN_NESTED type to store
stage-1 translation (i.e. GVA->GPA). This includes the GCR3 root pointer
table along with guest page tables. The struct iommu_hwpt_amd_guest
contains this information, and is passed from user-space as a parameter
of the struct iommu_ops.domain_alloc_nested().

Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
---
 drivers/iommu/amd/Makefile          |  2 +-
 drivers/iommu/amd/amd_iommu.h       |  4 ++
 drivers/iommu/amd/amd_iommu_types.h | 31 +++++++----
 drivers/iommu/amd/nested.c          | 84 +++++++++++++++++++++++++++++
 4 files changed, 110 insertions(+), 11 deletions(-)
 create mode 100644 drivers/iommu/amd/nested.c

diff --git a/drivers/iommu/amd/Makefile b/drivers/iommu/amd/Makefile
index 5ae46d99a45b..afa12ca2110e 100644
--- a/drivers/iommu/amd/Makefile
+++ b/drivers/iommu/amd/Makefile
@@ -1,4 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-y += iommu.o init.o quirks.o io_pgtable.o io_pgtable_v2.o ppr.o pasid.o
-obj-$(CONFIG_AMD_IOMMU_IOMMUFD) += iommufd.o
+obj-$(CONFIG_AMD_IOMMU_IOMMUFD) += iommufd.o nested.o
 obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += debugfs.o
diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h
index d533bb8851ea..3730d8bbe6dc 100644
--- a/drivers/iommu/amd/amd_iommu.h
+++ b/drivers/iommu/amd/amd_iommu.h
@@ -202,4 +202,8 @@ amd_iommu_make_clear_dte(struct iommu_dev_data *dev_data, struct dev_table_entry
 	new->data128[1] = 0;
 }
 
+/* NESTED */
+struct iommu_domain *
+amd_iommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
+			      const struct iommu_user_data *user_data);
 #endif /* AMD_IOMMU_H */
diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
index c34604cf1811..9374e6f7a19d 100644
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -20,6 +20,8 @@
 #include <linux/irqreturn.h>
 #include <linux/io-pgtable.h>
 
+#include <uapi/linux/iommufd.h>
+
 /*
  * Maximum number of IOMMUs supported
  */
@@ -586,6 +588,25 @@ struct pdom_iommu_info {
 	u32 refcnt;	/* Count of attached dev/pasid per domain/IOMMU */
 };
 
+/*
+ * Structure defining one entry in the device table
+ */
+struct dev_table_entry {
+	union {
+		u64 data[4];
+		u128 data128[2];
+	};
+};
+
+/*
+ * Nested domain is specifically used for nested translation
+ */
+struct nested_domain {
+	struct iommu_domain domain; /* generic domain handle used by iommu core code */
+	u16 id;	                    /* the domain id written to the device table */
+	struct iommu_hwpt_amd_guest gdte; /* Guest vIOMMU DTE */
+};
+
 /*
  * This structure contains generic data for  IOMMU protection domains
  * independent of their use.
@@ -895,16 +916,6 @@ extern struct list_head amd_iommu_pci_seg_list;
  */
 extern struct list_head amd_iommu_list;
 
-/*
- * Structure defining one entry in the device table
- */
-struct dev_table_entry {
-	union {
-		u64 data[4];
-		u128 data128[2];
-	};
-};
-
 /*
  * Structure defining one entry in the command buffer
  */
diff --git a/drivers/iommu/amd/nested.c b/drivers/iommu/amd/nested.c
new file mode 100644
index 000000000000..0ab5d65ec283
--- /dev/null
+++ b/drivers/iommu/amd/nested.c
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2025 Advanced Micro Devices, Inc.
+ */
+
+#define dev_fmt(fmt)	"AMD-Vi: " fmt
+
+#include <linux/iommu.h>
+#include <uapi/linux/iommufd.h>
+
+#include "amd_iommu.h"
+
+static const struct iommu_domain_ops nested_domain_ops;
+
+static inline struct nested_domain *to_ndomain(struct iommu_domain *dom)
+{
+	return container_of(dom, struct nested_domain, domain);
+}
+
+/*
+ * This function is assigned to struct iommufd_viommu_ops.alloc_domain_nested()
+ * during the call to struct iommu_ops.viommu_init().
+ */
+struct iommu_domain *
+amd_iommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
+			      const struct iommu_user_data *user_data)
+{
+	int ret;
+	struct iommu_hwpt_amd_guest gdte;
+	struct nested_domain *ndom;
+
+	if (user_data->type != IOMMU_HWPT_DATA_AMD_GUEST)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	ret = iommu_copy_struct_from_user(&gdte, user_data,
+					  IOMMU_HWPT_DATA_AMD_GUEST,
+					  dte);
+	if (ret)
+		return ERR_PTR(ret);
+
+	ndom = kzalloc(sizeof(*ndom), GFP_KERNEL);
+	if (!ndom)
+		return ERR_PTR(-ENOMEM);
+
+	ndom->domain.ops = &nested_domain_ops;
+	ndom->domain.type = IOMMU_DOMAIN_NESTED;
+	memcpy(&ndom->gdte, &gdte, sizeof(gdte));
+
+	/*
+	 * Normally, when a guest has multiple pass-through devices,
+	 * the IOMMU driver setup DTEs with the same stage-2 table and
+	 * use the same host domain ID (hDomId). In case of nested translation,
+	 * if the guest setup different stage-1 tables with same PASID,
+	 * IOMMU would use the same TLB tag. This will results in TLB
+	 * aliasing issue.
+	 *
+	 * Workaround the issue by allocating per-device hDomID for nested
+	 * domain (i.e. ndom->id). This require per-device IOMMU TLB invalidation
+	 * with corresponded hDomId on the host side when updating stage-2 table.
+	 */
+	ndom->id = amd_iommu_pdom_id_alloc();
+	if (ndom->id <= 0) {
+		ret = -ENOSPC;
+		goto out_err;
+	}
+
+	return &ndom->domain;
+out_err:
+	kfree(ndom);
+	return ERR_PTR(ret);
+}
+
+static void nested_domain_free(struct iommu_domain *dom)
+{
+	struct nested_domain *ndom = to_ndomain(dom);
+
+	amd_iommu_pdom_id_free(ndom->id);
+	kfree(ndom);
+}
+
+static const struct iommu_domain_ops nested_domain_ops = {
+	.free = nested_domain_free,
+};
+
-- 
2.34.1
Re: [PATCH v3 11/15] iommu/amd: Add support for nested domain allocation
Posted by Jason Gunthorpe 2 months, 1 week ago
On Thu, Oct 09, 2025 at 11:57:51PM +0000, Suravee Suthikulpanit wrote:

> +/*
> + * Structure defining one entry in the device table
> + */
> +struct dev_table_entry {
> +	union {

There is no longer a reason to move this, so these hunks can be dropped?

> +/*
> + * This function is assigned to struct iommufd_viommu_ops.alloc_domain_nested()
> + * during the call to struct iommu_ops.viommu_init().
> + */
> +struct iommu_domain *
> +amd_iommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
> +			      const struct iommu_user_data *user_data)
> +{
> +	int ret;
> +	struct iommu_hwpt_amd_guest gdte;
> +	struct nested_domain *ndom;
> +
> +	if (user_data->type != IOMMU_HWPT_DATA_AMD_GUEST)
> +		return ERR_PTR(-EOPNOTSUPP);
> +
> +	ret = iommu_copy_struct_from_user(&gdte, user_data,
> +					  IOMMU_HWPT_DATA_AMD_GUEST,
> +					  dte);
> +	if (ret)
> +		return ERR_PTR(ret);
> +
> +	ndom = kzalloc(sizeof(*ndom), GFP_KERNEL);
> +	if (!ndom)
> +		return ERR_PTR(-ENOMEM);
> +
> +	ndom->domain.ops = &nested_domain_ops;
> +	ndom->domain.type = IOMMU_DOMAIN_NESTED;
> +	memcpy(&ndom->gdte, &gdte, sizeof(gdte));

You can move the iommu_copy_struct_from_user() to here and avoid this
memcpy and stack allocation.

> +	/*
> +	 * Normally, when a guest has multiple pass-through devices,
> +	 * the IOMMU driver setup DTEs with the same stage-2 table and
> +	 * use the same host domain ID (hDomId). In case of nested translation,
> +	 * if the guest setup different stage-1 tables with same PASID,
> +	 * IOMMU would use the same TLB tag. This will results in TLB
> +	 * aliasing issue.

Yes, but if the guest does this then the guest is required to use
different gDomID's.

> +	 *
> +	 * Workaround the issue by allocating per-device hDomID for nested
> +	 * domain (i.e. ndom->id). This require per-device IOMMU TLB invalidation
> +	 * with corresponded hDomId on the host side when updating stage-2 table.
> +	 */

This is still missing the point - we cannot work around this with
unique hDomID's because when you implement invalidation there is only
one input gDomID and you need to map it to exactly one hDomID to push
the PASID invalidation.

The only reason it is barely acceptable now is because there is no
invalidation support!

The comment should be:

 The guest is assigning gDomIDs based on its own algorithm for managing
 cache tags of (DomID, PASID). Within a single viommu the S2 is used
 by all DTEs but we need to consistently map the gDomID to a single hDomID.
 This should be done by using an xarray in the viommu to keep track of
 the gDomID mapping. Since there is no invalidation support and no
 viommu yet just always use a unique hDomId for now.

 When the S2 is changed all the hDomIDs in the xarray need to have
 invalidations pushed.

Other than that it looks OK

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>

Jason