[PATCH v6 10/13] iommu/amd: Introduce gDomID-to-hDomID Mapping and handle parent domain invalidation

Suravee Suthikulpanit posted 13 patches 3 weeks, 1 day ago
[PATCH v6 10/13] iommu/amd: Introduce gDomID-to-hDomID Mapping and handle parent domain invalidation
Posted by Suravee Suthikulpanit 3 weeks, 1 day ago
Each nested domain is assigned guest domain ID (gDomID), which guest OS
programs into guest Device Table Entry (gDTE). For each gDomID, the driver
assigns a corresponding host domain ID (hDomID), which will be programmed
into the host Device Table Entry (hDTE).

The hDomID is allocated during amd_iommu_alloc_domain_nested(),
and free during nested_domain_free(). The gDomID-to-hDomID mapping info
(struct guest_domain_mapping_info) is stored in a per-viommu xarray
(struct amd_iommu_viommu.gdomid_array), which is indexed by gDomID.

Note also that parent domain can be shared among struct iommufd_viommu.
Therefore, when hypervisor invalidates the nest parent domain, the AMD
IOMMU command INVALIDATE_IOMMU_PAGES must be issued for each hDomID in
the gdomid_array. This is handled by the iommu_flush_pages_v1_hdom_ids(),
where it iterates through struct protection_domain.viommu_list.

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
---
 drivers/iommu/amd/amd_iommu_types.h |  23 ++++++
 drivers/iommu/amd/iommu.c           |  38 ++++++++++
 drivers/iommu/amd/iommufd.c         |  31 ++++++++
 drivers/iommu/amd/nested.c          | 111 ++++++++++++++++++++++++++++
 4 files changed, 203 insertions(+)

diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
index 487ee6123de5..4a98ac7dca0f 100644
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -503,6 +503,22 @@ struct pdom_iommu_info {
 struct amd_iommu_viommu {
 	struct iommufd_viommu core;
 	struct protection_domain *parent; /* nest parent domain for this viommu */
+	struct list_head pdom_list;	  /* For protection_domain->viommu_list */
+
+	/*
+	 * Per-vIOMMU guest domain ID to host domain ID mapping.
+	 * Indexed by guest domain ID.
+	 */
+	struct xarray gdomid_array;
+};
+
+/*
+ * Contains guest domain ID mapping info,
+ * which is stored in the struct xarray gdomid_array.
+ */
+struct guest_domain_mapping_info {
+	refcount_t users;
+	u32 hdom_id;		/* Host domain ID */
 };
 
 /*
@@ -511,6 +527,7 @@ struct amd_iommu_viommu {
 struct nested_domain {
 	struct iommu_domain domain; /* generic domain handle used by iommu core code */
 	u16 gdom_id;                /* domain ID from gDTE */
+	struct guest_domain_mapping_info *gdom_info;
 	struct iommu_hwpt_amd_guest gdte; /* Guest vIOMMU DTE */
 	struct amd_iommu_viommu *viommu;  /* AMD hw-viommu this nested domain belong to */
 };
@@ -535,6 +552,12 @@ struct protection_domain {
 
 	struct mmu_notifier mn;	/* mmu notifier for the SVA domain */
 	struct list_head dev_data_list; /* List of pdom_dev_data */
+
+	/*
+	 * Store reference to list of vIOMMUs, which use this protection domain.
+	 * This will be used to look up host domain ID when flushing this domain.
+	 */
+	struct list_head viommu_list;
 };
 PT_IOMMU_CHECK_DOMAIN(struct protection_domain, iommu, domain);
 PT_IOMMU_CHECK_DOMAIN(struct protection_domain, amdv1.iommu, domain);
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index ebc96f1f564f..e33076b99aac 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -1539,6 +1539,32 @@ static void amd_iommu_flush_tlb_domid(struct amd_iommu *iommu, u32 dom_id)
 	iommu_completion_wait(iommu);
 }
 
+static int iommu_flush_pages_v1_hdom_ids(struct protection_domain *pdom, u64 address, size_t size)
+{
+	int ret = 0;
+	struct amd_iommu_viommu *aviommu;
+
+	list_for_each_entry(aviommu, &pdom->viommu_list, pdom_list) {
+		unsigned long i;
+		struct guest_domain_mapping_info *gdom_info;
+		struct amd_iommu *iommu = container_of(aviommu->core.iommu_dev,
+						       struct amd_iommu, iommu);
+
+		xa_lock(&aviommu->gdomid_array);
+		xa_for_each(&aviommu->gdomid_array, i, gdom_info) {
+			struct iommu_cmd cmd;
+
+			pr_debug("%s: iommu=%#x, hdom_id=%#x\n", __func__,
+				 iommu->devid, gdom_info->hdom_id);
+			build_inv_iommu_pages(&cmd, address, size, gdom_info->hdom_id,
+					      IOMMU_NO_PASID, false);
+			ret |= iommu_queue_command(iommu, &cmd);
+		}
+		xa_unlock(&aviommu->gdomid_array);
+	}
+	return ret;
+}
+
 static void amd_iommu_flush_all(struct amd_iommu *iommu)
 {
 	struct iommu_cmd cmd;
@@ -1687,6 +1713,17 @@ static int domain_flush_pages_v1(struct protection_domain *pdom,
 		ret |= iommu_queue_command(pdom_iommu_info->iommu, &cmd);
 	}
 
+	/*
+	 * A domain w/ v1 table can be a nest parent, which can have
+	 * multiple nested domains. Each nested domain has 1:1 mapping
+	 * between gDomID and hDomID. Therefore, flush every hDomID
+	 * associated to this nest parent domain.
+	 *
+	 * See drivers/iommu/amd/nested.c: amd_iommu_alloc_domain_nested()
+	 */
+	if (!list_empty(&pdom->viommu_list))
+		ret |= iommu_flush_pages_v1_hdom_ids(pdom, address, size);
+
 	return ret;
 }
 
@@ -2504,6 +2541,7 @@ static void protection_domain_init(struct protection_domain *domain)
 	spin_lock_init(&domain->lock);
 	INIT_LIST_HEAD(&domain->dev_list);
 	INIT_LIST_HEAD(&domain->dev_data_list);
+	INIT_LIST_HEAD(&domain->viommu_list);
 	xa_init(&domain->iommu_array);
 }
 
diff --git a/drivers/iommu/amd/iommufd.c b/drivers/iommu/amd/iommufd.c
index eb6119bdcf12..2e50633d9c72 100644
--- a/drivers/iommu/amd/iommufd.c
+++ b/drivers/iommu/amd/iommufd.c
@@ -9,6 +9,8 @@
 #include "amd_iommu.h"
 #include "amd_iommu_types.h"
 
+static const struct iommufd_viommu_ops amd_viommu_ops;
+
 void *amd_iommufd_hw_info(struct device *dev, u32 *length, u32 *type)
 {
 	struct iommu_hw_info_amd *hwinfo;
@@ -38,10 +40,39 @@ size_t amd_iommufd_get_viommu_size(struct device *dev, enum iommu_viommu_type vi
 int amd_iommufd_viommu_init(struct iommufd_viommu *viommu, struct iommu_domain *parent,
 			    const struct iommu_user_data *user_data)
 {
+	unsigned long flags;
 	struct protection_domain *pdom = to_pdomain(parent);
 	struct amd_iommu_viommu *aviommu = container_of(viommu, struct amd_iommu_viommu, core);
 
+	xa_init_flags(&aviommu->gdomid_array, XA_FLAGS_ALLOC1);
 	aviommu->parent = pdom;
 
+	viommu->ops = &amd_viommu_ops;
+
+	spin_lock_irqsave(&pdom->lock, flags);
+	list_add(&aviommu->pdom_list, &pdom->viommu_list);
+	spin_unlock_irqrestore(&pdom->lock, flags);
+
 	return 0;
 }
+
+static void amd_iommufd_viommu_destroy(struct iommufd_viommu *viommu)
+{
+	unsigned long flags;
+	struct amd_iommu *iommu = container_of(viommu->iommu_dev, struct amd_iommu, iommu);
+	struct amd_iommu_viommu *aviommu = container_of(viommu, struct amd_iommu_viommu, core);
+	struct protection_domain *pdom = aviommu->parent;
+
+	spin_lock_irqsave(&pdom->lock, flags);
+	list_del(&aviommu->pdom_list);
+	spin_unlock_irqrestore(&pdom->lock, flags);
+	xa_destroy(&aviommu->gdomid_array);
+}
+
+/*
+ * See include/linux/iommufd.h
+ * struct iommufd_viommu_ops - vIOMMU specific operations
+ */
+static const struct iommufd_viommu_ops amd_viommu_ops = {
+	.destroy = amd_iommufd_viommu_destroy,
+};
diff --git a/drivers/iommu/amd/nested.c b/drivers/iommu/amd/nested.c
index a8c0bb4dd733..8154a773eed8 100644
--- a/drivers/iommu/amd/nested.c
+++ b/drivers/iommu/amd/nested.c
@@ -6,6 +6,7 @@
 #define dev_fmt(fmt)	"AMD-Vi: " fmt
 
 #include <linux/iommu.h>
+#include <linux/refcount.h>
 #include <uapi/linux/iommufd.h>
 
 #include "amd_iommu.h"
@@ -58,6 +59,33 @@ static int validate_gdte_nested(struct iommu_hwpt_amd_guest *gdte)
 	return 0;
 }
 
+static void *gdom_info_load_or_alloc_locked(struct xarray *xa, unsigned long index)
+{
+	struct guest_domain_mapping_info *elm, *res;
+
+	elm = xa_load(xa, index);
+	if (elm)
+		return elm;
+
+	xa_unlock(xa);
+	elm = kzalloc(sizeof(struct guest_domain_mapping_info), GFP_KERNEL);
+	xa_lock(xa);
+	if (!elm)
+		return ERR_PTR(-ENOMEM);
+
+	res = __xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL);
+	if (xa_is_err(res))
+		res = ERR_PTR(xa_err(res));
+
+	if (res) {
+		kfree(elm);
+		return res;
+	}
+
+	refcount_set(&elm->users, 0);
+	return elm;
+}
+
 /*
  * This function is assigned to struct iommufd_viommu_ops.alloc_domain_nested()
  * during the call to struct iommu_ops.viommu_init().
@@ -68,6 +96,7 @@ amd_iommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
 {
 	int ret;
 	struct nested_domain *ndom;
+	struct guest_domain_mapping_info *gdom_info;
 	struct amd_iommu_viommu *aviommu = container_of(viommu, struct amd_iommu_viommu, core);
 
 	if (user_data->type != IOMMU_HWPT_DATA_AMD_GUEST)
@@ -92,7 +121,63 @@ amd_iommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
 	ndom->domain.type = IOMMU_DOMAIN_NESTED;
 	ndom->viommu = aviommu;
 
+	/*
+	 * Normally, when a guest has multiple pass-through devices,
+	 * the IOMMU driver setup DTEs with the same stage-2 table and
+	 * use the same host domain ID (hDomId). In case of nested translation,
+	 * if the guest setup different stage-1 tables with same PASID,
+	 * IOMMU would use the same TLB tag. This will results in TLB
+	 * aliasing issue.
+	 *
+	 * The guest is assigning gDomIDs based on its own algorithm for managing
+	 * cache tags of (DomID, PASID). Within a single viommu, the nest parent domain
+	 * (w/ S2 table) is used by all DTEs. But we need to consistently map the gDomID
+	 * to a single hDomID. This is done using an xarray in the vIOMMU to
+	 * keep track of the gDomID mapping. When the S2 is changed, the INVALIDATE_IOMMU_PAGES
+	 * command must be issued for each hDomID in the xarray.
+	 */
+	xa_lock(&aviommu->gdomid_array);
+
+	gdom_info = gdom_info_load_or_alloc_locked(&aviommu->gdomid_array, ndom->gdom_id);
+	if (IS_ERR(gdom_info)) {
+		xa_unlock(&aviommu->gdomid_array);
+		ret = PTR_ERR(gdom_info);
+		goto out_err;
+	}
+
+	/* Check if gDomID exist */
+	if (refcount_inc_not_zero(&gdom_info->users)) {
+		ndom->gdom_info = gdom_info;
+		xa_unlock(&aviommu->gdomid_array);
+
+		pr_debug("%s: Found gdom_id=%#x, hdom_id=%#x\n",
+			  __func__, ndom->gdom_id, gdom_info->hdom_id);
+
+		return &ndom->domain;
+	}
+
+	/* The gDomID does not exist. We allocate new hdom_id */
+	gdom_info->hdom_id = amd_iommu_pdom_id_alloc();
+	if (gdom_info->hdom_id <= 0) {
+		__xa_cmpxchg(&aviommu->gdomid_array,
+			     ndom->gdom_id, gdom_info, NULL, GFP_ATOMIC);
+		xa_unlock(&aviommu->gdomid_array);
+		ret = -ENOSPC;
+		goto out_err_gdom_info;
+	}
+
+	ndom->gdom_info = gdom_info;
+	refcount_set(&gdom_info->users, 1);
+
+	xa_unlock(&aviommu->gdomid_array);
+
+	pr_debug("%s: Allocate gdom_id=%#x, hdom_id=%#x\n",
+		 __func__, ndom->gdom_id, gdom_info->hdom_id);
+
 	return &ndom->domain;
+
+out_err_gdom_info:
+	kfree(gdom_info);
 out_err:
 	kfree(ndom);
 	return ERR_PTR(ret);
@@ -100,8 +185,34 @@ amd_iommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
 
 static void nested_domain_free(struct iommu_domain *dom)
 {
+	struct guest_domain_mapping_info *curr;
 	struct nested_domain *ndom = to_ndomain(dom);
+	struct amd_iommu_viommu *aviommu = ndom->viommu;
+
+	xa_lock(&aviommu->gdomid_array);
+
+	if (!refcount_dec_and_test(&ndom->gdom_info->users)) {
+		xa_unlock(&aviommu->gdomid_array);
+		return;
+	}
+
+	/*
+	 * The refcount for the gdom_id to hdom_id mapping is zero.
+	 * It is now safe to remove the mapping.
+	 */
+	curr = __xa_cmpxchg(&aviommu->gdomid_array, ndom->gdom_id,
+			    ndom->gdom_info, NULL, GFP_ATOMIC);
+
+	xa_unlock(&aviommu->gdomid_array);
+	if (WARN_ON(!curr || xa_err(curr)))
+		return;
+
+	/* success */
+	pr_debug("%s: Free gdom_id=%#x, hdom_id=%#x\n",
+		__func__, ndom->gdom_id, curr->hdom_id);
 
+	amd_iommu_pdom_id_free(ndom->gdom_info->hdom_id);
+	kfree(curr);
 	kfree(ndom);
 }
 
-- 
2.34.1
Re: [PATCH v6 10/13] iommu/amd: Introduce gDomID-to-hDomID Mapping and handle parent domain invalidation
Posted by Jason Gunthorpe 2 weeks, 4 days ago
On Thu, Jan 15, 2026 at 06:08:11AM +0000, Suravee Suthikulpanit wrote:
> +static int iommu_flush_pages_v1_hdom_ids(struct protection_domain *pdom, u64 address, size_t size)
> +{
> +	int ret = 0;
> +	struct amd_iommu_viommu *aviommu;
> +
> +	list_for_each_entry(aviommu, &pdom->viommu_list, pdom_list) {
> +		unsigned long i;

You should have some lockdeps here for this list iteration..

> +static void *gdom_info_load_or_alloc_locked(struct xarray *xa, unsigned long index)
> +{
> +	struct guest_domain_mapping_info *elm, *res;
> +
> +	elm = xa_load(xa, index);
> +	if (elm)
> +		return elm;
> +
> +	xa_unlock(xa);
> +	elm = kzalloc(sizeof(struct guest_domain_mapping_info), GFP_KERNEL);
> +	xa_lock(xa);
> +	if (!elm)
> +		return ERR_PTR(-ENOMEM);
> +
> +	res = __xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL);
> +	if (xa_is_err(res))
> +		res = ERR_PTR(xa_err(res));
> +
> +	if (res) {
> +		kfree(elm);
> +		return res;
> +	}
> +
> +	refcount_set(&elm->users, 0);
> +	return elm;
> +}
> +
>  /*
>   * This function is assigned to struct iommufd_viommu_ops.alloc_domain_nested()
>   * during the call to struct iommu_ops.viommu_init().
> @@ -68,6 +96,7 @@ amd_iommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
>  {
>  	int ret;
>  	struct nested_domain *ndom;
> +	struct guest_domain_mapping_info *gdom_info;
>  	struct amd_iommu_viommu *aviommu = container_of(viommu, struct amd_iommu_viommu, core);
>  
>  	if (user_data->type != IOMMU_HWPT_DATA_AMD_GUEST)
> @@ -92,7 +121,63 @@ amd_iommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
>  	ndom->domain.type = IOMMU_DOMAIN_NESTED;
>  	ndom->viommu = aviommu;
>  
> +	/*
> +	 * Normally, when a guest has multiple pass-through devices,
> +	 * the IOMMU driver setup DTEs with the same stage-2 table and
> +	 * use the same host domain ID (hDomId). In case of nested translation,
> +	 * if the guest setup different stage-1 tables with same PASID,
> +	 * IOMMU would use the same TLB tag. This will results in TLB
> +	 * aliasing issue.
> +	 *
> +	 * The guest is assigning gDomIDs based on its own algorithm for managing
> +	 * cache tags of (DomID, PASID). Within a single viommu, the nest parent domain
> +	 * (w/ S2 table) is used by all DTEs. But we need to consistently map the gDomID
> +	 * to a single hDomID. This is done using an xarray in the vIOMMU to
> +	 * keep track of the gDomID mapping. When the S2 is changed, the INVALIDATE_IOMMU_PAGES
> +	 * command must be issued for each hDomID in the xarray.
> +	 */
> +	xa_lock(&aviommu->gdomid_array);
> +
> +	gdom_info = gdom_info_load_or_alloc_locked(&aviommu->gdomid_array, ndom->gdom_id);
> +	if (IS_ERR(gdom_info)) {
> +		xa_unlock(&aviommu->gdomid_array);
> +		ret = PTR_ERR(gdom_info);
> +		goto out_err;
> +	}
> +
> +	/* Check if gDomID exist */
> +	if (refcount_inc_not_zero(&gdom_info->users)) {
> +		ndom->gdom_info = gdom_info;
> +		xa_unlock(&aviommu->gdomid_array);

This is pretty tortured, the alloc flow inside
gdom_info_load_or_alloc_locked() should do the
amd_iommu_pdom_id_alloc() and set the refcount to 1 before installing
it in the xarray, then you don't need any of this here.

> +	/* The gDomID does not exist. We allocate new hdom_id */
> +	gdom_info->hdom_id = amd_iommu_pdom_id_alloc();

Then this allocation wouldn't have to be ATOMIC.

But it looks working the way it is so no rush

Jason