Each nested domain is assigned guest domain ID (gDomID), which guest OS
programs into guest Device Table Entry (gDTE). For each gDomID, the driver
assigns a corresponding host domain ID (hDomID), which will be programmed
into the host Device Table Entry (hDTE).
The hDomID is allocated during amd_iommu_alloc_domain_nested(),
and free during nested_domain_free(). The gDomID-to-hDomID mapping info
(struct guest_domain_mapping_info) is stored in a per-viommu xarray
(struct amd_iommu_viommu.gdomid_array), which is indexed by gDomID.
Note also that parent domain can be shared among struct iommufd_viommu.
Therefore, when hypervisor invalidates the nest parent domain, the AMD
IOMMU command INVALIDATE_IOMMU_PAGES must be issued for each hDomID in
the gdomid_array. This is handled by the iommu_flush_pages_v1_hdom_ids(),
where it iterates through struct protection_domain.viommu_list.
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
---
drivers/iommu/amd/amd_iommu_types.h | 23 ++++++
drivers/iommu/amd/iommu.c | 38 ++++++++++
drivers/iommu/amd/iommufd.c | 31 ++++++++
drivers/iommu/amd/nested.c | 111 ++++++++++++++++++++++++++++
4 files changed, 203 insertions(+)
diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
index 487ee6123de5..4a98ac7dca0f 100644
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -503,6 +503,22 @@ struct pdom_iommu_info {
struct amd_iommu_viommu {
struct iommufd_viommu core;
struct protection_domain *parent; /* nest parent domain for this viommu */
+ struct list_head pdom_list; /* For protection_domain->viommu_list */
+
+ /*
+ * Per-vIOMMU guest domain ID to host domain ID mapping.
+ * Indexed by guest domain ID.
+ */
+ struct xarray gdomid_array;
+};
+
+/*
+ * Contains guest domain ID mapping info,
+ * which is stored in the struct xarray gdomid_array.
+ */
+struct guest_domain_mapping_info {
+ refcount_t users;
+ u32 hdom_id; /* Host domain ID */
};
/*
@@ -511,6 +527,7 @@ struct amd_iommu_viommu {
struct nested_domain {
struct iommu_domain domain; /* generic domain handle used by iommu core code */
u16 gdom_id; /* domain ID from gDTE */
+ struct guest_domain_mapping_info *gdom_info;
struct iommu_hwpt_amd_guest gdte; /* Guest vIOMMU DTE */
struct amd_iommu_viommu *viommu; /* AMD hw-viommu this nested domain belong to */
};
@@ -535,6 +552,12 @@ struct protection_domain {
struct mmu_notifier mn; /* mmu notifier for the SVA domain */
struct list_head dev_data_list; /* List of pdom_dev_data */
+
+ /*
+ * Store reference to list of vIOMMUs, which use this protection domain.
+ * This will be used to look up host domain ID when flushing this domain.
+ */
+ struct list_head viommu_list;
};
PT_IOMMU_CHECK_DOMAIN(struct protection_domain, iommu, domain);
PT_IOMMU_CHECK_DOMAIN(struct protection_domain, amdv1.iommu, domain);
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index ebc96f1f564f..e33076b99aac 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -1539,6 +1539,32 @@ static void amd_iommu_flush_tlb_domid(struct amd_iommu *iommu, u32 dom_id)
iommu_completion_wait(iommu);
}
+static int iommu_flush_pages_v1_hdom_ids(struct protection_domain *pdom, u64 address, size_t size)
+{
+ int ret = 0;
+ struct amd_iommu_viommu *aviommu;
+
+ list_for_each_entry(aviommu, &pdom->viommu_list, pdom_list) {
+ unsigned long i;
+ struct guest_domain_mapping_info *gdom_info;
+ struct amd_iommu *iommu = container_of(aviommu->core.iommu_dev,
+ struct amd_iommu, iommu);
+
+ xa_lock(&aviommu->gdomid_array);
+ xa_for_each(&aviommu->gdomid_array, i, gdom_info) {
+ struct iommu_cmd cmd;
+
+ pr_debug("%s: iommu=%#x, hdom_id=%#x\n", __func__,
+ iommu->devid, gdom_info->hdom_id);
+ build_inv_iommu_pages(&cmd, address, size, gdom_info->hdom_id,
+ IOMMU_NO_PASID, false);
+ ret |= iommu_queue_command(iommu, &cmd);
+ }
+ xa_unlock(&aviommu->gdomid_array);
+ }
+ return ret;
+}
+
static void amd_iommu_flush_all(struct amd_iommu *iommu)
{
struct iommu_cmd cmd;
@@ -1687,6 +1713,17 @@ static int domain_flush_pages_v1(struct protection_domain *pdom,
ret |= iommu_queue_command(pdom_iommu_info->iommu, &cmd);
}
+ /*
+ * A domain w/ v1 table can be a nest parent, which can have
+ * multiple nested domains. Each nested domain has 1:1 mapping
+ * between gDomID and hDomID. Therefore, flush every hDomID
+ * associated to this nest parent domain.
+ *
+ * See drivers/iommu/amd/nested.c: amd_iommu_alloc_domain_nested()
+ */
+ if (!list_empty(&pdom->viommu_list))
+ ret |= iommu_flush_pages_v1_hdom_ids(pdom, address, size);
+
return ret;
}
@@ -2504,6 +2541,7 @@ static void protection_domain_init(struct protection_domain *domain)
spin_lock_init(&domain->lock);
INIT_LIST_HEAD(&domain->dev_list);
INIT_LIST_HEAD(&domain->dev_data_list);
+ INIT_LIST_HEAD(&domain->viommu_list);
xa_init(&domain->iommu_array);
}
diff --git a/drivers/iommu/amd/iommufd.c b/drivers/iommu/amd/iommufd.c
index eb6119bdcf12..2e50633d9c72 100644
--- a/drivers/iommu/amd/iommufd.c
+++ b/drivers/iommu/amd/iommufd.c
@@ -9,6 +9,8 @@
#include "amd_iommu.h"
#include "amd_iommu_types.h"
+static const struct iommufd_viommu_ops amd_viommu_ops;
+
void *amd_iommufd_hw_info(struct device *dev, u32 *length, u32 *type)
{
struct iommu_hw_info_amd *hwinfo;
@@ -38,10 +40,39 @@ size_t amd_iommufd_get_viommu_size(struct device *dev, enum iommu_viommu_type vi
int amd_iommufd_viommu_init(struct iommufd_viommu *viommu, struct iommu_domain *parent,
const struct iommu_user_data *user_data)
{
+ unsigned long flags;
struct protection_domain *pdom = to_pdomain(parent);
struct amd_iommu_viommu *aviommu = container_of(viommu, struct amd_iommu_viommu, core);
+ xa_init_flags(&aviommu->gdomid_array, XA_FLAGS_ALLOC1);
aviommu->parent = pdom;
+ viommu->ops = &amd_viommu_ops;
+
+ spin_lock_irqsave(&pdom->lock, flags);
+ list_add(&aviommu->pdom_list, &pdom->viommu_list);
+ spin_unlock_irqrestore(&pdom->lock, flags);
+
return 0;
}
+
+static void amd_iommufd_viommu_destroy(struct iommufd_viommu *viommu)
+{
+ unsigned long flags;
+ struct amd_iommu *iommu = container_of(viommu->iommu_dev, struct amd_iommu, iommu);
+ struct amd_iommu_viommu *aviommu = container_of(viommu, struct amd_iommu_viommu, core);
+ struct protection_domain *pdom = aviommu->parent;
+
+ spin_lock_irqsave(&pdom->lock, flags);
+ list_del(&aviommu->pdom_list);
+ spin_unlock_irqrestore(&pdom->lock, flags);
+ xa_destroy(&aviommu->gdomid_array);
+}
+
+/*
+ * See include/linux/iommufd.h
+ * struct iommufd_viommu_ops - vIOMMU specific operations
+ */
+static const struct iommufd_viommu_ops amd_viommu_ops = {
+ .destroy = amd_iommufd_viommu_destroy,
+};
diff --git a/drivers/iommu/amd/nested.c b/drivers/iommu/amd/nested.c
index a8c0bb4dd733..8154a773eed8 100644
--- a/drivers/iommu/amd/nested.c
+++ b/drivers/iommu/amd/nested.c
@@ -6,6 +6,7 @@
#define dev_fmt(fmt) "AMD-Vi: " fmt
#include <linux/iommu.h>
+#include <linux/refcount.h>
#include <uapi/linux/iommufd.h>
#include "amd_iommu.h"
@@ -58,6 +59,33 @@ static int validate_gdte_nested(struct iommu_hwpt_amd_guest *gdte)
return 0;
}
+static void *gdom_info_load_or_alloc_locked(struct xarray *xa, unsigned long index)
+{
+ struct guest_domain_mapping_info *elm, *res;
+
+ elm = xa_load(xa, index);
+ if (elm)
+ return elm;
+
+ xa_unlock(xa);
+ elm = kzalloc(sizeof(struct guest_domain_mapping_info), GFP_KERNEL);
+ xa_lock(xa);
+ if (!elm)
+ return ERR_PTR(-ENOMEM);
+
+ res = __xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL);
+ if (xa_is_err(res))
+ res = ERR_PTR(xa_err(res));
+
+ if (res) {
+ kfree(elm);
+ return res;
+ }
+
+ refcount_set(&elm->users, 0);
+ return elm;
+}
+
/*
* This function is assigned to struct iommufd_viommu_ops.alloc_domain_nested()
* during the call to struct iommu_ops.viommu_init().
@@ -68,6 +96,7 @@ amd_iommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
{
int ret;
struct nested_domain *ndom;
+ struct guest_domain_mapping_info *gdom_info;
struct amd_iommu_viommu *aviommu = container_of(viommu, struct amd_iommu_viommu, core);
if (user_data->type != IOMMU_HWPT_DATA_AMD_GUEST)
@@ -92,7 +121,63 @@ amd_iommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
ndom->domain.type = IOMMU_DOMAIN_NESTED;
ndom->viommu = aviommu;
+ /*
+ * Normally, when a guest has multiple pass-through devices,
+ * the IOMMU driver setup DTEs with the same stage-2 table and
+ * use the same host domain ID (hDomId). In case of nested translation,
+ * if the guest setup different stage-1 tables with same PASID,
+ * IOMMU would use the same TLB tag. This will results in TLB
+ * aliasing issue.
+ *
+ * The guest is assigning gDomIDs based on its own algorithm for managing
+ * cache tags of (DomID, PASID). Within a single viommu, the nest parent domain
+ * (w/ S2 table) is used by all DTEs. But we need to consistently map the gDomID
+ * to a single hDomID. This is done using an xarray in the vIOMMU to
+ * keep track of the gDomID mapping. When the S2 is changed, the INVALIDATE_IOMMU_PAGES
+ * command must be issued for each hDomID in the xarray.
+ */
+ xa_lock(&aviommu->gdomid_array);
+
+ gdom_info = gdom_info_load_or_alloc_locked(&aviommu->gdomid_array, ndom->gdom_id);
+ if (IS_ERR(gdom_info)) {
+ xa_unlock(&aviommu->gdomid_array);
+ ret = PTR_ERR(gdom_info);
+ goto out_err;
+ }
+
+ /* Check if gDomID exist */
+ if (refcount_inc_not_zero(&gdom_info->users)) {
+ ndom->gdom_info = gdom_info;
+ xa_unlock(&aviommu->gdomid_array);
+
+ pr_debug("%s: Found gdom_id=%#x, hdom_id=%#x\n",
+ __func__, ndom->gdom_id, gdom_info->hdom_id);
+
+ return &ndom->domain;
+ }
+
+ /* The gDomID does not exist. We allocate new hdom_id */
+ gdom_info->hdom_id = amd_iommu_pdom_id_alloc();
+ if (gdom_info->hdom_id <= 0) {
+ __xa_cmpxchg(&aviommu->gdomid_array,
+ ndom->gdom_id, gdom_info, NULL, GFP_ATOMIC);
+ xa_unlock(&aviommu->gdomid_array);
+ ret = -ENOSPC;
+ goto out_err_gdom_info;
+ }
+
+ ndom->gdom_info = gdom_info;
+ refcount_set(&gdom_info->users, 1);
+
+ xa_unlock(&aviommu->gdomid_array);
+
+ pr_debug("%s: Allocate gdom_id=%#x, hdom_id=%#x\n",
+ __func__, ndom->gdom_id, gdom_info->hdom_id);
+
return &ndom->domain;
+
+out_err_gdom_info:
+ kfree(gdom_info);
out_err:
kfree(ndom);
return ERR_PTR(ret);
@@ -100,8 +185,34 @@ amd_iommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
static void nested_domain_free(struct iommu_domain *dom)
{
+ struct guest_domain_mapping_info *curr;
struct nested_domain *ndom = to_ndomain(dom);
+ struct amd_iommu_viommu *aviommu = ndom->viommu;
+
+ xa_lock(&aviommu->gdomid_array);
+
+ if (!refcount_dec_and_test(&ndom->gdom_info->users)) {
+ xa_unlock(&aviommu->gdomid_array);
+ return;
+ }
+
+ /*
+ * The refcount for the gdom_id to hdom_id mapping is zero.
+ * It is now safe to remove the mapping.
+ */
+ curr = __xa_cmpxchg(&aviommu->gdomid_array, ndom->gdom_id,
+ ndom->gdom_info, NULL, GFP_ATOMIC);
+
+ xa_unlock(&aviommu->gdomid_array);
+ if (WARN_ON(!curr || xa_err(curr)))
+ return;
+
+ /* success */
+ pr_debug("%s: Free gdom_id=%#x, hdom_id=%#x\n",
+ __func__, ndom->gdom_id, curr->hdom_id);
+ amd_iommu_pdom_id_free(ndom->gdom_info->hdom_id);
+ kfree(curr);
kfree(ndom);
}
--
2.34.1
On Thu, Jan 15, 2026 at 06:08:11AM +0000, Suravee Suthikulpanit wrote:
> +static int iommu_flush_pages_v1_hdom_ids(struct protection_domain *pdom, u64 address, size_t size)
> +{
> + int ret = 0;
> + struct amd_iommu_viommu *aviommu;
> +
> + list_for_each_entry(aviommu, &pdom->viommu_list, pdom_list) {
> + unsigned long i;
You should have some lockdeps here for this list iteration..
> +static void *gdom_info_load_or_alloc_locked(struct xarray *xa, unsigned long index)
> +{
> + struct guest_domain_mapping_info *elm, *res;
> +
> + elm = xa_load(xa, index);
> + if (elm)
> + return elm;
> +
> + xa_unlock(xa);
> + elm = kzalloc(sizeof(struct guest_domain_mapping_info), GFP_KERNEL);
> + xa_lock(xa);
> + if (!elm)
> + return ERR_PTR(-ENOMEM);
> +
> + res = __xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL);
> + if (xa_is_err(res))
> + res = ERR_PTR(xa_err(res));
> +
> + if (res) {
> + kfree(elm);
> + return res;
> + }
> +
> + refcount_set(&elm->users, 0);
> + return elm;
> +}
> +
> /*
> * This function is assigned to struct iommufd_viommu_ops.alloc_domain_nested()
> * during the call to struct iommu_ops.viommu_init().
> @@ -68,6 +96,7 @@ amd_iommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
> {
> int ret;
> struct nested_domain *ndom;
> + struct guest_domain_mapping_info *gdom_info;
> struct amd_iommu_viommu *aviommu = container_of(viommu, struct amd_iommu_viommu, core);
>
> if (user_data->type != IOMMU_HWPT_DATA_AMD_GUEST)
> @@ -92,7 +121,63 @@ amd_iommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
> ndom->domain.type = IOMMU_DOMAIN_NESTED;
> ndom->viommu = aviommu;
>
> + /*
> + * Normally, when a guest has multiple pass-through devices,
> + * the IOMMU driver setup DTEs with the same stage-2 table and
> + * use the same host domain ID (hDomId). In case of nested translation,
> + * if the guest setup different stage-1 tables with same PASID,
> + * IOMMU would use the same TLB tag. This will results in TLB
> + * aliasing issue.
> + *
> + * The guest is assigning gDomIDs based on its own algorithm for managing
> + * cache tags of (DomID, PASID). Within a single viommu, the nest parent domain
> + * (w/ S2 table) is used by all DTEs. But we need to consistently map the gDomID
> + * to a single hDomID. This is done using an xarray in the vIOMMU to
> + * keep track of the gDomID mapping. When the S2 is changed, the INVALIDATE_IOMMU_PAGES
> + * command must be issued for each hDomID in the xarray.
> + */
> + xa_lock(&aviommu->gdomid_array);
> +
> + gdom_info = gdom_info_load_or_alloc_locked(&aviommu->gdomid_array, ndom->gdom_id);
> + if (IS_ERR(gdom_info)) {
> + xa_unlock(&aviommu->gdomid_array);
> + ret = PTR_ERR(gdom_info);
> + goto out_err;
> + }
> +
> + /* Check if gDomID exist */
> + if (refcount_inc_not_zero(&gdom_info->users)) {
> + ndom->gdom_info = gdom_info;
> + xa_unlock(&aviommu->gdomid_array);
This is pretty tortured, the alloc flow inside
gdom_info_load_or_alloc_locked() should do the
amd_iommu_pdom_id_alloc() and set the refcount to 1 before installing
it in the xarray, then you don't need any of this here.
> + /* The gDomID does not exist. We allocate new hdom_id */
> + gdom_info->hdom_id = amd_iommu_pdom_id_alloc();
Then this allocation wouldn't have to be ATOMIC.
But it looks working the way it is so no rush
Jason
© 2016 - 2026 Red Hat, Inc.