[PATCH v4 13/16] iommu/amd: Track host Domain ID mapping for each guest Domain ID

Suravee Suthikulpanit posted 16 patches 3 months, 3 weeks ago
Only 15 patches received!
There is a newer version of this series
[PATCH v4 13/16] iommu/amd: Track host Domain ID mapping for each guest Domain ID
Posted by Suravee Suthikulpanit 3 months, 3 weeks ago
Each nested domain is assigned guest domain ID (gDomID), which guest OS
programs into guest Device Table Entry (gDTE). For each gDomID, the driver
assigns a corresponding host domain ID (hDomID), which will be programmed
into the host Device Table Entry (hDTE).

The gDTE to hDTE 1:1 mapping is stored in the nest parent domain using
an xarray (struct protection_domain.gdomid_array). When invalidate the
nest parent domain, the INVALIDATE_IOMMU_PAGES must be issued for each
hDomID in the gdomid_array.

Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
---
 drivers/iommu/amd/amd_iommu_types.h |  3 ++
 drivers/iommu/amd/iommu.c           | 30 ++++++++++++++++
 drivers/iommu/amd/nested.c          | 53 ++++++++++++++++++++++++++++-
 3 files changed, 85 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
index e0f0cd3d34f2..09952b7a256d 100644
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -602,6 +602,7 @@ struct amd_iommu_viommu {
 struct nested_domain {
 	struct iommu_domain domain; /* generic domain handle used by iommu core code */
 	u16 gdom_id;                /* domain ID from gDTE */
+	u16 hdom_id;	            /* domain ID written to the device table */
 	struct iommu_hwpt_amd_guest gdte; /* Guest vIOMMU DTE */
 	struct amd_iommu_viommu *viommu;  /* AMD hw-viommu this nested domain belong to */
 };
@@ -623,6 +624,8 @@ struct protection_domain {
 
 	struct mmu_notifier mn;	/* mmu notifier for the SVA domain */
 	struct list_head dev_data_list; /* List of pdom_dev_data */
+
+	struct xarray gdomid_array; /* gDomID mapping to this nest parent domain */
 };
 
 /*
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index c4ff18adcf03..e4db6f304599 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -1493,6 +1493,23 @@ static void amd_iommu_flush_tlb_domid(struct amd_iommu *iommu, u32 dom_id)
 	amd_iommu_completion_wait(iommu);
 }
 
+static int iommu_flush_hdom_ids(struct amd_iommu *iommu,
+				u64 address, size_t size,
+				struct protection_domain *parent)
+{
+	int ret = 0;
+	unsigned long i;
+	struct iommu_cmd cmd;
+	struct nested_domain *ndom;
+
+	xa_for_each(&parent->gdomid_array, i, ndom) {
+		build_inv_iommu_pages(&cmd, address, size, ndom->hdom_id,
+				      IOMMU_NO_PASID, false);
+		ret |= iommu_queue_command(iommu, &cmd);
+	}
+	return ret;
+}
+
 static void amd_iommu_flush_all(struct amd_iommu *iommu)
 {
 	struct iommu_cmd cmd;
@@ -1639,6 +1656,18 @@ static int domain_flush_pages_v1(struct protection_domain *pdom,
 		 * We need a TLB flush
 		 */
 		ret |= iommu_queue_command(pdom_iommu_info->iommu, &cmd);
+
+		/*
+		 * A domain w/ v1 table can be a nest parent, which can have
+		 * multiple nested domains. Each nested domain has 1:1 mapping
+		 * between gDomID and hDomID, which is stored in the gdomid_array.
+		 * Therefore, we also need to flush every hDomID associated to this
+		 * nest parent domain.
+		 *
+		 * See drivers/iommu/amd/nested.c: amd_iommu_alloc_domain_nested()
+		 */
+		ret |= iommu_flush_hdom_ids(pdom_iommu_info->iommu, address,
+					    size, pdom);
 	}
 
 	return ret;
@@ -2470,6 +2499,7 @@ static void protection_domain_init(struct protection_domain *domain)
 	INIT_LIST_HEAD(&domain->dev_list);
 	INIT_LIST_HEAD(&domain->dev_data_list);
 	xa_init(&domain->iommu_array);
+	xa_init(&domain->gdomid_array);
 }
 
 struct protection_domain *protection_domain_alloc(void)
diff --git a/drivers/iommu/amd/nested.c b/drivers/iommu/amd/nested.c
index e7b6f69a9d0c..383a3c7b4a91 100644
--- a/drivers/iommu/amd/nested.c
+++ b/drivers/iommu/amd/nested.c
@@ -67,7 +67,7 @@ amd_iommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
 			      const struct iommu_user_data *user_data)
 {
 	int ret;
-	struct nested_domain *ndom;
+	struct nested_domain *ndom, *curr;
 	struct amd_iommu_viommu *aviommu = container_of(viommu, struct amd_iommu_viommu, core);
 
 	if (user_data->type != IOMMU_HWPT_DATA_AMD_GUEST)
@@ -92,6 +92,49 @@ amd_iommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
 	ndom->domain.type = IOMMU_DOMAIN_NESTED;
 	ndom->viommu = aviommu;
 
+	/*
+	 * Normally, when a guest has multiple pass-through devices,
+	 * the IOMMU driver setup DTEs with the same stage-2 table and
+	 * use the same host domain ID (hDomId). In case of nested translation,
+	 * if the guest setup different stage-1 tables with same PASID,
+	 * IOMMU would use the same TLB tag. This will results in TLB
+	 * aliasing issue.
+	 *
+	 * The guest is assigning gDomIDs based on its own algorithm for managing
+	 * cache tags of (DomID, PASID). Within a single viommu, the nest parent domain
+	 * (w/ S2 table) is used by all DTEs. But we need to consistently map the gDomID
+	 * to a single hDomID. This is done using an xarray in the nest parent domain to
+	 * keep track of the gDomID mapping. When the S2 is changed, the INVALIDATE_IOMMU_PAGES
+	 * command must be issued for each hDomID in the xarray.
+	 *
+	 * Since there is no invalidation support and no viommu yet, just always use a
+	 * unique hDomID for now.
+	 */
+	curr = xa_cmpxchg(&aviommu->parent->gdomid_array,
+			  ndom->gdom_id, NULL, ndom, GFP_ATOMIC);
+	if (curr) {
+		if (xa_err(curr)) {
+			ret = -EINVAL;
+		} else {
+			/* The gDomID already exist */
+			pr_debug("%s: Found gdom_id=%#x, hdom_id=%#x\n",
+				 __func__, ndom->gdom_id, ndom->hdom_id);
+			ret = -EBUSY;
+		}
+		goto out_err;
+	}
+
+	ndom->hdom_id = amd_iommu_pdom_id_alloc();
+	if (ndom->hdom_id <= 0) {
+		xa_cmpxchg(&aviommu->parent->gdomid_array,
+			   ndom->gdom_id, ndom, NULL, GFP_ATOMIC);
+		ret = -ENOSPC;
+		goto out_err;
+	}
+
+	pr_debug("%s: Allocate gdom_id=%#x, hdom_id=%#x\n",
+		 __func__, ndom->gdom_id, ndom->hdom_id);
+
 	return &ndom->domain;
 out_err:
 	kfree(ndom);
@@ -101,7 +144,15 @@ amd_iommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
 static void nested_domain_free(struct iommu_domain *dom)
 {
 	struct nested_domain *ndom = to_ndomain(dom);
+	struct amd_iommu_viommu *aviommu = ndom->viommu;
+
+	pr_debug("%s: Free gdom_id=%#x, hdom_id=%#x\n",
+		__func__, ndom->gdom_id, ndom->hdom_id);
+
+	xa_cmpxchg(&aviommu->parent->gdomid_array,
+		   ndom->gdom_id, ndom, NULL, GFP_ATOMIC);
 
+	amd_iommu_pdom_id_free(ndom->hdom_id);
 	kfree(ndom);
 }
 
-- 
2.34.1
Re: [PATCH v4 13/16] iommu/amd: Track host Domain ID mapping for each guest Domain ID
Posted by Jason Gunthorpe 3 months, 2 weeks ago
On Tue, Oct 21, 2025 at 01:43:21AM +0000, Suravee Suthikulpanit wrote:
> Each nested domain is assigned guest domain ID (gDomID), which guest OS
> programs into guest Device Table Entry (gDTE). For each gDomID, the driver
> assigns a corresponding host domain ID (hDomID), which will be programmed
> into the host Device Table Entry (hDTE).
> 
> The gDTE to hDTE 1:1 mapping is stored in the nest parent domain using
> an xarray (struct protection_domain.gdomid_array). When invalidate the
> nest parent domain, the INVALIDATE_IOMMU_PAGES must be issued for each
> hDomID in the gdomid_array.

I think this should be stored in the viommu..

It is a small unrealistic detail but very pedantically the API allows
creating two VIOMMU's from the same NEST PARENT domain and if someone
did this then each of the VIOMMU should have its own private gDomID
number space and own separated xarray.

Allowing two VIOMMUs to share the same hDomID could be problematic
because we don't know the PASID layout is consistent.

> +static int iommu_flush_hdom_ids(struct amd_iommu *iommu,
> +				u64 address, size_t size,
> +				struct protection_domain *parent)
> +{
> +	int ret = 0;
> +	unsigned long i;
> +	struct iommu_cmd cmd;
> +	struct nested_domain *ndom;
> +
> +	xa_for_each(&parent->gdomid_array, i, ndom) {

This doesn't seem right.. There could be many nested_domains sharing
the same gDomID..

I expect this xarray to have a struct like

struct gdomid {
   refcount_t users;
   u32 hdomid;
};

And each nested_domain will go into the viommu and either allocate a
new gdomid or ++users for the existing one. Inverse when destroying a
nested_domain.

> @@ -92,6 +92,49 @@ amd_iommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
>  	ndom->domain.type = IOMMU_DOMAIN_NESTED;
>  	ndom->viommu = aviommu;
>  
> +	/*
> +	 * Normally, when a guest has multiple pass-through devices,
> +	 * the IOMMU driver setup DTEs with the same stage-2 table and
> +	 * use the same host domain ID (hDomId). In case of nested translation,
> +	 * if the guest setup different stage-1 tables with same PASID,
> +	 * IOMMU would use the same TLB tag. This will results in TLB
> +	 * aliasing issue.
> +	 *
> +	 * The guest is assigning gDomIDs based on its own algorithm for managing
> +	 * cache tags of (DomID, PASID). Within a single viommu, the nest parent domain
> +	 * (w/ S2 table) is used by all DTEs. But we need to consistently map the gDomID
> +	 * to a single hDomID. This is done using an xarray in the nest parent domain to
> +	 * keep track of the gDomID mapping. When the S2 is changed, the INVALIDATE_IOMMU_PAGES
> +	 * command must be issued for each hDomID in the xarray.
> +	 *
> +	 * Since there is no invalidation support and no viommu yet, just always use a
> +	 * unique hDomID for now.

It is not "for now" anymore, this is the correct algorithm..

Jason
Re: [PATCH v4 13/16] iommu/amd: Track host Domain ID mapping for each guest Domain ID
Posted by Suthikulpanit, Suravee 3 months ago
Jason

On 10/23/2025 3:08 AM, Jason Gunthorpe wrote:
> On Tue, Oct 21, 2025 at 01:43:21AM +0000, Suravee Suthikulpanit wrote:
>> Each nested domain is assigned guest domain ID (gDomID), which guest OS
>> programs into guest Device Table Entry (gDTE). For each gDomID, the driver
>> assigns a corresponding host domain ID (hDomID), which will be programmed
>> into the host Device Table Entry (hDTE).
>>
>> The gDTE to hDTE 1:1 mapping is stored in the nest parent domain using
>> an xarray (struct protection_domain.gdomid_array). When invalidate the
>> nest parent domain, the INVALIDATE_IOMMU_PAGES must be issued for each
>> hDomID in the gdomid_array.
> 
> I think this should be stored in the viommu..
> 
> It is a small unrealistic detail but very pedantically the API allows
> creating two VIOMMU's from the same NEST PARENT domain and if someone
> did this then each of the VIOMMU should have its own private gDomID
> number space and own separated xarray.

Actually, to support nested translation w/ HW-based vIOMMU support in 
the guest w/ two VFIO devices on two different physical IOMMUs, it would 
require setting up two iommufd_viommu structures (one for each IOMMU) 
and share the same parent domain (single GPA-SPA mapping). Also, AMD 
HW-vIOMMU use a single domain ID (gDomID-to-hDomID) mapping table per 
guest-ID. Since the table is indexed using gDomID, it requires single 
gDomID space per guest.

In this case, it makes more sense to store the gDomID-to-hDomID mapping 
in the parent domain since:

    1. There is one gDomID-space per guest and there is one parent 
domain per guest.

    2. When host issues invalidation for a parent domain, IOMMU driver 
needs to issue an invalidate command for each hDomId used for the same 
parent domain (on each IOMMU). We can't do this if we store xarray in 
the viommu. Otherwise, we would need to store a list of vIOMMUs per 
parent domain.

> Allowing two VIOMMUs to share the same hDomID could be problematic
> because we don't know the PASID layout is consistent.

Not sure why PASID layout matters here?
>> +static int iommu_flush_hdom_ids(struct amd_iommu *iommu,
>> +				u64 address, size_t size,
>> +				struct protection_domain *parent)
>> +{
>> +	int ret = 0;
>> +	unsigned long i;
>> +	struct iommu_cmd cmd;
>> +	struct nested_domain *ndom;
>> +
>> +	xa_for_each(&parent->gdomid_array, i, ndom) {
> 
> This doesn't seem right.. There could be many nested_domains sharing
> the same gDomID..
> 
> I expect this xarray to have a struct like
> 
> struct gdomid {
>     refcount_t users;
>     u32 hdomid;
> };
> 
> And each nested_domain will go into the viommu and either allocate a
> new gdomid or ++users for the existing one. Inverse when destroying a
> nested_domain.

Got it. I have new code for this and will send out in v5 soon.

Thanks,
Suravee
Re: [PATCH v4 13/16] iommu/amd: Track host Domain ID mapping for each guest Domain ID
Posted by Jason Gunthorpe 3 months ago
On Wed, Nov 05, 2025 at 05:50:44PM +0700, Suthikulpanit, Suravee wrote:
> Jason
> 
> On 10/23/2025 3:08 AM, Jason Gunthorpe wrote:
> > On Tue, Oct 21, 2025 at 01:43:21AM +0000, Suravee Suthikulpanit wrote:
> > > Each nested domain is assigned guest domain ID (gDomID), which guest OS
> > > programs into guest Device Table Entry (gDTE). For each gDomID, the driver
> > > assigns a corresponding host domain ID (hDomID), which will be programmed
> > > into the host Device Table Entry (hDTE).
> > > 
> > > The gDTE to hDTE 1:1 mapping is stored in the nest parent domain using
> > > an xarray (struct protection_domain.gdomid_array). When invalidate the
> > > nest parent domain, the INVALIDATE_IOMMU_PAGES must be issued for each
> > > hDomID in the gdomid_array.
> > 
> > I think this should be stored in the viommu..
> > 
> > It is a small unrealistic detail but very pedantically the API allows
> > creating two VIOMMU's from the same NEST PARENT domain and if someone
> > did this then each of the VIOMMU should have its own private gDomID
> > number space and own separated xarray.
> 
> Actually, to support nested translation w/ HW-based vIOMMU support in the
> guest w/ two VFIO devices on two different physical IOMMUs, it would require
> setting up two iommufd_viommu structures (one for each IOMMU) and share the
> same parent domain (single GPA-SPA mapping). Also, AMD HW-vIOMMU use a
> single domain ID (gDomID-to-hDomID) mapping table per guest-ID. Since the
> table is indexed using gDomID, it requires single gDomID space per guest.

Yes, this is why the guest ID needs to come from the viommu object,
not the parent domain.

> In this case, it makes more sense to store the gDomID-to-hDomID mapping in
> the parent domain since:
> 
>    1. There is one gDomID-space per guest and there is one parent domain per
> guest.

No. There is one gDomID-space *per viommu*. The driver has no concept
of what a guest is, that isn't part of our API.

For AMD hardware it is the viommu object that must hold the gDomID,
and all the other related HW datastructures for a "guest".

One viommu object per what the AMD spec calls a "guest".

>    2. When host issues invalidation for a parent domain, IOMMU driver needs
> to issue an invalidate command for each hDomId used for the same parent
> domain (on each IOMMU). We can't do this if we store xarray in the viommu.
> Otherwise, we would need to store a list of vIOMMUs per parent domain.

Correct, however this is just showing that the driver invalidation
logic is incorrect. See how Nicolin is working to fix this same
problem in the ARM driver. The driver cannot assume a domain has only
a single domain id in a viommu world.

For now you should still store the xarray in the viommu and then
prevent the parent domain from being used with more than one viommu
somehow so it doesn't get into trouble with the limited invalidation
logic.

Someday maybe we can fix the invalidation logic and allow domain
sharing. But for now the code should be structured along its own
limitations..

> > Allowing two VIOMMUs to share the same hDomID could be problematic
> > because we don't know the PASID layout is consistent.
> 
> Not sure why PASID layout matters here?

I'm saying you cannot share hDomID across two VIOMMU objects, it is
not allowed.

Jason