[PATCH v4 15/16] iommu/amd: Refactor logic to program the host page table in DTE

Suravee Suthikulpanit posted 16 patches 3 months, 3 weeks ago
Only 15 patches received!
There is a newer version of this series
[PATCH v4 15/16] iommu/amd: Refactor logic to program the host page table in DTE
Posted by Suravee Suthikulpanit 3 months, 3 weeks ago
Introduce the set_dte_v1() helper function to configure IOMMU host (v1)
page table into DTE.

There is no functional change.

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
---
 drivers/iommu/amd/amd_iommu.h |  3 ++
 drivers/iommu/amd/iommu.c     | 57 ++++++++++++++++++++---------------
 2 files changed, 35 insertions(+), 25 deletions(-)

diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h
index cfb63de7732a..5e61fdb2c6c0 100644
--- a/drivers/iommu/amd/amd_iommu.h
+++ b/drivers/iommu/amd/amd_iommu.h
@@ -190,6 +190,9 @@ struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid);
 int amd_iommu_completion_wait(struct amd_iommu *iommu);
 
 /* DTE */
+void amd_iommu_set_dte_v1(struct iommu_dev_data *dev_data,
+			  struct protection_domain *domain,
+			  struct dev_table_entry *new);
 int amd_iommu_device_flush_dte(struct iommu_dev_data *dev_data);
 void amd_iommu_update_dte256(struct amd_iommu *iommu,
 			     struct iommu_dev_data *dev_data,
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index e3330f3b8c14..428008cff06a 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -2041,16 +2041,12 @@ int amd_iommu_clear_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid)
  * Note:
  * The old value for GCR3 table and GPT have been cleared from caller.
  */
-static void set_dte_gcr3_table(struct amd_iommu *iommu,
-			       struct iommu_dev_data *dev_data,
+static void set_dte_gcr3_table(struct iommu_dev_data *dev_data,
 			       struct dev_table_entry *target)
 {
 	struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
 	u64 gcr3;
 
-	if (!gcr3_info->gcr3_tbl)
-		return;
-
 	pr_debug("%s: devid=%#x, glx=%#x, gcr3_tbl=%#llx\n",
 		 __func__, dev_data->devid, gcr3_info->glx,
 		 (unsigned long long)gcr3_info->gcr3_tbl);
@@ -2071,6 +2067,26 @@ static void set_dte_gcr3_table(struct amd_iommu *iommu,
 		target->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_5_LEVEL);
 	else
 		target->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_4_LEVEL);
+
+	/* Note: PRI is only supported w/ GCR3 table */
+	if (dev_data->ppr)
+		target->data[0] |= 1ULL << DEV_ENTRY_PPR;
+}
+
+void amd_iommu_set_dte_v1(struct iommu_dev_data *dev_data,
+			  struct protection_domain *domain,
+			  struct dev_table_entry *new)
+{
+	u64 htrp;
+
+	new->data[0] |= FIELD_PREP(DTE_MODE_MASK, domain->iop.mode);
+
+	htrp = FIELD_GET(GENMASK_ULL(51, 12), iommu_virt_to_phys(domain->iop.root));
+	new->data[0] |= FIELD_PREP(DTE_HOST_TRP, htrp);
+
+	/* Note Dirty tracking is used for v1 table only for now */
+	if (domain->dirty_tracking)
+		new->data[0] |= DTE_FLAG_HAD;
 }
 
 static void set_dte_entry(struct amd_iommu *iommu,
@@ -2088,37 +2104,28 @@ static void set_dte_entry(struct amd_iommu *iommu,
 	else
 		domid = domain->id;
 
-	amd_iommu_make_clear_dte(dev_data, &new);
-
-	if (domain->iop.mode != PAGE_MODE_NONE)
-		new.data[0] |= iommu_virt_to_phys(domain->iop.root);
-
-	new.data[0] |= (domain->iop.mode & DEV_ENTRY_MODE_MASK)
-		    << DEV_ENTRY_MODE_SHIFT;
-
-	new.data[0] |= DTE_FLAG_IR | DTE_FLAG_IW;
-
 	/*
 	 * When SNP is enabled, we can only support TV=1 with non-zero domain ID.
 	 * This is prevented by the SNP-enable and IOMMU_DOMAIN_IDENTITY check in
 	 * do_iommu_domain_alloc().
 	 */
 	WARN_ON(amd_iommu_snp_en && (domid == 0));
-	new.data[0] |= DTE_FLAG_TV;
 
-	if (dev_data->ppr)
-		new.data[0] |= 1ULL << DEV_ENTRY_PPR;
+	amd_iommu_make_clear_dte(dev_data, &new);
 
-	if (domain->dirty_tracking)
-		new.data[0] |= DTE_FLAG_HAD;
+	old_domid = READ_ONCE(dte->data[1]) & DTE_DOMID_MASK;
 
-	if (dev_data->ats_enabled)
-		new.data[1] |= DTE_FLAG_IOTLB;
+	if (gcr3_info && gcr3_info->gcr3_tbl)
+		set_dte_gcr3_table(dev_data, &new);
+	else if (domain->iop.mode != PAGE_MODE_NONE)
+		amd_iommu_set_dte_v1(dev_data, domain, &new);
 
-	old_domid = READ_ONCE(dte->data[1]) & DTE_DOMID_MASK;
-	new.data[1] |= domid;
+	/* Note: The IR, IW, TV, DOMID are needed for both v1 and gcr3 table */
+	new.data[0] |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_TV;
+	new.data[1] |= FIELD_PREP(DTE_DOMID_MASK, domid);
 
-	set_dte_gcr3_table(iommu, dev_data, &new);
+	if (dev_data->ats_enabled)
+		new.data[1] |= DTE_FLAG_IOTLB;
 
 	amd_iommu_update_dte256(iommu, dev_data, &new);
 
-- 
2.34.1
Re: [PATCH v4 15/16] iommu/amd: Refactor logic to program the host page table in DTE
Posted by Jason Gunthorpe 3 months, 2 weeks ago
On Tue, Oct 21, 2025 at 01:43:23AM +0000, Suravee Suthikulpanit wrote:
> @@ -2088,37 +2104,28 @@ static void set_dte_entry(struct amd_iommu *iommu,
>  	else
>  		domid = domain->id;
>  
>  	/*
>  	 * When SNP is enabled, we can only support TV=1 with non-zero domain ID.
>  	 * This is prevented by the SNP-enable and IOMMU_DOMAIN_IDENTITY check in
>  	 * do_iommu_domain_alloc().
>  	 */
>  	WARN_ON(amd_iommu_snp_en && (domid == 0));
>  
> +	amd_iommu_make_clear_dte(dev_data, &new);
>  
> +	old_domid = READ_ONCE(dte->data[1]) & DTE_DOMID_MASK;

This old_domid stuff doesn't make any sense. I think the commit that
added it is bonkers: 36b7200f67df ("iommu/amd: Flush old domains in kdump kernel")

The problem is that the dom ids that are present in the re-used DTE
table have to be reserved from the domain id alloctor at boot.

If the kdump kernel tries to create new DTEs it MUST NOT re-use any
IDs that are actively being using in DTEs already or you get data
corruption. Randomly flushing IDs is just getting lucky..

Not for this series, but something to think about.

> +	if (gcr3_info && gcr3_info->gcr3_tbl)
> +		set_dte_gcr3_table(dev_data, &new);
> +	else if (domain->iop.mode != PAGE_MODE_NONE)
> +		amd_iommu_set_dte_v1(dev_data, domain, &new);
>  
> +	/* Note: The IR, IW, TV, DOMID are needed for both v1 and gcr3 table */
> +	new.data[0] |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_TV;
> +	new.data[1] |= FIELD_PREP(DTE_DOMID_MASK, domid);
>  
> +	if (dev_data->ats_enabled)
> +		new.data[1] |= DTE_FLAG_IOTLB;

These three should be merged into the two functions so they stand
alone. These sets have to be made in the next patch, doesn't make
sense to open code them in callers.

Like this, it is simple and readable. It directly answers the question
'what bits are set when the driver creates this kind of DTE'

static void set_dte_gcr3_table(struct amd_iommu *iommu,
			       struct iommu_dev_data *dev_data,
			       struct dev_table_entry *target)
{
	struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
	u64 gcr3 = iommu_virt_to_phys(gcr3_info->gcr3_tbl);

	target->data[0] |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_TV |
			   DTE_FLAG_GV | FIELD_PREP(DTE_GLX, gcr3_info->glx) |
			   FIELD_PREP(DTE_GCR3_14_12, gcr3 >> 12) |
			   (dev_data->ats_enabled ? DTE_FLAG_IOTLB : 0) |
			   (pdom_is_v2_pgtbl_mode(dev_data->domain) ?
				    target->data[0] |= DTE_FLAG_GIOV :
				    0);

	target->data[1] |= FIELD_PREP(DTE_GCR3_30_15, gcr3 >> 15) |
			   FIELD_PREP(DTE_GCR3_51_31, gcr3 >> 31) |
			   FIELD_PREP(DTE_DOMID_MASK, dev_data->gcr3_info.domid);

	/* Guest page table can only support 4 and 5 levels  */
	if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL)
		target->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_5_LEVEL);
	else
		target->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_4_LEVEL);
}

void amd_iommu_set_dte_v1(struct iommu_dev_data *dev_data,
			  struct protection_domain *domain,
			  struct dev_table_entry *new)
{
	u64 htrp = iommu_virt_to_phys(domain->iop.root);

	new->data[0] |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_TV |
			FIELD_PREP(DTE_MODE_MASK, domain->iop.mode) |
			FIELD_PREP(DTE_HOST_TRP, htrp >> 12) |
			(dev_data->ats_enabled ? DTE_FLAG_IOTLB : 0) |
			(domain->dirty_tracking ? DTE_FLAG_HAD : 0);
	new.data[1] |= FIELD_PREP(DTE_DOMID_MASK, domain->id);
}

(It is nice to sort the fields by the spec order, I didn't do that)

Looks like an identity one is needed too:

void set_dte_identity(struct dev_table_entry *new)
{
	new->data[0] |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_TV |
			(dev_data->ats_enabled ? DTE_FLAG_IOTLB : 0);
}

Then the whole function is pretty much just:

	amd_iommu_make_clear_dte(dev_data, &new);
	if (gcr3_info && gcr3_info->gcr3_tbl)
		set_dte_gcr3_table(dev_data, &new);
	else if (domain->domain.type == IOMMU_DOMAIN_IDENTITY)
		set_dte_identity(&new)
	else if (domain->domain.type == IOMMU_DOMAIN_PAGING && io.mode == V1)
		amd_iommu_set_dte_v1(dev_data, domain, &new);
	else
		WARN_ON(true);

?

(though how does IDENTITY on a device with a PASID installed work?)

Jason
Re: [PATCH v4 15/16] iommu/amd: Refactor logic to program the host page table in DTE
Posted by Vasant Hegde 3 months ago
Jason,


On 10/23/2025 6:38 PM, Jason Gunthorpe wrote:
> On Tue, Oct 21, 2025 at 01:43:23AM +0000, Suravee Suthikulpanit wrote:
>> @@ -2088,37 +2104,28 @@ static void set_dte_entry(struct amd_iommu *iommu,
>>  	else
>>  		domid = domain->id;
>>  
>>  	/*
>>  	 * When SNP is enabled, we can only support TV=1 with non-zero domain ID.
>>  	 * This is prevented by the SNP-enable and IOMMU_DOMAIN_IDENTITY check in
>>  	 * do_iommu_domain_alloc().
>>  	 */
>>  	WARN_ON(amd_iommu_snp_en && (domid == 0));
>>  
>> +	amd_iommu_make_clear_dte(dev_data, &new);
>>  
>> +	old_domid = READ_ONCE(dte->data[1]) & DTE_DOMID_MASK;
> 
> This old_domid stuff doesn't make any sense. I think the commit that
> added it is bonkers: 36b7200f67df ("iommu/amd: Flush old domains in kdump kernel")
> 
> The problem is that the dom ids that are present in the re-used DTE
> table have to be reserved from the domain id alloctor at boot.
> > If the kdump kernel tries to create new DTEs it MUST NOT re-use any
> IDs that are actively being using in DTEs already or you get data
> corruption. Randomly flushing IDs is just getting lucky..

Good catch. Thanks!

Looks like commit 38e5f33ee359 ("iommu/amd: Reuse device table for kdump") broke
domain ID reservation in kdump path :-( We will fix it.


> 
> Not for this series, but something to think about.
> 
>> +	if (gcr3_info && gcr3_info->gcr3_tbl)
>> +		set_dte_gcr3_table(dev_data, &new);
>> +	else if (domain->iop.mode != PAGE_MODE_NONE)
>> +		amd_iommu_set_dte_v1(dev_data, domain, &new);
>>  
>> +	/* Note: The IR, IW, TV, DOMID are needed for both v1 and gcr3 table */
>> +	new.data[0] |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_TV;
>> +	new.data[1] |= FIELD_PREP(DTE_DOMID_MASK, domid);
>>  
>> +	if (dev_data->ats_enabled)
>> +		new.data[1] |= DTE_FLAG_IOTLB;
> 
> These three should be merged into the two functions so they stand
> alone. These sets have to be made in the next patch, doesn't make
> sense to open code them in callers.
> 
> Like this, it is simple and readable. It directly answers the question
> 'what bits are set when the driver creates this kind of DTE'
> 
> static void set_dte_gcr3_table(struct amd_iommu *iommu,
> 			       struct iommu_dev_data *dev_data,
> 			       struct dev_table_entry *target)
> {
> 	struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
> 	u64 gcr3 = iommu_virt_to_phys(gcr3_info->gcr3_tbl);
> 
> 	target->data[0] |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_TV |
> 			   DTE_FLAG_GV | FIELD_PREP(DTE_GLX, gcr3_info->glx) |
> 			   FIELD_PREP(DTE_GCR3_14_12, gcr3 >> 12) |
> 			   (dev_data->ats_enabled ? DTE_FLAG_IOTLB : 0) |
> 			   (pdom_is_v2_pgtbl_mode(dev_data->domain) ?
> 				    target->data[0] |= DTE_FLAG_GIOV :
> 				    0);
> 
> 	target->data[1] |= FIELD_PREP(DTE_GCR3_30_15, gcr3 >> 15) |
> 			   FIELD_PREP(DTE_GCR3_51_31, gcr3 >> 31) |
> 			   FIELD_PREP(DTE_DOMID_MASK, dev_data->gcr3_info.domid);
> 
> 	/* Guest page table can only support 4 and 5 levels  */
> 	if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL)
> 		target->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_5_LEVEL);
> 	else
> 		target->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_4_LEVEL);
> }
> 
> void amd_iommu_set_dte_v1(struct iommu_dev_data *dev_data,
> 			  struct protection_domain *domain,
> 			  struct dev_table_entry *new)
> {
> 	u64 htrp = iommu_virt_to_phys(domain->iop.root);
> 
> 	new->data[0] |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_TV |
> 			FIELD_PREP(DTE_MODE_MASK, domain->iop.mode) |
> 			FIELD_PREP(DTE_HOST_TRP, htrp >> 12) |
> 			(dev_data->ats_enabled ? DTE_FLAG_IOTLB : 0) |
> 			(domain->dirty_tracking ? DTE_FLAG_HAD : 0);
> 	new.data[1] |= FIELD_PREP(DTE_DOMID_MASK, domain->id);
> }
> 
> (It is nice to sort the fields by the spec order, I didn't do that)
> 
> Looks like an identity one is needed too:
> 
> void set_dte_identity(struct dev_table_entry *new)
> {
> 	new->data[0] |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_TV |
> 			(dev_data->ats_enabled ? DTE_FLAG_IOTLB : 0);
> }
> 
> Then the whole function is pretty much just:
> 
> 	amd_iommu_make_clear_dte(dev_data, &new);
> 	if (gcr3_info && gcr3_info->gcr3_tbl)
> 		set_dte_gcr3_table(dev_data, &new);
> 	else if (domain->domain.type == IOMMU_DOMAIN_IDENTITY)
> 		set_dte_identity(&new)
> 	else if (domain->domain.type == IOMMU_DOMAIN_PAGING && io.mode == V1)
> 		amd_iommu_set_dte_v1(dev_data, domain, &new);
> 	else
> 		WARN_ON(true);
> 
> ?
> 
> (though how does IDENTITY on a device with a PASID installed work?)

Probably set_dte_identity() should call  set_dte_gcr3_table () and update
relevant fields.

-Vasant
Re: [PATCH v4 15/16] iommu/amd: Refactor logic to program the host page table in DTE
Posted by Jason Gunthorpe 3 months ago
On Sat, Nov 08, 2025 at 10:56:38PM +0530, Vasant Hegde wrote:

> > On Tue, Oct 21, 2025 at 01:43:23AM +0000, Suravee Suthikulpanit wrote:
> >> @@ -2088,37 +2104,28 @@ static void set_dte_entry(struct amd_iommu *iommu,
> >>  	else
> >>  		domid = domain->id;
> >>  
> >>  	/*
> >>  	 * When SNP is enabled, we can only support TV=1 with non-zero domain ID.
> >>  	 * This is prevented by the SNP-enable and IOMMU_DOMAIN_IDENTITY check in
> >>  	 * do_iommu_domain_alloc().
> >>  	 */
> >>  	WARN_ON(amd_iommu_snp_en && (domid == 0));
> >>  
> >> +	amd_iommu_make_clear_dte(dev_data, &new);
> >>  
> >> +	old_domid = READ_ONCE(dte->data[1]) & DTE_DOMID_MASK;
> > 
> > This old_domid stuff doesn't make any sense. I think the commit that
> > added it is bonkers: 36b7200f67df ("iommu/amd: Flush old domains in kdump kernel")
> > 
> > The problem is that the dom ids that are present in the re-used DTE
> > table have to be reserved from the domain id alloctor at boot.
> > > If the kdump kernel tries to create new DTEs it MUST NOT re-use any
> > IDs that are actively being using in DTEs already or you get data
> > corruption. Randomly flushing IDs is just getting lucky..
> 
> Good catch. Thanks!
> 
> Looks like commit 38e5f33ee359 ("iommu/amd: Reuse device table for kdump") broke
> domain ID reservation in kdump path :-( We will fix it.

I suggest just reading the whole DTE and permanently reserving any
DomIDs from the ida. Just leak them forever, that is good enough for a
kdump kernel. Then get rid of this stuff above.

> > (though how does IDENTITY on a device with a PASID installed work?)
> 
> Probably set_dte_identity() should call  set_dte_gcr3_table () and update
> relevant fields.

Yeah, maybe so, though it might be best if the caller can detect it
and have it call both functions instead of trying to bury it.

There is a fair number of variations here as you have
blocked/identity, RID/PASID attach and PASID detach to all consider.

It would be nice if it worked properly, identity on RID while PASID is
in use has a strong real use case.

Jason
Re: [PATCH v4 15/16] iommu/amd: Refactor logic to program the host page table in DTE
Posted by Suthikulpanit, Suravee 2 months, 4 weeks ago

On 11/9/2025 6:03 AM, Jason Gunthorpe wrote:
> On Sat, Nov 08, 2025 at 10:56:38PM +0530, Vasant Hegde wrote:
>> On 10/23/2025 6:38 PM, Jason Gunthorpe wrote: 
>>> On Tue, Oct 21, 2025 at 01:43:23AM +0000, Suravee Suthikulpanit wrote:
>>>
>>> (though how does IDENTITY on a device with a PASID installed work?)
>>
>> Probably set_dte_identity() should call  set_dte_gcr3_table () and update
>> relevant fields.

Actually, PASID would not work with IDENTITY since it has no page table 
(i.e. iommu=pt means DTE[Mode]=0 and does not have host table pointer).
PASID only work with GCR3 table.

Therefore, it does not make sense for set_dte_identity() to call 
set_dte_gcr3_table(). Each one should be stand alone.

Thanks,
Suravee

> Yeah, maybe so, though it might be best if the caller can detect it
> and have it call both functions instead of trying to bury it.
> 
> There is a fair number of variations here as you have
> blocked/identity, RID/PASID attach and PASID detach to all consider.
> 
> It would be nice if it worked properly, identity on RID while PASID is
> in use has a strong real use case.
> 
> Jason
Re: [PATCH v4 15/16] iommu/amd: Refactor logic to program the host page table in DTE
Posted by Jason Gunthorpe 2 months, 3 weeks ago
On Thu, Nov 13, 2025 at 01:40:47AM +0700, Suthikulpanit, Suravee wrote:
> 
> 
> On 11/9/2025 6:03 AM, Jason Gunthorpe wrote:
> > On Sat, Nov 08, 2025 at 10:56:38PM +0530, Vasant Hegde wrote:
> > > On 10/23/2025 6:38 PM, Jason Gunthorpe wrote:
> > > > On Tue, Oct 21, 2025 at 01:43:23AM +0000, Suravee Suthikulpanit wrote:
> > > > 
> > > > (though how does IDENTITY on a device with a PASID installed work?)
> > > 
> > > Probably set_dte_identity() should call  set_dte_gcr3_table () and update
> > > relevant fields.
> 
> Actually, PASID would not work with IDENTITY since it has no page table
> (i.e. iommu=pt means DTE[Mode]=0 and does not have host table pointer).
> PASID only work with GCR3 table.
> 
> Therefore, it does not make sense for set_dte_identity() to call
> set_dte_gcr3_table(). Each one should be stand alone.

OK, so the HW cannot do this?

On SMMUv3 there are bits in the "DTE" (S1DSS) that control how
no-PASID, ie PASID 0 transactions are handled which allow it to be set
to indentity. Intel can do the same.

IMHO this is a HW gap that AMD probably should fix.

In the mean time the driver should be blocking this unsupported
combination, I don't remember seeing that code? It will make it more
difficult to use SVA on AMD platforms, but not much that can be done
about that :\

Jason