iommu/amd: Modify set_dte_entry() to use 128-bit cmpxchg operation

[PATCH] iommu/amd: Modify set_dte_entry() to use 128-bit cmpxchg operation

Posted by Suravee Suthikulpanit 1 year, 5 months ago

The current implementation does not follow the 128-bit write
requirement to update DTE as specified in the AMD I/O Virtualization
Techonology (IOMMU) Specification.

In addition, the function is used to program several DTE fields
(e.g. stage1 table, stage2 table, domain id, and etc.), which is
difficult to keep track with current implementation.

Therefore, introduce new a new dte256_t data type and a helper function
update_dte_256(), which uses two try_cmpxchg128 operations to update
256-bit DTE.

Also, separate logic for setting up the GCR3 Table Root Pointer, GIOV, GV,
GLX, and GuestPagingMode into another helper function set_dte_gcr3_table().

Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
---
 drivers/iommu/amd/amd_iommu_types.h |  17 ++++
 drivers/iommu/amd/iommu.c           | 143 +++++++++++++++++-----------
 2 files changed, 107 insertions(+), 53 deletions(-)

diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
index c9f9a598eb82..295138447476 100644
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -886,6 +886,23 @@ struct dev_table_entry {
 	u64 data[4];
 };
 
+struct dte256 {
+	union {
+		struct {
+			u64 lo;
+			u64 hi;
+		};
+		u128 data;
+	} qw_lo;
+	union {
+		struct {
+			u64 lo;
+			u64 hi;
+		};
+		u128 data;
+	} qw_hi;
+};
+
 /*
  * One entry for unity mappings parsed out of the ACPI table.
  */
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 87c5385ce3f2..189f65af45fe 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -1851,90 +1851,127 @@ int amd_iommu_clear_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid)
 	return ret;
 }
 
+static void set_dte_gcr3_table(struct amd_iommu *iommu,
+			       struct iommu_dev_data *dev_data,
+			       struct dte256 *target)
+{
+	struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
+	u64 tmp, gcr3;
+
+	if (!gcr3_info->gcr3_tbl)
+		return;
+
+	pr_debug("%s: devid=%#x, glx=%#x, gcr3_tbl=%#llx\n",
+		 __func__, dev_data->devid, gcr3_info->glx,
+		 (unsigned long long)gcr3_info->gcr3_tbl);
+
+	tmp = gcr3_info->glx;
+	target->qw_lo.lo |= (tmp & DTE_GLX_MASK) << DTE_GLX_SHIFT;
+	if (pdom_is_v2_pgtbl_mode(dev_data->domain))
+		target->qw_lo.lo |= DTE_FLAG_GIOV;
+	target->qw_lo.lo |= DTE_FLAG_GV;
+
+	/* First mask out possible old values for GCR3 table */
+	tmp = DTE_GCR3_VAL_A(~0ULL) << DTE_GCR3_SHIFT_A;
+	target->qw_lo.lo &= ~tmp;
+	tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B;
+	tmp |= DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C;
+	target->qw_lo.hi &= ~tmp;
+
+	gcr3 = iommu_virt_to_phys(gcr3_info->gcr3_tbl);
+
+	/* Encode GCR3 table into DTE */
+	tmp = DTE_GCR3_VAL_A(gcr3) << DTE_GCR3_SHIFT_A;
+	target->qw_lo.lo |= tmp;
+	tmp = DTE_GCR3_VAL_B(gcr3) << DTE_GCR3_SHIFT_B;
+	tmp |= DTE_GCR3_VAL_C(gcr3) << DTE_GCR3_SHIFT_C;
+	target->qw_lo.hi |= tmp;
+
+	/* Mask out old values for GuestPagingMode */
+	target->qw_hi.lo &= ~(0x3ULL << DTE_GPT_LEVEL_SHIFT);
+	/* Guest page table can only support 4 and 5 levels  */
+	if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL)
+		target->qw_hi.lo |= ((u64)GUEST_PGTABLE_5_LEVEL << DTE_GPT_LEVEL_SHIFT);
+}
+
+static void update_dte256(struct amd_iommu *iommu, u16 devid, struct dte256 *new)
+{
+	struct dev_table_entry *dev_table = get_dev_table(iommu);
+	struct dte256 *ptr = (struct dte256 *)&dev_table[devid];
+	struct dte256 old = {
+		.qw_lo.data = ptr->qw_lo.data,
+		.qw_hi.data = ptr->qw_hi.data,
+	};
+
+	/* Update qw_lo */
+	if (!try_cmpxchg128(&ptr->qw_lo.data, &old.qw_lo.data, new->qw_lo.data))
+		goto err_out;
+
+	/* Update qw_hi */
+	if (!try_cmpxchg128(&ptr->qw_hi.data, &old.qw_hi.data, new->qw_hi.data)) {
+		/* Restore qw_lo */
+		try_cmpxchg128(&ptr->qw_lo.data, &new->qw_lo.data, old.qw_lo.data);
+		goto err_out;
+	}
+	return;
+err_out:
+	pr_err("%s: Failed to update DTE for devid %#x\n", __func__, devid);
+}
+
 static void set_dte_entry(struct amd_iommu *iommu,
 			  struct iommu_dev_data *dev_data)
 {
-	u64 pte_root = 0;
-	u64 flags = 0;
-	u32 old_domid;
-	u16 devid = dev_data->devid;
 	u16 domid;
+	struct dte256 new = { .qw_lo.data = 0, .qw_hi.data = 0 };
+	u16 devid = dev_data->devid;
 	struct protection_domain *domain = dev_data->domain;
 	struct dev_table_entry *dev_table = get_dev_table(iommu);
 	struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
+	u32 old_domid = dev_table[devid].data[1] & DEV_DOMID_MASK;
 
-	if (gcr3_info && gcr3_info->gcr3_tbl)
+	if (gcr3_info->gcr3_tbl)
 		domid = dev_data->gcr3_info.domid;
 	else
 		domid = domain->id;
 
+	/*
+	 * Need to get the current value in dte[1,2] because they contain
+	 * interrupt-remapping settings, which has been programmed earlier.
+	 */
+	new.qw_lo.hi = dev_table[devid].data[1];
+	new.qw_hi.lo = dev_table[devid].data[2];
+	new.qw_hi.hi = dev_table[devid].data[3];
+
 	if (domain->iop.mode != PAGE_MODE_NONE)
-		pte_root = iommu_virt_to_phys(domain->iop.root);
+		new.qw_lo.lo = iommu_virt_to_phys(domain->iop.root);
 
-	pte_root |= (domain->iop.mode & DEV_ENTRY_MODE_MASK)
+	new.qw_lo.lo |= (domain->iop.mode & DEV_ENTRY_MODE_MASK)
 		    << DEV_ENTRY_MODE_SHIFT;
 
-	pte_root |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_V;
+	new.qw_lo.lo |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_V;
 
 	/*
 	 * When SNP is enabled, Only set TV bit when IOMMU
 	 * page translation is in use.
 	 */
 	if (!amd_iommu_snp_en || (domid != 0))
-		pte_root |= DTE_FLAG_TV;
-
-	flags = dev_table[devid].data[1];
+		new.qw_lo.lo |= DTE_FLAG_TV;
 
 	if (dev_data->ats_enabled)
-		flags |= DTE_FLAG_IOTLB;
+		new.qw_lo.hi |= DTE_FLAG_IOTLB;
 
 	if (dev_data->ppr)
-		pte_root |= 1ULL << DEV_ENTRY_PPR;
+		new.qw_lo.lo |= 1ULL << DEV_ENTRY_PPR;
 
 	if (domain->dirty_tracking)
-		pte_root |= DTE_FLAG_HAD;
-
-	if (gcr3_info && gcr3_info->gcr3_tbl) {
-		u64 gcr3 = iommu_virt_to_phys(gcr3_info->gcr3_tbl);
-		u64 glx  = gcr3_info->glx;
-		u64 tmp;
+		new.qw_lo.lo |= DTE_FLAG_HAD;
 
-		pte_root |= DTE_FLAG_GV;
-		pte_root |= (glx & DTE_GLX_MASK) << DTE_GLX_SHIFT;
-
-		/* First mask out possible old values for GCR3 table */
-		tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B;
-		flags    &= ~tmp;
-
-		tmp = DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C;
-		flags    &= ~tmp;
-
-		/* Encode GCR3 table into DTE */
-		tmp = DTE_GCR3_VAL_A(gcr3) << DTE_GCR3_SHIFT_A;
-		pte_root |= tmp;
-
-		tmp = DTE_GCR3_VAL_B(gcr3) << DTE_GCR3_SHIFT_B;
-		flags    |= tmp;
-
-		tmp = DTE_GCR3_VAL_C(gcr3) << DTE_GCR3_SHIFT_C;
-		flags    |= tmp;
-
-		if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) {
-			dev_table[devid].data[2] |=
-				((u64)GUEST_PGTABLE_5_LEVEL << DTE_GPT_LEVEL_SHIFT);
-		}
-
-		/* GIOV is supported with V2 page table mode only */
-		if (pdom_is_v2_pgtbl_mode(domain))
-			pte_root |= DTE_FLAG_GIOV;
-	}
+	new.qw_lo.hi &= ~DEV_DOMID_MASK;
+	new.qw_lo.hi |= domid;
 
-	flags &= ~DEV_DOMID_MASK;
-	flags |= domid;
+	set_dte_gcr3_table(iommu, dev_data, &new);
 
-	old_domid = dev_table[devid].data[1] & DEV_DOMID_MASK;
-	dev_table[devid].data[1]  = flags;
-	dev_table[devid].data[0]  = pte_root;
+	update_dte256(iommu, devid, &new);
 
 	/*
 	 * A kdump kernel might be replacing a domain ID that was copied from
-- 
2.34.1

Re: [PATCH] iommu/amd: Modify set_dte_entry() to use 128-bit cmpxchg operation

Posted by Uros Bizjak 1 year, 5 months ago


On 19. 08. 24 18:18, Suravee Suthikulpanit wrote:
> The current implementation does not follow the 128-bit write
> requirement to update DTE as specified in the AMD I/O Virtualization
> Techonology (IOMMU) Specification.
> 
> In addition, the function is used to program several DTE fields
> (e.g. stage1 table, stage2 table, domain id, and etc.), which is
> difficult to keep track with current implementation.
> 
> Therefore, introduce new a new dte256_t data type and a helper function
> update_dte_256(), which uses two try_cmpxchg128 operations to update
> 256-bit DTE.
> 
> Also, separate logic for setting up the GCR3 Table Root Pointer, GIOV, GV,
> GLX, and GuestPagingMode into another helper function set_dte_gcr3_table().
> 
> Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>

[...]

> +static void update_dte256(struct amd_iommu *iommu, u16 devid, struct dte256 *new)
> +{
> +	struct dev_table_entry *dev_table = get_dev_table(iommu);
> +	struct dte256 *ptr = (struct dte256 *)&dev_table[devid];
> +	struct dte256 old = {
> +		.qw_lo.data = ptr->qw_lo.data,
> +		.qw_hi.data = ptr->qw_hi.data,
> +	};
> +
> +	/* Update qw_lo */
> +	if (!try_cmpxchg128(&ptr->qw_lo.data, &old.qw_lo.data, new->qw_lo.data))
> +		goto err_out;
> +
> +	/* Update qw_hi */
> +	if (!try_cmpxchg128(&ptr->qw_hi.data, &old.qw_hi.data, new->qw_hi.data)) {
> +		/* Restore qw_lo */
> +		try_cmpxchg128(&ptr->qw_lo.data, &new->qw_lo.data, old.qw_lo.data);

You should use plain cmpxchg128() when the result is unused.

Uros.

Re: [PATCH] iommu/amd: Modify set_dte_entry() to use 128-bit cmpxchg operation

Posted by Suthikulpanit, Suravee 1 year, 5 months ago


On 8/20/2024 1:15 AM, Uros Bizjak wrote:
> 
> 
> On 19. 08. 24 18:18, Suravee Suthikulpanit wrote:
>> The current implementation does not follow the 128-bit write
>> requirement to update DTE as specified in the AMD I/O Virtualization
>> Techonology (IOMMU) Specification.
>>
>> In addition, the function is used to program several DTE fields
>> (e.g. stage1 table, stage2 table, domain id, and etc.), which is
>> difficult to keep track with current implementation.
>>
>> Therefore, introduce new a new dte256_t data type and a helper function
>> update_dte_256(), which uses two try_cmpxchg128 operations to update
>> 256-bit DTE.
>>
>> Also, separate logic for setting up the GCR3 Table Root Pointer, GIOV, 
>> GV,
>> GLX, and GuestPagingMode into another helper function 
>> set_dte_gcr3_table().
>>
>> Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
> 
> [...]
> 
>> +static void update_dte256(struct amd_iommu *iommu, u16 devid, struct 
>> dte256 *new)
>> +{
>> +    struct dev_table_entry *dev_table = get_dev_table(iommu);
>> +    struct dte256 *ptr = (struct dte256 *)&dev_table[devid];
>> +    struct dte256 old = {
>> +        .qw_lo.data = ptr->qw_lo.data,
>> +        .qw_hi.data = ptr->qw_hi.data,
>> +    };
>> +
>> +    /* Update qw_lo */
>> +    if (!try_cmpxchg128(&ptr->qw_lo.data, &old.qw_lo.data, 
>> new->qw_lo.data))
>> +        goto err_out;
>> +
>> +    /* Update qw_hi */
>> +    if (!try_cmpxchg128(&ptr->qw_hi.data, &old.qw_hi.data, 
>> new->qw_hi.data)) {
>> +        /* Restore qw_lo */
>> +        try_cmpxchg128(&ptr->qw_lo.data, &new->qw_lo.data, 
>> old.qw_lo.data);
> 
> You should use plain cmpxchg128() when the result is unused.
> 
> Uros.

I will update in v2.

Thanks,
Suravee

Re: [PATCH] iommu/amd: Modify set_dte_entry() to use 128-bit cmpxchg operation

Posted by Jason Gunthorpe 1 year, 5 months ago

On Mon, Aug 19, 2024 at 04:18:39PM +0000, Suravee Suthikulpanit wrote:

> +struct dte256 {

[..]

This would be alot better as just

struct dte {
 union {
     u64 data[4];
     u128 data128[2];
 };
};

And don't make a new type. IIRC the cmpxchg likes to have alignment
and the u128 provides that..

> +static void update_dte256(struct amd_iommu *iommu, u16 devid, struct dte256 *new)
> +{
> +	struct dev_table_entry *dev_table = get_dev_table(iommu);
> +	struct dte256 *ptr = (struct dte256 *)&dev_table[devid];
> +	struct dte256 old = {
> +		.qw_lo.data = ptr->qw_lo.data,
> +		.qw_hi.data = ptr->qw_hi.data,
> +	};
> +
> +	/* Update qw_lo */
> +	if (!try_cmpxchg128(&ptr->qw_lo.data, &old.qw_lo.data, new->qw_lo.data))
> +		goto err_out;
> +
> +	/* Update qw_hi */
> +	if (!try_cmpxchg128(&ptr->qw_hi.data, &old.qw_hi.data, new->qw_hi.data)) {
> +		/* Restore qw_lo */
> +		try_cmpxchg128(&ptr->qw_lo.data, &new->qw_lo.data, old.qw_lo.data);
> +		goto err_out;
> +	}

I don't think this is going to work like this, the interrupt remapping
code can asynchronously change the values (it is an existing race bug)
and if it races that will cause these to fail too, and this doesn't
handle that case at all.

IMHO the simple solution would hold the spinlock and transfer the
interrupt remapping fields then directly write them without a failable
cmpxchg. Maybe you could also use a cmpxchg loop to transfer the interrupt
remapping bits without a lock..

Can you assume cmpxchg 128 is available on all CPUs that have the
iommu? I see google says some early AMD 64 bit CPUs don't have it? Do
they have iommu's? I had wondered if the doc intended that some 128
bit SSE/MMX store store would be used here??

If cmpxchg128 is safe checking that cap and refusing to probe the
iommu driver would be appropriate. Otherwise a 64 bit flow still has
to work?

Finally, the order of programming the two 128s depends on what you are
programming. The first 128 contains the valid bit so you shouldn't be
writing the second 128 after, and vice versa. This will make
undesirable races around Guest Paging Mode and eventually viommuen at
least.

If you have to solve the 64 bit case too, then you really should use
the programmer from SMMUv3. Everyone seems to need this logic and it
should be generalized. I can help you do it.. I have a patch to make
that logic use 128 bit stores too someplace.

Otherwise you can possibly open code the 128 bit path with some more
care to check the valid bits and set the order, plus the interrupt
remapping locking.

Jason

Re: [PATCH] iommu/amd: Modify set_dte_entry() to use 128-bit cmpxchg operation

Posted by Suthikulpanit, Suravee 1 year, 5 months ago

Hi Jason,

On 8/20/2024 12:09 AM, Jason Gunthorpe wrote:
> On Mon, Aug 19, 2024 at 04:18:39PM +0000, Suravee Suthikulpanit wrote:
> 
>> +struct dte256 {
> 
> [..]
> 
> This would be alot better as just
> 
> struct dte {
>   union {
>       u64 data[4];
>       u128 data128[2];
>   };
> };
> 
> And don't make a new type. IIRC the cmpxchg likes to have alignment
> and the u128 provides that..

OK.

>> +static void update_dte256(struct amd_iommu *iommu, u16 devid, struct dte256 *new)
>> +{
>> +	struct dev_table_entry *dev_table = get_dev_table(iommu);
>> +	struct dte256 *ptr = (struct dte256 *)&dev_table[devid];
>> +	struct dte256 old = {
>> +		.qw_lo.data = ptr->qw_lo.data,
>> +		.qw_hi.data = ptr->qw_hi.data,
>> +	};
>> +
>> +	/* Update qw_lo */
>> +	if (!try_cmpxchg128(&ptr->qw_lo.data, &old.qw_lo.data, new->qw_lo.data))
>> +		goto err_out;
>> +
>> +	/* Update qw_hi */
>> +	if (!try_cmpxchg128(&ptr->qw_hi.data, &old.qw_hi.data, new->qw_hi.data)) {
>> +		/* Restore qw_lo */
>> +		try_cmpxchg128(&ptr->qw_lo.data, &new->qw_lo.data, old.qw_lo.data);
>> +		goto err_out;
>> +	}
> 
> I don't think this is going to work like this, the interrupt remapping
> code can asynchronously change the values (it is an existing race bug)
> and if it races that will cause these to fail too, and this doesn't
> handle that case at all.
> 
> IMHO the simple solution would hold the spinlock and transfer the
> interrupt remapping fields then directly write them without a failable
> cmpxchg. Maybe you could also use a cmpxchg loop to transfer the interrupt
> remapping bits without a lock..

OK, I'll as synchronization when updating DTE.

> Can you assume cmpxchg 128 is available on all CPUs that have the
> iommu? I see google says some early AMD 64 bit CPUs don't have it? Do
> they have iommu's? I had wondered if the doc intended that some 128
> bit SSE/MMX store store would be used here??
> 
> If cmpxchg128 is safe checking that cap and refusing to probe the
> iommu driver would be appropriate. Otherwise a 64 bit flow still has
> to work?

According to the AMD BIOS and Kernel Developer's Guide (BDKG) dated back 
since family 10h Processor 
(https://www.amd.com/content/dam/amd/en/documents/archived-tech-docs/programmer-references/31116.pdf), 
which is the first introduction of AMD IOMMU, AMD processor always 
support for CPUID Fn0000_0001_ECX[CMPXCHG16B]=1.

So, it should be safe to assume that cmpxchg128 is available with all 
AMD processor w/ IOMMU.

> Finally, the order of programming the two 128s depends on what you are
> programming. The first 128 contains the valid bit so you shouldn't be
> writing the second 128 after, and vice versa. This will make
> undesirable races around Guest Paging Mode and eventually viommuen at
> least.

Ok, I'll program the higher 128-bit first.

> If you have to solve the 64 bit case too, then you really should use
> the programmer from SMMUv3. Everyone seems to need this logic and it
> should be generalized. I can help you do it.. I have a patch to make
> that logic use 128 bit stores too someplace.
> 
> Otherwise you can possibly open code the 128 bit path with some more
> care to check the valid bits and set the order, plus the interrupt
> remapping locking.

I think the cmpxchg128 should still work and would be a simpler 
solution. I am sending out v2 shortly.

Thanks,
Suravee