[PATCH v7 06/10] iommu/amd: Modify set_dte_entry() to use 256-bit DTE helpers

Suravee Suthikulpanit posted 10 patches 3 weeks, 3 days ago
There is a newer version of this series
[PATCH v7 06/10] iommu/amd: Modify set_dte_entry() to use 256-bit DTE helpers
Posted by Suravee Suthikulpanit 3 weeks, 3 days ago
Also, the set_dte_entry() is used to program several DTE fields (e.g.
stage1 table, stage2 table, domain id, and etc.), which is difficult
to keep track with current implementation.

Therefore, separate logic for clearing DTE (i.e. make_clear_dte) and
another function for setting up the GCR3 Table Root Pointer, GIOV, GV,
GLX, and GuestPagingMode into another function set_dte_gcr3_table().

Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
---
 drivers/iommu/amd/amd_iommu.h       |   2 +
 drivers/iommu/amd/amd_iommu_types.h |  13 +--
 drivers/iommu/amd/init.c            |  28 +++++-
 drivers/iommu/amd/iommu.c           | 129 ++++++++++++++++------------
 4 files changed, 104 insertions(+), 68 deletions(-)

diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h
index 6386fa4556d9..35d1e40930a5 100644
--- a/drivers/iommu/amd/amd_iommu.h
+++ b/drivers/iommu/amd/amd_iommu.h
@@ -177,3 +177,5 @@ void amd_iommu_domain_set_pgtable(struct protection_domain *domain,
 struct dev_table_entry *get_dev_table(struct amd_iommu *iommu);
 
 #endif
+
+struct dev_table_entry *amd_iommu_get_ivhd_dte_flags(u16 devid);
diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
index e11a77c0f592..561972356ff6 100644
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -409,8 +409,7 @@
 #define DTE_FLAG_HAD	(3ULL << 7)
 #define DTE_FLAG_GIOV	BIT_ULL(54)
 #define DTE_FLAG_GV	BIT_ULL(55)
-#define DTE_GLX_SHIFT	(56)
-#define DTE_GLX_MASK	(3)
+#define DTE_GLX		GENMASK_ULL(57, 56)
 #define DTE_FLAG_IR	BIT_ULL(61)
 #define DTE_FLAG_IW	BIT_ULL(62)
 
@@ -418,13 +417,9 @@
 #define DTE_FLAG_MASK	(0x3ffULL << 32)
 #define DEV_DOMID_MASK	0xffffULL
 
-#define DTE_GCR3_VAL_A(x)	(((x) >> 12) & 0x00007ULL)
-#define DTE_GCR3_VAL_B(x)	(((x) >> 15) & 0x0ffffULL)
-#define DTE_GCR3_VAL_C(x)	(((x) >> 31) & 0x1fffffULL)
-
-#define DTE_GCR3_SHIFT_A	58
-#define DTE_GCR3_SHIFT_B	16
-#define DTE_GCR3_SHIFT_C	43
+#define DTE_GCR3_14_12	GENMASK_ULL(60, 58)
+#define DTE_GCR3_30_15	GENMASK_ULL(31, 16)
+#define DTE_GCR3_51_31	GENMASK_ULL(63, 43)
 
 #define DTE_GPT_LEVEL_SHIFT	54
 #define DTE_GPT_LEVEL_MASK	GENMASK_ULL(55, 54)
diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index 98b4b116d557..9f5bda23e45e 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -1093,11 +1093,9 @@ static bool __copy_device_table(struct amd_iommu *iommu)
 			__set_bit(dom_id, amd_iommu_pd_alloc_bitmap);
 			/* If gcr3 table existed, mask it out */
 			if (old_devtb[devid].data[0] & DTE_FLAG_GV) {
-				tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B;
-				tmp |= DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C;
+				tmp = (DTE_GCR3_30_15 | DTE_GCR3_51_31);
 				pci_seg->old_dev_tbl_cpy[devid].data[1] &= ~tmp;
-				tmp = DTE_GCR3_VAL_A(~0ULL) << DTE_GCR3_SHIFT_A;
-				tmp |= DTE_FLAG_GV;
+				tmp = (DTE_GCR3_14_12 | DTE_FLAG_GV);
 				pci_seg->old_dev_tbl_cpy[devid].data[0] &= ~tmp;
 			}
 		}
@@ -1148,6 +1146,28 @@ static bool copy_device_table(void)
 	return true;
 }
 
+struct dev_table_entry *amd_iommu_get_ivhd_dte_flags(u16 devid)
+{
+	u16 f = 0, l = 0xFFFF;
+	struct ivhd_dte_flags *e;
+	struct dev_table_entry *dte = NULL;
+
+	for_each_ivhd_dte_flags(e) {
+		/*
+		 * Need to go through the whole list to find the smallest range,
+		 * which contains the devid. Then store it in f and l variables.
+		 */
+		if ((e->devid_first >= devid) && (e->devid_last <= devid)) {
+			if (f < e->devid_first)
+				f = e->devid_first;
+			if (e->devid_last < l)
+				l = e->devid_last;
+			dte = &(e->dte);
+		}
+	}
+	return dte;
+}
+
 static bool search_ivhd_dte_flags(u16 first, u16 last)
 {
 	struct ivhd_dte_flags *e;
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index eb22ed1a219c..fd239b38809b 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -1958,90 +1958,109 @@ int amd_iommu_clear_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid)
 	return ret;
 }
 
+static void make_clear_dte(struct iommu_dev_data *dev_data, struct dev_table_entry *ptr,
+			   struct dev_table_entry *new)
+{
+	/* All existing DTE must have V bit set */
+	new->data128[0] = DTE_FLAG_V;
+	new->data128[1] = 0;
+}
+
+/*
+ * Note:
+ * The old value for GCR3 table and GPT have been cleared from caller.
+ */
+static void set_dte_gcr3_table(struct amd_iommu *iommu,
+			       struct iommu_dev_data *dev_data,
+			       struct dev_table_entry *target)
+{
+	struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
+	u64 gcr3;
+
+	if (!gcr3_info->gcr3_tbl)
+		return;
+
+	pr_debug("%s: devid=%#x, glx=%#x, gcr3_tbl=%#llx\n",
+		 __func__, dev_data->devid, gcr3_info->glx,
+		 (unsigned long long)gcr3_info->gcr3_tbl);
+
+	gcr3 = iommu_virt_to_phys(gcr3_info->gcr3_tbl);
+
+	target->data[0] |= DTE_FLAG_GV |
+			   FIELD_PREP(DTE_GLX, gcr3_info->glx) |
+			   FIELD_PREP(DTE_GCR3_14_12, gcr3 >> 12);
+	if (pdom_is_v2_pgtbl_mode(dev_data->domain))
+		target->data[0] |= DTE_FLAG_GIOV;
+
+	target->data[1] |= FIELD_PREP(DTE_GCR3_30_15, gcr3 >> 15) |
+			   FIELD_PREP(DTE_GCR3_51_31, gcr3 >> 31);
+
+	/* Guest page table can only support 4 and 5 levels  */
+	if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL)
+		target->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_5_LEVEL);
+	else
+		target->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_4_LEVEL);
+}
+
 static void set_dte_entry(struct amd_iommu *iommu,
 			  struct iommu_dev_data *dev_data)
 {
-	u64 pte_root = 0;
-	u64 flags = 0;
-	u32 old_domid;
-	u16 devid = dev_data->devid;
 	u16 domid;
+	u32 old_domid;
+	struct dev_table_entry *initial_dte;
+	struct dev_table_entry new = {};
 	struct protection_domain *domain = dev_data->domain;
-	struct dev_table_entry *dev_table = get_dev_table(iommu);
 	struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
+	struct dev_table_entry *dte = &get_dev_table(iommu)[dev_data->devid];
 
 	if (gcr3_info && gcr3_info->gcr3_tbl)
 		domid = dev_data->gcr3_info.domid;
 	else
 		domid = domain->id;
 
+	make_clear_dte(dev_data, dte, &new);
+
 	if (domain->iop.mode != PAGE_MODE_NONE)
-		pte_root = iommu_virt_to_phys(domain->iop.root);
+		new.data[0] = iommu_virt_to_phys(domain->iop.root);
 
-	pte_root |= (domain->iop.mode & DEV_ENTRY_MODE_MASK)
+	new.data[0] |= (domain->iop.mode & DEV_ENTRY_MODE_MASK)
 		    << DEV_ENTRY_MODE_SHIFT;
 
-	pte_root |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_V;
+	new.data[0] |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_V;
 
 	/*
-	 * When SNP is enabled, Only set TV bit when IOMMU
-	 * page translation is in use.
+	 * When SNP is enabled, we can only support TV=1 with non-zero domain ID.
+	 * This is prevented by the SNP-enable and IOMMU_DOMAIN_IDENTITY check in
+	 * do_iommu_domain_alloc().
 	 */
-	if (!amd_iommu_snp_en || (domid != 0))
-		pte_root |= DTE_FLAG_TV;
-
-	flags = dev_table[devid].data[1];
-
-	if (dev_data->ats_enabled)
-		flags |= DTE_FLAG_IOTLB;
+	WARN_ON(amd_iommu_snp_en && (domid == 0));
+	new.data[0] |= DTE_FLAG_TV;
 
 	if (dev_data->ppr)
-		pte_root |= 1ULL << DEV_ENTRY_PPR;
+		new.data[0] |= 1ULL << DEV_ENTRY_PPR;
 
 	if (domain->dirty_tracking)
-		pte_root |= DTE_FLAG_HAD;
-
-	if (gcr3_info && gcr3_info->gcr3_tbl) {
-		u64 gcr3 = iommu_virt_to_phys(gcr3_info->gcr3_tbl);
-		u64 glx  = gcr3_info->glx;
-		u64 tmp;
-
-		pte_root |= DTE_FLAG_GV;
-		pte_root |= (glx & DTE_GLX_MASK) << DTE_GLX_SHIFT;
+		new.data[0] |= DTE_FLAG_HAD;
 
-		/* First mask out possible old values for GCR3 table */
-		tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B;
-		flags    &= ~tmp;
-
-		tmp = DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C;
-		flags    &= ~tmp;
-
-		/* Encode GCR3 table into DTE */
-		tmp = DTE_GCR3_VAL_A(gcr3) << DTE_GCR3_SHIFT_A;
-		pte_root |= tmp;
-
-		tmp = DTE_GCR3_VAL_B(gcr3) << DTE_GCR3_SHIFT_B;
-		flags    |= tmp;
-
-		tmp = DTE_GCR3_VAL_C(gcr3) << DTE_GCR3_SHIFT_C;
-		flags    |= tmp;
+	if (dev_data->ats_enabled)
+		new.data[1] |= DTE_FLAG_IOTLB;
 
-		if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) {
-			dev_table[devid].data[2] |=
-				((u64)GUEST_PGTABLE_5_LEVEL << DTE_GPT_LEVEL_SHIFT);
-		}
+	old_domid = READ_ONCE(dte->data[1]) & DEV_DOMID_MASK;
+	new.data[1] |= domid;
 
-		/* GIOV is supported with V2 page table mode only */
-		if (pdom_is_v2_pgtbl_mode(domain))
-			pte_root |= DTE_FLAG_GIOV;
+	/*
+	 * Restore cached persistent DTE bits, which can be set by information
+	 * in IVRS table. See set_dev_entry_from_acpi().
+	 */
+	initial_dte = amd_iommu_get_ivhd_dte_flags(dev_data->devid);
+	if (initial_dte) {
+		new.data128[0] |= initial_dte->data128[0];
+		new.data128[1] |= initial_dte->data128[1];
 	}
 
-	flags &= ~DEV_DOMID_MASK;
-	flags |= domid;
+	set_dte_gcr3_table(iommu, dev_data, &new);
 
-	old_domid = dev_table[devid].data[1] & DEV_DOMID_MASK;
-	dev_table[devid].data[1]  = flags;
-	dev_table[devid].data[0]  = pte_root;
+	update_dte256(iommu, dev_data, &new);
 
 	/*
 	 * A kdump kernel might be replacing a domain ID that was copied from
-- 
2.34.1
Re: [PATCH v7 06/10] iommu/amd: Modify set_dte_entry() to use 256-bit DTE helpers
Posted by Jason Gunthorpe 3 weeks, 3 days ago
On Thu, Oct 31, 2024 at 09:16:20AM +0000, Suravee Suthikulpanit wrote:
> @@ -1148,6 +1146,28 @@ static bool copy_device_table(void)
>  	return true;
>  }
>  
> +struct dev_table_entry *amd_iommu_get_ivhd_dte_flags(u16 devid)
> +{
> +	u16 f = 0, l = 0xFFFF;
> +	struct ivhd_dte_flags *e;
> +	struct dev_table_entry *dte = NULL;
> +
> +	for_each_ivhd_dte_flags(e) {

Maybe the list head should be on the iommu? I don't know how long it
would be if it is worthwhile.

> +		/*
> +		 * Need to go through the whole list to find the smallest range,
> +		 * which contains the devid. Then store it in f and l variables.
> +		 */
> +		if ((e->devid_first >= devid) && (e->devid_last <= devid)) {
> +			if (f < e->devid_first)
> +				f = e->devid_first;
> +			if (e->devid_last < l)
> +				l = e->devid_last;
> +			dte = &(e->dte);
> +		}

f and l are never used, why calculate them?

Isn't (e->devid_first >= devid) not the right way to check if devid
falls within a range?

Based on the comment it seems like you want something like this??

struct dev_table_entry *amd_iommu_get_ivhd_dte_flags(u16 devid)
{
	struct dev_table_entry *dte = NULL;
	unsigned int best_len = UINT_MAX;
	struct ivhd_dte_flags *e;

	for_each_ivhd_dte_flags(e) {
		/*
		 * Need to go through the whole list to find the smallest range,
		 * which contains the devid. Then store it in f and l variables.
		 */
		if ((e->devid_first <= devid) && (e->devid_last >= devid)) {
			unsigned int len = e->devid_last - e->devid_first;

			if (len < best_len) {
				dte = &(e->dte);
				best_len = len;
			}
		}
	}
	return dte;
}

(and it would be smart to sort the linked list by size, but again I
don't know how big it is if it is worthwile complexity)

The DTE code looks OK

Jason
Re: [PATCH v7 06/10] iommu/amd: Modify set_dte_entry() to use 256-bit DTE helpers
Posted by Suthikulpanit, Suravee 3 weeks, 3 days ago
On 10/31/2024 7:53 PM, Jason Gunthorpe wrote:
> On Thu, Oct 31, 2024 at 09:16:20AM +0000, Suravee Suthikulpanit wrote:
>> @@ -1148,6 +1146,28 @@ static bool copy_device_table(void)
>>   	return true;
>>   }
>>   
>> +struct dev_table_entry *amd_iommu_get_ivhd_dte_flags(u16 devid)
>> +{
>> +	u16 f = 0, l = 0xFFFF;
>> +	struct ivhd_dte_flags *e;
>> +	struct dev_table_entry *dte = NULL;
>> +
>> +	for_each_ivhd_dte_flags(e) {
> 
> Maybe the list head should be on the iommu? I don't know how long it
> would be if it is worthwhile.
> 
>> +		/*
>> +		 * Need to go through the whole list to find the smallest range,
>> +		 * which contains the devid. Then store it in f and l variables.
>> +		 */
>> +		if ((e->devid_first >= devid) && (e->devid_last <= devid)) {
>> +			if (f < e->devid_first)
>> +				f = e->devid_first;
>> +			if (e->devid_last < l)
>> +				l = e->devid_last;
>> +			dte = &(e->dte);
>> +		}
> 
> f and l are never used, why calculate them?
> 
> Isn't (e->devid_first >= devid) not the right way to check if devid
> falls within a range?

Actually, I missed one line. I intended to do:

+         if ((e->devid_first >= devid) && (e->devid_last <= devid) &&
+             (f <= e->devid_first) && (e->devid_last <= l)) {

The IVHD device entry can be defined for a range of devices (range 
entry) or for a selected device (dev entry). So, a particular devid 
could fall into both a range entry and a dev entry. For example:

AMD-Vi: device: 0000:00:00.2 cap: 0040 flags: 30 info 0000
AMD-Vi:   DEV_SELECT_RANGE_START       devid: 0000:00:00.3 flags: 0x0
AMD-Vi:   DEV_RANGE_END                devid: 0000:1f:1f.6
AMD-Vi:   DEV_SPECIAL(HPET[0])         devid: 0000:00:14.0, flags: 0x0
AMD-Vi:   DEV_SPECIAL(IOAPIC[128])     devid: 0000:00:14.0, flags: 0xd7
AMD-Vi:   DEV_SPECIAL(IOAPIC[242])     devid: 0000:00:00.1, flags: 0x0
AMD-Vi:   DEV_ACPI_HID(AMDI0020[ID00]) devid: 0000:00:14.5, flags: 0x40
AMD-Vi:   DEV_ACPI_HID(AMDI0020[ID01]) devid: 0000:00:14.5, flags: 0x40
AMD-Vi:   DEV_ACPI_HID(AMDI0020[ID02]) devid: 0000:00:14.5, flags: 0x40
AMD-Vi:   DEV_ACPI_HID(AMDI0020[ID03]) devid: 0000:00:14.5, flags: 0x40
AMD-Vi:   DEV_ACPI_HID(AMDI0095[ID00]) devid: 0000:00:00.4, flags: 0x30

Note that the logic only store the entry w/ flags != 0.

For devid 0000:00:14.0, we want to get the dev entry

     DEV_SPECIAL(IOAPIC[128])     devid: 0000:00:14.0, flags: 0xd7

> Based on the comment it seems like you want something like this??
> 
> struct dev_table_entry *amd_iommu_get_ivhd_dte_flags(u16 devid)
> {
> 	struct dev_table_entry *dte = NULL;
> 	unsigned int best_len = UINT_MAX;
> 	struct ivhd_dte_flags *e;
> 
> 	for_each_ivhd_dte_flags(e) {
> 		/*
> 		 * Need to go through the whole list to find the smallest range,
> 		 * which contains the devid. Then store it in f and l variables.
> 		 */
> 		if ((e->devid_first <= devid) && (e->devid_last >= devid)) {
> 			unsigned int len = e->devid_last - e->devid_first;
> 
> 			if (len < best_len) {
> 				dte = &(e->dte);
> 				best_len = len;
> 			}
> 		}
> 	}
> 	return dte;
> }

This logic would work also. Thanks.

> (and it would be smart to sort the linked list by size, but again I
> don't know how big it is if it is worthwile complexity)

The list should not be too long (only a few entries). So, I think it 
would be okay to just keep one list for the whole system.

However, we would need to also add logic to check PCI segment ID, since 
the device table is per PCI segment.

I will update and send out V8.

Thanks,
Suravee