Also, the set_dte_entry() is used to program several DTE fields (e.g.
stage1 table, stage2 table, domain id, and etc.), which is difficult
to keep track with current implementation.
Therefore, separate logic for clearing DTE (i.e. make_clear_dte) and
another function for setting up the GCR3 Table Root Pointer, GIOV, GV,
GLX, and GuestPagingMode into another function set_dte_gcr3_table().
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
---
drivers/iommu/amd/amd_iommu.h | 2 +
drivers/iommu/amd/amd_iommu_types.h | 13 +--
drivers/iommu/amd/init.c | 28 +++++-
drivers/iommu/amd/iommu.c | 129 ++++++++++++++++------------
4 files changed, 104 insertions(+), 68 deletions(-)
diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h
index 6386fa4556d9..35d1e40930a5 100644
--- a/drivers/iommu/amd/amd_iommu.h
+++ b/drivers/iommu/amd/amd_iommu.h
@@ -177,3 +177,5 @@ void amd_iommu_domain_set_pgtable(struct protection_domain *domain,
struct dev_table_entry *get_dev_table(struct amd_iommu *iommu);
#endif
+
+struct dev_table_entry *amd_iommu_get_ivhd_dte_flags(u16 devid);
diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
index e11a77c0f592..561972356ff6 100644
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -409,8 +409,7 @@
#define DTE_FLAG_HAD (3ULL << 7)
#define DTE_FLAG_GIOV BIT_ULL(54)
#define DTE_FLAG_GV BIT_ULL(55)
-#define DTE_GLX_SHIFT (56)
-#define DTE_GLX_MASK (3)
+#define DTE_GLX GENMASK_ULL(57, 56)
#define DTE_FLAG_IR BIT_ULL(61)
#define DTE_FLAG_IW BIT_ULL(62)
@@ -418,13 +417,9 @@
#define DTE_FLAG_MASK (0x3ffULL << 32)
#define DEV_DOMID_MASK 0xffffULL
-#define DTE_GCR3_VAL_A(x) (((x) >> 12) & 0x00007ULL)
-#define DTE_GCR3_VAL_B(x) (((x) >> 15) & 0x0ffffULL)
-#define DTE_GCR3_VAL_C(x) (((x) >> 31) & 0x1fffffULL)
-
-#define DTE_GCR3_SHIFT_A 58
-#define DTE_GCR3_SHIFT_B 16
-#define DTE_GCR3_SHIFT_C 43
+#define DTE_GCR3_14_12 GENMASK_ULL(60, 58)
+#define DTE_GCR3_30_15 GENMASK_ULL(31, 16)
+#define DTE_GCR3_51_31 GENMASK_ULL(63, 43)
#define DTE_GPT_LEVEL_SHIFT 54
#define DTE_GPT_LEVEL_MASK GENMASK_ULL(55, 54)
diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index 98b4b116d557..9f5bda23e45e 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -1093,11 +1093,9 @@ static bool __copy_device_table(struct amd_iommu *iommu)
__set_bit(dom_id, amd_iommu_pd_alloc_bitmap);
/* If gcr3 table existed, mask it out */
if (old_devtb[devid].data[0] & DTE_FLAG_GV) {
- tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B;
- tmp |= DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C;
+ tmp = (DTE_GCR3_30_15 | DTE_GCR3_51_31);
pci_seg->old_dev_tbl_cpy[devid].data[1] &= ~tmp;
- tmp = DTE_GCR3_VAL_A(~0ULL) << DTE_GCR3_SHIFT_A;
- tmp |= DTE_FLAG_GV;
+ tmp = (DTE_GCR3_14_12 | DTE_FLAG_GV);
pci_seg->old_dev_tbl_cpy[devid].data[0] &= ~tmp;
}
}
@@ -1148,6 +1146,28 @@ static bool copy_device_table(void)
return true;
}
+struct dev_table_entry *amd_iommu_get_ivhd_dte_flags(u16 devid)
+{
+ u16 f = 0, l = 0xFFFF;
+ struct ivhd_dte_flags *e;
+ struct dev_table_entry *dte = NULL;
+
+ for_each_ivhd_dte_flags(e) {
+ /*
+ * Need to go through the whole list to find the smallest range,
+ * which contains the devid. Then store it in f and l variables.
+ */
+ if ((e->devid_first >= devid) && (e->devid_last <= devid)) {
+ if (f < e->devid_first)
+ f = e->devid_first;
+ if (e->devid_last < l)
+ l = e->devid_last;
+ dte = &(e->dte);
+ }
+ }
+ return dte;
+}
+
static bool search_ivhd_dte_flags(u16 first, u16 last)
{
struct ivhd_dte_flags *e;
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index eb22ed1a219c..fd239b38809b 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -1958,90 +1958,109 @@ int amd_iommu_clear_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid)
return ret;
}
+static void make_clear_dte(struct iommu_dev_data *dev_data, struct dev_table_entry *ptr,
+ struct dev_table_entry *new)
+{
+ /* All existing DTE must have V bit set */
+ new->data128[0] = DTE_FLAG_V;
+ new->data128[1] = 0;
+}
+
+/*
+ * Note:
+ * The old value for GCR3 table and GPT have been cleared from caller.
+ */
+static void set_dte_gcr3_table(struct amd_iommu *iommu,
+ struct iommu_dev_data *dev_data,
+ struct dev_table_entry *target)
+{
+ struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
+ u64 gcr3;
+
+ if (!gcr3_info->gcr3_tbl)
+ return;
+
+ pr_debug("%s: devid=%#x, glx=%#x, gcr3_tbl=%#llx\n",
+ __func__, dev_data->devid, gcr3_info->glx,
+ (unsigned long long)gcr3_info->gcr3_tbl);
+
+ gcr3 = iommu_virt_to_phys(gcr3_info->gcr3_tbl);
+
+ target->data[0] |= DTE_FLAG_GV |
+ FIELD_PREP(DTE_GLX, gcr3_info->glx) |
+ FIELD_PREP(DTE_GCR3_14_12, gcr3 >> 12);
+ if (pdom_is_v2_pgtbl_mode(dev_data->domain))
+ target->data[0] |= DTE_FLAG_GIOV;
+
+ target->data[1] |= FIELD_PREP(DTE_GCR3_30_15, gcr3 >> 15) |
+ FIELD_PREP(DTE_GCR3_51_31, gcr3 >> 31);
+
+ /* Guest page table can only support 4 and 5 levels */
+ if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL)
+ target->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_5_LEVEL);
+ else
+ target->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_4_LEVEL);
+}
+
static void set_dte_entry(struct amd_iommu *iommu,
struct iommu_dev_data *dev_data)
{
- u64 pte_root = 0;
- u64 flags = 0;
- u32 old_domid;
- u16 devid = dev_data->devid;
u16 domid;
+ u32 old_domid;
+ struct dev_table_entry *initial_dte;
+ struct dev_table_entry new = {};
struct protection_domain *domain = dev_data->domain;
- struct dev_table_entry *dev_table = get_dev_table(iommu);
struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
+ struct dev_table_entry *dte = &get_dev_table(iommu)[dev_data->devid];
if (gcr3_info && gcr3_info->gcr3_tbl)
domid = dev_data->gcr3_info.domid;
else
domid = domain->id;
+ make_clear_dte(dev_data, dte, &new);
+
if (domain->iop.mode != PAGE_MODE_NONE)
- pte_root = iommu_virt_to_phys(domain->iop.root);
+ new.data[0] = iommu_virt_to_phys(domain->iop.root);
- pte_root |= (domain->iop.mode & DEV_ENTRY_MODE_MASK)
+ new.data[0] |= (domain->iop.mode & DEV_ENTRY_MODE_MASK)
<< DEV_ENTRY_MODE_SHIFT;
- pte_root |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_V;
+ new.data[0] |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_V;
/*
- * When SNP is enabled, Only set TV bit when IOMMU
- * page translation is in use.
+ * When SNP is enabled, we can only support TV=1 with non-zero domain ID.
+ * This is prevented by the SNP-enable and IOMMU_DOMAIN_IDENTITY check in
+ * do_iommu_domain_alloc().
*/
- if (!amd_iommu_snp_en || (domid != 0))
- pte_root |= DTE_FLAG_TV;
-
- flags = dev_table[devid].data[1];
-
- if (dev_data->ats_enabled)
- flags |= DTE_FLAG_IOTLB;
+ WARN_ON(amd_iommu_snp_en && (domid == 0));
+ new.data[0] |= DTE_FLAG_TV;
if (dev_data->ppr)
- pte_root |= 1ULL << DEV_ENTRY_PPR;
+ new.data[0] |= 1ULL << DEV_ENTRY_PPR;
if (domain->dirty_tracking)
- pte_root |= DTE_FLAG_HAD;
-
- if (gcr3_info && gcr3_info->gcr3_tbl) {
- u64 gcr3 = iommu_virt_to_phys(gcr3_info->gcr3_tbl);
- u64 glx = gcr3_info->glx;
- u64 tmp;
-
- pte_root |= DTE_FLAG_GV;
- pte_root |= (glx & DTE_GLX_MASK) << DTE_GLX_SHIFT;
+ new.data[0] |= DTE_FLAG_HAD;
- /* First mask out possible old values for GCR3 table */
- tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B;
- flags &= ~tmp;
-
- tmp = DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C;
- flags &= ~tmp;
-
- /* Encode GCR3 table into DTE */
- tmp = DTE_GCR3_VAL_A(gcr3) << DTE_GCR3_SHIFT_A;
- pte_root |= tmp;
-
- tmp = DTE_GCR3_VAL_B(gcr3) << DTE_GCR3_SHIFT_B;
- flags |= tmp;
-
- tmp = DTE_GCR3_VAL_C(gcr3) << DTE_GCR3_SHIFT_C;
- flags |= tmp;
+ if (dev_data->ats_enabled)
+ new.data[1] |= DTE_FLAG_IOTLB;
- if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) {
- dev_table[devid].data[2] |=
- ((u64)GUEST_PGTABLE_5_LEVEL << DTE_GPT_LEVEL_SHIFT);
- }
+ old_domid = READ_ONCE(dte->data[1]) & DEV_DOMID_MASK;
+ new.data[1] |= domid;
- /* GIOV is supported with V2 page table mode only */
- if (pdom_is_v2_pgtbl_mode(domain))
- pte_root |= DTE_FLAG_GIOV;
+ /*
+ * Restore cached persistent DTE bits, which can be set by information
+ * in IVRS table. See set_dev_entry_from_acpi().
+ */
+ initial_dte = amd_iommu_get_ivhd_dte_flags(dev_data->devid);
+ if (initial_dte) {
+ new.data128[0] |= initial_dte->data128[0];
+ new.data128[1] |= initial_dte->data128[1];
}
- flags &= ~DEV_DOMID_MASK;
- flags |= domid;
+ set_dte_gcr3_table(iommu, dev_data, &new);
- old_domid = dev_table[devid].data[1] & DEV_DOMID_MASK;
- dev_table[devid].data[1] = flags;
- dev_table[devid].data[0] = pte_root;
+ update_dte256(iommu, dev_data, &new);
/*
* A kdump kernel might be replacing a domain ID that was copied from
--
2.34.1
On Thu, Oct 31, 2024 at 09:16:20AM +0000, Suravee Suthikulpanit wrote: > @@ -1148,6 +1146,28 @@ static bool copy_device_table(void) > return true; > } > > +struct dev_table_entry *amd_iommu_get_ivhd_dte_flags(u16 devid) > +{ > + u16 f = 0, l = 0xFFFF; > + struct ivhd_dte_flags *e; > + struct dev_table_entry *dte = NULL; > + > + for_each_ivhd_dte_flags(e) { Maybe the list head should be on the iommu? I don't know how long it would be if it is worthwhile. > + /* > + * Need to go through the whole list to find the smallest range, > + * which contains the devid. Then store it in f and l variables. > + */ > + if ((e->devid_first >= devid) && (e->devid_last <= devid)) { > + if (f < e->devid_first) > + f = e->devid_first; > + if (e->devid_last < l) > + l = e->devid_last; > + dte = &(e->dte); > + } f and l are never used, why calculate them? Isn't (e->devid_first >= devid) not the right way to check if devid falls within a range? Based on the comment it seems like you want something like this?? struct dev_table_entry *amd_iommu_get_ivhd_dte_flags(u16 devid) { struct dev_table_entry *dte = NULL; unsigned int best_len = UINT_MAX; struct ivhd_dte_flags *e; for_each_ivhd_dte_flags(e) { /* * Need to go through the whole list to find the smallest range, * which contains the devid. Then store it in f and l variables. */ if ((e->devid_first <= devid) && (e->devid_last >= devid)) { unsigned int len = e->devid_last - e->devid_first; if (len < best_len) { dte = &(e->dte); best_len = len; } } } return dte; } (and it would be smart to sort the linked list by size, but again I don't know how big it is if it is worthwile complexity) The DTE code looks OK Jason
On 10/31/2024 7:53 PM, Jason Gunthorpe wrote: > On Thu, Oct 31, 2024 at 09:16:20AM +0000, Suravee Suthikulpanit wrote: >> @@ -1148,6 +1146,28 @@ static bool copy_device_table(void) >> return true; >> } >> >> +struct dev_table_entry *amd_iommu_get_ivhd_dte_flags(u16 devid) >> +{ >> + u16 f = 0, l = 0xFFFF; >> + struct ivhd_dte_flags *e; >> + struct dev_table_entry *dte = NULL; >> + >> + for_each_ivhd_dte_flags(e) { > > Maybe the list head should be on the iommu? I don't know how long it > would be if it is worthwhile. > >> + /* >> + * Need to go through the whole list to find the smallest range, >> + * which contains the devid. Then store it in f and l variables. >> + */ >> + if ((e->devid_first >= devid) && (e->devid_last <= devid)) { >> + if (f < e->devid_first) >> + f = e->devid_first; >> + if (e->devid_last < l) >> + l = e->devid_last; >> + dte = &(e->dte); >> + } > > f and l are never used, why calculate them? > > Isn't (e->devid_first >= devid) not the right way to check if devid > falls within a range? Actually, I missed one line. I intended to do: + if ((e->devid_first >= devid) && (e->devid_last <= devid) && + (f <= e->devid_first) && (e->devid_last <= l)) { The IVHD device entry can be defined for a range of devices (range entry) or for a selected device (dev entry). So, a particular devid could fall into both a range entry and a dev entry. For example: AMD-Vi: device: 0000:00:00.2 cap: 0040 flags: 30 info 0000 AMD-Vi: DEV_SELECT_RANGE_START devid: 0000:00:00.3 flags: 0x0 AMD-Vi: DEV_RANGE_END devid: 0000:1f:1f.6 AMD-Vi: DEV_SPECIAL(HPET[0]) devid: 0000:00:14.0, flags: 0x0 AMD-Vi: DEV_SPECIAL(IOAPIC[128]) devid: 0000:00:14.0, flags: 0xd7 AMD-Vi: DEV_SPECIAL(IOAPIC[242]) devid: 0000:00:00.1, flags: 0x0 AMD-Vi: DEV_ACPI_HID(AMDI0020[ID00]) devid: 0000:00:14.5, flags: 0x40 AMD-Vi: DEV_ACPI_HID(AMDI0020[ID01]) devid: 0000:00:14.5, flags: 0x40 AMD-Vi: DEV_ACPI_HID(AMDI0020[ID02]) devid: 0000:00:14.5, flags: 0x40 AMD-Vi: DEV_ACPI_HID(AMDI0020[ID03]) devid: 0000:00:14.5, flags: 0x40 AMD-Vi: DEV_ACPI_HID(AMDI0095[ID00]) devid: 0000:00:00.4, flags: 0x30 Note that the logic only store the entry w/ flags != 0. For devid 0000:00:14.0, we want to get the dev entry DEV_SPECIAL(IOAPIC[128]) devid: 0000:00:14.0, flags: 0xd7 > Based on the comment it seems like you want something like this?? > > struct dev_table_entry *amd_iommu_get_ivhd_dte_flags(u16 devid) > { > struct dev_table_entry *dte = NULL; > unsigned int best_len = UINT_MAX; > struct ivhd_dte_flags *e; > > for_each_ivhd_dte_flags(e) { > /* > * Need to go through the whole list to find the smallest range, > * which contains the devid. Then store it in f and l variables. > */ > if ((e->devid_first <= devid) && (e->devid_last >= devid)) { > unsigned int len = e->devid_last - e->devid_first; > > if (len < best_len) { > dte = &(e->dte); > best_len = len; > } > } > } > return dte; > } This logic would work also. Thanks. > (and it would be smart to sort the linked list by size, but again I > don't know how big it is if it is worthwile complexity) The list should not be too long (only a few entries). So, I think it would be okay to just keep one list for the whole system. However, we would need to also add logic to check PCI segment ID, since the device table is per PCI segment. I will update and send out V8. Thanks, Suravee
© 2016 - 2024 Red Hat, Inc.