Also, the set_dte_entry() is used to program several DTE fields (e.g.
stage1 table, stage2 table, domain id, and etc.), which is difficult
to keep track with current implementation.
Therefore, separate logic for clearing DTE (i.e. make_clear_dte) and
another function for setting up the GCR3 Table Root Pointer, GIOV, GV,
GLX, and GuestPagingMode into another function set_dte_gcr3_table().
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
---
drivers/iommu/amd/amd_iommu_types.h | 2 +
drivers/iommu/amd/iommu.c | 132 ++++++++++++++++------------
2 files changed, 80 insertions(+), 54 deletions(-)
diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
index 91f802be7898..74aca19725cc 100644
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -427,6 +427,8 @@
#define DTE_GPT_LEVEL_SHIFT 54
#define DTE_GPT_LEVEL_MASK GENMASK_ULL(55, 54)
+#define DTE_SYSMGT_MASK GENMASK_ULL(41, 40)
+
#define GCR3_VALID 0x01ULL
/* DTE[128:179] | DTE[184:191] */
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index deb19af48a3e..6fa2f5bb5156 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -1956,90 +1956,114 @@ int amd_iommu_clear_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid)
return ret;
}
+static void make_clear_dte(struct amd_iommu *iommu, struct dev_table_entry *dte,
+ struct dev_table_entry *new)
+{
+ new->data[0] = DTE_FLAG_V;
+
+ /* Apply erratum 63 */
+ if (FIELD_GET(DTE_SYSMGT_MASK, dte->data[1]) == 0x01)
+ new->data[0] |= BIT_ULL(DEV_ENTRY_IW);
+
+ if (!amd_iommu_snp_en)
+ new->data[0] |= DTE_FLAG_TV;
+
+ /* Need to preserve interrupt remapping information in DTE[128:255] */
+ new->data128[1] = dte->data128[1];
+
+ /* Mask out old values for GuestPagingMode */
+ new->data[2] &= ~(0x3ULL << DTE_GPT_LEVEL_SHIFT);
+}
+
+/*
+ * Note:
+ * The old value for GCR3 table and GPT have been cleared from caller.
+ */
+static void set_dte_gcr3_table(struct amd_iommu *iommu,
+ struct iommu_dev_data *dev_data,
+ struct dev_table_entry *target)
+{
+ struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
+ u64 tmp, gcr3;
+
+ if (!gcr3_info->gcr3_tbl)
+ return;
+
+ pr_debug("%s: devid=%#x, glx=%#x, gcr3_tbl=%#llx\n",
+ __func__, dev_data->devid, gcr3_info->glx,
+ (unsigned long long)gcr3_info->gcr3_tbl);
+
+ tmp = gcr3_info->glx;
+ target->data[0] |= (tmp & DTE_GLX_MASK) << DTE_GLX_SHIFT;
+ if (pdom_is_v2_pgtbl_mode(dev_data->domain))
+ target->data[0] |= DTE_FLAG_GIOV;
+ target->data[0] |= DTE_FLAG_GV;
+
+
+ gcr3 = iommu_virt_to_phys(gcr3_info->gcr3_tbl);
+
+ /* Encode GCR3 table into DTE */
+ tmp = DTE_GCR3_VAL_A(gcr3) << DTE_GCR3_SHIFT_A;
+ target->data[0] |= tmp;
+ tmp = DTE_GCR3_VAL_B(gcr3) << DTE_GCR3_SHIFT_B;
+ tmp |= DTE_GCR3_VAL_C(gcr3) << DTE_GCR3_SHIFT_C;
+ target->data[1] |= tmp;
+
+ /* Guest page table can only support 4 and 5 levels */
+ if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL)
+ target->data[2] |= ((u64)GUEST_PGTABLE_5_LEVEL << DTE_GPT_LEVEL_SHIFT);
+}
+
static void set_dte_entry(struct amd_iommu *iommu,
struct iommu_dev_data *dev_data)
{
- u64 pte_root = 0;
- u64 flags = 0;
- u32 old_domid;
- u16 devid = dev_data->devid;
u16 domid;
+ u32 old_domid;
+ struct dev_table_entry new = {};
struct protection_domain *domain = dev_data->domain;
- struct dev_table_entry *dev_table = get_dev_table(iommu);
struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
+ struct dev_table_entry *dte = &get_dev_table(iommu)[dev_data->devid];
if (gcr3_info && gcr3_info->gcr3_tbl)
domid = dev_data->gcr3_info.domid;
else
domid = domain->id;
+ make_clear_dte(iommu, dte, &new);
+
if (domain->iop.mode != PAGE_MODE_NONE)
- pte_root = iommu_virt_to_phys(domain->iop.root);
+ new.data[0] = iommu_virt_to_phys(domain->iop.root);
- pte_root |= (domain->iop.mode & DEV_ENTRY_MODE_MASK)
+ new.data[0] |= (domain->iop.mode & DEV_ENTRY_MODE_MASK)
<< DEV_ENTRY_MODE_SHIFT;
- pte_root |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_V;
+ new.data[0] |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_V;
/*
* When SNP is enabled, Only set TV bit when IOMMU
* page translation is in use.
*/
if (!amd_iommu_snp_en || (domid != 0))
- pte_root |= DTE_FLAG_TV;
-
- flags = dev_table[devid].data[1];
-
- if (dev_data->ats_enabled)
- flags |= DTE_FLAG_IOTLB;
+ new.data[0] |= DTE_FLAG_TV;
if (dev_data->ppr)
- pte_root |= 1ULL << DEV_ENTRY_PPR;
+ new.data[0] |= 1ULL << DEV_ENTRY_PPR;
if (domain->dirty_tracking)
- pte_root |= DTE_FLAG_HAD;
-
- if (gcr3_info && gcr3_info->gcr3_tbl) {
- u64 gcr3 = iommu_virt_to_phys(gcr3_info->gcr3_tbl);
- u64 glx = gcr3_info->glx;
- u64 tmp;
+ new.data[0] |= DTE_FLAG_HAD;
- pte_root |= DTE_FLAG_GV;
- pte_root |= (glx & DTE_GLX_MASK) << DTE_GLX_SHIFT;
-
- /* First mask out possible old values for GCR3 table */
- tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B;
- flags &= ~tmp;
-
- tmp = DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C;
- flags &= ~tmp;
-
- /* Encode GCR3 table into DTE */
- tmp = DTE_GCR3_VAL_A(gcr3) << DTE_GCR3_SHIFT_A;
- pte_root |= tmp;
-
- tmp = DTE_GCR3_VAL_B(gcr3) << DTE_GCR3_SHIFT_B;
- flags |= tmp;
-
- tmp = DTE_GCR3_VAL_C(gcr3) << DTE_GCR3_SHIFT_C;
- flags |= tmp;
-
- if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) {
- dev_table[devid].data[2] |=
- ((u64)GUEST_PGTABLE_5_LEVEL << DTE_GPT_LEVEL_SHIFT);
- }
+ if (dev_data->ats_enabled)
+ new.data[1] |= DTE_FLAG_IOTLB;
+ else
+ new.data[1] &= ~DTE_FLAG_IOTLB;
- /* GIOV is supported with V2 page table mode only */
- if (pdom_is_v2_pgtbl_mode(domain))
- pte_root |= DTE_FLAG_GIOV;
- }
+ old_domid = dte->data[1] & DEV_DOMID_MASK;
+ new.data[1] &= ~DEV_DOMID_MASK;
+ new.data[1] |= domid;
- flags &= ~DEV_DOMID_MASK;
- flags |= domid;
+ set_dte_gcr3_table(iommu, dev_data, &new);
- old_domid = dev_table[devid].data[1] & DEV_DOMID_MASK;
- dev_table[devid].data[1] = flags;
- dev_table[devid].data[0] = pte_root;
+ update_dte256(iommu, dev_data, &new);
/*
* A kdump kernel might be replacing a domain ID that was copied from
--
2.34.1
On Mon, Oct 07, 2024 at 04:13:50AM +0000, Suravee Suthikulpanit wrote: > diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c > index deb19af48a3e..6fa2f5bb5156 100644 > --- a/drivers/iommu/amd/iommu.c > +++ b/drivers/iommu/amd/iommu.c > @@ -1956,90 +1956,114 @@ int amd_iommu_clear_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid) > return ret; > } > > +static void make_clear_dte(struct amd_iommu *iommu, struct dev_table_entry *dte, > + struct dev_table_entry *new) > +{ > + new->data[0] = DTE_FLAG_V; > + > + /* Apply erratum 63 */ > + if (FIELD_GET(DTE_SYSMGT_MASK, dte->data[1]) == 0x01) > + new->data[0] |= BIT_ULL(DEV_ENTRY_IW); Doesn't this need to be /* Preserve set_dev_entry_from_acpi(), including erratum 64 */ new->data[1] |= dte->data[1] & DTE_SYSMGT_MASK; if (FIELD_GET(DTE_SYSMGT_MASK, dte->data[1]) == 0x01) new->data[0] |= BIT_ULL(DEV_ENTRY_IW); And this has a significant security issue, we can't set IW here because clear_dte must generate a blocked DTE, so TV=1,Mode=0,IW=1 is not an OK setting!! Jason
On 10/7/2024 9:17 PM, Jason Gunthorpe wrote: > On Mon, Oct 07, 2024 at 04:13:50AM +0000, Suravee Suthikulpanit wrote: >> diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c >> index deb19af48a3e..6fa2f5bb5156 100644 >> --- a/drivers/iommu/amd/iommu.c >> +++ b/drivers/iommu/amd/iommu.c >> @@ -1956,90 +1956,114 @@ int amd_iommu_clear_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid) >> return ret; >> } >> >> +static void make_clear_dte(struct amd_iommu *iommu, struct dev_table_entry *dte, >> + struct dev_table_entry *new) >> +{ >> + new->data[0] = DTE_FLAG_V; >> + >> + /* Apply erratum 63 */ >> + if (FIELD_GET(DTE_SYSMGT_MASK, dte->data[1]) == 0x01) >> + new->data[0] |= BIT_ULL(DEV_ENTRY_IW); > > Doesn't this need to be > > /* Preserve set_dev_entry_from_acpi(), including erratum 64 */ > new->data[1] |= dte->data[1] & DTE_SYSMGT_MASK; > if (FIELD_GET(DTE_SYSMGT_MASK, dte->data[1]) == 0x01) > new->data[0] |= BIT_ULL(DEV_ENTRY_IW); > > And this has a significant security issue, we can't set IW here > because clear_dte must generate a blocked DTE, so TV=1,Mode=0,IW=1 is > not an OK setting!! I am reworking this part. I am going to introduce a variable in struct dev_data to store persistent value (during driver initialization) so that we can simply recreate the DTE from scratch w/o having to depend on existing configuration in the table. And it should address your concern regarding the IW bit during blocked DTE. Suravee
On Mon, Oct 07, 2024 at 04:13:50AM +0000, Suravee Suthikulpanit wrote: > +static void make_clear_dte(struct amd_iommu *iommu, struct dev_table_entry *dte, > + struct dev_table_entry *new) > +{ > + new->data[0] = DTE_FLAG_V; > + > + /* Apply erratum 63 */ > + if (FIELD_GET(DTE_SYSMGT_MASK, dte->data[1]) == 0x01) > + new->data[0] |= BIT_ULL(DEV_ENTRY_IW); > + > + if (!amd_iommu_snp_en) > + new->data[0] |= DTE_FLAG_TV; It would be nice to have a comment here.. clear_dte() must create a blocking configuration as several callers depend on that. Why is blocking with TV=1,Mode=0,IW=0,IR=0 used sometimes but sometimes TV=0 is used instead? > + /* Need to preserve interrupt remapping information in DTE[128:255] */ > + new->data128[1] = dte->data128[1]; It doesn't need to preserve.. write_dte_upper128() does the preservation automatically under the right lock. Any bits in DTE_DATA2_INTR_MASK should be 0 for the input DTE because they will be ignored by the masking: + new->data[2] &= ~DTE_DATA2_INTR_MASK; + new->data[2] |= old.data[2] & (DTE_DATA2_INTR_MASK | DTE_DATA2_RESV_MASK); Also this shouldn't preserve the top Guest related 64 bit for a 'clear dte' either. So, I think this can just be new->data128[1] = 0; ? Jason
On 10/7/2024 9:05 PM, Jason Gunthorpe wrote: > On Mon, Oct 07, 2024 at 04:13:50AM +0000, Suravee Suthikulpanit wrote: >> +static void make_clear_dte(struct amd_iommu *iommu, struct dev_table_entry *dte, >> + struct dev_table_entry *new) >> +{ >> + new->data[0] = DTE_FLAG_V; >> + >> + /* Apply erratum 63 */ >> + if (FIELD_GET(DTE_SYSMGT_MASK, dte->data[1]) == 0x01) >> + new->data[0] |= BIT_ULL(DEV_ENTRY_IW); >> + >> + if (!amd_iommu_snp_en) >> + new->data[0] |= DTE_FLAG_TV; > > It would be nice to have a comment here.. I am moving this check. See description below... > clear_dte() must create a blocking configuration as several callers > depend on that. Right, I missed that. I'll rework this function. > Why is blocking with TV=1,Mode=0,IW=0,IR=0 used sometimes but > sometimes TV=0 is used instead? Originally, when DTE[Mode]=0, the TV bit is set. Then, the commit b9f0043e1ea6 "iommu/amd: Set translation valid bit only when IO page tables are in use" clears the TV ONLY when running on SNP-enabled system. We didn't clear the bit for all cases since there was a concern whether it would cause regression on older platforms. However, I am considering clearing the TV flag for all cases to simplify the logic since it should not violate the spec. >> + /* Need to preserve interrupt remapping information in DTE[128:255] */ >> + new->data128[1] = dte->data128[1]; > > It doesn't need to preserve.. write_dte_upper128() does the > preservation automatically under the right lock. Any bits in > DTE_DATA2_INTR_MASK should be 0 for the input DTE because they will be > ignored by the masking: > > + new->data[2] &= ~DTE_DATA2_INTR_MASK; > + new->data[2] |= old.data[2] & (DTE_DATA2_INTR_MASK | DTE_DATA2_RESV_MASK); > > Also this shouldn't preserve the top Guest related 64 bit for a 'clear > dte' either. > > So, I think this can just be > > new->data128[1] = 0; > > ? Good point. I'll clean this up. Thanks, Suravee
© 2016 - 2024 Red Hat, Inc.