[PATCH v3 06/17] intel_iommu: Implement stage-1 translation

Zhenzhong Duan posted 17 patches 2 months ago
There is a newer version of this series
[PATCH v3 06/17] intel_iommu: Implement stage-1 translation
Posted by Zhenzhong Duan 2 months ago
From: Yi Liu <yi.l.liu@intel.com>

This adds stage-1 page table walking to support stage-1 only
transltion in scalable modern mode.

Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Co-developed-by: Clément Mathieu--Drif <clement.mathieu--drif@eviden.com>
Signed-off-by: Clément Mathieu--Drif <clement.mathieu--drif@eviden.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
---
 hw/i386/intel_iommu_internal.h |  23 ++++++
 hw/i386/intel_iommu.c          | 146 ++++++++++++++++++++++++++++++++-
 2 files changed, 165 insertions(+), 4 deletions(-)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 1fa4add9e2..51e9b1fc43 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -433,6 +433,21 @@ typedef union VTDInvDesc VTDInvDesc;
         (0x3ffff800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM | VTD_SL_TM)) : \
         (0x3ffff800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
 
+/* Rsvd field masks for fpte */
+#define VTD_FS_UPPER_IGNORED 0xfff0000000000000ULL
+#define VTD_FPTE_PAGE_L1_RSVD_MASK(aw) \
+        (~(VTD_HAW_MASK(aw) | VTD_FS_UPPER_IGNORED))
+#define VTD_FPTE_PAGE_L2_RSVD_MASK(aw) \
+        (~(VTD_HAW_MASK(aw) | VTD_FS_UPPER_IGNORED))
+#define VTD_FPTE_PAGE_L3_RSVD_MASK(aw) \
+        (~(VTD_HAW_MASK(aw) | VTD_FS_UPPER_IGNORED))
+#define VTD_FPTE_PAGE_L3_FS1GP_RSVD_MASK(aw) \
+        (0x3fffe000ULL | ~(VTD_HAW_MASK(aw) | VTD_FS_UPPER_IGNORED))
+#define VTD_FPTE_PAGE_L2_FS2MP_RSVD_MASK(aw) \
+        (0x1fe000ULL | ~(VTD_HAW_MASK(aw) | VTD_FS_UPPER_IGNORED))
+#define VTD_FPTE_PAGE_L4_RSVD_MASK(aw) \
+        (0x80ULL | ~(VTD_HAW_MASK(aw) | VTD_FS_UPPER_IGNORED))
+
 /* Masks for PIOTLB Invalidate Descriptor */
 #define VTD_INV_DESC_PIOTLB_G             (3ULL << 4)
 #define VTD_INV_DESC_PIOTLB_ALL_IN_PASID  (2ULL << 4)
@@ -525,6 +540,14 @@ typedef struct VTDRootEntry VTDRootEntry;
 #define VTD_SM_PASID_ENTRY_AW          7ULL /* Adjusted guest-address-width */
 #define VTD_SM_PASID_ENTRY_DID(val)    ((val) & VTD_DOMAIN_ID_MASK)
 
+#define VTD_SM_PASID_ENTRY_FLPM          3ULL
+#define VTD_SM_PASID_ENTRY_FLPTPTR       (~0xfffULL)
+
+/* First Level Paging Structure */
+/* Masks for First Level Paging Entry */
+#define VTD_FL_P                    1ULL
+#define VTD_FL_RW_MASK              (1ULL << 1)
+
 /* Second Level Page Translation Pointer*/
 #define VTD_SM_PASID_ENTRY_SLPTPTR     (~0xfffULL)
 
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index a22bd43b98..6e31a8d383 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -48,6 +48,8 @@
 
 /* pe operations */
 #define VTD_PE_GET_TYPE(pe) ((pe)->val[0] & VTD_SM_PASID_ENTRY_PGTT)
+#define VTD_PE_GET_FL_LEVEL(pe) \
+    (4 + (((pe)->val[2] >> 2) & VTD_SM_PASID_ENTRY_FLPM))
 #define VTD_PE_GET_SL_LEVEL(pe) \
     (2 + (((pe)->val[0] >> 2) & VTD_SM_PASID_ENTRY_AW))
 
@@ -755,6 +757,11 @@ static inline bool vtd_is_sl_level_supported(IntelIOMMUState *s, uint32_t level)
            (1ULL << (level - 2 + VTD_CAP_SAGAW_SHIFT));
 }
 
+static inline bool vtd_is_fl_level_supported(IntelIOMMUState *s, uint32_t level)
+{
+    return level == VTD_PML4_LEVEL;
+}
+
 /* Return true if check passed, otherwise false */
 static inline bool vtd_pe_type_check(X86IOMMUState *x86_iommu,
                                      VTDPASIDEntry *pe)
@@ -838,6 +845,11 @@ static int vtd_get_pe_in_pasid_leaf_table(IntelIOMMUState *s,
             return -VTD_FR_PASID_TABLE_ENTRY_INV;
     }
 
+    if (pgtt == VTD_SM_PASID_ENTRY_FLT &&
+        !vtd_is_fl_level_supported(s, VTD_PE_GET_FL_LEVEL(pe))) {
+            return -VTD_FR_PASID_TABLE_ENTRY_INV;
+    }
+
     return 0;
 }
 
@@ -973,7 +985,11 @@ static uint32_t vtd_get_iova_level(IntelIOMMUState *s,
 
     if (s->root_scalable) {
         vtd_ce_get_rid2pasid_entry(s, ce, &pe, pasid);
-        return VTD_PE_GET_SL_LEVEL(&pe);
+        if (s->scalable_modern) {
+            return VTD_PE_GET_FL_LEVEL(&pe);
+        } else {
+            return VTD_PE_GET_SL_LEVEL(&pe);
+        }
     }
 
     return vtd_ce_get_level(ce);
@@ -1060,7 +1076,11 @@ static dma_addr_t vtd_get_iova_pgtbl_base(IntelIOMMUState *s,
 
     if (s->root_scalable) {
         vtd_ce_get_rid2pasid_entry(s, ce, &pe, pasid);
-        return pe.val[0] & VTD_SM_PASID_ENTRY_SLPTPTR;
+        if (s->scalable_modern) {
+            return pe.val[2] & VTD_SM_PASID_ENTRY_FLPTPTR;
+        } else {
+            return pe.val[0] & VTD_SM_PASID_ENTRY_SLPTPTR;
+        }
     }
 
     return vtd_ce_get_slpt_base(ce);
@@ -1862,6 +1882,104 @@ out:
     trace_vtd_pt_enable_fast_path(source_id, success);
 }
 
+/*
+ * Rsvd field masks for fpte:
+ *     vtd_fpte_rsvd 4k pages
+ *     vtd_fpte_rsvd_large large pages
+ *
+ * We support only 4-level page tables.
+ */
+#define VTD_FPTE_RSVD_LEN 5
+static uint64_t vtd_fpte_rsvd[VTD_FPTE_RSVD_LEN];
+static uint64_t vtd_fpte_rsvd_large[VTD_FPTE_RSVD_LEN];
+
+static bool vtd_flpte_nonzero_rsvd(uint64_t flpte, uint32_t level)
+{
+    uint64_t rsvd_mask;
+
+    /*
+     * We should have caught a guest-mis-programmed level earlier,
+     * via vtd_is_fl_level_supported.
+     */
+    assert(level < VTD_SPTE_RSVD_LEN);
+    /*
+     * Zero level doesn't exist. The smallest level is VTD_PT_LEVEL=1 and
+     * checked by vtd_is_last_pte().
+     */
+    assert(level);
+
+    if ((level == VTD_PD_LEVEL || level == VTD_PDP_LEVEL) &&
+        (flpte & VTD_PT_PAGE_SIZE_MASK)) {
+        /* large page */
+        rsvd_mask = vtd_fpte_rsvd_large[level];
+    } else {
+        rsvd_mask = vtd_fpte_rsvd[level];
+    }
+
+    return flpte & rsvd_mask;
+}
+
+static inline bool vtd_flpte_present(uint64_t flpte)
+{
+    return !!(flpte & VTD_FL_P);
+}
+
+/*
+ * Given the @iova, get relevant @flptep. @flpte_level will be the last level
+ * of the translation, can be used for deciding the size of large page.
+ */
+static int vtd_iova_to_flpte(IntelIOMMUState *s, VTDContextEntry *ce,
+                             uint64_t iova, bool is_write,
+                             uint64_t *flptep, uint32_t *flpte_level,
+                             bool *reads, bool *writes, uint8_t aw_bits,
+                             uint32_t pasid)
+{
+    dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce, pasid);
+    uint32_t level = vtd_get_iova_level(s, ce, pasid);
+    uint32_t offset;
+    uint64_t flpte;
+
+    while (true) {
+        offset = vtd_iova_level_offset(iova, level);
+        flpte = vtd_get_pte(addr, offset);
+
+        if (flpte == (uint64_t)-1) {
+            if (level == vtd_get_iova_level(s, ce, pasid)) {
+                /* Invalid programming of context-entry */
+                return -VTD_FR_CONTEXT_ENTRY_INV;
+            } else {
+                return -VTD_FR_PAGING_ENTRY_INV;
+            }
+        }
+        if (!vtd_flpte_present(flpte)) {
+            *reads = false;
+            *writes = false;
+            return -VTD_FR_PAGING_ENTRY_INV;
+        }
+        *reads = true;
+        *writes = (*writes) && (flpte & VTD_FL_RW_MASK);
+        if (is_write && !(flpte & VTD_FL_RW_MASK)) {
+            return -VTD_FR_WRITE;
+        }
+        if (vtd_flpte_nonzero_rsvd(flpte, level)) {
+            error_report_once("%s: detected flpte reserved non-zero "
+                              "iova=0x%" PRIx64 ", level=0x%" PRIx32
+                              "flpte=0x%" PRIx64 ", pasid=0x%" PRIX32 ")",
+                              __func__, iova, level, flpte, pasid);
+            return -VTD_FR_PAGING_ENTRY_RSVD;
+        }
+
+        if (vtd_is_last_pte(flpte, level)) {
+            *flptep = flpte;
+            *flpte_level = level;
+            return 0;
+        }
+
+        addr = vtd_get_pte_addr(flpte, aw_bits);
+        level--;
+    }
+}
+
 static void vtd_report_fault(IntelIOMMUState *s,
                              int err, bool is_fpd_set,
                              uint16_t source_id,
@@ -2010,8 +2128,13 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
         }
     }
 
-    ret_fr = vtd_iova_to_slpte(s, &ce, addr, is_write, &pte, &level,
-                               &reads, &writes, s->aw_bits, pasid);
+    if (s->scalable_modern && s->root_scalable) {
+        ret_fr = vtd_iova_to_flpte(s, &ce, addr, is_write, &pte, &level,
+                                   &reads, &writes, s->aw_bits, pasid);
+    } else {
+        ret_fr = vtd_iova_to_slpte(s, &ce, addr, is_write, &pte, &level,
+                                   &reads, &writes, s->aw_bits, pasid);
+    }
     if (ret_fr) {
         vtd_report_fault(s, -ret_fr, is_fpd_set, source_id,
                          addr, is_write, pasid != PCI_NO_PASID, pasid);
@@ -4239,6 +4362,21 @@ static void vtd_init(IntelIOMMUState *s)
     vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits,
                                                     x86_iommu->dt_supported);
 
+    /*
+     * Rsvd field masks for fpte
+     */
+    vtd_fpte_rsvd[0] = ~0ULL;
+    vtd_fpte_rsvd[1] = VTD_FPTE_PAGE_L1_RSVD_MASK(s->aw_bits);
+    vtd_fpte_rsvd[2] = VTD_FPTE_PAGE_L2_RSVD_MASK(s->aw_bits);
+    vtd_fpte_rsvd[3] = VTD_FPTE_PAGE_L3_RSVD_MASK(s->aw_bits);
+    vtd_fpte_rsvd[4] = VTD_FPTE_PAGE_L4_RSVD_MASK(s->aw_bits);
+
+    vtd_fpte_rsvd_large[0] = ~0ULL;
+    vtd_fpte_rsvd_large[1] = ~0ULL;
+    vtd_fpte_rsvd_large[2] = VTD_FPTE_PAGE_L2_FS2MP_RSVD_MASK(s->aw_bits);
+    vtd_fpte_rsvd_large[3] = VTD_FPTE_PAGE_L3_FS1GP_RSVD_MASK(s->aw_bits);
+    vtd_fpte_rsvd_large[4] = ~0ULL;
+
     if (s->scalable_mode || s->snoop_control) {
         vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP;
         vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP;
-- 
2.34.1


Re: [PATCH v3 06/17] intel_iommu: Implement stage-1 translation
Posted by Yi Liu 1 month, 2 weeks ago
On 2024/9/11 13:22, Zhenzhong Duan wrote:
> From: Yi Liu <yi.l.liu@intel.com>
> 
> This adds stage-1 page table walking to support stage-1 only
> transltion in scalable modern mode.

a typo. s/tansltion/translation/
> 
> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> Co-developed-by: Clément Mathieu--Drif <clement.mathieu--drif@eviden.com>
> Signed-off-by: Clément Mathieu--Drif <clement.mathieu--drif@eviden.com>
> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
> ---
>   hw/i386/intel_iommu_internal.h |  23 ++++++
>   hw/i386/intel_iommu.c          | 146 ++++++++++++++++++++++++++++++++-
>   2 files changed, 165 insertions(+), 4 deletions(-)
> 
> diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
> index 1fa4add9e2..51e9b1fc43 100644
> --- a/hw/i386/intel_iommu_internal.h
> +++ b/hw/i386/intel_iommu_internal.h
> @@ -433,6 +433,21 @@ typedef union VTDInvDesc VTDInvDesc;
>           (0x3ffff800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM | VTD_SL_TM)) : \
>           (0x3ffff800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
>   
> +/* Rsvd field masks for fpte */
> +#define VTD_FS_UPPER_IGNORED 0xfff0000000000000ULL
> +#define VTD_FPTE_PAGE_L1_RSVD_MASK(aw) \
> +        (~(VTD_HAW_MASK(aw) | VTD_FS_UPPER_IGNORED))
> +#define VTD_FPTE_PAGE_L2_RSVD_MASK(aw) \
> +        (~(VTD_HAW_MASK(aw) | VTD_FS_UPPER_IGNORED))
> +#define VTD_FPTE_PAGE_L3_RSVD_MASK(aw) \
> +        (~(VTD_HAW_MASK(aw) | VTD_FS_UPPER_IGNORED))
> +#define VTD_FPTE_PAGE_L3_FS1GP_RSVD_MASK(aw) \
> +        (0x3fffe000ULL | ~(VTD_HAW_MASK(aw) | VTD_FS_UPPER_IGNORED))
> +#define VTD_FPTE_PAGE_L2_FS2MP_RSVD_MASK(aw) \
> +        (0x1fe000ULL | ~(VTD_HAW_MASK(aw) | VTD_FS_UPPER_IGNORED))

May we follow the same naming for the large page? e.g. LPAGE_L2, LPAGE_L3.
Also follow the order of the SL definitions as well.

> +#define VTD_FPTE_PAGE_L4_RSVD_MASK(aw) \
> +        (0x80ULL | ~(VTD_HAW_MASK(aw) | VTD_FS_UPPER_IGNORED))
> +
>   /* Masks for PIOTLB Invalidate Descriptor */
>   #define VTD_INV_DESC_PIOTLB_G             (3ULL << 4)
>   #define VTD_INV_DESC_PIOTLB_ALL_IN_PASID  (2ULL << 4)
> @@ -525,6 +540,14 @@ typedef struct VTDRootEntry VTDRootEntry;
>   #define VTD_SM_PASID_ENTRY_AW          7ULL /* Adjusted guest-address-width */
>   #define VTD_SM_PASID_ENTRY_DID(val)    ((val) & VTD_DOMAIN_ID_MASK)
>   
> +#define VTD_SM_PASID_ENTRY_FLPM          3ULL
> +#define VTD_SM_PASID_ENTRY_FLPTPTR       (~0xfffULL)
> +
> +/* First Level Paging Structure */
> +/* Masks for First Level Paging Entry */
> +#define VTD_FL_P                    1ULL
> +#define VTD_FL_RW_MASK              (1ULL << 1)
> +
>   /* Second Level Page Translation Pointer*/
>   #define VTD_SM_PASID_ENTRY_SLPTPTR     (~0xfffULL)
>   
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index a22bd43b98..6e31a8d383 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -48,6 +48,8 @@
>   
>   /* pe operations */
>   #define VTD_PE_GET_TYPE(pe) ((pe)->val[0] & VTD_SM_PASID_ENTRY_PGTT)
> +#define VTD_PE_GET_FL_LEVEL(pe) \
> +    (4 + (((pe)->val[2] >> 2) & VTD_SM_PASID_ENTRY_FLPM))
>   #define VTD_PE_GET_SL_LEVEL(pe) \
>       (2 + (((pe)->val[0] >> 2) & VTD_SM_PASID_ENTRY_AW))
>   
> @@ -755,6 +757,11 @@ static inline bool vtd_is_sl_level_supported(IntelIOMMUState *s, uint32_t level)
>              (1ULL << (level - 2 + VTD_CAP_SAGAW_SHIFT));
>   }
>   
> +static inline bool vtd_is_fl_level_supported(IntelIOMMUState *s, uint32_t level)
> +{
> +    return level == VTD_PML4_LEVEL;
> +}
> +
>   /* Return true if check passed, otherwise false */
>   static inline bool vtd_pe_type_check(X86IOMMUState *x86_iommu,
>                                        VTDPASIDEntry *pe)
> @@ -838,6 +845,11 @@ static int vtd_get_pe_in_pasid_leaf_table(IntelIOMMUState *s,
>               return -VTD_FR_PASID_TABLE_ENTRY_INV;
>       }
>   
> +    if (pgtt == VTD_SM_PASID_ENTRY_FLT &&
> +        !vtd_is_fl_level_supported(s, VTD_PE_GET_FL_LEVEL(pe))) {
> +            return -VTD_FR_PASID_TABLE_ENTRY_INV;
> +    }
> +
>       return 0;
>   }
>   
> @@ -973,7 +985,11 @@ static uint32_t vtd_get_iova_level(IntelIOMMUState *s,
>   
>       if (s->root_scalable) {
>           vtd_ce_get_rid2pasid_entry(s, ce, &pe, pasid);
> -        return VTD_PE_GET_SL_LEVEL(&pe);
> +        if (s->scalable_modern) {
> +            return VTD_PE_GET_FL_LEVEL(&pe);
> +        } else {
> +            return VTD_PE_GET_SL_LEVEL(&pe);
> +        }
>       }
>   
>       return vtd_ce_get_level(ce);
> @@ -1060,7 +1076,11 @@ static dma_addr_t vtd_get_iova_pgtbl_base(IntelIOMMUState *s,
>   
>       if (s->root_scalable) {
>           vtd_ce_get_rid2pasid_entry(s, ce, &pe, pasid);
> -        return pe.val[0] & VTD_SM_PASID_ENTRY_SLPTPTR;
> +        if (s->scalable_modern) {
> +            return pe.val[2] & VTD_SM_PASID_ENTRY_FLPTPTR;
> +        } else {
> +            return pe.val[0] & VTD_SM_PASID_ENTRY_SLPTPTR;
> +        }
>       }
>   
>       return vtd_ce_get_slpt_base(ce);
> @@ -1862,6 +1882,104 @@ out:
>       trace_vtd_pt_enable_fast_path(source_id, success);
>   }
>   
> +/*
> + * Rsvd field masks for fpte:
> + *     vtd_fpte_rsvd 4k pages
> + *     vtd_fpte_rsvd_large large pages
> + *
> + * We support only 4-level page tables.
> + */
> +#define VTD_FPTE_RSVD_LEN 5
> +static uint64_t vtd_fpte_rsvd[VTD_FPTE_RSVD_LEN];
> +static uint64_t vtd_fpte_rsvd_large[VTD_FPTE_RSVD_LEN];
> +
> +static bool vtd_flpte_nonzero_rsvd(uint64_t flpte, uint32_t level)
> +{
> +    uint64_t rsvd_mask;
> +
> +    /*
> +     * We should have caught a guest-mis-programmed level earlier,
> +     * via vtd_is_fl_level_supported.
> +     */
> +    assert(level < VTD_SPTE_RSVD_LEN);

s/VTD_SPTE_RSVD_LEN/VTD_FPTE_RSVD_LEN/

> +    /*
> +     * Zero level doesn't exist. The smallest level is VTD_PT_LEVEL=1 and
> +     * checked by vtd_is_last_pte().
> +     */
> +    assert(level);
> +
> +    if ((level == VTD_PD_LEVEL || level == VTD_PDP_LEVEL) &&
> +        (flpte & VTD_PT_PAGE_SIZE_MASK)) {
> +        /* large page */
> +        rsvd_mask = vtd_fpte_rsvd_large[level];
> +    } else {
> +        rsvd_mask = vtd_fpte_rsvd[level];
> +    }
> +
> +    return flpte & rsvd_mask;
> +}
> +
> +static inline bool vtd_flpte_present(uint64_t flpte)
> +{
> +    return !!(flpte & VTD_FL_P);
> +}
> +
> +/*
> + * Given the @iova, get relevant @flptep. @flpte_level will be the last level
> + * of the translation, can be used for deciding the size of large page.
> + */
> +static int vtd_iova_to_flpte(IntelIOMMUState *s, VTDContextEntry *ce,
> +                             uint64_t iova, bool is_write,
> +                             uint64_t *flptep, uint32_t *flpte_level,
> +                             bool *reads, bool *writes, uint8_t aw_bits,
> +                             uint32_t pasid)
> +{
> +    dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce, pasid);
> +    uint32_t level = vtd_get_iova_level(s, ce, pasid);
> +    uint32_t offset;
> +    uint64_t flpte;
> +

do we need to check the iova range as well like the SL path?

> +    while (true) {
> +        offset = vtd_iova_level_offset(iova, level);
> +        flpte = vtd_get_pte(addr, offset);
> +
> +        if (flpte == (uint64_t)-1) {
> +            if (level == vtd_get_iova_level(s, ce, pasid)) {
> +                /* Invalid programming of context-entry */
> +                return -VTD_FR_CONTEXT_ENTRY_INV;
> +            } else {
> +                return -VTD_FR_PAGING_ENTRY_INV;
> +            }
> +        }
> +        if (!vtd_flpte_present(flpte)) {
> +            *reads = false;
> +            *writes = false;
> +            return -VTD_FR_PAGING_ENTRY_INV;
> +        }
> +        *reads = true;
> +        *writes = (*writes) && (flpte & VTD_FL_RW_MASK);
> +        if (is_write && !(flpte & VTD_FL_RW_MASK)) {
> +            return -VTD_FR_WRITE;
> +        }
> +        if (vtd_flpte_nonzero_rsvd(flpte, level)) {
> +            error_report_once("%s: detected flpte reserved non-zero "
> +                              "iova=0x%" PRIx64 ", level=0x%" PRIx32
> +                              "flpte=0x%" PRIx64 ", pasid=0x%" PRIX32 ")",
> +                              __func__, iova, level, flpte, pasid);
> +            return -VTD_FR_PAGING_ENTRY_RSVD;
> +        }
> +
> +        if (vtd_is_last_pte(flpte, level)) {
> +            *flptep = flpte;
> +            *flpte_level = level;
> +            return 0;
> +        }
> +
> +        addr = vtd_get_pte_addr(flpte, aw_bits);
> +        level--;
> +    }

we also need to do the below check like the SL path.

     /*
      * From VT-d spec 3.14: Untranslated requests and translation
      * requests that result in an address in the interrupt range will be
      * blocked with condition code LGN.4 or SGN.8.
      */

> +}
> +
>   static void vtd_report_fault(IntelIOMMUState *s,
>                                int err, bool is_fpd_set,
>                                uint16_t source_id,
> @@ -2010,8 +2128,13 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
>           }
>       }
>   
> -    ret_fr = vtd_iova_to_slpte(s, &ce, addr, is_write, &pte, &level,
> -                               &reads, &writes, s->aw_bits, pasid);
> +    if (s->scalable_modern && s->root_scalable) {
> +        ret_fr = vtd_iova_to_flpte(s, &ce, addr, is_write, &pte, &level,
> +                                   &reads, &writes, s->aw_bits, pasid);
> +    } else {
> +        ret_fr = vtd_iova_to_slpte(s, &ce, addr, is_write, &pte, &level,
> +                                   &reads, &writes, s->aw_bits, pasid);
> +    }
>       if (ret_fr) {
>           vtd_report_fault(s, -ret_fr, is_fpd_set, source_id,
>                            addr, is_write, pasid != PCI_NO_PASID, pasid);
> @@ -4239,6 +4362,21 @@ static void vtd_init(IntelIOMMUState *s)
>       vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits,
>                                                       x86_iommu->dt_supported);

VT-d spec has dropped TM since 3.2 (2020 Oct). May have a patch to drop it
in vIOMMU as well. :)

Change log in VT-d spec.

"
  Remove Transient Mapping (TM) field from second-level page-tables and 
treat the field
as Reserved(0).
"

>   
> +    /*
> +     * Rsvd field masks for fpte
> +     */
> +    vtd_fpte_rsvd[0] = ~0ULL;
> +    vtd_fpte_rsvd[1] = VTD_FPTE_PAGE_L1_RSVD_MASK(s->aw_bits);
> +    vtd_fpte_rsvd[2] = VTD_FPTE_PAGE_L2_RSVD_MASK(s->aw_bits);
> +    vtd_fpte_rsvd[3] = VTD_FPTE_PAGE_L3_RSVD_MASK(s->aw_bits);
> +    vtd_fpte_rsvd[4] = VTD_FPTE_PAGE_L4_RSVD_MASK(s->aw_bits);
> +
> +    vtd_fpte_rsvd_large[0] = ~0ULL;
> +    vtd_fpte_rsvd_large[1] = ~0ULL;
> +    vtd_fpte_rsvd_large[2] = VTD_FPTE_PAGE_L2_FS2MP_RSVD_MASK(s->aw_bits);
> +    vtd_fpte_rsvd_large[3] = VTD_FPTE_PAGE_L3_FS1GP_RSVD_MASK(s->aw_bits);
> +    vtd_fpte_rsvd_large[4] = ~0ULL;
> +

this looks to be different with the SL large definitions. Is it necessary
to set the [0]/[1] and [4] as the large index should only be 2 or 3?
BTW. Before patch 16 of this series, it's unclear whether FS1GP is
supported or not, wondering if you want to add the 1G related definitions
togather with the FS1GP support in patch 16?

>       if (s->scalable_mode || s->snoop_control) {
>           vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP;
>           vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP;

-- 
Regards,
Yi Liu

RE: [PATCH v3 06/17] intel_iommu: Implement stage-1 translation
Posted by Duan, Zhenzhong 1 month, 1 week ago

>-----Original Message-----
>From: Liu, Yi L <yi.l.liu@intel.com>
>Subject: Re: [PATCH v3 06/17] intel_iommu: Implement stage-1 translation
>
>On 2024/9/11 13:22, Zhenzhong Duan wrote:
>> From: Yi Liu <yi.l.liu@intel.com>
>>
>> This adds stage-1 page table walking to support stage-1 only
>> transltion in scalable modern mode.
>
>a typo. s/tansltion/translation/

Will fix.

>>
>> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
>> Co-developed-by: Clément Mathieu--Drif <clement.mathieu--
>drif@eviden.com>
>> Signed-off-by: Clément Mathieu--Drif <clement.mathieu--drif@eviden.com>
>> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
>> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
>> ---
>>   hw/i386/intel_iommu_internal.h |  23 ++++++
>>   hw/i386/intel_iommu.c          | 146
>++++++++++++++++++++++++++++++++-
>>   2 files changed, 165 insertions(+), 4 deletions(-)
>>
>> diff --git a/hw/i386/intel_iommu_internal.h
>b/hw/i386/intel_iommu_internal.h
>> index 1fa4add9e2..51e9b1fc43 100644
>> --- a/hw/i386/intel_iommu_internal.h
>> +++ b/hw/i386/intel_iommu_internal.h
>> @@ -433,6 +433,21 @@ typedef union VTDInvDesc VTDInvDesc;
>>           (0x3ffff800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM |
>VTD_SL_TM)) : \
>>           (0x3ffff800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
>>
>> +/* Rsvd field masks for fpte */
>> +#define VTD_FS_UPPER_IGNORED 0xfff0000000000000ULL
>> +#define VTD_FPTE_PAGE_L1_RSVD_MASK(aw) \
>> +        (~(VTD_HAW_MASK(aw) | VTD_FS_UPPER_IGNORED))
>> +#define VTD_FPTE_PAGE_L2_RSVD_MASK(aw) \
>> +        (~(VTD_HAW_MASK(aw) | VTD_FS_UPPER_IGNORED))
>> +#define VTD_FPTE_PAGE_L3_RSVD_MASK(aw) \
>> +        (~(VTD_HAW_MASK(aw) | VTD_FS_UPPER_IGNORED))
>> +#define VTD_FPTE_PAGE_L3_FS1GP_RSVD_MASK(aw) \
>> +        (0x3fffe000ULL | ~(VTD_HAW_MASK(aw) |
>VTD_FS_UPPER_IGNORED))
>> +#define VTD_FPTE_PAGE_L2_FS2MP_RSVD_MASK(aw) \
>> +        (0x1fe000ULL | ~(VTD_HAW_MASK(aw) | VTD_FS_UPPER_IGNORED))
>
>May we follow the same naming for the large page? e.g. LPAGE_L2,
>LPAGE_L3.
>Also follow the order of the SL definitions as well.

Sure, will do.

>
>> +#define VTD_FPTE_PAGE_L4_RSVD_MASK(aw) \
>> +        (0x80ULL | ~(VTD_HAW_MASK(aw) | VTD_FS_UPPER_IGNORED))
>> +
>>   /* Masks for PIOTLB Invalidate Descriptor */
>>   #define VTD_INV_DESC_PIOTLB_G             (3ULL << 4)
>>   #define VTD_INV_DESC_PIOTLB_ALL_IN_PASID  (2ULL << 4)
>> @@ -525,6 +540,14 @@ typedef struct VTDRootEntry VTDRootEntry;
>>   #define VTD_SM_PASID_ENTRY_AW          7ULL /* Adjusted guest-
>address-width */
>>   #define VTD_SM_PASID_ENTRY_DID(val)    ((val) &
>VTD_DOMAIN_ID_MASK)
>>
>> +#define VTD_SM_PASID_ENTRY_FLPM          3ULL
>> +#define VTD_SM_PASID_ENTRY_FLPTPTR       (~0xfffULL)
>> +
>> +/* First Level Paging Structure */
>> +/* Masks for First Level Paging Entry */
>> +#define VTD_FL_P                    1ULL
>> +#define VTD_FL_RW_MASK              (1ULL << 1)
>> +
>>   /* Second Level Page Translation Pointer*/
>>   #define VTD_SM_PASID_ENTRY_SLPTPTR     (~0xfffULL)
>>
>> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
>> index a22bd43b98..6e31a8d383 100644
>> --- a/hw/i386/intel_iommu.c
>> +++ b/hw/i386/intel_iommu.c
>> @@ -48,6 +48,8 @@
>>
>>   /* pe operations */
>>   #define VTD_PE_GET_TYPE(pe) ((pe)->val[0] &
>VTD_SM_PASID_ENTRY_PGTT)
>> +#define VTD_PE_GET_FL_LEVEL(pe) \
>> +    (4 + (((pe)->val[2] >> 2) & VTD_SM_PASID_ENTRY_FLPM))
>>   #define VTD_PE_GET_SL_LEVEL(pe) \
>>       (2 + (((pe)->val[0] >> 2) & VTD_SM_PASID_ENTRY_AW))
>>
>> @@ -755,6 +757,11 @@ static inline bool
>vtd_is_sl_level_supported(IntelIOMMUState *s, uint32_t level)
>>              (1ULL << (level - 2 + VTD_CAP_SAGAW_SHIFT));
>>   }
>>
>> +static inline bool vtd_is_fl_level_supported(IntelIOMMUState *s,
>uint32_t level)
>> +{
>> +    return level == VTD_PML4_LEVEL;
>> +}
>> +
>>   /* Return true if check passed, otherwise false */
>>   static inline bool vtd_pe_type_check(X86IOMMUState *x86_iommu,
>>                                        VTDPASIDEntry *pe)
>> @@ -838,6 +845,11 @@ static int
>vtd_get_pe_in_pasid_leaf_table(IntelIOMMUState *s,
>>               return -VTD_FR_PASID_TABLE_ENTRY_INV;
>>       }
>>
>> +    if (pgtt == VTD_SM_PASID_ENTRY_FLT &&
>> +        !vtd_is_fl_level_supported(s, VTD_PE_GET_FL_LEVEL(pe))) {
>> +            return -VTD_FR_PASID_TABLE_ENTRY_INV;
>> +    }
>> +
>>       return 0;
>>   }
>>
>> @@ -973,7 +985,11 @@ static uint32_t
>vtd_get_iova_level(IntelIOMMUState *s,
>>
>>       if (s->root_scalable) {
>>           vtd_ce_get_rid2pasid_entry(s, ce, &pe, pasid);
>> -        return VTD_PE_GET_SL_LEVEL(&pe);
>> +        if (s->scalable_modern) {
>> +            return VTD_PE_GET_FL_LEVEL(&pe);
>> +        } else {
>> +            return VTD_PE_GET_SL_LEVEL(&pe);
>> +        }
>>       }
>>
>>       return vtd_ce_get_level(ce);
>> @@ -1060,7 +1076,11 @@ static dma_addr_t
>vtd_get_iova_pgtbl_base(IntelIOMMUState *s,
>>
>>       if (s->root_scalable) {
>>           vtd_ce_get_rid2pasid_entry(s, ce, &pe, pasid);
>> -        return pe.val[0] & VTD_SM_PASID_ENTRY_SLPTPTR;
>> +        if (s->scalable_modern) {
>> +            return pe.val[2] & VTD_SM_PASID_ENTRY_FLPTPTR;
>> +        } else {
>> +            return pe.val[0] & VTD_SM_PASID_ENTRY_SLPTPTR;
>> +        }
>>       }
>>
>>       return vtd_ce_get_slpt_base(ce);
>> @@ -1862,6 +1882,104 @@ out:
>>       trace_vtd_pt_enable_fast_path(source_id, success);
>>   }
>>
>> +/*
>> + * Rsvd field masks for fpte:
>> + *     vtd_fpte_rsvd 4k pages
>> + *     vtd_fpte_rsvd_large large pages
>> + *
>> + * We support only 4-level page tables.
>> + */
>> +#define VTD_FPTE_RSVD_LEN 5
>> +static uint64_t vtd_fpte_rsvd[VTD_FPTE_RSVD_LEN];
>> +static uint64_t vtd_fpte_rsvd_large[VTD_FPTE_RSVD_LEN];
>> +
>> +static bool vtd_flpte_nonzero_rsvd(uint64_t flpte, uint32_t level)
>> +{
>> +    uint64_t rsvd_mask;
>> +
>> +    /*
>> +     * We should have caught a guest-mis-programmed level earlier,
>> +     * via vtd_is_fl_level_supported.
>> +     */
>> +    assert(level < VTD_SPTE_RSVD_LEN);
>
>s/VTD_SPTE_RSVD_LEN/VTD_FPTE_RSVD_LEN/

Good catch, will fix.

>
>> +    /*
>> +     * Zero level doesn't exist. The smallest level is VTD_PT_LEVEL=1 and
>> +     * checked by vtd_is_last_pte().
>> +     */
>> +    assert(level);
>> +
>> +    if ((level == VTD_PD_LEVEL || level == VTD_PDP_LEVEL) &&
>> +        (flpte & VTD_PT_PAGE_SIZE_MASK)) {
>> +        /* large page */
>> +        rsvd_mask = vtd_fpte_rsvd_large[level];
>> +    } else {
>> +        rsvd_mask = vtd_fpte_rsvd[level];
>> +    }
>> +
>> +    return flpte & rsvd_mask;
>> +}
>> +
>> +static inline bool vtd_flpte_present(uint64_t flpte)
>> +{
>> +    return !!(flpte & VTD_FL_P);
>> +}
>> +
>> +/*
>> + * Given the @iova, get relevant @flptep. @flpte_level will be the last
>level
>> + * of the translation, can be used for deciding the size of large page.
>> + */
>> +static int vtd_iova_to_flpte(IntelIOMMUState *s, VTDContextEntry *ce,
>> +                             uint64_t iova, bool is_write,
>> +                             uint64_t *flptep, uint32_t *flpte_level,
>> +                             bool *reads, bool *writes, uint8_t aw_bits,
>> +                             uint32_t pasid)
>> +{
>> +    dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce, pasid);
>> +    uint32_t level = vtd_get_iova_level(s, ce, pasid);
>> +    uint32_t offset;
>> +    uint64_t flpte;
>> +
>
>do we need to check the iova range as well like the SL path?

In patch7, vtd_iova_fl_check_canonical() already guarantees that.

>
>> +    while (true) {
>> +        offset = vtd_iova_level_offset(iova, level);
>> +        flpte = vtd_get_pte(addr, offset);
>> +
>> +        if (flpte == (uint64_t)-1) {
>> +            if (level == vtd_get_iova_level(s, ce, pasid)) {
>> +                /* Invalid programming of context-entry */
>> +                return -VTD_FR_CONTEXT_ENTRY_INV;
>> +            } else {
>> +                return -VTD_FR_PAGING_ENTRY_INV;
>> +            }
>> +        }
>> +        if (!vtd_flpte_present(flpte)) {
>> +            *reads = false;
>> +            *writes = false;
>> +            return -VTD_FR_PAGING_ENTRY_INV;
>> +        }
>> +        *reads = true;
>> +        *writes = (*writes) && (flpte & VTD_FL_RW_MASK);
>> +        if (is_write && !(flpte & VTD_FL_RW_MASK)) {
>> +            return -VTD_FR_WRITE;
>> +        }
>> +        if (vtd_flpte_nonzero_rsvd(flpte, level)) {
>> +            error_report_once("%s: detected flpte reserved non-zero "
>> +                              "iova=0x%" PRIx64 ", level=0x%" PRIx32
>> +                              "flpte=0x%" PRIx64 ", pasid=0x%" PRIX32 ")",
>> +                              __func__, iova, level, flpte, pasid);
>> +            return -VTD_FR_PAGING_ENTRY_RSVD;
>> +        }
>> +
>> +        if (vtd_is_last_pte(flpte, level)) {
>> +            *flptep = flpte;
>> +            *flpte_level = level;
>> +            return 0;
>> +        }
>> +
>> +        addr = vtd_get_pte_addr(flpte, aw_bits);
>> +        level--;
>> +    }
>
>we also need to do the below check like the SL path.
>
>     /*
>      * From VT-d spec 3.14: Untranslated requests and translation
>      * requests that result in an address in the interrupt range will be
>      * blocked with condition code LGN.4 or SGN.8.
>      */

Seems unnecessary. We have a memory region as->iommu_ir for interrupt range,
so we never receive translation request in interrupt range.

See assert(!vtd_is_interrupt_addr(addr)) in vtd_do_iommu_translate().

>
>> +}
>> +
>>   static void vtd_report_fault(IntelIOMMUState *s,
>>                                int err, bool is_fpd_set,
>>                                uint16_t source_id,
>> @@ -2010,8 +2128,13 @@ static bool
>vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
>>           }
>>       }
>>
>> -    ret_fr = vtd_iova_to_slpte(s, &ce, addr, is_write, &pte, &level,
>> -                               &reads, &writes, s->aw_bits, pasid);
>> +    if (s->scalable_modern && s->root_scalable) {
>> +        ret_fr = vtd_iova_to_flpte(s, &ce, addr, is_write, &pte, &level,
>> +                                   &reads, &writes, s->aw_bits, pasid);
>> +    } else {
>> +        ret_fr = vtd_iova_to_slpte(s, &ce, addr, is_write, &pte, &level,
>> +                                   &reads, &writes, s->aw_bits, pasid);
>> +    }
>>       if (ret_fr) {
>>           vtd_report_fault(s, -ret_fr, is_fpd_set, source_id,
>>                            addr, is_write, pasid != PCI_NO_PASID, pasid);
>> @@ -4239,6 +4362,21 @@ static void vtd_init(IntelIOMMUState *s)
>>       vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s-
>>aw_bits,
>>                                                       x86_iommu->dt_supported);
>
>VT-d spec has dropped TM since 3.2 (2020 Oct). May have a patch to drop it
>in vIOMMU as well. :)

Good suggestion, will do.

>
>Change log in VT-d spec.
>
>"
>  Remove Transient Mapping (TM) field from second-level page-tables and
>treat the field
>as Reserved(0).
>"
>
>>
>> +    /*
>> +     * Rsvd field masks for fpte
>> +     */
>> +    vtd_fpte_rsvd[0] = ~0ULL;
>> +    vtd_fpte_rsvd[1] = VTD_FPTE_PAGE_L1_RSVD_MASK(s->aw_bits);
>> +    vtd_fpte_rsvd[2] = VTD_FPTE_PAGE_L2_RSVD_MASK(s->aw_bits);
>> +    vtd_fpte_rsvd[3] = VTD_FPTE_PAGE_L3_RSVD_MASK(s->aw_bits);
>> +    vtd_fpte_rsvd[4] = VTD_FPTE_PAGE_L4_RSVD_MASK(s->aw_bits);
>> +
>> +    vtd_fpte_rsvd_large[0] = ~0ULL;
>> +    vtd_fpte_rsvd_large[1] = ~0ULL;
>> +    vtd_fpte_rsvd_large[2] = VTD_FPTE_PAGE_L2_FS2MP_RSVD_MASK(s-
>>aw_bits);
>> +    vtd_fpte_rsvd_large[3] = VTD_FPTE_PAGE_L3_FS1GP_RSVD_MASK(s-
>>aw_bits);
>> +    vtd_fpte_rsvd_large[4] = ~0ULL;
>> +
>
>this looks to be different with the SL large definitions. Is it necessary
>to set the [0]/[1] and [4] as the large index should only be 2 or 3?

Yes, will remove those unnecessary part.

>BTW. Before patch 16 of this series, it's unclear whether FS1GP is
>supported or not, wondering if you want to add the 1G related definitions
>togather with the FS1GP support in patch 16?

FS1GP is supported by default for scalable modern mode after this patch,
patch 16 only add a knob to disable it.

Thanks
Zhenzhong

>
>>       if (s->scalable_mode || s->snoop_control) {
>>           vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP;
>>           vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP;
>
>--
>Regards,
>Yi Liu
Re: [PATCH v3 06/17] intel_iommu: Implement stage-1 translation
Posted by Jason Wang 1 month, 2 weeks ago
On Wed, Sep 11, 2024 at 1:26 PM Zhenzhong Duan <zhenzhong.duan@intel.com> wrote:
>
> From: Yi Liu <yi.l.liu@intel.com>
>
> This adds stage-1 page table walking to support stage-1 only
> transltion in scalable modern mode.
>
> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> Co-developed-by: Clément Mathieu--Drif <clement.mathieu--drif@eviden.com>
> Signed-off-by: Clément Mathieu--Drif <clement.mathieu--drif@eviden.com>
> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
> ---

Acked-by: Jason Wang <jasowang@redhat.com>

Thanks