[PATCH rfcv1 17/23] intel_iommu: implement firt level translation

Zhenzhong Duan posted 23 patches 10 months, 2 weeks ago
[PATCH rfcv1 17/23] intel_iommu: implement firt level translation
Posted by Zhenzhong Duan 10 months, 2 weeks ago
From: Yi Liu <yi.l.liu@intel.com>

This adds stage-1 page table walking to support stage-1 only
transltion in scalable mode.

Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
---
 hw/i386/intel_iommu_internal.h |  16 +++
 hw/i386/intel_iommu.c          | 242 ++++++++++++++++++++++++++++++++-
 hw/i386/trace-events           |   2 +
 3 files changed, 258 insertions(+), 2 deletions(-)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index dcf1410fcf..41b958cd5d 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -598,6 +598,22 @@ typedef struct VTDPIOTLBInvInfo VTDPIOTLBInvInfo;
 #define VTD_SM_PASID_ENTRY_WPE_BIT(val)  (!!(((val) >> 4) & 1ULL))
 #define VTD_SM_PASID_ENTRY_EAFE_BIT(val) (!!(((val) >> 7) & 1ULL))
 
+#define VTD_PASID_IOTLB_MAX_SIZE       1024 /* Max size of the hash table */
+
+/* Paging Structure common */
+#define VTD_FL_PT_PAGE_SIZE_MASK    (1ULL << 7)
+/* Bits to decide the offset for each level */
+#define VTD_FL_LEVEL_BITS           9
+
+/* First Level Paging Structure */
+#define VTD_FL_PT_LEVEL             1
+#define VTD_FL_PT_ENTRY_NR          512
+
+/* Masks for First Level Paging Entry */
+#define VTD_FL_RW_MASK              (1ULL << 1)
+#define VTD_FL_PT_BASE_ADDR_MASK(aw) (~(VTD_PAGE_SIZE - 1) & VTD_HAW_MASK(aw))
+#define VTD_PASID_ENTRY_FPD         (1ULL << 1) /* Fault Processing Disable */
+
 /* Second Level Page Translation Pointer*/
 #define VTD_SM_PASID_ENTRY_SLPTPTR     (~0xfffULL)
 
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 7c24f8f677..1c21f40ccd 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -78,6 +78,10 @@ static void vtd_pasid_cache_sync(IntelIOMMUState *s,
                                  VTDPASIDCacheInfo *pc_info);
 static void vtd_pasid_cache_devsi(IntelIOMMUState *s,
                                   PCIBus *bus, uint16_t devfn);
+static VTDPASIDAddressSpace *vtd_add_find_pasid_as(IntelIOMMUState *s,
+                                                   PCIBus *bus,
+                                                   int devfn,
+                                                   uint32_t pasid);
 
 static void vtd_panic_require_caching_mode(void)
 {
@@ -1888,6 +1892,114 @@ out:
     trace_vtd_pt_enable_fast_path(source_id, success);
 }
 
+/* The shift of an addr for a certain level of paging structure */
+static inline uint32_t vtd_flpt_level_shift(uint32_t level)
+{
+    assert(level != 0);
+    return VTD_PAGE_SHIFT_4K + (level - 1) * VTD_FL_LEVEL_BITS;
+}
+
+static inline uint64_t vtd_flpt_level_page_mask(uint32_t level)
+{
+    return ~((1ULL << vtd_flpt_level_shift(level)) - 1);
+}
+
+static inline dma_addr_t vtd_pe_get_flpt_level(VTDPASIDEntry *pe)
+{
+    return 4 + ((pe->val[2] >> 2) & VTD_SM_PASID_ENTRY_FLPM);
+}
+
+/*
+ * Given an iova and the level of paging structure, return the offset
+ * of current level.
+ */
+static inline uint32_t vtd_iova_fl_level_offset(uint64_t iova, uint32_t level)
+{
+    return (iova >> vtd_flpt_level_shift(level)) &
+            ((1ULL << VTD_FL_LEVEL_BITS) - 1);
+}
+
+/* Get the content of a flpte located in @base_addr[@index] */
+static uint64_t vtd_get_flpte(dma_addr_t base_addr, uint32_t index)
+{
+    uint64_t flpte;
+
+    assert(index < VTD_FL_PT_ENTRY_NR);
+
+    if (dma_memory_read(&address_space_memory,
+                        base_addr + index * sizeof(flpte), &flpte,
+                        sizeof(flpte), MEMTXATTRS_UNSPECIFIED)) {
+        flpte = (uint64_t)-1;
+        return flpte;
+    }
+    flpte = le64_to_cpu(flpte);
+    return flpte;
+}
+
+static inline bool vtd_flpte_present(uint64_t flpte)
+{
+    return !!(flpte & 0x1);
+}
+
+/* Whether the pte indicates the address of the page frame */
+static inline bool vtd_is_last_flpte(uint64_t flpte, uint32_t level)
+{
+    return level == VTD_FL_PT_LEVEL || (flpte & VTD_FL_PT_PAGE_SIZE_MASK);
+}
+
+static inline uint64_t vtd_get_flpte_addr(uint64_t flpte, uint8_t aw)
+{
+    return flpte & VTD_FL_PT_BASE_ADDR_MASK(aw);
+}
+
+/*
+ * Given the @iova, get relevant @flptep. @flpte_level will be the last level
+ * of the translation, can be used for deciding the size of large page.
+ */
+static int vtd_iova_to_flpte(VTDPASIDEntry *pe, uint64_t iova, bool is_write,
+                             uint64_t *flptep, uint32_t *flpte_level,
+                             bool *reads, bool *writes, uint8_t aw_bits)
+{
+    dma_addr_t addr = vtd_pe_get_flpt_base(pe);
+    uint32_t level = vtd_pe_get_flpt_level(pe);
+    uint32_t offset;
+    uint64_t flpte;
+
+    while (true) {
+        offset = vtd_iova_fl_level_offset(iova, level);
+        flpte = vtd_get_flpte(addr, offset);
+        if (flpte == (uint64_t)-1) {
+            if (level == VTD_PE_GET_LEVEL(pe)) {
+                /* Invalid programming of context-entry */
+                return -VTD_FR_CONTEXT_ENTRY_INV;
+            } else {
+                return -VTD_FR_PAGING_ENTRY_INV;
+            }
+        }
+
+        if (!vtd_flpte_present(flpte)) {
+            *reads = false;
+            *writes = false;
+            return -VTD_FR_PAGING_ENTRY_INV;
+        }
+
+        *reads = true;
+        *writes = (*writes) && (flpte & VTD_FL_RW_MASK);
+        if (is_write && !(flpte & VTD_FL_RW_MASK)) {
+            return -VTD_FR_WRITE;
+        }
+
+        if (vtd_is_last_flpte(flpte, level)) {
+            *flptep = flpte;
+            *flpte_level = level;
+            return 0;
+        }
+
+        addr = vtd_get_flpte_addr(flpte, aw_bits);
+        level--;
+    }
+}
+
 static void vtd_report_fault(IntelIOMMUState *s,
                              int err, bool is_fpd_set,
                              uint16_t source_id,
@@ -1904,6 +2016,105 @@ static void vtd_report_fault(IntelIOMMUState *s,
     }
 }
 
+/*
+ * Map dev to pasid-entry then do a paging-structures walk to do a iommu
+ * translation.
+ *
+ * Called from RCU critical section.
+ *
+ * @vtd_as: The untranslated address space
+ * @bus_num: The bus number
+ * @devfn: The devfn, which is the  combined of device and function number
+ * @is_write: The access is a write operation
+ * @entry: IOMMUTLBEntry that contain the addr to be translated and result
+ *
+ * Returns true if translation is successful, otherwise false.
+ */
+static bool vtd_do_iommu_fl_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
+                                      uint8_t devfn, hwaddr addr, bool is_write,
+                                      IOMMUTLBEntry *entry)
+{
+    IntelIOMMUState *s = vtd_as->iommu_state;
+    VTDContextEntry ce;
+    VTDPASIDEntry pe;
+    uint8_t bus_num = pci_bus_num(bus);
+    uint64_t flpte, page_mask;
+    uint32_t level;
+    uint16_t source_id = PCI_BUILD_BDF(bus_num, devfn);
+    int ret;
+    bool is_fpd_set = false;
+    bool reads = true;
+    bool writes = true;
+    uint8_t access_flags;
+
+    /*
+     * We have standalone memory region for interrupt addresses, we
+     * should never receive translation requests in this region.
+     */
+    assert(!vtd_is_interrupt_addr(addr));
+
+    ret = vtd_dev_to_context_entry(s, pci_bus_num(bus), devfn, &ce);
+    if (ret) {
+        error_report_once("%s: detected translation failure 1 "
+                          "(dev=%02x:%02x:%02x, iova=0x%" PRIx64 ")",
+                          __func__, pci_bus_num(bus),
+                          VTD_PCI_SLOT(devfn),
+                          VTD_PCI_FUNC(devfn),
+                          addr);
+        return false;
+    }
+
+    vtd_iommu_lock(s);
+
+    ret = vtd_ce_get_rid2pasid_entry(s, &ce, &pe, PCI_NO_PASID);
+    is_fpd_set = pe.val[0] & VTD_PASID_ENTRY_FPD;
+    if (ret) {
+        vtd_report_fault(s, -ret, is_fpd_set, source_id, addr, is_write,
+                         false, PCI_NO_PASID);
+        goto error;
+    }
+
+    /*
+     * We don't need to translate for pass-through context entries.
+     * Also, let's ignore IOTLB caching as well for PT devices.
+     */
+    if (VTD_PE_GET_TYPE(&pe) == VTD_SM_PASID_ENTRY_PT) {
+        entry->iova = addr & VTD_PAGE_MASK_4K;
+        entry->translated_addr = entry->iova;
+        entry->addr_mask = ~VTD_PAGE_MASK_4K;
+        entry->perm = IOMMU_RW;
+        vtd_iommu_unlock(s);
+        return true;
+    }
+
+    ret = vtd_iova_to_flpte(&pe, addr, is_write, &flpte, &level,
+                            &reads, &writes, s->aw_bits);
+    if (ret) {
+        vtd_report_fault(s, -ret, is_fpd_set, source_id, addr, is_write,
+                         false, PCI_NO_PASID);
+        goto error;
+    }
+
+    page_mask = vtd_flpt_level_page_mask(level);
+    access_flags = IOMMU_ACCESS_FLAG(reads, writes);
+
+    vtd_iommu_unlock(s);
+
+    entry->iova = addr & page_mask;
+    entry->translated_addr = vtd_get_flpte_addr(flpte, s->aw_bits) & page_mask;
+    entry->addr_mask = ~page_mask;
+    entry->perm = access_flags;
+    return true;
+
+error:
+    vtd_iommu_unlock(s);
+    entry->iova = 0;
+    entry->translated_addr = 0;
+    entry->addr_mask = 0;
+    entry->perm = IOMMU_NONE;
+    return false;
+}
+
 /* Map dev to context-entry then do a paging-structures walk to do a iommu
  * translation.
  *
@@ -4516,10 +4727,37 @@ static IOMMUTLBEntry vtd_iommu_translate(IOMMUMemoryRegion *iommu, hwaddr addr,
         .target_as = &address_space_memory,
     };
     bool success;
+    VTDContextEntry ce;
+    VTDPASIDEntry pe;
+    int ret = 0;
 
     if (likely(s->dmar_enabled)) {
-        success = vtd_do_iommu_translate(vtd_as, vtd_as->bus, vtd_as->devfn,
-                                         addr, flag & IOMMU_WO, &iotlb);
+        if (s->root_scalable) {
+            ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
+                                           vtd_as->devfn, &ce);
+            ret = vtd_ce_get_rid2pasid_entry(s, &ce, &pe, PCI_NO_PASID);
+            if (ret) {
+                error_report_once("%s: detected translation failure 1 "
+                                  "(dev=%02x:%02x:%02x, iova=0x%" PRIx64 ")",
+                                  __func__, pci_bus_num(vtd_as->bus),
+                                  VTD_PCI_SLOT(vtd_as->devfn),
+                                  VTD_PCI_FUNC(vtd_as->devfn),
+                                  addr);
+                return iotlb;
+            }
+            if (VTD_PE_GET_TYPE(&pe) == VTD_SM_PASID_ENTRY_FLT) {
+                success = vtd_do_iommu_fl_translate(vtd_as, vtd_as->bus,
+                                                    vtd_as->devfn, addr,
+                                                    flag & IOMMU_WO, &iotlb);
+            } else {
+                success = vtd_do_iommu_translate(vtd_as, vtd_as->bus,
+                                                 vtd_as->devfn, addr,
+                                                 flag & IOMMU_WO, &iotlb);
+            }
+        } else {
+            success = vtd_do_iommu_translate(vtd_as, vtd_as->bus, vtd_as->devfn,
+                                             addr, flag & IOMMU_WO, &iotlb);
+        }
     } else {
         /* DMAR disabled, passthrough, use 4k-page*/
         iotlb.iova = addr & VTD_PAGE_MASK_4K;
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
index 66f7c1ba59..00b27bc5b1 100644
--- a/hw/i386/trace-events
+++ b/hw/i386/trace-events
@@ -33,6 +33,8 @@ vtd_re_not_present(uint8_t bus) "Root entry bus %"PRIu8" not present"
 vtd_ce_not_present(uint8_t bus, uint8_t devfn) "Context entry bus %"PRIu8" devfn %"PRIu8" not present"
 vtd_iotlb_page_hit(uint16_t sid, uint64_t addr, uint64_t slpte, uint16_t domain) "IOTLB page hit sid 0x%"PRIx16" iova 0x%"PRIx64" slpte 0x%"PRIx64" domain 0x%"PRIx16
 vtd_iotlb_page_update(uint16_t sid, uint64_t addr, uint64_t slpte, uint16_t domain) "IOTLB page update sid 0x%"PRIx16" iova 0x%"PRIx64" slpte 0x%"PRIx64" domain 0x%"PRIx16
+vtd_iotlb_pe_hit(uint32_t pasid, uint64_t val0, uint32_t gen) "IOTLB pasid hit pasid %"PRIu32" val[0] 0x%"PRIx64" gen %"PRIu32
+vtd_iotlb_pe_update(uint32_t pasid, uint64_t val0, uint32_t gen1, uint32_t gen2) "IOTLB pasid update pasid %"PRIu32" val[0] 0x%"PRIx64" gen %"PRIu32" -> gen %"PRIu32
 vtd_iotlb_cc_hit(uint8_t bus, uint8_t devfn, uint64_t high, uint64_t low, uint32_t gen) "IOTLB context hit bus 0x%"PRIx8" devfn 0x%"PRIx8" high 0x%"PRIx64" low 0x%"PRIx64" gen %"PRIu32
 vtd_iotlb_cc_update(uint8_t bus, uint8_t devfn, uint64_t high, uint64_t low, uint32_t gen1, uint32_t gen2) "IOTLB context update bus 0x%"PRIx8" devfn 0x%"PRIx8" high 0x%"PRIx64" low 0x%"PRIx64" gen %"PRIu32" -> gen %"PRIu32
 vtd_iotlb_reset(const char *reason) "IOTLB reset (reason: %s)"
-- 
2.34.1