[PATCH v5 12/21] intel_iommu: Handle PASID entry addition

Zhenzhong Duan posted 21 patches 2 months, 3 weeks ago
Maintainers: "Michael S. Tsirkin" <mst@redhat.com>, Jason Wang <jasowang@redhat.com>, Yi Liu <yi.l.liu@intel.com>, "Clément Mathieu--Drif" <clement.mathieu--drif@eviden.com>, Marcel Apfelbaum <marcel.apfelbaum@gmail.com>, Paolo Bonzini <pbonzini@redhat.com>, Richard Henderson <richard.henderson@linaro.org>, Eduardo Habkost <eduardo@habkost.net>, Alex Williamson <alex.williamson@redhat.com>, "Cédric Le Goater" <clg@redhat.com>, Eric Auger <eric.auger@redhat.com>, Zhenzhong Duan <zhenzhong.duan@intel.com>
[PATCH v5 12/21] intel_iommu: Handle PASID entry addition
Posted by Zhenzhong Duan 2 months, 3 weeks ago
When guest creates new PASID entries, QEMU will capture the guest pasid
selective pasid cache invalidation, walk through each passthrough device
and each pasid, when a match is found, identify an existing vtd_as or
create a new one and update its corresponding cached pasid entry.

Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
---
 hw/i386/intel_iommu_internal.h |   2 +
 hw/i386/intel_iommu.c          | 176 ++++++++++++++++++++++++++++++++-
 2 files changed, 175 insertions(+), 3 deletions(-)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index b9b76dd996..fb2a919e87 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -559,6 +559,7 @@ typedef struct VTDRootEntry VTDRootEntry;
 #define VTD_CTX_ENTRY_LEGACY_SIZE     16
 #define VTD_CTX_ENTRY_SCALABLE_SIZE   32
 
+#define VTD_SM_CONTEXT_ENTRY_PDTS(x)        extract64((x)->val[0], 9, 3)
 #define VTD_SM_CONTEXT_ENTRY_RID2PASID_MASK 0xfffff
 #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(aw)  (0x1e0ULL | ~VTD_HAW_MASK(aw))
 #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL1      0xffffffffffe00000ULL
@@ -589,6 +590,7 @@ typedef struct VTDPASIDCacheInfo {
 #define VTD_PASID_TABLE_BITS_MASK     (0x3fULL)
 #define VTD_PASID_TABLE_INDEX(pasid)  ((pasid) & VTD_PASID_TABLE_BITS_MASK)
 #define VTD_PASID_ENTRY_FPD           (1ULL << 1) /* Fault Processing Disable */
+#define VTD_PASID_TBL_ENTRY_NUM       (1ULL << 6)
 
 /* PASID Granular Translation Type Mask */
 #define VTD_PASID_ENTRY_P              1ULL
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index a2ee6d684e..7d2c9feae7 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -826,6 +826,11 @@ static inline bool vtd_pe_type_check(IntelIOMMUState *s, VTDPASIDEntry *pe)
     }
 }
 
+static inline uint32_t vtd_sm_ce_get_pdt_entry_num(VTDContextEntry *ce)
+{
+    return 1U << (VTD_SM_CONTEXT_ENTRY_PDTS(ce) + 7);
+}
+
 static inline bool vtd_pdire_present(VTDPASIDDirEntry *pdire)
 {
     return pdire->val & 1;
@@ -1647,9 +1652,9 @@ static gboolean vtd_find_as_by_sid_and_iommu_pasid(gpointer key, gpointer value,
 }
 
 /* Translate iommu pasid to vtd_as */
-static inline
-VTDAddressSpace *vtd_as_from_iommu_pasid_locked(IntelIOMMUState *s,
-                                                uint16_t sid, uint32_t pasid)
+static VTDAddressSpace *vtd_as_from_iommu_pasid_locked(IntelIOMMUState *s,
+                                                       uint16_t sid,
+                                                       uint32_t pasid)
 {
     struct vtd_as_raw_key key = {
         .sid = sid,
@@ -3220,10 +3225,172 @@ remove:
     return true;
 }
 
+/*
+ * This function walks over PASID range within [start, end) in a single
+ * PASID table for entries matching @info type/did, then retrieve/create
+ * vtd_as and fill associated pasid entry cache.
+ */
+static void vtd_sm_pasid_table_walk_one(IntelIOMMUState *s,
+                                        dma_addr_t pt_base,
+                                        int start,
+                                        int end,
+                                        VTDPASIDCacheInfo *info)
+{
+    VTDPASIDEntry pe;
+    int pasid = start;
+
+    while (pasid < end) {
+        if (!vtd_get_pe_in_pasid_leaf_table(s, pasid, pt_base, &pe)
+            && vtd_pe_present(&pe)) {
+            int bus_n = pci_bus_num(info->bus), devfn = info->devfn;
+            uint16_t sid = PCI_BUILD_BDF(bus_n, devfn);
+            VTDPASIDCacheEntry *pc_entry;
+            VTDAddressSpace *vtd_as;
+
+            vtd_iommu_lock(s);
+            /*
+             * When indexed by rid2pasid, vtd_as should have been created,
+             * e.g., by PCI subsystem. For other iommu pasid, we need to
+             * create vtd_as dynamically. Other iommu pasid is same value
+             * as PCI's pasid, so it's used as input of vtd_find_add_as().
+             */
+            vtd_as = vtd_as_from_iommu_pasid_locked(s, sid, pasid);
+            vtd_iommu_unlock(s);
+            if (!vtd_as) {
+                vtd_as = vtd_find_add_as(s, info->bus, devfn, pasid);
+            }
+
+            if ((info->type == VTD_PASID_CACHE_DOMSI ||
+                 info->type == VTD_PASID_CACHE_PASIDSI) &&
+                (info->did != VTD_SM_PASID_ENTRY_DID(&pe))) {
+                /*
+                 * VTD_PASID_CACHE_DOMSI and VTD_PASID_CACHE_PASIDSI
+                 * requires domain id check. If domain id check fail,
+                 * go to next pasid.
+                 */
+                pasid++;
+                continue;
+            }
+
+            pc_entry = &vtd_as->pasid_cache_entry;
+            /*
+             * pasid cache update and clear are handled in
+             * vtd_flush_pasid_locked(), only care new pasid entry here.
+             */
+            if (!pc_entry->valid) {
+                pc_entry->pasid_entry = pe;
+                pc_entry->valid = true;
+            }
+        }
+        pasid++;
+    }
+}
+
+/*
+ * In VT-d scalable mode translation, PASID dir + PASID table is used.
+ * This function aims at looping over a range of PASIDs in the given
+ * two level table to identify the pasid config in guest.
+ */
+static void vtd_sm_pasid_table_walk(IntelIOMMUState *s,
+                                    dma_addr_t pdt_base,
+                                    int start, int end,
+                                    VTDPASIDCacheInfo *info)
+{
+    VTDPASIDDirEntry pdire;
+    int pasid = start;
+    int pasid_next;
+    dma_addr_t pt_base;
+
+    while (pasid < end) {
+        pasid_next =
+             (pasid + VTD_PASID_TBL_ENTRY_NUM) & ~(VTD_PASID_TBL_ENTRY_NUM - 1);
+        pasid_next = pasid_next < end ? pasid_next : end;
+
+        if (!vtd_get_pdire_from_pdir_table(pdt_base, pasid, &pdire)
+            && vtd_pdire_present(&pdire)) {
+            pt_base = pdire.val & VTD_PASID_TABLE_BASE_ADDR_MASK;
+            vtd_sm_pasid_table_walk_one(s, pt_base, pasid, pasid_next, info);
+        }
+        pasid = pasid_next;
+    }
+}
+
+static void vtd_replay_pasid_bind_for_dev(IntelIOMMUState *s,
+                                          int start, int end,
+                                          VTDPASIDCacheInfo *info)
+{
+    VTDContextEntry ce;
+
+    if (!vtd_dev_to_context_entry(s, pci_bus_num(info->bus), info->devfn,
+                                  &ce)) {
+        uint32_t max_pasid;
+
+        max_pasid = vtd_sm_ce_get_pdt_entry_num(&ce) * VTD_PASID_TBL_ENTRY_NUM;
+        if (end > max_pasid) {
+            end = max_pasid;
+        }
+        vtd_sm_pasid_table_walk(s,
+                                VTD_CE_GET_PASID_DIR_TABLE(&ce),
+                                start,
+                                end,
+                                info);
+    }
+}
+
+/*
+ * This function replays the guest pasid bindings by walking the two level
+ * guest PASID table. For each valid pasid entry, it finds or creates a
+ * vtd_as and caches pasid entry in vtd_as.
+ */
+static void vtd_replay_guest_pasid_bindings(IntelIOMMUState *s,
+                                            VTDPASIDCacheInfo *pc_info)
+{
+    /*
+     * Currently only Requests-without-PASID is supported, as vIOMMU doesn't
+     * support RPS(RID-PASID Support), pasid scope is fixed to [0, 1).
+     */
+    int start = 0, end = 1;
+    VTDHostIOMMUDevice *vtd_hiod;
+    VTDPASIDCacheInfo walk_info;
+    GHashTableIter as_it;
+
+    switch (pc_info->type) {
+    case VTD_PASID_CACHE_PASIDSI:
+        start = pc_info->pasid;
+        end = pc_info->pasid + 1;
+       /* fall through */
+    case VTD_PASID_CACHE_DOMSI:
+    case VTD_PASID_CACHE_GLOBAL_INV:
+        /* loop all assigned devices */
+        break;
+    default:
+        error_setg(&error_fatal, "invalid pc_info->type for replay");
+    }
+
+    /*
+     * In this replay, one only needs to care about the devices which are
+     * backed by host IOMMU. Those devices have a corresponding vtd_hiod
+     * in s->vtd_host_iommu_dev. For devices not backed by host IOMMU, it
+     * is not necessary to replay the bindings since their cache could be
+     * re-created in the future DMA address translation.
+     *
+     * VTD translation callback never accesses vtd_hiod and its corresponding
+     * cached pasid entry, so no iommu lock needed here.
+     */
+    walk_info = *pc_info;
+    g_hash_table_iter_init(&as_it, s->vtd_host_iommu_dev);
+    while (g_hash_table_iter_next(&as_it, NULL, (void **)&vtd_hiod)) {
+        walk_info.bus = vtd_hiod->bus;
+        walk_info.devfn = vtd_hiod->devfn;
+        vtd_replay_pasid_bind_for_dev(s, start, end, &walk_info);
+    }
+}
+
 /*
  * For a PASID cache invalidation, this function handles below scenarios:
  * a) a present cached pasid entry needs to be removed
  * b) a present cached pasid entry needs to be updated
+ * c) a present cached pasid entry needs to be created
  */
 static void vtd_pasid_cache_sync(IntelIOMMUState *s, VTDPASIDCacheInfo *pc_info)
 {
@@ -3239,6 +3406,9 @@ static void vtd_pasid_cache_sync(IntelIOMMUState *s, VTDPASIDCacheInfo *pc_info)
     g_hash_table_foreach_remove(s->vtd_address_spaces, vtd_flush_pasid_locked,
                                 pc_info);
     vtd_iommu_unlock(s);
+
+    /* c): loop all passthrough device for new pasid entries */
+    vtd_replay_guest_pasid_bindings(s, pc_info);
 }
 
 static bool vtd_process_pasid_desc(IntelIOMMUState *s,
-- 
2.47.1
Re: [PATCH v5 12/21] intel_iommu: Handle PASID entry addition
Posted by Yi Liu 2 months, 2 weeks ago
On 2025/8/22 14:40, Zhenzhong Duan wrote:
> When guest creates new PASID entries, QEMU will capture the guest pasid
> selective pasid cache invalidation, walk through each passthrough device
> and each pasid, when a match is found, identify an existing vtd_as or
> create a new one and update its corresponding cached pasid entry.
> 
> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
> ---
>   hw/i386/intel_iommu_internal.h |   2 +
>   hw/i386/intel_iommu.c          | 176 ++++++++++++++++++++++++++++++++-
>   2 files changed, 175 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
> index b9b76dd996..fb2a919e87 100644
> --- a/hw/i386/intel_iommu_internal.h
> +++ b/hw/i386/intel_iommu_internal.h
> @@ -559,6 +559,7 @@ typedef struct VTDRootEntry VTDRootEntry;
>   #define VTD_CTX_ENTRY_LEGACY_SIZE     16
>   #define VTD_CTX_ENTRY_SCALABLE_SIZE   32
>   
> +#define VTD_SM_CONTEXT_ENTRY_PDTS(x)        extract64((x)->val[0], 9, 3)
>   #define VTD_SM_CONTEXT_ENTRY_RID2PASID_MASK 0xfffff
>   #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(aw)  (0x1e0ULL | ~VTD_HAW_MASK(aw))
>   #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL1      0xffffffffffe00000ULL
> @@ -589,6 +590,7 @@ typedef struct VTDPASIDCacheInfo {
>   #define VTD_PASID_TABLE_BITS_MASK     (0x3fULL)
>   #define VTD_PASID_TABLE_INDEX(pasid)  ((pasid) & VTD_PASID_TABLE_BITS_MASK)
>   #define VTD_PASID_ENTRY_FPD           (1ULL << 1) /* Fault Processing Disable */
> +#define VTD_PASID_TBL_ENTRY_NUM       (1ULL << 6)
>   
>   /* PASID Granular Translation Type Mask */
>   #define VTD_PASID_ENTRY_P              1ULL
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index a2ee6d684e..7d2c9feae7 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -826,6 +826,11 @@ static inline bool vtd_pe_type_check(IntelIOMMUState *s, VTDPASIDEntry *pe)
>       }
>   }
>   
> +static inline uint32_t vtd_sm_ce_get_pdt_entry_num(VTDContextEntry *ce)
> +{
> +    return 1U << (VTD_SM_CONTEXT_ENTRY_PDTS(ce) + 7);
> +}
> +
>   static inline bool vtd_pdire_present(VTDPASIDDirEntry *pdire)
>   {
>       return pdire->val & 1;
> @@ -1647,9 +1652,9 @@ static gboolean vtd_find_as_by_sid_and_iommu_pasid(gpointer key, gpointer value,
>   }
>   
>   /* Translate iommu pasid to vtd_as */
> -static inline
> -VTDAddressSpace *vtd_as_from_iommu_pasid_locked(IntelIOMMUState *s,
> -                                                uint16_t sid, uint32_t pasid)
> +static VTDAddressSpace *vtd_as_from_iommu_pasid_locked(IntelIOMMUState *s,
> +                                                       uint16_t sid,
> +                                                       uint32_t pasid)
>   {
>       struct vtd_as_raw_key key = {
>           .sid = sid,

this hunk can be merged with the patch that introduces 
vtd_as_from_iommu_pasid_locked.

> @@ -3220,10 +3225,172 @@ remove:
>       return true;
>   }
>   
> +/*
> + * This function walks over PASID range within [start, end) in a single
> + * PASID table for entries matching @info type/did, then retrieve/create
> + * vtd_as and fill associated pasid entry cache.
> + */
> +static void vtd_sm_pasid_table_walk_one(IntelIOMMUState *s,
> +                                        dma_addr_t pt_base,
> +                                        int start,
> +                                        int end,
> +                                        VTDPASIDCacheInfo *info)
> +{
> +    VTDPASIDEntry pe;
> +    int pasid = start;
> +
> +    while (pasid < end) {
> +        if (!vtd_get_pe_in_pasid_leaf_table(s, pasid, pt_base, &pe)
> +            && vtd_pe_present(&pe)) {
> +            int bus_n = pci_bus_num(info->bus), devfn = info->devfn;
> +            uint16_t sid = PCI_BUILD_BDF(bus_n, devfn);
> +            VTDPASIDCacheEntry *pc_entry;
> +            VTDAddressSpace *vtd_as;
> +
> +            vtd_iommu_lock(s);
> +            /*
> +             * When indexed by rid2pasid, vtd_as should have been created,
> +             * e.g., by PCI subsystem. For other iommu pasid, we need to
> +             * create vtd_as dynamically. Other iommu pasid is same value
> +             * as PCI's pasid, so it's used as input of vtd_find_add_as().
> +             */
> +            vtd_as = vtd_as_from_iommu_pasid_locked(s, sid, pasid);
> +            vtd_iommu_unlock(s);
> +            if (!vtd_as) {
> +                vtd_as = vtd_find_add_as(s, info->bus, devfn, pasid);
> +            }
> +
> +            if ((info->type == VTD_PASID_CACHE_DOMSI ||
> +                 info->type == VTD_PASID_CACHE_PASIDSI) &&
> +                (info->did != VTD_SM_PASID_ENTRY_DID(&pe))) {
> +                /*
> +                 * VTD_PASID_CACHE_DOMSI and VTD_PASID_CACHE_PASIDSI
> +                 * requires domain id check. If domain id check fail,
> +                 * go to next pasid.
> +                 */
> +                pasid++;
> +                continue;
> +            }
> +
> +            pc_entry = &vtd_as->pasid_cache_entry;
> +            /*
> +             * pasid cache update and clear are handled in
> +             * vtd_flush_pasid_locked(), only care new pasid entry here.
> +             */
> +            if (!pc_entry->valid) {
> +                pc_entry->pasid_entry = pe;
> +                pc_entry->valid = true;
> +            }
> +        }
> +        pasid++;
> +    }
> +}
> +
> +/*
> + * In VT-d scalable mode translation, PASID dir + PASID table is used.

remove translation.

> + * This function aims at looping over a range of PASIDs in the given
> + * two level table to identify the pasid config in guest.
> + */
> +static void vtd_sm_pasid_table_walk(IntelIOMMUState *s,
> +                                    dma_addr_t pdt_base,
> +                                    int start, int end,
> +                                    VTDPASIDCacheInfo *info)
> +{
> +    VTDPASIDDirEntry pdire;
> +    int pasid = start;
> +    int pasid_next;
> +    dma_addr_t pt_base;
> +
> +    while (pasid < end) {
> +        pasid_next =
> +             (pasid + VTD_PASID_TBL_ENTRY_NUM) & ~(VTD_PASID_TBL_ENTRY_NUM - 1);
> +        pasid_next = pasid_next < end ? pasid_next : end;
> +
> +        if (!vtd_get_pdire_from_pdir_table(pdt_base, pasid, &pdire)
> +            && vtd_pdire_present(&pdire)) {
> +            pt_base = pdire.val & VTD_PASID_TABLE_BASE_ADDR_MASK;
> +            vtd_sm_pasid_table_walk_one(s, pt_base, pasid, pasid_next, info);
> +        }
> +        pasid = pasid_next;
> +    }
> +}
> +
> +static void vtd_replay_pasid_bind_for_dev(IntelIOMMUState *s,
> +                                          int start, int end,
> +                                          VTDPASIDCacheInfo *info)
> +{
> +    VTDContextEntry ce;
> +
> +    if (!vtd_dev_to_context_entry(s, pci_bus_num(info->bus), info->devfn,
> +                                  &ce)) {
> +        uint32_t max_pasid;
> +
> +        max_pasid = vtd_sm_ce_get_pdt_entry_num(&ce) * VTD_PASID_TBL_ENTRY_NUM;
> +        if (end > max_pasid) {
> +            end = max_pasid;
> +        }
> +        vtd_sm_pasid_table_walk(s,
> +                                VTD_CE_GET_PASID_DIR_TABLE(&ce),
> +                                start,
> +                                end,
> +                                info);
> +    }
> +}
> +
> +/*
> + * This function replays the guest pasid bindings by walking the two level
> + * guest PASID table. For each valid pasid entry, it finds or creates a
> + * vtd_as and caches pasid entry in vtd_as.
> + */
> +static void vtd_replay_guest_pasid_bindings(IntelIOMMUState *s,
> +                                            VTDPASIDCacheInfo *pc_info)
> +{
> +    /*
> +     * Currently only Requests-without-PASID is supported, as vIOMMU doesn't
> +     * support RPS(RID-PASID Support), pasid scope is fixed to [0, 1).

RPS is not the reason for only requests-without-PASID. RPS is 0 means
rid_pasid is fixed to be 0. While if RPS is 1, rid_pasid can be other
values. This is different with requests-with-PASID. If only aims for
supporting requests-without-PASID when RPS is 0, we may want to re-org
the code here. No need to pretend to loop a range of PASIDs just use
pasid#0 as we know rid_pasid is 0.

To me, this series should be able to support non-rid_pasid PASIDs.

> +     */
> +    int start = 0, end = 1;
> +    VTDHostIOMMUDevice *vtd_hiod;
> +    VTDPASIDCacheInfo walk_info;
> +    GHashTableIter as_it;
> +
> +    switch (pc_info->type) {
> +    case VTD_PASID_CACHE_PASIDSI:
> +        start = pc_info->pasid;
> +        end = pc_info->pasid + 1;
> +       /* fall through */
> +    case VTD_PASID_CACHE_DOMSI:
> +    case VTD_PASID_CACHE_GLOBAL_INV:
> +        /* loop all assigned devices */
> +        break;
> +    default:
> +        error_setg(&error_fatal, "invalid pc_info->type for replay");
> +    }
> +
> +    /*
> +     * In this replay, one only needs to care about the devices which are
> +     * backed by host IOMMU. Those devices have a corresponding vtd_hiod
> +     * in s->vtd_host_iommu_dev. For devices not backed by host IOMMU, it
> +     * is not necessary to replay the bindings since their cache could be
> +     * re-created in the future DMA address translation.
> +     *
> +     * VTD translation callback never accesses vtd_hiod and its corresponding
> +     * cached pasid entry, so no iommu lock needed here.
> +     */
> +    walk_info = *pc_info;
> +    g_hash_table_iter_init(&as_it, s->vtd_host_iommu_dev);
> +    while (g_hash_table_iter_next(&as_it, NULL, (void **)&vtd_hiod)) {
> +        walk_info.bus = vtd_hiod->bus;
> +        walk_info.devfn = vtd_hiod->devfn;
> +        vtd_replay_pasid_bind_for_dev(s, start, end, &walk_info);
> +    }
> +}
> +
>   /*
>    * For a PASID cache invalidation, this function handles below scenarios:
>    * a) a present cached pasid entry needs to be removed
>    * b) a present cached pasid entry needs to be updated
> + * c) a present cached pasid entry needs to be created
>    */
>   static void vtd_pasid_cache_sync(IntelIOMMUState *s, VTDPASIDCacheInfo *pc_info)
>   {
> @@ -3239,6 +3406,9 @@ static void vtd_pasid_cache_sync(IntelIOMMUState *s, VTDPASIDCacheInfo *pc_info)
>       g_hash_table_foreach_remove(s->vtd_address_spaces, vtd_flush_pasid_locked,
>                                   pc_info);
>       vtd_iommu_unlock(s);
> +
> +    /* c): loop all passthrough device for new pasid entries */
> +    vtd_replay_guest_pasid_bindings(s, pc_info);
>   }
>   
>   static bool vtd_process_pasid_desc(IntelIOMMUState *s,
Re: [PATCH v5 12/21] intel_iommu: Handle PASID entry addition
Posted by Eric Auger 2 months, 2 weeks ago
Hi Zhenzhong,

On 8/22/25 8:40 AM, Zhenzhong Duan wrote:
> When guest creates new PASID entries, QEMU will capture the guest pasid
> selective pasid cache invalidation, walk through each passthrough device
> and each pasid, when a match is found, identify an existing vtd_as or
> create a new one and update its corresponding cached pasid entry.

You need to emphasize that the support is currently limited to
Requests-without-PASID
>
> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
> ---
>  hw/i386/intel_iommu_internal.h |   2 +
>  hw/i386/intel_iommu.c          | 176 ++++++++++++++++++++++++++++++++-
>  2 files changed, 175 insertions(+), 3 deletions(-)
>
> diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
> index b9b76dd996..fb2a919e87 100644
> --- a/hw/i386/intel_iommu_internal.h
> +++ b/hw/i386/intel_iommu_internal.h
> @@ -559,6 +559,7 @@ typedef struct VTDRootEntry VTDRootEntry;
>  #define VTD_CTX_ENTRY_LEGACY_SIZE     16
>  #define VTD_CTX_ENTRY_SCALABLE_SIZE   32
>  
> +#define VTD_SM_CONTEXT_ENTRY_PDTS(x)        extract64((x)->val[0], 9, 3)
>  #define VTD_SM_CONTEXT_ENTRY_RID2PASID_MASK 0xfffff
>  #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(aw)  (0x1e0ULL | ~VTD_HAW_MASK(aw))
>  #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL1      0xffffffffffe00000ULL
> @@ -589,6 +590,7 @@ typedef struct VTDPASIDCacheInfo {
>  #define VTD_PASID_TABLE_BITS_MASK     (0x3fULL)
>  #define VTD_PASID_TABLE_INDEX(pasid)  ((pasid) & VTD_PASID_TABLE_BITS_MASK)
>  #define VTD_PASID_ENTRY_FPD           (1ULL << 1) /* Fault Processing Disable */
> +#define VTD_PASID_TBL_ENTRY_NUM       (1ULL << 6)
>  
>  /* PASID Granular Translation Type Mask */
>  #define VTD_PASID_ENTRY_P              1ULL
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index a2ee6d684e..7d2c9feae7 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -826,6 +826,11 @@ static inline bool vtd_pe_type_check(IntelIOMMUState *s, VTDPASIDEntry *pe)
>      }
>  }
>  
> +static inline uint32_t vtd_sm_ce_get_pdt_entry_num(VTDContextEntry *ce)
> +{
> +    return 1U << (VTD_SM_CONTEXT_ENTRY_PDTS(ce) + 7);
> +}
> +
>  static inline bool vtd_pdire_present(VTDPASIDDirEntry *pdire)
>  {
>      return pdire->val & 1;
> @@ -1647,9 +1652,9 @@ static gboolean vtd_find_as_by_sid_and_iommu_pasid(gpointer key, gpointer value,
>  }
>  
>  /* Translate iommu pasid to vtd_as */
> -static inline
> -VTDAddressSpace *vtd_as_from_iommu_pasid_locked(IntelIOMMUState *s,
> -                                                uint16_t sid, uint32_t pasid)
> +static VTDAddressSpace *vtd_as_from_iommu_pasid_locked(IntelIOMMUState *s,
> +                                                       uint16_t sid,
> +                                                       uint32_t pasid)
>  {
>      struct vtd_as_raw_key key = {
>          .sid = sid,
> @@ -3220,10 +3225,172 @@ remove:
>      return true;
>  }
>  
> +/*
> + * This function walks over PASID range within [start, end) in a single
> + * PASID table for entries matching @info type/did, then retrieve/create
> + * vtd_as and fill associated pasid entry cache.
> + */
> +static void vtd_sm_pasid_table_walk_one(IntelIOMMUState *s,
> +                                        dma_addr_t pt_base,
> +                                        int start,
> +                                        int end,
> +                                        VTDPASIDCacheInfo *info)
> +{
> +    VTDPASIDEntry pe;
> +    int pasid = start;
> +
> +    while (pasid < end) {
> +        if (!vtd_get_pe_in_pasid_leaf_table(s, pasid, pt_base, &pe)
> +            && vtd_pe_present(&pe)) {
> +            int bus_n = pci_bus_num(info->bus), devfn = info->devfn;
> +            uint16_t sid = PCI_BUILD_BDF(bus_n, devfn);
> +            VTDPASIDCacheEntry *pc_entry;
> +            VTDAddressSpace *vtd_as;
> +
> +            vtd_iommu_lock(s);
> +            /*
> +             * When indexed by rid2pasid, vtd_as should have been created,
> +             * e.g., by PCI subsystem. For other iommu pasid, we need to
> +             * create vtd_as dynamically. Other iommu pasid is same value
since you don't support somthing else than rid2pasid, I would drop that
and simplify the code. See below.
> +             * as PCI's pasid, so it's used as input of vtd_find_add_as().
> +             */
> +            vtd_as = vtd_as_from_iommu_pasid_locked(s, sid, pasid);
> +            vtd_iommu_unlock(s);
> +            if (!vtd_as) {
> +                vtd_as = vtd_find_add_as(s, info->bus, devfn, pasid);
you could check the vtd_as already exists here per the rid2pasid support
limitation
> +            }
> +
> +            if ((info->type == VTD_PASID_CACHE_DOMSI ||
> +                 info->type == VTD_PASID_CACHE_PASIDSI) &&
> +                (info->did != VTD_SM_PASID_ENTRY_DID(&pe))) {
> +                /*
> +                 * VTD_PASID_CACHE_DOMSI and VTD_PASID_CACHE_PASIDSI
> +                 * requires domain id check. If domain id check fail,
fails
> +                 * go to next pasid.
> +                 */
> +                pasid++;
> +                continue;
> +            }
> +
> +            pc_entry = &vtd_as->pasid_cache_entry;
> +            /*
> +             * pasid cache update and clear are handled in
> +             * vtd_flush_pasid_locked(), only care new pasid entry here.
> +             */
> +            if (!pc_entry->valid) {
> +                pc_entry->pasid_entry = pe;
> +                pc_entry->valid = true;
> +            }
> +        }
> +        pasid++;
> +    }
> +}
> +
> +/*
> + * In VT-d scalable mode translation, PASID dir + PASID table is used.
> + * This function aims at looping over a range of PASIDs in the given
> + * two level table to identify the pasid config in guest.
> + */
> +static void vtd_sm_pasid_table_walk(IntelIOMMUState *s,
> +                                    dma_addr_t pdt_base,
> +                                    int start, int end,
> +                                    VTDPASIDCacheInfo *info)
> +{
> +    VTDPASIDDirEntry pdire;
> +    int pasid = start;
> +    int pasid_next;
> +    dma_addr_t pt_base;
> +
> +    while (pasid < end) {
> +        pasid_next =
> +             (pasid + VTD_PASID_TBL_ENTRY_NUM) & ~(VTD_PASID_TBL_ENTRY_NUM - 1);
> +        pasid_next = pasid_next < end ? pasid_next : end;
> +
> +        if (!vtd_get_pdire_from_pdir_table(pdt_base, pasid, &pdire)
> +            && vtd_pdire_present(&pdire)) {
> +            pt_base = pdire.val & VTD_PASID_TABLE_BASE_ADDR_MASK;
> +            vtd_sm_pasid_table_walk_one(s, pt_base, pasid, pasid_next, info);
> +        }
> +        pasid = pasid_next;
> +    }
> +}
> +
> +static void vtd_replay_pasid_bind_for_dev(IntelIOMMUState *s,
> +                                          int start, int end,
> +                                          VTDPASIDCacheInfo *info)
> +{
> +    VTDContextEntry ce;
> +
> +    if (!vtd_dev_to_context_entry(s, pci_bus_num(info->bus), info->devfn,
> +                                  &ce)) {
> +        uint32_t max_pasid;
> +
> +        max_pasid = vtd_sm_ce_get_pdt_entry_num(&ce) * VTD_PASID_TBL_ENTRY_NUM;
> +        if (end > max_pasid) {
> +            end = max_pasid;
> +        }
> +        vtd_sm_pasid_table_walk(s,
> +                                VTD_CE_GET_PASID_DIR_TABLE(&ce),
> +                                start,
> +                                end,
> +                                info);
> +    }
> +}
> +
> +/*
> + * This function replays the guest pasid bindings by walking the two level
> + * guest PASID table. For each valid pasid entry, it finds or creates a
> + * vtd_as and caches pasid entry in vtd_as.
> + */
> +static void vtd_replay_guest_pasid_bindings(IntelIOMMUState *s,
> +                                            VTDPASIDCacheInfo *pc_info)
> +{
> +    /*
> +     * Currently only Requests-without-PASID is supported, as vIOMMU doesn't
> +     * support RPS(RID-PASID Support), pasid scope is fixed to [0, 1).
> +     */
> +    int start = 0, end = 1;
> +    VTDHostIOMMUDevice *vtd_hiod;
> +    VTDPASIDCacheInfo walk_info;
> +    GHashTableIter as_it;
> +
> +    switch (pc_info->type) {
> +    case VTD_PASID_CACHE_PASIDSI:
> +        start = pc_info->pasid;
> +        end = pc_info->pasid + 1;
if you never replay a range, you could simplify the code for now because
some code paths are not properly tested
> +       /* fall through */
> +    case VTD_PASID_CACHE_DOMSI:
Why can't we have other invalidation types along with request without
PASID? It is not obvious to me at least why it couldn't be used by the
guest. Would deserve a comment in the commit desc I think.
> +    case VTD_PASID_CACHE_GLOBAL_INV:
> +        /* loop all assigned devices */
> +        break;
> +    default:
> +        error_setg(&error_fatal, "invalid pc_info->type for replay");
> +    }
> +
> +    /*
> +     * In this replay, one only needs to care about the devices which are
> +     * backed by host IOMMU. Those devices have a corresponding vtd_hiod
> +     * in s->vtd_host_iommu_dev. For devices not backed by host IOMMU, it
> +     * is not necessary to replay the bindings since their cache could be
> +     * re-created in the future DMA address translation.
> +     *
> +     * VTD translation callback never accesses vtd_hiod and its corresponding
> +     * cached pasid entry, so no iommu lock needed here.
> +     */
> +    walk_info = *pc_info;
> +    g_hash_table_iter_init(&as_it, s->vtd_host_iommu_dev);
> +    while (g_hash_table_iter_next(&as_it, NULL, (void **)&vtd_hiod)) {
> +        walk_info.bus = vtd_hiod->bus;
> +        walk_info.devfn = vtd_hiod->devfn;
> +        vtd_replay_pasid_bind_for_dev(s, start, end, &walk_info);
> +    }
> +}
> +
>  /*
>   * For a PASID cache invalidation, this function handles below scenarios:
>   * a) a present cached pasid entry needs to be removed
>   * b) a present cached pasid entry needs to be updated
> + * c) a present cached pasid entry needs to be created
>   */
>  static void vtd_pasid_cache_sync(IntelIOMMUState *s, VTDPASIDCacheInfo *pc_info)
>  {
> @@ -3239,6 +3406,9 @@ static void vtd_pasid_cache_sync(IntelIOMMUState *s, VTDPASIDCacheInfo *pc_info)
>      g_hash_table_foreach_remove(s->vtd_address_spaces, vtd_flush_pasid_locked,
>                                  pc_info);
>      vtd_iommu_unlock(s);
> +
> +    /* c): loop all passthrough device for new pasid entries */
> +    vtd_replay_guest_pasid_bindings(s, pc_info);
>  }
>  
>  static bool vtd_process_pasid_desc(IntelIOMMUState *s,
Thanks

Eric
RE: [PATCH v5 12/21] intel_iommu: Handle PASID entry addition
Posted by Duan, Zhenzhong 2 months, 2 weeks ago
Hi Eric,

>-----Original Message-----
>From: Eric Auger <eric.auger@redhat.com>
>Subject: Re: [PATCH v5 12/21] intel_iommu: Handle PASID entry addition
>
>Hi Zhenzhong,
>
>On 8/22/25 8:40 AM, Zhenzhong Duan wrote:
>> When guest creates new PASID entries, QEMU will capture the guest pasid
>> selective pasid cache invalidation, walk through each passthrough device
>> and each pasid, when a match is found, identify an existing vtd_as or
>> create a new one and update its corresponding cached pasid entry.
>
>You need to emphasize that the support is currently limited to
>Requests-without-PASID

OK.

>>
>> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
>> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
>> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
>> ---
>>  hw/i386/intel_iommu_internal.h |   2 +
>>  hw/i386/intel_iommu.c          | 176
>++++++++++++++++++++++++++++++++-
>>  2 files changed, 175 insertions(+), 3 deletions(-)
>>
>> diff --git a/hw/i386/intel_iommu_internal.h
>b/hw/i386/intel_iommu_internal.h
>> index b9b76dd996..fb2a919e87 100644
>> --- a/hw/i386/intel_iommu_internal.h
>> +++ b/hw/i386/intel_iommu_internal.h
>> @@ -559,6 +559,7 @@ typedef struct VTDRootEntry VTDRootEntry;
>>  #define VTD_CTX_ENTRY_LEGACY_SIZE     16
>>  #define VTD_CTX_ENTRY_SCALABLE_SIZE   32
>>
>> +#define VTD_SM_CONTEXT_ENTRY_PDTS(x)        extract64((x)->val[0],
>9, 3)
>>  #define VTD_SM_CONTEXT_ENTRY_RID2PASID_MASK 0xfffff
>>  #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(aw)  (0x1e0ULL |
>~VTD_HAW_MASK(aw))
>>  #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL1
>0xffffffffffe00000ULL
>> @@ -589,6 +590,7 @@ typedef struct VTDPASIDCacheInfo {
>>  #define VTD_PASID_TABLE_BITS_MASK     (0x3fULL)
>>  #define VTD_PASID_TABLE_INDEX(pasid)  ((pasid) &
>VTD_PASID_TABLE_BITS_MASK)
>>  #define VTD_PASID_ENTRY_FPD           (1ULL << 1) /* Fault
>Processing Disable */
>> +#define VTD_PASID_TBL_ENTRY_NUM       (1ULL << 6)
>>
>>  /* PASID Granular Translation Type Mask */
>>  #define VTD_PASID_ENTRY_P              1ULL
>> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
>> index a2ee6d684e..7d2c9feae7 100644
>> --- a/hw/i386/intel_iommu.c
>> +++ b/hw/i386/intel_iommu.c
>> @@ -826,6 +826,11 @@ static inline bool
>vtd_pe_type_check(IntelIOMMUState *s, VTDPASIDEntry *pe)
>>      }
>>  }
>>
>> +static inline uint32_t vtd_sm_ce_get_pdt_entry_num(VTDContextEntry
>*ce)
>> +{
>> +    return 1U << (VTD_SM_CONTEXT_ENTRY_PDTS(ce) + 7);
>> +}
>> +
>>  static inline bool vtd_pdire_present(VTDPASIDDirEntry *pdire)
>>  {
>>      return pdire->val & 1;
>> @@ -1647,9 +1652,9 @@ static gboolean
>vtd_find_as_by_sid_and_iommu_pasid(gpointer key, gpointer value,
>>  }
>>
>>  /* Translate iommu pasid to vtd_as */
>> -static inline
>> -VTDAddressSpace *vtd_as_from_iommu_pasid_locked(IntelIOMMUState
>*s,
>> -                                                uint16_t sid,
>uint32_t pasid)
>> +static VTDAddressSpace
>*vtd_as_from_iommu_pasid_locked(IntelIOMMUState *s,
>> +
>uint16_t sid,
>> +
>uint32_t pasid)
>>  {
>>      struct vtd_as_raw_key key = {
>>          .sid = sid,
>> @@ -3220,10 +3225,172 @@ remove:
>>      return true;
>>  }
>>
>> +/*
>> + * This function walks over PASID range within [start, end) in a single
>> + * PASID table for entries matching @info type/did, then retrieve/create
>> + * vtd_as and fill associated pasid entry cache.
>> + */
>> +static void vtd_sm_pasid_table_walk_one(IntelIOMMUState *s,
>> +                                        dma_addr_t pt_base,
>> +                                        int start,
>> +                                        int end,
>> +                                        VTDPASIDCacheInfo
>*info)
>> +{
>> +    VTDPASIDEntry pe;
>> +    int pasid = start;
>> +
>> +    while (pasid < end) {
>> +        if (!vtd_get_pe_in_pasid_leaf_table(s, pasid, pt_base, &pe)
>> +            && vtd_pe_present(&pe)) {
>> +            int bus_n = pci_bus_num(info->bus), devfn = info->devfn;
>> +            uint16_t sid = PCI_BUILD_BDF(bus_n, devfn);
>> +            VTDPASIDCacheEntry *pc_entry;
>> +            VTDAddressSpace *vtd_as;
>> +
>> +            vtd_iommu_lock(s);
>> +            /*
>> +             * When indexed by rid2pasid, vtd_as should have been
>created,
>> +             * e.g., by PCI subsystem. For other iommu pasid, we need
>to
>> +             * create vtd_as dynamically. Other iommu pasid is same
>value
>since you don't support somthing else than rid2pasid, I would drop that
>and simplify the code. See below.
>> +             * as PCI's pasid, so it's used as input of vtd_find_add_as().
>> +             */
>> +            vtd_as = vtd_as_from_iommu_pasid_locked(s, sid, pasid);
>> +            vtd_iommu_unlock(s);
>> +            if (!vtd_as) {
>> +                vtd_as = vtd_find_add_as(s, info->bus, devfn, pasid);
>you could check the vtd_as already exists here per the rid2pasid support
>limitation

In this series, I do include some basic codes for non-rid2pasid because they share some common code with rid2pasid and we already have emulated rid2pasid support in vIOMMU for a long time, it's not bad to accumulate some supporting code for non-rid2pasid for passthrough device. But I can do the factor out if you insist to have only rid_pasid code.

>> +            }
>> +
>> +            if ((info->type == VTD_PASID_CACHE_DOMSI ||
>> +                 info->type == VTD_PASID_CACHE_PASIDSI) &&
>> +                (info->did != VTD_SM_PASID_ENTRY_DID(&pe))) {
>> +                /*
>> +                 * VTD_PASID_CACHE_DOMSI and
>VTD_PASID_CACHE_PASIDSI
>> +                 * requires domain id check. If domain id check fail,
>fails
>> +                 * go to next pasid.
>> +                 */
>> +                pasid++;
>> +                continue;
>> +            }
>> +
>> +            pc_entry = &vtd_as->pasid_cache_entry;
>> +            /*
>> +             * pasid cache update and clear are handled in
>> +             * vtd_flush_pasid_locked(), only care new pasid entry
>here.
>> +             */
>> +            if (!pc_entry->valid) {
>> +                pc_entry->pasid_entry = pe;
>> +                pc_entry->valid = true;
>> +            }
>> +        }
>> +        pasid++;
>> +    }
>> +}
>> +
>> +/*
>> + * In VT-d scalable mode translation, PASID dir + PASID table is used.
>> + * This function aims at looping over a range of PASIDs in the given
>> + * two level table to identify the pasid config in guest.
>> + */
>> +static void vtd_sm_pasid_table_walk(IntelIOMMUState *s,
>> +                                    dma_addr_t pdt_base,
>> +                                    int start, int end,
>> +                                    VTDPASIDCacheInfo *info)
>> +{
>> +    VTDPASIDDirEntry pdire;
>> +    int pasid = start;
>> +    int pasid_next;
>> +    dma_addr_t pt_base;
>> +
>> +    while (pasid < end) {
>> +        pasid_next =
>> +             (pasid + VTD_PASID_TBL_ENTRY_NUM) &
>~(VTD_PASID_TBL_ENTRY_NUM - 1);
>> +        pasid_next = pasid_next < end ? pasid_next : end;
>> +
>> +        if (!vtd_get_pdire_from_pdir_table(pdt_base, pasid, &pdire)
>> +            && vtd_pdire_present(&pdire)) {
>> +            pt_base = pdire.val &
>VTD_PASID_TABLE_BASE_ADDR_MASK;
>> +            vtd_sm_pasid_table_walk_one(s, pt_base, pasid,
>pasid_next, info);
>> +        }
>> +        pasid = pasid_next;
>> +    }
>> +}
>> +
>> +static void vtd_replay_pasid_bind_for_dev(IntelIOMMUState *s,
>> +                                          int start, int end,
>> +                                          VTDPASIDCacheInfo
>*info)
>> +{
>> +    VTDContextEntry ce;
>> +
>> +    if (!vtd_dev_to_context_entry(s, pci_bus_num(info->bus),
>info->devfn,
>> +                                  &ce)) {
>> +        uint32_t max_pasid;
>> +
>> +        max_pasid = vtd_sm_ce_get_pdt_entry_num(&ce) *
>VTD_PASID_TBL_ENTRY_NUM;
>> +        if (end > max_pasid) {
>> +            end = max_pasid;
>> +        }
>> +        vtd_sm_pasid_table_walk(s,
>> +
>VTD_CE_GET_PASID_DIR_TABLE(&ce),
>> +                                start,
>> +                                end,
>> +                                info);
>> +    }
>> +}
>> +
>> +/*
>> + * This function replays the guest pasid bindings by walking the two level
>> + * guest PASID table. For each valid pasid entry, it finds or creates a
>> + * vtd_as and caches pasid entry in vtd_as.
>> + */
>> +static void vtd_replay_guest_pasid_bindings(IntelIOMMUState *s,
>> +                                            VTDPASIDCacheInfo
>*pc_info)
>> +{
>> +    /*
>> +     * Currently only Requests-without-PASID is supported, as vIOMMU
>doesn't
>> +     * support RPS(RID-PASID Support), pasid scope is fixed to [0, 1).
>> +     */
>> +    int start = 0, end = 1;
>> +    VTDHostIOMMUDevice *vtd_hiod;
>> +    VTDPASIDCacheInfo walk_info;
>> +    GHashTableIter as_it;
>> +
>> +    switch (pc_info->type) {
>> +    case VTD_PASID_CACHE_PASIDSI:
>> +        start = pc_info->pasid;
>> +        end = pc_info->pasid + 1;
>if you never replay a range, you could simplify the code for now because
>some code paths are not properly tested

OK. Instead of assignment of start and end variable, maybe just an assert(!pc_info->pasid).

>> +       /* fall through */
>> +    case VTD_PASID_CACHE_DOMSI:
>Why can't we have other invalidation types along with request without
>PASID? It is not obvious to me at least why it couldn't be used by the
>guest. Would deserve a comment in the commit desc I think.

Other invalidation types are indeed used, just in pasid scope [0, 1), because [start, end) are already initialized to [0, 1), nothing more here, so just break.

Thanks
Zhenzhong

>> +    case VTD_PASID_CACHE_GLOBAL_INV:
>> +        /* loop all assigned devices */
>> +        break;
>> +    default:
>> +        error_setg(&error_fatal, "invalid pc_info->type for replay");
>> +    }
>> +
>> +    /*
>> +     * In this replay, one only needs to care about the devices which are
>> +     * backed by host IOMMU. Those devices have a corresponding
>vtd_hiod
>> +     * in s->vtd_host_iommu_dev. For devices not backed by host
>IOMMU, it
>> +     * is not necessary to replay the bindings since their cache could be
>> +     * re-created in the future DMA address translation.
>> +     *
>> +     * VTD translation callback never accesses vtd_hiod and its
>corresponding
>> +     * cached pasid entry, so no iommu lock needed here.
>> +     */
>> +    walk_info = *pc_info;
>> +    g_hash_table_iter_init(&as_it, s->vtd_host_iommu_dev);
>> +    while (g_hash_table_iter_next(&as_it, NULL, (void **)&vtd_hiod)) {
>> +        walk_info.bus = vtd_hiod->bus;
>> +        walk_info.devfn = vtd_hiod->devfn;
>> +        vtd_replay_pasid_bind_for_dev(s, start, end, &walk_info);
>> +    }
>> +}
>> +
>>  /*
>>   * For a PASID cache invalidation, this function handles below scenarios:
>>   * a) a present cached pasid entry needs to be removed
>>   * b) a present cached pasid entry needs to be updated
>> + * c) a present cached pasid entry needs to be created
>>   */
>>  static void vtd_pasid_cache_sync(IntelIOMMUState *s,
>VTDPASIDCacheInfo *pc_info)
>>  {
>> @@ -3239,6 +3406,9 @@ static void
>vtd_pasid_cache_sync(IntelIOMMUState *s, VTDPASIDCacheInfo *pc_info)
>>      g_hash_table_foreach_remove(s->vtd_address_spaces,
>vtd_flush_pasid_locked,
>>                                  pc_info);
>>      vtd_iommu_unlock(s);
>> +
>> +    /* c): loop all passthrough device for new pasid entries */
>> +    vtd_replay_guest_pasid_bindings(s, pc_info);
>>  }
>>
>>  static bool vtd_process_pasid_desc(IntelIOMMUState *s,
>Thanks
>
>Eric

Re: [PATCH v5 12/21] intel_iommu: Handle PASID entry addition
Posted by Yi Liu 2 months, 1 week ago
On 2025/9/1 17:03, Duan, Zhenzhong wrote:

>>>   }
>>>
>>> +/*
>>> + * This function walks over PASID range within [start, end) in a single
>>> + * PASID table for entries matching @info type/did, then retrieve/create
>>> + * vtd_as and fill associated pasid entry cache.
>>> + */
>>> +static void vtd_sm_pasid_table_walk_one(IntelIOMMUState *s,
>>> +                                        dma_addr_t pt_base,
>>> +                                        int start,
>>> +                                        int end,
>>> +                                        VTDPASIDCacheInfo
>> *info)
>>> +{
>>> +    VTDPASIDEntry pe;
>>> +    int pasid = start;
>>> +
>>> +    while (pasid < end) {
>>> +        if (!vtd_get_pe_in_pasid_leaf_table(s, pasid, pt_base, &pe)
>>> +            && vtd_pe_present(&pe)) {
>>> +            int bus_n = pci_bus_num(info->bus), devfn = info->devfn;
>>> +            uint16_t sid = PCI_BUILD_BDF(bus_n, devfn);
>>> +            VTDPASIDCacheEntry *pc_entry;
>>> +            VTDAddressSpace *vtd_as;
>>> +
>>> +            vtd_iommu_lock(s);
>>> +            /*
>>> +             * When indexed by rid2pasid, vtd_as should have been
>> created,
>>> +             * e.g., by PCI subsystem. For other iommu pasid, we need
>> to
>>> +             * create vtd_as dynamically. Other iommu pasid is same
>> value
>> since you don't support somthing else than rid2pasid, I would drop that
>> and simplify the code. See below.
>>> +             * as PCI's pasid, so it's used as input of vtd_find_add_as().
>>> +             */
>>> +            vtd_as = vtd_as_from_iommu_pasid_locked(s, sid, pasid);
>>> +            vtd_iommu_unlock(s);
>>> +            if (!vtd_as) {
>>> +                vtd_as = vtd_find_add_as(s, info->bus, devfn, pasid);
>> you could check the vtd_as already exists here per the rid2pasid support
>> limitation
> 
> In this series, I do include some basic codes for non-rid2pasid because they share some common code with rid2pasid and we already have emulated rid2pasid support in vIOMMU for a long time, it's not bad to accumulate some supporting code for non-rid2pasid for passthrough device. But I can do the factor out if you insist to have only rid_pasid code.

I think it's a reasonable ask. :)

> 
>>> +            }
>>> +
>>> +            if ((info->type == VTD_PASID_CACHE_DOMSI ||
>>> +                 info->type == VTD_PASID_CACHE_PASIDSI) &&
>>> +                (info->did != VTD_SM_PASID_ENTRY_DID(&pe))) {
>>> +                /*
>>> +                 * VTD_PASID_CACHE_DOMSI and
>> VTD_PASID_CACHE_PASIDSI
>>> +                 * requires domain id check. If domain id check fail,
>> fails
>>> +                 * go to next pasid.
>>> +                 */
>>> +                pasid++;
>>> +                continue;
>>> +            }
>>> +
>>> +            pc_entry = &vtd_as->pasid_cache_entry;
>>> +            /*
>>> +             * pasid cache update and clear are handled in
>>> +             * vtd_flush_pasid_locked(), only care new pasid entry
>> here.
>>> +             */
>>> +            if (!pc_entry->valid) {
>>> +                pc_entry->pasid_entry = pe;
>>> +                pc_entry->valid = true;
>>> +            }
>>> +        }
>>> +        pasid++;
>>> +    }
>>> +}
>>> +
>>> +/*
>>> + * In VT-d scalable mode translation, PASID dir + PASID table is used.
>>> + * This function aims at looping over a range of PASIDs in the given
>>> + * two level table to identify the pasid config in guest.
>>> + */
>>> +static void vtd_sm_pasid_table_walk(IntelIOMMUState *s,
>>> +                                    dma_addr_t pdt_base,
>>> +                                    int start, int end,
>>> +                                    VTDPASIDCacheInfo *info)
>>> +{
>>> +    VTDPASIDDirEntry pdire;
>>> +    int pasid = start;
>>> +    int pasid_next;
>>> +    dma_addr_t pt_base;
>>> +
>>> +    while (pasid < end) {
>>> +        pasid_next =
>>> +             (pasid + VTD_PASID_TBL_ENTRY_NUM) &
>> ~(VTD_PASID_TBL_ENTRY_NUM - 1);
>>> +        pasid_next = pasid_next < end ? pasid_next : end;
>>> +
>>> +        if (!vtd_get_pdire_from_pdir_table(pdt_base, pasid, &pdire)
>>> +            && vtd_pdire_present(&pdire)) {
>>> +            pt_base = pdire.val &
>> VTD_PASID_TABLE_BASE_ADDR_MASK;
>>> +            vtd_sm_pasid_table_walk_one(s, pt_base, pasid,
>> pasid_next, info);
>>> +        }
>>> +        pasid = pasid_next;
>>> +    }
>>> +}
>>> +
>>> +static void vtd_replay_pasid_bind_for_dev(IntelIOMMUState *s,
>>> +                                          int start, int end,
>>> +                                          VTDPASIDCacheInfo
>> *info)
>>> +{
>>> +    VTDContextEntry ce;
>>> +
>>> +    if (!vtd_dev_to_context_entry(s, pci_bus_num(info->bus),
>> info->devfn,
>>> +                                  &ce)) {
>>> +        uint32_t max_pasid;
>>> +
>>> +        max_pasid = vtd_sm_ce_get_pdt_entry_num(&ce) *
>> VTD_PASID_TBL_ENTRY_NUM;
>>> +        if (end > max_pasid) {
>>> +            end = max_pasid;
>>> +        }
>>> +        vtd_sm_pasid_table_walk(s,
>>> +
>> VTD_CE_GET_PASID_DIR_TABLE(&ce),
>>> +                                start,
>>> +                                end,
>>> +                                info);
>>> +    }
>>> +}
>>> +
>>> +/*
>>> + * This function replays the guest pasid bindings by walking the two level
>>> + * guest PASID table. For each valid pasid entry, it finds or creates a
>>> + * vtd_as and caches pasid entry in vtd_as.
>>> + */
>>> +static void vtd_replay_guest_pasid_bindings(IntelIOMMUState *s,
>>> +                                            VTDPASIDCacheInfo
>> *pc_info)
>>> +{
>>> +    /*
>>> +     * Currently only Requests-without-PASID is supported, as vIOMMU
>> doesn't
>>> +     * support RPS(RID-PASID Support), pasid scope is fixed to [0, 1).
>>> +     */
>>> +    int start = 0, end = 1;
>>> +    VTDHostIOMMUDevice *vtd_hiod;
>>> +    VTDPASIDCacheInfo walk_info;
>>> +    GHashTableIter as_it;
>>> +
>>> +    switch (pc_info->type) {
>>> +    case VTD_PASID_CACHE_PASIDSI:
>>> +        start = pc_info->pasid;
>>> +        end = pc_info->pasid + 1;
>> if you never replay a range, you could simplify the code for now because
>> some code paths are not properly tested
> 
> OK. Instead of assignment of start and end variable, maybe just an assert(!pc_info->pasid).

I think there are two reasons for this range replay.

1) as a preparation for patch 16 of this series.
2) support domain selective or global pasid cache invalidation

> 
>>> +       /* fall through */
>>> +    case VTD_PASID_CACHE_DOMSI:
>> Why can't we have other invalidation types along with request without
>> PASID? It is not obvious to me at least why it couldn't be used by the
>> guest. Would deserve a comment in the commit desc I think.
> 
> Other invalidation types are indeed used, just in pasid scope [0, 1), because [start, end) are already initialized to [0, 1), nothing more here, so just break.

hmmm. The fixed scope makes the range replay a fake one. It's better
holding the range replay logic for now and add it when there is
non-rid_pasid support for passthrough devices.

Regards,
Yi Liu
RE: [PATCH v5 12/21] intel_iommu: Handle PASID entry addition
Posted by Duan, Zhenzhong 2 months, 1 week ago

>-----Original Message-----
>From: Liu, Yi L <yi.l.liu@intel.com>
>Subject: Re: [PATCH v5 12/21] intel_iommu: Handle PASID entry addition
>
>On 2025/9/1 17:03, Duan, Zhenzhong wrote:
>
>>>>   }
>>>>
>>>> +/*
>>>> + * This function walks over PASID range within [start, end) in a single
>>>> + * PASID table for entries matching @info type/did, then retrieve/create
>>>> + * vtd_as and fill associated pasid entry cache.
>>>> + */
>>>> +static void vtd_sm_pasid_table_walk_one(IntelIOMMUState *s,
>>>> +                                        dma_addr_t pt_base,
>>>> +                                        int start,
>>>> +                                        int end,
>>>> +                                        VTDPASIDCacheInfo
>>> *info)
>>>> +{
>>>> +    VTDPASIDEntry pe;
>>>> +    int pasid = start;
>>>> +
>>>> +    while (pasid < end) {
>>>> +        if (!vtd_get_pe_in_pasid_leaf_table(s, pasid, pt_base, &pe)
>>>> +            && vtd_pe_present(&pe)) {
>>>> +            int bus_n = pci_bus_num(info->bus), devfn = info->devfn;
>>>> +            uint16_t sid = PCI_BUILD_BDF(bus_n, devfn);
>>>> +            VTDPASIDCacheEntry *pc_entry;
>>>> +            VTDAddressSpace *vtd_as;
>>>> +
>>>> +            vtd_iommu_lock(s);
>>>> +            /*
>>>> +             * When indexed by rid2pasid, vtd_as should have been
>>> created,
>>>> +             * e.g., by PCI subsystem. For other iommu pasid, we
>need
>>> to
>>>> +             * create vtd_as dynamically. Other iommu pasid is same
>>> value
>>> since you don't support somthing else than rid2pasid, I would drop that
>>> and simplify the code. See below.
>>>> +             * as PCI's pasid, so it's used as input of
>vtd_find_add_as().
>>>> +             */
>>>> +            vtd_as = vtd_as_from_iommu_pasid_locked(s, sid, pasid);
>>>> +            vtd_iommu_unlock(s);
>>>> +            if (!vtd_as) {
>>>> +                vtd_as = vtd_find_add_as(s, info->bus, devfn, pasid);
>>> you could check the vtd_as already exists here per the rid2pasid support
>>> limitation
>>
>> In this series, I do include some basic codes for non-rid2pasid because they
>share some common code with rid2pasid and we already have emulated
>rid2pasid support in vIOMMU for a long time, it's not bad to accumulate some
>supporting code for non-rid2pasid for passthrough device. But I can do the
>factor out if you insist to have only rid_pasid code.
>
>I think it's a reasonable ask. :)

OK, will do.

>
>>
>>>> +            }
>>>> +
>>>> +            if ((info->type == VTD_PASID_CACHE_DOMSI ||
>>>> +                 info->type == VTD_PASID_CACHE_PASIDSI) &&
>>>> +                (info->did != VTD_SM_PASID_ENTRY_DID(&pe))) {
>>>> +                /*
>>>> +                 * VTD_PASID_CACHE_DOMSI and
>>> VTD_PASID_CACHE_PASIDSI
>>>> +                 * requires domain id check. If domain id check fail,
>>> fails
>>>> +                 * go to next pasid.
>>>> +                 */
>>>> +                pasid++;
>>>> +                continue;
>>>> +            }
>>>> +
>>>> +            pc_entry = &vtd_as->pasid_cache_entry;
>>>> +            /*
>>>> +             * pasid cache update and clear are handled in
>>>> +             * vtd_flush_pasid_locked(), only care new pasid entry
>>> here.
>>>> +             */
>>>> +            if (!pc_entry->valid) {
>>>> +                pc_entry->pasid_entry = pe;
>>>> +                pc_entry->valid = true;
>>>> +            }
>>>> +        }
>>>> +        pasid++;
>>>> +    }
>>>> +}
>>>> +
>>>> +/*
>>>> + * In VT-d scalable mode translation, PASID dir + PASID table is used.
>>>> + * This function aims at looping over a range of PASIDs in the given
>>>> + * two level table to identify the pasid config in guest.
>>>> + */
>>>> +static void vtd_sm_pasid_table_walk(IntelIOMMUState *s,
>>>> +                                    dma_addr_t pdt_base,
>>>> +                                    int start, int end,
>>>> +                                    VTDPASIDCacheInfo *info)
>>>> +{
>>>> +    VTDPASIDDirEntry pdire;
>>>> +    int pasid = start;
>>>> +    int pasid_next;
>>>> +    dma_addr_t pt_base;
>>>> +
>>>> +    while (pasid < end) {
>>>> +        pasid_next =
>>>> +             (pasid + VTD_PASID_TBL_ENTRY_NUM) &
>>> ~(VTD_PASID_TBL_ENTRY_NUM - 1);
>>>> +        pasid_next = pasid_next < end ? pasid_next : end;
>>>> +
>>>> +        if (!vtd_get_pdire_from_pdir_table(pdt_base, pasid, &pdire)
>>>> +            && vtd_pdire_present(&pdire)) {
>>>> +            pt_base = pdire.val &
>>> VTD_PASID_TABLE_BASE_ADDR_MASK;
>>>> +            vtd_sm_pasid_table_walk_one(s, pt_base, pasid,
>>> pasid_next, info);
>>>> +        }
>>>> +        pasid = pasid_next;
>>>> +    }
>>>> +}
>>>> +
>>>> +static void vtd_replay_pasid_bind_for_dev(IntelIOMMUState *s,
>>>> +                                          int start, int end,
>>>> +                                          VTDPASIDCacheInfo
>>> *info)
>>>> +{
>>>> +    VTDContextEntry ce;
>>>> +
>>>> +    if (!vtd_dev_to_context_entry(s, pci_bus_num(info->bus),
>>> info->devfn,
>>>> +                                  &ce)) {
>>>> +        uint32_t max_pasid;
>>>> +
>>>> +        max_pasid = vtd_sm_ce_get_pdt_entry_num(&ce) *
>>> VTD_PASID_TBL_ENTRY_NUM;
>>>> +        if (end > max_pasid) {
>>>> +            end = max_pasid;
>>>> +        }
>>>> +        vtd_sm_pasid_table_walk(s,
>>>> +
>>> VTD_CE_GET_PASID_DIR_TABLE(&ce),
>>>> +                                start,
>>>> +                                end,
>>>> +                                info);
>>>> +    }
>>>> +}
>>>> +
>>>> +/*
>>>> + * This function replays the guest pasid bindings by walking the two level
>>>> + * guest PASID table. For each valid pasid entry, it finds or creates a
>>>> + * vtd_as and caches pasid entry in vtd_as.
>>>> + */
>>>> +static void vtd_replay_guest_pasid_bindings(IntelIOMMUState *s,
>>>> +
>VTDPASIDCacheInfo
>>> *pc_info)
>>>> +{
>>>> +    /*
>>>> +     * Currently only Requests-without-PASID is supported, as
>vIOMMU
>>> doesn't
>>>> +     * support RPS(RID-PASID Support), pasid scope is fixed to [0, 1).
>>>> +     */
>>>> +    int start = 0, end = 1;
>>>> +    VTDHostIOMMUDevice *vtd_hiod;
>>>> +    VTDPASIDCacheInfo walk_info;
>>>> +    GHashTableIter as_it;
>>>> +
>>>> +    switch (pc_info->type) {
>>>> +    case VTD_PASID_CACHE_PASIDSI:
>>>> +        start = pc_info->pasid;
>>>> +        end = pc_info->pasid + 1;
>>> if you never replay a range, you could simplify the code for now because
>>> some code paths are not properly tested
>>
>> OK. Instead of assignment of start and end variable, maybe just an
>assert(!pc_info->pasid).
>
>I think there are two reasons for this range replay.
>
>1) as a preparation for patch 16 of this series.
>2) support domain selective or global pasid cache invalidation

Exactly.

>
>>
>>>> +       /* fall through */
>>>> +    case VTD_PASID_CACHE_DOMSI:
>>> Why can't we have other invalidation types along with request without
>>> PASID? It is not obvious to me at least why it couldn't be used by the
>>> guest. Would deserve a comment in the commit desc I think.
>>
>> Other invalidation types are indeed used, just in pasid scope [0, 1), because
>[start, end) are already initialized to [0, 1), nothing more here, so just break.
>
>hmmm. The fixed scope makes the range replay a fake one. It's better
>holding the range replay logic for now and add it when there is
>non-rid_pasid support for passthrough devices.

Yes, will delete the non-rid_pasid code.

Thanks
Zhenzhong