[v1] intel_iommu: Enable stage-1 translation for passthrough device

[PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to host

Posted by Zhenzhong Duan 5 months, 4 weeks ago

This captures the guest PASID table entry modifications and
propagates the changes to host to attach a hwpt with type determined
per guest PGTT configuration.

When PGTT is Pass-through(100b), the hwpt on host side is a stage-2
page table(GPA->HPA). When PGTT is First-stage Translation only(001b),
the hwpt on host side is a nested page table.

The guest page table is configured as stage-1 page table (gIOVA->GPA)
whose translation result would further go through host VT-d stage-2
page table(GPA->HPA) under nested translation mode. This is the key
to support gIOVA over stage-1 page table for Intel VT-d in
virtualization environment.

Stage-2 page table could be shared by different devices if there is
no conflict and devices link to same iommufd object, i.e. devices
under same host IOMMU can share same stage-2 page table. If there
is conflict, i.e. there is one device under non cache coherency
mode which is different from others, it requires a separate
stage-2 page table in non-CC mode.

See below example diagram:

      IntelIOMMUState
             |
             V
    .------------------.    .------------------.
    | VTDIOASContainer |--->| VTDIOASContainer |--->...
    |    (iommufd0)    |    |    (iommufd1)    |
    .------------------.    .------------------.
             |                       |
             |                       .-->...
             V
      .-------------------.    .-------------------.
      |   VTDS2Hwpt(CC)   |--->| VTDS2Hwpt(non-CC) |-->...
      .-------------------.    .-------------------.
          |            |               |
          |            |               |
    .-----------.  .-----------.  .------------.
    | IOMMUFD   |  | IOMMUFD   |  | IOMMUFD    |
    | Device(CC)|  | Device(CC)|  | Device     |
    | (iommufd0)|  | (iommufd0)|  | (non-CC)   |
    |           |  |           |  | (iommufd0) |
    .-----------.  .-----------.  .------------.

Co-Authored-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
---
 hw/i386/intel_iommu_internal.h |  11 +
 include/hw/i386/intel_iommu.h  |  24 ++
 hw/i386/intel_iommu.c          | 581 +++++++++++++++++++++++++++++++--
 hw/i386/trace-events           |   8 +
 4 files changed, 604 insertions(+), 20 deletions(-)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 5e5583d94a..e76f43bb8f 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -563,6 +563,13 @@ typedef struct VTDRootEntry VTDRootEntry;
 #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(aw)  (0x1e0ULL | ~VTD_HAW_MASK(aw))
 #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL1      0xffffffffffe00000ULL
 
+typedef enum VTDPASIDOp {
+    VTD_PASID_BIND,
+    VTD_PASID_UPDATE,
+    VTD_PASID_UNBIND,
+    VTD_OP_NUM
+} VTDPASIDOp;
+
 typedef enum VTDPCInvType {
     /* Force reset all */
     VTD_PASID_CACHE_FORCE_RESET = 0,
@@ -578,6 +585,7 @@ typedef struct VTDPASIDCacheInfo {
     uint32_t pasid;
     PCIBus *bus;
     uint16_t devfn;
+    bool error_happened;
 } VTDPASIDCacheInfo;
 
 /* PASID Table Related Definitions */
@@ -606,6 +614,9 @@ typedef struct VTDPASIDCacheInfo {
 
 #define VTD_SM_PASID_ENTRY_FLPM          3ULL
 #define VTD_SM_PASID_ENTRY_FLPTPTR       (~0xfffULL)
+#define VTD_SM_PASID_ENTRY_SRE_BIT(val)  (!!((val) & 1ULL))
+#define VTD_SM_PASID_ENTRY_WPE_BIT(val)  (!!(((val) >> 4) & 1ULL))
+#define VTD_SM_PASID_ENTRY_EAFE_BIT(val) (!!(((val) >> 7) & 1ULL))
 
 /* First Level Paging Structure */
 /* Masks for First Level Paging Entry */
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index fbc9da903a..594281c1d3 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -100,10 +100,32 @@ typedef struct VTDPASIDCacheEntry {
     bool cache_filled;
 } VTDPASIDCacheEntry;
 
+typedef struct VTDIOASContainer {
+    struct IOMMUFDBackend *iommufd;
+    uint32_t ioas_id;
+    MemoryListener listener;
+    QLIST_HEAD(, VTDS2Hwpt) s2_hwpt_list;
+    QLIST_ENTRY(VTDIOASContainer) next;
+    Error *error;
+} VTDIOASContainer;
+
+typedef struct VTDS2Hwpt {
+    uint32_t users;
+    uint32_t hwpt_id;
+    VTDIOASContainer *container;
+    QLIST_ENTRY(VTDS2Hwpt) next;
+} VTDS2Hwpt;
+
+typedef struct VTDHwpt {
+    uint32_t hwpt_id;
+    VTDS2Hwpt *s2_hwpt;
+} VTDHwpt;
+
 struct VTDAddressSpace {
     PCIBus *bus;
     uint8_t devfn;
     uint32_t pasid;
+    VTDHwpt hwpt;
     AddressSpace as;
     IOMMUMemoryRegion iommu;
     MemoryRegion root;          /* The root container of the device */
@@ -303,6 +325,8 @@ struct IntelIOMMUState {
 
     GHashTable *vtd_host_iommu_dev;             /* VTDHostIOMMUDevice */
 
+    QLIST_HEAD(, VTDIOASContainer) containers;
+
     /* interrupt remapping */
     bool intr_enabled;              /* Whether guest enabled IR */
     dma_addr_t intr_root;           /* Interrupt remapping table pointer */
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 050b0d3ca2..3269a66ac7 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -20,6 +20,7 @@
  */
 
 #include "qemu/osdep.h"
+#include CONFIG_DEVICES /* CONFIG_IOMMUFD */
 #include "qemu/error-report.h"
 #include "qemu/main-loop.h"
 #include "qapi/error.h"
@@ -40,6 +41,9 @@
 #include "migration/vmstate.h"
 #include "trace.h"
 #include "system/iommufd.h"
+#ifdef CONFIG_IOMMUFD
+#include <linux/iommufd.h>
+#endif
 
 /* context entry operations */
 #define VTD_CE_GET_RID2PASID(ce) \
@@ -838,11 +842,40 @@ static inline uint16_t vtd_pe_get_did(VTDPASIDEntry *pe)
     return VTD_SM_PASID_ENTRY_DID((pe)->val[1]);
 }
 
+static inline dma_addr_t vtd_pe_get_flpt_base(VTDPASIDEntry *pe)
+{
+    return pe->val[2] & VTD_SM_PASID_ENTRY_FLPTPTR;
+}
+
+static inline uint32_t vtd_pe_get_fl_aw(VTDPASIDEntry *pe)
+{
+    return 48 + ((pe->val[2] >> 2) & VTD_SM_PASID_ENTRY_FLPM) * 9;
+}
+
+static inline bool vtd_pe_pgtt_is_pt(VTDPASIDEntry *pe)
+{
+    return (VTD_PE_GET_TYPE(pe) == VTD_SM_PASID_ENTRY_PT);
+}
+
+/* check if pgtt is first stage translation */
+static inline bool vtd_pe_pgtt_is_flt(VTDPASIDEntry *pe)
+{
+    return (VTD_PE_GET_TYPE(pe) == VTD_SM_PASID_ENTRY_FLT);
+}
+
 static inline bool vtd_pdire_present(VTDPASIDDirEntry *pdire)
 {
     return pdire->val & 1;
 }
 
+static inline void pasid_cache_info_set_error(VTDPASIDCacheInfo *pc_info)
+{
+    if (pc_info->error_happened) {
+        return;
+    }
+    pc_info->error_happened = true;
+}
+
 /**
  * Caller of this function should check present bit if wants
  * to use pdir entry for further usage except for fpd bit check.
@@ -1776,7 +1809,7 @@ static bool vtd_dev_pt_enabled(IntelIOMMUState *s, VTDContextEntry *ce,
              */
             return false;
         }
-        return (VTD_PE_GET_TYPE(&pe) == VTD_SM_PASID_ENTRY_PT);
+        return vtd_pe_pgtt_is_pt(&pe);
     }
 
     return (vtd_ce_get_type(ce) == VTD_CONTEXT_TT_PASS_THROUGH);
@@ -2403,6 +2436,497 @@ static void vtd_context_global_invalidate(IntelIOMMUState *s)
     vtd_iommu_replay_all(s);
 }
 
+#ifdef CONFIG_IOMMUFD
+static bool iommufd_listener_skipped_section(MemoryRegionSection *section)
+{
+    return !memory_region_is_ram(section->mr) ||
+           memory_region_is_protected(section->mr) ||
+           /*
+            * Sizing an enabled 64-bit BAR can cause spurious mappings to
+            * addresses in the upper part of the 64-bit address space.  These
+            * are never accessed by the CPU and beyond the address width of
+            * some IOMMU hardware.  TODO: VFIO should tell us the IOMMU width.
+            */
+           section->offset_within_address_space & (1ULL << 63);
+}
+
+static void iommufd_listener_region_add_s2domain(MemoryListener *listener,
+                                                 MemoryRegionSection *section)
+{
+    VTDIOASContainer *container = container_of(listener,
+                                               VTDIOASContainer, listener);
+    IOMMUFDBackend *iommufd = container->iommufd;
+    uint32_t ioas_id = container->ioas_id;
+    hwaddr iova;
+    Int128 llend, llsize;
+    void *vaddr;
+    Error *err = NULL;
+    int ret;
+
+    if (iommufd_listener_skipped_section(section)) {
+        return;
+    }
+    iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
+    llend = int128_make64(section->offset_within_address_space);
+    llend = int128_add(llend, section->size);
+    llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
+    llsize = int128_sub(llend, int128_make64(iova));
+    vaddr = memory_region_get_ram_ptr(section->mr) +
+            section->offset_within_region +
+            (iova - section->offset_within_address_space);
+
+    memory_region_ref(section->mr);
+
+    ret = iommufd_backend_map_dma(iommufd, ioas_id, iova, int128_get64(llsize),
+                                  vaddr, section->readonly);
+    if (!ret) {
+        return;
+    }
+
+    error_setg(&err,
+               "iommufd_listener_region_add_s2domain(%p, 0x%"HWADDR_PRIx", "
+               "0x%"HWADDR_PRIx", %p) = %d (%s)",
+               container, iova, int128_get64(llsize), vaddr, ret,
+               strerror(-ret));
+
+    if (memory_region_is_ram_device(section->mr)) {
+        /* Allow unexpected mappings not to be fatal for RAM devices */
+        error_report_err(err);
+        return;
+    }
+
+    if (!container->error) {
+        error_propagate_prepend(&container->error, err, "Region %s: ",
+                                memory_region_name(section->mr));
+    } else {
+        error_free(err);
+    }
+}
+
+static void iommufd_listener_region_del_s2domain(MemoryListener *listener,
+                                                 MemoryRegionSection *section)
+{
+    VTDIOASContainer *container = container_of(listener,
+                                               VTDIOASContainer, listener);
+    IOMMUFDBackend *iommufd = container->iommufd;
+    uint32_t ioas_id = container->ioas_id;
+    hwaddr iova;
+    Int128 llend, llsize;
+    int ret;
+
+    if (iommufd_listener_skipped_section(section)) {
+        return;
+    }
+    iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
+    llend = int128_make64(section->offset_within_address_space);
+    llend = int128_add(llend, section->size);
+    llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
+    llsize = int128_sub(llend, int128_make64(iova));
+
+    ret = iommufd_backend_unmap_dma(iommufd, ioas_id,
+                                    iova, int128_get64(llsize));
+    if (ret) {
+        error_report("iommufd_listener_region_del_s2domain(%p, "
+                     "0x%"HWADDR_PRIx", 0x%"HWADDR_PRIx") = %d (%s)",
+                     container, iova, int128_get64(llsize), ret,
+                     strerror(-ret));
+    }
+
+    memory_region_unref(section->mr);
+}
+
+static const MemoryListener iommufd_s2domain_memory_listener = {
+    .name = "iommufd_s2domain",
+    .priority = 1000,
+    .region_add = iommufd_listener_region_add_s2domain,
+    .region_del = iommufd_listener_region_del_s2domain,
+};
+
+static void vtd_init_s1_hwpt_data(struct iommu_hwpt_vtd_s1 *vtd,
+                                  VTDPASIDEntry *pe)
+{
+    memset(vtd, 0, sizeof(*vtd));
+
+    vtd->flags =  (VTD_SM_PASID_ENTRY_SRE_BIT(pe->val[2]) ?
+                                        IOMMU_VTD_S1_SRE : 0) |
+                  (VTD_SM_PASID_ENTRY_WPE_BIT(pe->val[2]) ?
+                                        IOMMU_VTD_S1_WPE : 0) |
+                  (VTD_SM_PASID_ENTRY_EAFE_BIT(pe->val[2]) ?
+                                        IOMMU_VTD_S1_EAFE : 0);
+    vtd->addr_width = vtd_pe_get_fl_aw(pe);
+    vtd->pgtbl_addr = (uint64_t)vtd_pe_get_flpt_base(pe);
+}
+
+static int vtd_create_s1_hwpt(HostIOMMUDeviceIOMMUFD *idev,
+                              VTDS2Hwpt *s2_hwpt, VTDHwpt *hwpt,
+                              VTDPASIDEntry *pe, Error **errp)
+{
+    struct iommu_hwpt_vtd_s1 vtd;
+    uint32_t hwpt_id, s2_hwpt_id = s2_hwpt->hwpt_id;
+
+    vtd_init_s1_hwpt_data(&vtd, pe);
+
+    if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid,
+                                    s2_hwpt_id, 0, IOMMU_HWPT_DATA_VTD_S1,
+                                    sizeof(vtd), &vtd, &hwpt_id, errp)) {
+        return -EINVAL;
+    }
+
+    hwpt->hwpt_id = hwpt_id;
+
+    return 0;
+}
+
+static void vtd_destroy_s1_hwpt(HostIOMMUDeviceIOMMUFD *idev, VTDHwpt *hwpt)
+{
+    iommufd_backend_free_id(idev->iommufd, hwpt->hwpt_id);
+}
+
+static VTDS2Hwpt *vtd_ioas_container_get_s2_hwpt(VTDIOASContainer *container,
+                                                 uint32_t hwpt_id)
+{
+    VTDS2Hwpt *s2_hwpt;
+
+    QLIST_FOREACH(s2_hwpt, &container->s2_hwpt_list, next) {
+        if (s2_hwpt->hwpt_id == hwpt_id) {
+            return s2_hwpt;
+        }
+    }
+
+    s2_hwpt = g_malloc0(sizeof(*s2_hwpt));
+
+    s2_hwpt->hwpt_id = hwpt_id;
+    s2_hwpt->container = container;
+    QLIST_INSERT_HEAD(&container->s2_hwpt_list, s2_hwpt, next);
+
+    return s2_hwpt;
+}
+
+static void vtd_ioas_container_put_s2_hwpt(VTDS2Hwpt *s2_hwpt)
+{
+    VTDIOASContainer *container = s2_hwpt->container;
+
+    if (s2_hwpt->users) {
+        return;
+    }
+
+    QLIST_REMOVE(s2_hwpt, next);
+    iommufd_backend_free_id(container->iommufd, s2_hwpt->hwpt_id);
+    g_free(s2_hwpt);
+}
+
+static void vtd_ioas_container_destroy(VTDIOASContainer *container)
+{
+    if (!QLIST_EMPTY(&container->s2_hwpt_list)) {
+        return;
+    }
+
+    QLIST_REMOVE(container, next);
+    memory_listener_unregister(&container->listener);
+    iommufd_backend_free_id(container->iommufd, container->ioas_id);
+    g_free(container);
+}
+
+static int vtd_device_attach_hwpt(VTDHostIOMMUDevice *vtd_hiod,
+                                  uint32_t pasid, VTDPASIDEntry *pe,
+                                  VTDS2Hwpt *s2_hwpt, VTDHwpt *hwpt,
+                                  Error **errp)
+{
+    HostIOMMUDeviceIOMMUFD *idev = HOST_IOMMU_DEVICE_IOMMUFD(vtd_hiod->hiod);
+    int ret;
+
+    if (vtd_pe_pgtt_is_flt(pe)) {
+        ret = vtd_create_s1_hwpt(idev, s2_hwpt, hwpt, pe, errp);
+        if (ret) {
+            return ret;
+        }
+    } else {
+        hwpt->hwpt_id = s2_hwpt->hwpt_id;
+    }
+
+    ret = !host_iommu_device_iommufd_attach_hwpt(idev, hwpt->hwpt_id, errp);
+    trace_vtd_device_attach_hwpt(idev->devid, pasid, hwpt->hwpt_id, ret);
+    if (ret) {
+        if (vtd_pe_pgtt_is_flt(pe)) {
+            vtd_destroy_s1_hwpt(idev, hwpt);
+        }
+        hwpt->hwpt_id = 0;
+        error_report("devid %d pasid %d failed to attach hwpt %d",
+                     idev->devid, pasid, hwpt->hwpt_id);
+        return ret;
+    }
+
+    s2_hwpt->users++;
+    hwpt->s2_hwpt = s2_hwpt;
+
+    return 0;
+}
+
+static void vtd_device_detach_hwpt(VTDHostIOMMUDevice *vtd_hiod,
+                                   uint32_t pasid, VTDPASIDEntry *pe,
+                                   VTDHwpt *hwpt, Error **errp)
+{
+    HostIOMMUDeviceIOMMUFD *idev = HOST_IOMMU_DEVICE_IOMMUFD(vtd_hiod->hiod);
+    int ret;
+
+    if (vtd_hiod->iommu_state->dmar_enabled) {
+        ret = !host_iommu_device_iommufd_detach_hwpt(idev, errp);
+        trace_vtd_device_detach_hwpt(idev->devid, pasid, ret);
+    } else {
+        ret = !host_iommu_device_iommufd_attach_hwpt(idev, idev->hwpt_id, errp);
+        trace_vtd_device_reattach_def_hwpt(idev->devid, pasid, idev->hwpt_id,
+                                           ret);
+    }
+
+    if (ret) {
+        error_report("devid %d pasid %d failed to attach hwpt %d",
+                     idev->devid, pasid, hwpt->hwpt_id);
+    }
+
+    if (vtd_pe_pgtt_is_flt(pe)) {
+        vtd_destroy_s1_hwpt(idev, hwpt);
+    }
+
+    hwpt->s2_hwpt->users--;
+    hwpt->s2_hwpt = NULL;
+    hwpt->hwpt_id = 0;
+}
+
+static int vtd_device_attach_container(VTDHostIOMMUDevice *vtd_hiod,
+                                       VTDIOASContainer *container,
+                                       uint32_t pasid, VTDPASIDEntry *pe,
+                                       VTDHwpt *hwpt, Error **errp)
+{
+    HostIOMMUDeviceIOMMUFD *idev = HOST_IOMMU_DEVICE_IOMMUFD(vtd_hiod->hiod);
+    IOMMUFDBackend *iommufd = idev->iommufd;
+    VTDS2Hwpt *s2_hwpt;
+    uint32_t s2_hwpt_id;
+    Error *err = NULL;
+    int ret;
+
+    /* try to attach to an existing hwpt in this container */
+    QLIST_FOREACH(s2_hwpt, &container->s2_hwpt_list, next) {
+        ret = vtd_device_attach_hwpt(vtd_hiod, pasid, pe, s2_hwpt, hwpt, &err);
+        if (ret) {
+            const char *msg = error_get_pretty(err);
+
+            trace_vtd_device_fail_attach_existing_hwpt(msg);
+            error_free(err);
+            err = NULL;
+        } else {
+            goto found_hwpt;
+        }
+    }
+
+    if (!iommufd_backend_alloc_hwpt(iommufd, idev->devid, container->ioas_id,
+                                    IOMMU_HWPT_ALLOC_NEST_PARENT,
+                                    IOMMU_HWPT_DATA_NONE, 0, NULL,
+                                    &s2_hwpt_id, errp)) {
+        return -EINVAL;
+    }
+
+    s2_hwpt = vtd_ioas_container_get_s2_hwpt(container, s2_hwpt_id);
+
+    /* Attach vtd device to a new allocated hwpt within iommufd */
+    ret = vtd_device_attach_hwpt(vtd_hiod, pasid, pe, s2_hwpt, hwpt, errp);
+    if (ret) {
+        goto err_attach_hwpt;
+    }
+
+found_hwpt:
+    trace_vtd_device_attach_container(iommufd->fd, idev->devid, pasid,
+                                      container->ioas_id, hwpt->hwpt_id);
+    return 0;
+
+err_attach_hwpt:
+    vtd_ioas_container_put_s2_hwpt(s2_hwpt);
+    return ret;
+}
+
+static void vtd_device_detach_container(VTDHostIOMMUDevice *vtd_hiod,
+                                        uint32_t pasid, VTDPASIDEntry *pe,
+                                        VTDHwpt *hwpt, Error **errp)
+{
+    HostIOMMUDeviceIOMMUFD *idev = HOST_IOMMU_DEVICE_IOMMUFD(vtd_hiod->hiod);
+    IOMMUFDBackend *iommufd = idev->iommufd;
+    VTDS2Hwpt *s2_hwpt = hwpt->s2_hwpt;
+
+    trace_vtd_device_detach_container(iommufd->fd, idev->devid, pasid);
+    vtd_device_detach_hwpt(vtd_hiod, pasid, pe, hwpt, errp);
+    vtd_ioas_container_put_s2_hwpt(s2_hwpt);
+}
+
+static int vtd_device_attach_iommufd(VTDHostIOMMUDevice *vtd_hiod,
+                                     uint32_t pasid, VTDPASIDEntry *pe,
+                                     VTDHwpt *hwpt, Error **errp)
+{
+    HostIOMMUDeviceIOMMUFD *idev = HOST_IOMMU_DEVICE_IOMMUFD(vtd_hiod->hiod);
+    IOMMUFDBackend *iommufd = idev->iommufd;
+    IntelIOMMUState *s = vtd_hiod->iommu_state;
+    VTDIOASContainer *container;
+    Error *err = NULL;
+    uint32_t ioas_id;
+    int ret;
+
+    /* try to attach to an existing container in this space */
+    QLIST_FOREACH(container, &s->containers, next) {
+        if (container->iommufd != iommufd) {
+            continue;
+        }
+
+        if (vtd_device_attach_container(vtd_hiod, container, pasid, pe, hwpt,
+                                        &err)) {
+            const char *msg = error_get_pretty(err);
+
+            trace_vtd_device_fail_attach_existing_container(msg);
+            error_free(err);
+            err = NULL;
+        } else {
+            return 0;
+        }
+    }
+
+    /* Need to allocate a new dedicated container */
+    ret = iommufd_backend_alloc_ioas(iommufd, &ioas_id, errp);
+    if (ret < 0) {
+        return ret;
+    }
+
+    trace_vtd_device_alloc_ioas(iommufd->fd, ioas_id);
+
+    container = g_malloc0(sizeof(*container));
+    container->iommufd = iommufd;
+    container->ioas_id = ioas_id;
+    QLIST_INIT(&container->s2_hwpt_list);
+
+    if (vtd_device_attach_container(vtd_hiod, container, pasid, pe, hwpt,
+                                    errp)) {
+        goto err_attach_container;
+    }
+
+    container->listener = iommufd_s2domain_memory_listener;
+    memory_listener_register(&container->listener, &address_space_memory);
+
+    if (container->error) {
+        ret = -1;
+        error_propagate_prepend(errp, container->error,
+                                "memory listener initialization failed: ");
+        goto err_listener_register;
+    }
+
+    QLIST_INSERT_HEAD(&s->containers, container, next);
+
+    return 0;
+
+err_listener_register:
+    vtd_device_detach_container(vtd_hiod, pasid, pe, hwpt, errp);
+err_attach_container:
+    iommufd_backend_free_id(iommufd, container->ioas_id);
+    g_free(container);
+    return ret;
+}
+
+static void vtd_device_detach_iommufd(VTDHostIOMMUDevice *vtd_hiod,
+                                      uint32_t pasid, VTDPASIDEntry *pe,
+                                      VTDHwpt *hwpt, Error **errp)
+{
+    VTDIOASContainer *container = hwpt->s2_hwpt->container;
+
+    vtd_device_detach_container(vtd_hiod, pasid, pe, hwpt, errp);
+    vtd_ioas_container_destroy(container);
+}
+
+static int vtd_device_attach_pgtbl(VTDHostIOMMUDevice *vtd_hiod,
+                                   VTDAddressSpace *vtd_as, VTDPASIDEntry *pe)
+{
+    /*
+     * If pe->gptt != FLT, should be go ahead to do bind as host only
+     * accepts guest FLT under nesting. If pe->pgtt==PT, should setup
+     * the pasid with GPA page table. Otherwise should return failure.
+     */
+    if (!vtd_pe_pgtt_is_flt(pe) && !vtd_pe_pgtt_is_pt(pe)) {
+        return -EINVAL;
+    }
+
+    /* Should fail if the FLPT base is 0 */
+    if (vtd_pe_pgtt_is_flt(pe) && !vtd_pe_get_flpt_base(pe)) {
+        return -EINVAL;
+    }
+
+    return vtd_device_attach_iommufd(vtd_hiod, vtd_as->pasid, pe,
+                                     &vtd_as->hwpt, &error_abort);
+}
+
+static int vtd_device_detach_pgtbl(VTDHostIOMMUDevice *vtd_hiod,
+                                   VTDAddressSpace *vtd_as)
+{
+    VTDPASIDEntry *cached_pe = vtd_as->pasid_cache_entry.cache_filled ?
+                       &vtd_as->pasid_cache_entry.pasid_entry : NULL;
+
+    if (!cached_pe ||
+        (!vtd_pe_pgtt_is_flt(cached_pe) && !vtd_pe_pgtt_is_pt(cached_pe))) {
+        return 0;
+    }
+
+    vtd_device_detach_iommufd(vtd_hiod, vtd_as->pasid, cached_pe,
+                              &vtd_as->hwpt, &error_abort);
+
+    return 0;
+}
+
+/**
+ * Caller should hold iommu_lock.
+ */
+static int vtd_bind_guest_pasid(VTDAddressSpace *vtd_as,
+                                VTDPASIDEntry *pe, VTDPASIDOp op)
+{
+    IntelIOMMUState *s = vtd_as->iommu_state;
+    VTDHostIOMMUDevice *vtd_hiod;
+    int devfn = vtd_as->devfn;
+    int ret = -EINVAL;
+    struct vtd_as_key key = {
+        .bus = vtd_as->bus,
+        .devfn = devfn,
+    };
+
+    vtd_hiod = g_hash_table_lookup(s->vtd_host_iommu_dev, &key);
+    if (!vtd_hiod || !vtd_hiod->hiod) {
+        /* means no need to go further, e.g. for emulated devices */
+        return 0;
+    }
+
+    if (vtd_as->pasid != PCI_NO_PASID) {
+        error_report("Non-rid_pasid %d not supported yet", vtd_as->pasid);
+        return ret;
+    }
+
+    switch (op) {
+    case VTD_PASID_UPDATE:
+    case VTD_PASID_BIND:
+    {
+        ret = vtd_device_attach_pgtbl(vtd_hiod, vtd_as, pe);
+        break;
+    }
+    case VTD_PASID_UNBIND:
+    {
+        ret = vtd_device_detach_pgtbl(vtd_hiod, vtd_as);
+        break;
+    }
+    default:
+        error_report_once("Unknown VTDPASIDOp!!!\n");
+        break;
+    }
+
+    return ret;
+}
+#else
+static int vtd_bind_guest_pasid(VTDAddressSpace *vtd_as,
+                                VTDPASIDEntry *pe, VTDPASIDOp op)
+{
+    return 0;
+}
+#endif
+
 /* Do a context-cache device-selective invalidation.
  * @func_mask: FM field after shifting
  */
@@ -3145,21 +3669,27 @@ static bool vtd_pasid_entry_compare(VTDPASIDEntry *p1, VTDPASIDEntry *p2)
  * This function fills in the pasid entry in &vtd_as. Caller
  * of this function should hold iommu_lock.
  */
-static void vtd_fill_pe_in_cache(IntelIOMMUState *s, VTDAddressSpace *vtd_as,
-                                 VTDPASIDEntry *pe)
+static int vtd_fill_pe_in_cache(IntelIOMMUState *s, VTDAddressSpace *vtd_as,
+                                VTDPASIDEntry *pe)
 {
     VTDPASIDCacheEntry *pc_entry = &vtd_as->pasid_cache_entry;
+    int ret;
 
-    if (vtd_pasid_entry_compare(pe, &pc_entry->pasid_entry)) {
-        /* No need to go further as cached pasid entry is latest */
-        return;
+    if (pc_entry->cache_filled) {
+        if (vtd_pasid_entry_compare(pe, &pc_entry->pasid_entry)) {
+            /* No need to go further as cached pasid entry is latest */
+            return 0;
+        }
+        ret = vtd_bind_guest_pasid(vtd_as, pe, VTD_PASID_UPDATE);
+    } else {
+        ret = vtd_bind_guest_pasid(vtd_as, pe, VTD_PASID_BIND);
     }
 
-    pc_entry->pasid_entry = *pe;
-    pc_entry->cache_filled = true;
-    /*
-     * TODO: send pasid bind to host for passthru devices
-     */
+    if (!ret) {
+        pc_entry->pasid_entry = *pe;
+        pc_entry->cache_filled = true;
+    }
+    return ret;
 }
 
 /*
@@ -3225,14 +3755,20 @@ static gboolean vtd_flush_pasid(gpointer key, gpointer value,
         goto remove;
     }
 
-    vtd_fill_pe_in_cache(s, vtd_as, &pe);
+    if (vtd_fill_pe_in_cache(s, vtd_as, &pe)) {
+        pasid_cache_info_set_error(pc_info);
+    }
     return false;
 
 remove:
-    /*
-     * TODO: send pasid unbind to host for passthru devices
-     */
-    pc_entry->cache_filled = false;
+    if (pc_entry->cache_filled) {
+        if (vtd_bind_guest_pasid(vtd_as, NULL, VTD_PASID_UNBIND)) {
+            pasid_cache_info_set_error(pc_info);
+            return false;
+        } else {
+            pc_entry->cache_filled = false;
+        }
+    }
 
     /*
      * Don't remove address space of PCI_NO_PASID which is created by PCI
@@ -3247,7 +3783,7 @@ remove:
 /* Caller of this function should hold iommu_lock */
 static void vtd_pasid_cache_reset_locked(IntelIOMMUState *s)
 {
-    VTDPASIDCacheInfo pc_info;
+    VTDPASIDCacheInfo pc_info = { .error_happened = false, };
 
     trace_vtd_pasid_cache_reset();
 
@@ -3308,7 +3844,9 @@ static void vtd_sm_pasid_table_walk_one(IntelIOMMUState *s,
                 pasid = pasid_next;
                 continue;
             }
-            vtd_fill_pe_in_cache(s, vtd_as, &pe);
+            if (vtd_fill_pe_in_cache(s, vtd_as, &pe)) {
+                pasid_cache_info_set_error(info);
+            }
         }
         pasid = pasid_next;
     }
@@ -3416,6 +3954,9 @@ static void vtd_replay_guest_pasid_bindings(IntelIOMMUState *s,
         walk_info.devfn = vtd_hiod->devfn;
         vtd_replay_pasid_bind_for_dev(s, start, end, &walk_info);
     }
+    if (walk_info.error_happened) {
+        pasid_cache_info_set_error(pc_info);
+    }
 }
 
 /*
@@ -3488,9 +4029,9 @@ static void vtd_pasid_cache_sync(IntelIOMMUState *s,
 static bool vtd_process_pasid_desc(IntelIOMMUState *s,
                                    VTDInvDesc *inv_desc)
 {
+    VTDPASIDCacheInfo pc_info = { .error_happened = false, };
     uint16_t domain_id;
     uint32_t pasid;
-    VTDPASIDCacheInfo pc_info;
     uint64_t mask[4] = {VTD_INV_DESC_PASIDC_RSVD_VAL0, VTD_INV_DESC_ALL_ONE,
                         VTD_INV_DESC_ALL_ONE, VTD_INV_DESC_ALL_ONE};
 
@@ -3529,7 +4070,7 @@ static bool vtd_process_pasid_desc(IntelIOMMUState *s,
     }
 
     vtd_pasid_cache_sync(s, &pc_info);
-    return true;
+    return !pc_info.error_happened ? true : false;
 }
 
 static bool vtd_process_inv_iec_desc(IntelIOMMUState *s,
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
index c8a936eb46..de903a0033 100644
--- a/hw/i386/trace-events
+++ b/hw/i386/trace-events
@@ -73,6 +73,14 @@ vtd_warn_invalid_qi_tail(uint16_t tail) "tail 0x%"PRIx16
 vtd_warn_ir_vector(uint16_t sid, int index, int vec, int target) "sid 0x%"PRIx16" index %d vec %d (should be: %d)"
 vtd_warn_ir_trigger(uint16_t sid, int index, int trig, int target) "sid 0x%"PRIx16" index %d trigger %d (should be: %d)"
 vtd_reset_exit(void) ""
+vtd_device_attach_hwpt(uint32_t dev_id, uint32_t pasid, uint32_t hwpt_id, int ret) "dev_id %d pasid %d hwpt_id %d, ret: %d"
+vtd_device_detach_hwpt(uint32_t dev_id, uint32_t pasid, int ret) "dev_id %d pasid %d ret: %d"
+vtd_device_reattach_def_hwpt(uint32_t dev_id, uint32_t pasid, uint32_t hwpt_id, int ret) "dev_id %d pasid %d hwpt_id %d, ret: %d"
+vtd_device_fail_attach_existing_hwpt(const char *msg) " %s"
+vtd_device_attach_container(int fd, uint32_t dev_id, uint32_t pasid, uint32_t ioas_id, uint32_t hwpt_id) "iommufd %d dev_id %d pasid %d ioas_id %d hwpt_id %d"
+vtd_device_detach_container(int fd, uint32_t dev_id, uint32_t pasid) "iommufd %d dev_id %d pasid %d"
+vtd_device_fail_attach_existing_container(const char *msg) " %s"
+vtd_device_alloc_ioas(int fd, uint32_t ioas_id) "iommufd %d ioas_id %d"
 
 # amd_iommu.c
 amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at addr 0x%"PRIx64" +  offset 0x%"PRIx32
-- 
2.34.1

Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to host

Posted by Nicolin Chen 5 months, 3 weeks ago

On Wed, May 21, 2025 at 07:14:45PM +0800, Zhenzhong Duan wrote:
> +static const MemoryListener iommufd_s2domain_memory_listener = {
> +    .name = "iommufd_s2domain",
> +    .priority = 1000,
> +    .region_add = iommufd_listener_region_add_s2domain,
> +    .region_del = iommufd_listener_region_del_s2domain,
> +};

Would you mind elaborating When and how vtd does all S2 mappings?

On ARM, the default vfio_memory_listener could capture the entire
guest RAM and add to the address space. So what we do is basically
reusing the vfio_memory_listener:
https://lore.kernel.org/qemu-devel/20250311141045.66620-13-shameerali.kolothum.thodi@huawei.com/

The thing is that when a VFIO device is attached to the container
upon a nesting configuration, the ->get_address_space op should
return the system address space as S1 nested HWPT isn't allocated
yet. Then all the iommu as routines in vfio_listener_region_add()
would be skipped, ending up with mapping the guest RAM in S2 HWPT
correctly. Not until the S1 nested HWPT is allocated by the guest
OS (after guest boots), can the ->get_address_space op return the
iommu address space.

With this address space shift, S2 mappings can be simply captured
and done by vfio_memory_listener. Then, such an s2domain listener
would be largely redundant.

So the second question is:
Does vtd have to own this iommufd_s2domain_memory_listener? IOW,
does vtd_host_dma_iommu() have to return the iommu address space
all the time?

> +static int vtd_create_s1_hwpt(HostIOMMUDeviceIOMMUFD *idev,
> +                              VTDS2Hwpt *s2_hwpt, VTDHwpt *hwpt,
> +                              VTDPASIDEntry *pe, Error **errp)
> +{
> +    struct iommu_hwpt_vtd_s1 vtd;
> +    uint32_t hwpt_id, s2_hwpt_id = s2_hwpt->hwpt_id;
> +
> +    vtd_init_s1_hwpt_data(&vtd, pe);
> +
> +    if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid,
> +                                    s2_hwpt_id, 0, IOMMU_HWPT_DATA_VTD_S1,
> +                                    sizeof(vtd), &vtd, &hwpt_id, errp)) {
> +        return -EINVAL;
> +    }
> +
> +    hwpt->hwpt_id = hwpt_id;
> +
> +    return 0;
> +}
> +
> +static void vtd_destroy_s1_hwpt(HostIOMMUDeviceIOMMUFD *idev, VTDHwpt *hwpt)
> +{
> +    iommufd_backend_free_id(idev->iommufd, hwpt->hwpt_id);
> +}

I think you did some substantial work to isolate the get_hw_info
part inside the iommufd backend code, which looks nice and clean
as the vIOMMU code simply does iodc->get_cap().

However, that then makes these direct raw backend function calls
very awkward :-/

In my view, the way to make sense is either:
* We don't do any isolation, but just call raw backend functions
  in vIOMMU code
* We do every isolation, and never call raw backend functions in
  vIOMMU code
?

Thanks
Nicolin

Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to host

Posted by Yi Liu 5 months, 3 weeks ago

Hey Nic,

On 2025/5/22 06:49, Nicolin Chen wrote:
> On Wed, May 21, 2025 at 07:14:45PM +0800, Zhenzhong Duan wrote:
>> +static const MemoryListener iommufd_s2domain_memory_listener = {
>> +    .name = "iommufd_s2domain",
>> +    .priority = 1000,
>> +    .region_add = iommufd_listener_region_add_s2domain,
>> +    .region_del = iommufd_listener_region_del_s2domain,
>> +};
> 
> Would you mind elaborating When and how vtd does all S2 mappings?
> 
> On ARM, the default vfio_memory_listener could capture the entire
> guest RAM and add to the address space. So what we do is basically
> reusing the vfio_memory_listener:
> https://lore.kernel.org/qemu-devel/20250311141045.66620-13-shameerali.kolothum.thodi@huawei.com/

in concept yes, all the guest ram. but due to an errata, we need
to skip the RO mappings.

> The thing is that when a VFIO device is attached to the container
> upon a nesting configuration, the ->get_address_space op should
> return the system address space as S1 nested HWPT isn't allocated
> yet. Then all the iommu as routines in vfio_listener_region_add()
> would be skipped, ending up with mapping the guest RAM in S2 HWPT
> correctly. Not until the S1 nested HWPT is allocated by the guest
> OS (after guest boots), can the ->get_address_space op return the
> iommu address space.

This seems a bit different between ARM and VT-d emulation. The VT-d
emulation code returns the iommu address space regardless of what
translation mode guest configured. But the MR of the address space
has two overlapped subregions, one is nodmar, another one is iommu.
As the naming shows, the nodmar is aliased to the system MR. And before
the guest enables iommu and set PGTT to a non-PT mode (e.g. S1 or S2),
the effective MR alias is the nodmar, hence the mapping this address
space holds are the GPA mappings in the beginning. If guest set PGTT to S2,
then the iommu MR is enabled, hence the mapping is gIOVA mappings
accordingly. So in VT-d emulation, the address space switch is more the MR
alias switching.

In this series, we mainly want to support S1 translation type for guest.
And it is based on nested translation, which needs a S2 domain that holds
the GPA mappings. Besides S1 translation type, PT is also supported. Both
the two types need a S2 domain which already holds GPA mappings. So we have
this internal listener. Also, we want to skip RO mappings on S2, so that's
another reason for it.  @Zhenzhong, perhaps, it can be described in the
commit message why an internal listener is introduced.

> 
> With this address space shift, S2 mappings can be simply captured
> and done by vfio_memory_listener. Then, such an s2domain listener
> would be largely redundant.

hope above addressed your question.

> So the second question is:
> Does vtd have to own this iommufd_s2domain_memory_listener? IOW,

yes based on the current design. when guest GPTT==PT, attach device
to S2 hwpt, when it goes to S1, then attach it to a S1 hwpt whose
parent is the aforementioned S2 hwpt. This S2 hwpt is always there
for use.

> does vtd_host_dma_iommu() have to return the iommu address space
> all the time?

yes, all the time.

-- 
Regards,
Yi Liu

Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to host

Posted by Nicolin Chen 5 months, 3 weeks ago

Hey,

Thanks for the reply.

Just want to say that I am asking a lot to understand why VT-d is
different than ARM, so as to decide whether ARM should follow VT-d
implementing a separate listener or just use the VFIO listener.

On Fri, May 23, 2025 at 02:22:15PM +0800, Yi Liu wrote:
> Hey Nic,
> 
> On 2025/5/22 06:49, Nicolin Chen wrote:
> > On Wed, May 21, 2025 at 07:14:45PM +0800, Zhenzhong Duan wrote:
> > > +static const MemoryListener iommufd_s2domain_memory_listener = {
> > > +    .name = "iommufd_s2domain",
> > > +    .priority = 1000,
> > > +    .region_add = iommufd_listener_region_add_s2domain,
> > > +    .region_del = iommufd_listener_region_del_s2domain,
> > > +};
> > 
> > Would you mind elaborating When and how vtd does all S2 mappings?
> > 
> > On ARM, the default vfio_memory_listener could capture the entire
> > guest RAM and add to the address space. So what we do is basically
> > reusing the vfio_memory_listener:
> > https://lore.kernel.org/qemu-devel/20250311141045.66620-13-shameerali.kolothum.thodi@huawei.com/
> 
> in concept yes, all the guest ram. but due to an errata, we need
> to skip the RO mappings.

Mind elaborating what are RO mappings? Can those be possible within
the range of the RAM?

> > The thing is that when a VFIO device is attached to the container
> > upon a nesting configuration, the ->get_address_space op should
> > return the system address space as S1 nested HWPT isn't allocated
> > yet. Then all the iommu as routines in vfio_listener_region_add()
> > would be skipped, ending up with mapping the guest RAM in S2 HWPT
> > correctly. Not until the S1 nested HWPT is allocated by the guest
> > OS (after guest boots), can the ->get_address_space op return the
> > iommu address space.
> 
> This seems a bit different between ARM and VT-d emulation. The VT-d
> emulation code returns the iommu address space regardless of what
> translation mode guest configured. But the MR of the address space
> has two overlapped subregions, one is nodmar, another one is iommu.
> As the naming shows, the nodmar is aliased to the system MR.

OK. But why two overlapped subregions v.s. two separate two ASs?

> And before
> the guest enables iommu and set PGTT to a non-PT mode (e.g. S1 or S2),
> the effective MR alias is the nodmar, hence the mapping this address
> space holds are the GPA mappings in the beginning.

I think this is same on ARM, where get_address_space() may return
system address space. And for VT-d, it actually returns the range
of the system address space (just though a sub MR of an iommu AS),
right? 

> If guest set PGTT to S2,
> then the iommu MR is enabled, hence the mapping is gIOVA mappings
> accordingly. So in VT-d emulation, the address space switch is more the MR
> alias switching.

Zhenzhong said that there is no shadow page table for the nesting
setup, i.e. gIOVA=>gPA mappings are entirely done by the guest OS.

Then, why does VT-d need to switch to the iommu MR here?

> In this series, we mainly want to support S1 translation type for guest.
> And it is based on nested translation, which needs a S2 domain that holds
> the GPA mappings. Besides S1 translation type, PT is also supported. Both
> the two types need a S2 domain which already holds GPA mappings. So we have
> this internal listener.

Hmm, the reasoning to the last "so" doesn't sound enough. The VFIO
listener could do the same...

> Also, we want to skip RO mappings on S2, so that's
> another reason for it.  @Zhenzhong, perhaps, it can be described in the
> commit message why an internal listener is introduced.

OK. I think that can be a good reason to have an internal listener,
only if VFIO can't skip the RO mappings.

> > So the second question is:
> > Does vtd have to own this iommufd_s2domain_memory_listener? IOW,
> 
> yes based on the current design. when guest GPTT==PT, attach device
> to S2 hwpt, when it goes to S1, then attach it to a S1 hwpt whose
> parent is the aforementioned S2 hwpt. This S2 hwpt is always there
> for use.

ARM is doing the same thing. And the exact point "this S2 hwpt is
always there for use" has been telling me that the device can just
stay at the S2 address space (system), since the guest kernel will
take care of the S1 address space (iommu).

Overall, the questions here have been two-fold:

1.Why does VT-d need an internal listener?

  I can see the (only) reason is for the RO mappings.

  Yet, Is there anything that we can do to the VFIO listener to
  bypass these RO mappings?

2.Why not return the system AS all the time when nesting is on?
  Why switch to the iommu AS when device attaches to S1 HWPT?

  For ARM, MSI requires a translation so it has to; but MSI on
  VT-d doesn't. So, I couldn't see why VT-d needs to return the
  iommu AS via get_address_space().

  However, combining the question-1, my gut feeling is that, VT-d
  needs to skip RO mappings for errata, while the VFIO listener
  can't do that. So, VT-d has to create its own listener. And to
  avoid duplicated mappings on the same address space, it has to
  bypass the VFIO listener by working it around with an IOMMU AS.
  Is this correct?

Thanks
Nic

Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to host

Posted by Yi Liu 5 months, 3 weeks ago

On 2025/5/24 05:12, Nicolin Chen wrote:
> Hey,
> 
> Thanks for the reply.
> 
> Just want to say that I am asking a lot to understand why VT-d is
> different than ARM, so as to decide whether ARM should follow VT-d
> implementing a separate listener or just use the VFIO listener.
> 
> On Fri, May 23, 2025 at 02:22:15PM +0800, Yi Liu wrote:
>> Hey Nic,
>>
>> On 2025/5/22 06:49, Nicolin Chen wrote:
>>> On Wed, May 21, 2025 at 07:14:45PM +0800, Zhenzhong Duan wrote:
>>>> +static const MemoryListener iommufd_s2domain_memory_listener = {
>>>> +    .name = "iommufd_s2domain",
>>>> +    .priority = 1000,
>>>> +    .region_add = iommufd_listener_region_add_s2domain,
>>>> +    .region_del = iommufd_listener_region_del_s2domain,
>>>> +};
>>>
>>> Would you mind elaborating When and how vtd does all S2 mappings?
>>>
>>> On ARM, the default vfio_memory_listener could capture the entire
>>> guest RAM and add to the address space. So what we do is basically
>>> reusing the vfio_memory_listener:
>>> https://lore.kernel.org/qemu-devel/20250311141045.66620-13-shameerali.kolothum.thodi@huawei.com/
>>
>> in concept yes, all the guest ram. but due to an errata, we need
>> to skip the RO mappings.
> 
> Mind elaborating what are RO mappings? Can those be possible within
> the range of the RAM?

Below are RO regions when booting Q35 machine (this is the pcie capable 
platform and also vIOMMU capable), 4GB memory. For the bios and rom
regions, it looks reasonable. I'm not quite sure why there is RO RAM yet.
But it seems to be the fact we need to face.

vfio_listener_region_add, section->mr->name: pc.bios, iova: fffc0000, size: 
40000, vaddr: 7fb314200000, RO
vfio_listener_region_add, section->mr->name: pc.rom, iova: c0000, size: 
20000, vaddr: 7fb206c00000, RO
vfio_listener_region_add, section->mr->name: pc.bios, iova: e0000, size: 
20000, vaddr: 7fb314220000, RO
vfio_listener_region_add, section->mr->name: pc.rom, iova: d8000, size: 
8000, vaddr: 7fb206c18000, RO
vfio_listener_region_add, section->mr->name: pc.bios, iova: e0000, size: 
10000, vaddr: 7fb314220000, RO
vfio_listener_region_add, section->mr->name: vga.rom, iova: febc0000, size: 
10000, vaddr: 7fb205800000, RO
vfio_listener_region_add, section->mr->name: virtio-net-pci.rom, iova: 
feb80000, size: 40000, vaddr: 7fb205600000, RO
vfio_listener_region_add, section->mr->name: pc.ram, iova: c0000, size: 
b000, vaddr: 7fb207ec0000, RO
vfio_listener_region_add, section->mr->name: pc.ram, iova: ce000, size: 
a000, vaddr: 7fb207ece000, RO
vfio_listener_region_add, section->mr->name: pc.ram, iova: f0000, size: 
10000, vaddr: 7fb207ef0000, RO
vfio_listener_region_add, section->mr->name: pc.ram, iova: ce000, size: 
1a000, vaddr: 7fb207ece000, RO

>>> The thing is that when a VFIO device is attached to the container
>>> upon a nesting configuration, the ->get_address_space op should
>>> return the system address space as S1 nested HWPT isn't allocated
>>> yet. Then all the iommu as routines in vfio_listener_region_add()
>>> would be skipped, ending up with mapping the guest RAM in S2 HWPT
>>> correctly. Not until the S1 nested HWPT is allocated by the guest
>>> OS (after guest boots), can the ->get_address_space op return the
>>> iommu address space.
>>
>> This seems a bit different between ARM and VT-d emulation. The VT-d
>> emulation code returns the iommu address space regardless of what
>> translation mode guest configured. But the MR of the address space
>> has two overlapped subregions, one is nodmar, another one is iommu.
>> As the naming shows, the nodmar is aliased to the system MR.
> 
> OK. But why two overlapped subregions v.s. two separate two ASs?

TBH. I don't have the exact reason about it. +Cc Peter if he remembers
it or not.

IMHO. At least for vfio devices, I can see only one get_address_space()
call. So even there are two ASs, how should the vfio be notified when the
AS changed? Since vIOMMU is the source of map/umap requests, it looks fine
to always return iommu AS and handle the AS switch by switching the enabled
subregions according to the guest vIOMMU translation types.

> 
>> And before
>> the guest enables iommu and set PGTT to a non-PT mode (e.g. S1 or S2),
>> the effective MR alias is the nodmar, hence the mapping this address
>> space holds are the GPA mappings in the beginning.
> 
> I think this is same on ARM, where get_address_space() may return
> system address space. And for VT-d, it actually returns the range
> of the system address space (just though a sub MR of an iommu AS),
> right?

hmmm, I'm not quite getting why it is similar. As I replied, the VT-d
emulation code returns iommu AS in get_address_space(). I didn't see
where it returns address_space_memory (the system address space).

> 
>> If guest set PGTT to S2,
>> then the iommu MR is enabled, hence the mapping is gIOVA mappings
>> accordingly. So in VT-d emulation, the address space switch is more the MR
>> alias switching.
> 
> Zhenzhong said that there is no shadow page table for the nesting
> setup, i.e. gIOVA=>gPA mappings are entirely done by the guest OS.
> 
> Then, why does VT-d need to switch to the iommu MR here?

what I described in prior email is the general idea of the AS switching
before this series. nesting for sure does not need this switching just like
PT.

>> In this series, we mainly want to support S1 translation type for guest.
>> And it is based on nested translation, which needs a S2 domain that holds
>> the GPA mappings. Besides S1 translation type, PT is also supported. Both
>> the two types need a S2 domain which already holds GPA mappings. So we have
>> this internal listener.
> 
> Hmm, the reasoning to the last "so" doesn't sound enough. The VFIO
> listener could do the same...

yes. I just realized that RO mappings should be allowed for the normal
S2 domains. Only the nested parent S2 domain should skip the RO mappings.

> 
>> Also, we want to skip RO mappings on S2, so that's
>> another reason for it.  @Zhenzhong, perhaps, it can be described in the
>> commit message why an internal listener is introduced.
> 
> OK. I think that can be a good reason to have an internal listener,
> only if VFIO can't skip the RO mappings.
> 
>>> So the second question is:
>>> Does vtd have to own this iommufd_s2domain_memory_listener? IOW,
>>
>> yes based on the current design. when guest GPTT==PT, attach device
>> to S2 hwpt, when it goes to S1, then attach it to a S1 hwpt whose
>> parent is the aforementioned S2 hwpt. This S2 hwpt is always there
>> for use.
> 
> ARM is doing the same thing. And the exact point "this S2 hwpt is
> always there for use" has been telling me that the device can just
> stay at the S2 address space (system), since the guest kernel will
> take care of the S1 address space (iommu).
> 
> Overall, the questions here have been two-fold:
> 
> 1.Why does VT-d need an internal listener?
> 
>    I can see the (only) reason is for the RO mappings.
> 
>    Yet, Is there anything that we can do to the VFIO listener to
>    bypass these RO mappings?
> 
> 2.Why not return the system AS all the time when nesting is on?
>    Why switch to the iommu AS when device attaches to S1 HWPT?

no switch if going to setup nesting.

Just got a question on ARM side. IIUC. The ARM emulation code will return
the system address space in the get_address_space() op before guest enables
vIOMMU. Hence the IOAS in the vfio side is GPA IOAS. When guest enables
vIOMMU, the emulation will return iommu address space. Hence, the vfio side
needs switch to gIOVA IOAS? My question is if guest is setting S1
translation, and the emulation code figures out it is going to set up
nested translation, will the get_address_space() op return the iommu
address space as well? If so, where is the GPA IOAS locates? In this
series, the VT-d emulation code actually has an internal GPA IOAS which
skips RO mappings.

-- 
Regards,
Yi Liu

Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to host

Posted by Nicolin Chen 5 months, 3 weeks ago

OK. Let me clarify this at the top as I see the gap here now:

First, the vSMMU model is based on Zhenzhong's older series that
keeps an ioas_id in the HostIOMMUDeviceIOMMUFD structure, which
now it only keeps an hwpt_id in this RFCv3 series. This ioas_id
is allocated when a passthrough cdev attaches to a VFIO container.

Second, the vSMMU model reuses the default IOAS via that ioas_id.
Since the VFIO container doesn't allocate a nesting parent S2 HWPT
(maybe it could?), so the vSMMU allocates another S2 HWPT in the
vIOMMU code.

Third, the vSMMU model, for invalidation efficiency and HW Queue
support, isolates all emulated devices out of the nesting-enabled
vSMMU instance, suggested by Jason. So, only passthrough devices
would use the nesting-enabled vSMMU instance, meaning there is no
need of IOMMU_NOTIFIER_IOTLB_EVENTS:
 - MAP is not needed as there is no shadow page table. QEMU only
   traps the page table pointer and forwards it to host kernel.
 - UNMAP is not needed as QEMU only traps invalidation requests
   and forwards them to host kernel.

(let's forget about the "address space switch" for MSI for now.)

So, in the vSMMU model, there is actually no need for the iommu
AS. And there is only one IOAS in the VM instance allocated by the
VFIO container. And this IOAS manages the GPA->PA mappings. So,
get_address_space() returns the system AS for passthrough devices.

On the other hand, the VT-d model is a bit different. It's a giant
vIOMMU for all devices (either passthrough or emualted). For all
emulated devices, it needs IOMMU_NOTIFIER_IOTLB_EVENTS, i.e. the
iommu address space returned via get_address_space().

That being said, IOMMU_NOTIFIER_IOTLB_EVENTS should not be needed
for passthrough devices, right?

IIUIC, in the VT-d model, a passthrough device also gets attached
to the VFIO container via iommufd_cdev_attach, allocating an IOAS.
But it returns the iommu address space, treating them like those
emulated devices, although the underlying MR of the returned IOMMU
AS is backed by a nodmar MR (that is essentially a system AS).

This seems to completely ignore the default IOAS owned by the VFIO
container, because it needs to bypass those RO mappings(?)

Then for passthrough devices, the VT-d model allocates an internal
IOAS that further requires an internal S2 listener, which seems an
large duplication of what the VFIO container already does..

So, here are things that I want us to conclude:
 1) Since the VFIO container already has an IOAS for a passthrough
    device, and IOMMU_NOTIFIER_IOTLB_EVENTS isn't seemingly needed,
    why not setup this default IOAS to manage gPA=>PA mappings by
    returning the system AS via get_address_space() for passthrough
    devices?

    I got that the VT-d model might have some concern against this,
    as the default listener would map those RO regions. Yet, maybe
    the right approach is to figure out a way to bypass RO regions
    in the core v.s. duplicating another ioas_alloc()/map() and S2
    listener?

 2) If (1) makes sense, I think we can further simplify the routine
    by allocating a nesting parent HWPT in iommufd_cdev_attach(),
    as long as the attaching device is identified as "passthrough"
    and there is "iommufd" in its "-device" string?

    After all, IOMMU_HWPT_ALLOC_NEST_PARENT is a common flag.

On Mon, May 26, 2025 at 03:24:50PM +0800, Yi Liu wrote:
> vfio_listener_region_add, section->mr->name: pc.bios, iova: fffc0000, size:
> 40000, vaddr: 7fb314200000, RO
> vfio_listener_region_add, section->mr->name: pc.rom, iova: c0000, size:
> 20000, vaddr: 7fb206c00000, RO
..
> vfio_listener_region_add, section->mr->name: pc.ram, iova: ce000, size:
> 1a000, vaddr: 7fb207ece000, RO

OK. They look like memory carveouts for FWs. "iova" is gPA right?

And they can be in the range of a guest RAM..

Mind elaborating why they shouldn't be mapped onto nesting parent
S2?

> IMHO. At least for vfio devices, I can see only one get_address_space()
> call. So even there are two ASs, how should the vfio be notified when the
> AS changed? Since vIOMMU is the source of map/umap requests, it looks fine
> to always return iommu AS and handle the AS switch by switching the enabled
> subregions according to the guest vIOMMU translation types.

No, VFIO doesn't get notified when the AS changes.

The vSMMU model wants VFIO to stay in the system AS since the VFIO
container manages the S2 mappings for guest PA.

The "switch" in vSMMU model is only needed by KVM for MSI doorbell
translation. By thinking it carefully, maybe it shouldn't switch AS
because VFIO might be confused if it somehow does get_address_space
again in the future..

Thanks
Nic

RE: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to host

Posted by Duan, Zhenzhong 5 months, 3 weeks ago


>-----Original Message-----
>From: Nicolin Chen <nicolinc@nvidia.com>
>Subject: Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to
>host
>
>OK. Let me clarify this at the top as I see the gap here now:
>
>First, the vSMMU model is based on Zhenzhong's older series that
>keeps an ioas_id in the HostIOMMUDeviceIOMMUFD structure, which
>now it only keeps an hwpt_id in this RFCv3 series. This ioas_id
>is allocated when a passthrough cdev attaches to a VFIO container.
>
>Second, the vSMMU model reuses the default IOAS via that ioas_id.
>Since the VFIO container doesn't allocate a nesting parent S2 HWPT
>(maybe it could?), so the vSMMU allocates another S2 HWPT in the
>vIOMMU code.
>
>Third, the vSMMU model, for invalidation efficiency and HW Queue
>support, isolates all emulated devices out of the nesting-enabled
>vSMMU instance, suggested by Jason. So, only passthrough devices
>would use the nesting-enabled vSMMU instance, meaning there is no
>need of IOMMU_NOTIFIER_IOTLB_EVENTS:

I see, then you need to check if there is emulated device under nesting-enabled vSMMU and fail if there is.

> - MAP is not needed as there is no shadow page table. QEMU only
>   traps the page table pointer and forwards it to host kernel.
> - UNMAP is not needed as QEMU only traps invalidation requests
>   and forwards them to host kernel.
>
>(let's forget about the "address space switch" for MSI for now.)
>
>So, in the vSMMU model, there is actually no need for the iommu
>AS. And there is only one IOAS in the VM instance allocated by the
>VFIO container. And this IOAS manages the GPA->PA mappings. So,
>get_address_space() returns the system AS for passthrough devices.
>
>On the other hand, the VT-d model is a bit different. It's a giant
>vIOMMU for all devices (either passthrough or emualted). For all
>emulated devices, it needs IOMMU_NOTIFIER_IOTLB_EVENTS, i.e. the
>iommu address space returned via get_address_space().
>
>That being said, IOMMU_NOTIFIER_IOTLB_EVENTS should not be needed
>for passthrough devices, right?

No, even if x-flts=on is configured in QEMU cmdline, that only mean virtual vtd
supports stage-1 translation, guest still can choose to run in legacy mode(stage2),
e.g., with kernel cmdline intel_iommu=on,sm_off

So before guest run, we don't know which kind of page table either stage1 or stage2
for this VFIO device by guest. So we have to use iommu AS to catch stage2's MAP event
if guest choose stage2.

>
>IIUIC, in the VT-d model, a passthrough device also gets attached
>to the VFIO container via iommufd_cdev_attach, allocating an IOAS.
>But it returns the iommu address space, treating them like those
>emulated devices, although the underlying MR of the returned IOMMU
>AS is backed by a nodmar MR (that is essentially a system AS).
>
>This seems to completely ignore the default IOAS owned by the VFIO
>container, because it needs to bypass those RO mappings(?)
>
>Then for passthrough devices, the VT-d model allocates an internal
>IOAS that further requires an internal S2 listener, which seems an
>large duplication of what the VFIO container already does..
>
>So, here are things that I want us to conclude:
> 1) Since the VFIO container already has an IOAS for a passthrough
>    device, and IOMMU_NOTIFIER_IOTLB_EVENTS isn't seemingly needed,
>    why not setup this default IOAS to manage gPA=>PA mappings by
>    returning the system AS via get_address_space() for passthrough
>    devices?
>
>    I got that the VT-d model might have some concern against this,
>    as the default listener would map those RO regions. Yet, maybe
>    the right approach is to figure out a way to bypass RO regions
>    in the core v.s. duplicating another ioas_alloc()/map() and S2
>    listener?
>
> 2) If (1) makes sense, I think we can further simplify the routine
>    by allocating a nesting parent HWPT in iommufd_cdev_attach(),
>    as long as the attaching device is identified as "passthrough"
>    and there is "iommufd" in its "-device" string?
>
>    After all, IOMMU_HWPT_ALLOC_NEST_PARENT is a common flag.
>
>On Mon, May 26, 2025 at 03:24:50PM +0800, Yi Liu wrote:
>> vfio_listener_region_add, section->mr->name: pc.bios, iova: fffc0000, size:
>> 40000, vaddr: 7fb314200000, RO
>> vfio_listener_region_add, section->mr->name: pc.rom, iova: c0000, size:
>> 20000, vaddr: 7fb206c00000, RO
>..
>> vfio_listener_region_add, section->mr->name: pc.ram, iova: ce000, size:
>> 1a000, vaddr: 7fb207ece000, RO
>
>OK. They look like memory carveouts for FWs. "iova" is gPA right?
>
>And they can be in the range of a guest RAM..
>
>Mind elaborating why they shouldn't be mapped onto nesting parent
>S2?
>
>> IMHO. At least for vfio devices, I can see only one get_address_space()
>> call. So even there are two ASs, how should the vfio be notified when the
>> AS changed? Since vIOMMU is the source of map/umap requests, it looks fine
>> to always return iommu AS and handle the AS switch by switching the enabled
>> subregions according to the guest vIOMMU translation types.
>
>No, VFIO doesn't get notified when the AS changes.
>
>The vSMMU model wants VFIO to stay in the system AS since the VFIO
>container manages the S2 mappings for guest PA.
>
>The "switch" in vSMMU model is only needed by KVM for MSI doorbell
>translation. By thinking it carefully, maybe it shouldn't switch AS
>because VFIO might be confused if it somehow does get_address_space
>again in the future..
>
>Thanks
>Nic

Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to host

Posted by Nicolin Chen 5 months ago

Sorry for a late reply.

On Wed, May 28, 2025 at 07:12:25AM +0000, Duan, Zhenzhong wrote:
> >Third, the vSMMU model, for invalidation efficiency and HW Queue
> >support, isolates all emulated devices out of the nesting-enabled
> >vSMMU instance, suggested by Jason. So, only passthrough devices
> >would use the nesting-enabled vSMMU instance, meaning there is no
> >need of IOMMU_NOTIFIER_IOTLB_EVENTS:
> 
> I see, then you need to check if there is emulated device under nesting-enabled vSMMU and fail if there is.

Shameer is working on a multi-vSMMU model in the QEMU. This gives
each VM different instances to attach devices. And we do not plan
to support emulated devices on an nesting enabled vSMMU instance,
which is a bit different than the VT-d model.

> >On the other hand, the VT-d model is a bit different. It's a giant
> >vIOMMU for all devices (either passthrough or emualted). For all
> >emulated devices, it needs IOMMU_NOTIFIER_IOTLB_EVENTS, i.e. the
> >iommu address space returned via get_address_space().
> >
> >That being said, IOMMU_NOTIFIER_IOTLB_EVENTS should not be needed
> >for passthrough devices, right?
> 
> No, even if x-flts=on is configured in QEMU cmdline, that only mean virtual vtd
> supports stage-1 translation, guest still can choose to run in legacy mode(stage2),
> e.g., with kernel cmdline intel_iommu=on,sm_off
> 
> So before guest run, we don't know which kind of page table either stage1 or stage2
> for this VFIO device by guest. So we have to use iommu AS to catch stage2's MAP event
> if guest choose stage2.

IIUIC, the guest kernel cmdline can switch the mode between the
stage1 (nesting) and stage2 (legacy/emulated VT-d), right?

Thanks
Nicolin

RE: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to host

Posted by Duan, Zhenzhong 5 months ago


>-----Original Message-----
>From: Nicolin Chen <nicolinc@nvidia.com>
>Subject: Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to
>host
>
>Sorry for a late reply.
>
>On Wed, May 28, 2025 at 07:12:25AM +0000, Duan, Zhenzhong wrote:
>> >Third, the vSMMU model, for invalidation efficiency and HW Queue
>> >support, isolates all emulated devices out of the nesting-enabled
>> >vSMMU instance, suggested by Jason. So, only passthrough devices
>> >would use the nesting-enabled vSMMU instance, meaning there is no
>> >need of IOMMU_NOTIFIER_IOTLB_EVENTS:
>>
>> I see, then you need to check if there is emulated device under nesting-enabled
>vSMMU and fail if there is.
>
>Shameer is working on a multi-vSMMU model in the QEMU. This gives
>each VM different instances to attach devices. And we do not plan
>to support emulated devices on an nesting enabled vSMMU instance,
>which is a bit different than the VT-d model.

I see.

>
>> >On the other hand, the VT-d model is a bit different. It's a giant
>> >vIOMMU for all devices (either passthrough or emualted). For all
>> >emulated devices, it needs IOMMU_NOTIFIER_IOTLB_EVENTS, i.e. the
>> >iommu address space returned via get_address_space().
>> >
>> >That being said, IOMMU_NOTIFIER_IOTLB_EVENTS should not be needed
>> >for passthrough devices, right?
>>
>> No, even if x-flts=on is configured in QEMU cmdline, that only mean virtual vtd
>> supports stage-1 translation, guest still can choose to run in legacy
>mode(stage2),
>> e.g., with kernel cmdline intel_iommu=on,sm_off
>>
>> So before guest run, we don't know which kind of page table either stage1 or
>stage2
>> for this VFIO device by guest. So we have to use iommu AS to catch stage2's
>MAP event
>> if guest choose stage2.
>
>IIUIC, the guest kernel cmdline can switch the mode between the
>stage1 (nesting) and stage2 (legacy/emulated VT-d), right?

Right. E.g., kexec from "intel_iommu=on,sm_on" to "intel_iommu=on,sm_off",
Then first kernel will run in scalable mode and use stage1(nesting) and second kernel will run in legacy mode and use stage2.

Zhenzhong

Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to host

Posted by Nicolin Chen 5 months ago

On Mon, Jun 16, 2025 at 08:15:11AM +0000, Duan, Zhenzhong wrote:
> >IIUIC, the guest kernel cmdline can switch the mode between the
> >stage1 (nesting) and stage2 (legacy/emulated VT-d), right?
> 
> Right. E.g., kexec from "intel_iommu=on,sm_on" to "intel_iommu=on,sm_off",
> Then first kernel will run in scalable mode and use stage1(nesting) and
> second kernel will run in legacy mode and use stage2.

In scalable mode, guest kernel has a stage1 (nested) domain and
host kernel has a stage2 (nesting parent) domain. In this case,
the VFIO container IOAS could be the system AS corresponding to
the kernel-managed stage2 domain.

In legacy mode, guest kernel has a stage2 (normal) domain while
host kernel has a stage2 (shadow) domain? In this case, the VFIO
container IOAS should be the iommu AS corresponding to the kernel
guest-level stage2 domain (or should it be shadow)?

The ARM model that Shameer is proposing only allows a nested SMMU
when such a legacy mode is off. This simplifies a lot of things.
But the difficulty of the VT-d model is that it has to rely on a
guest bootcmd during runtime..

Nicolin

Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to host

Posted by Jason Gunthorpe 5 months ago

On Mon, Jun 16, 2025 at 08:14:27PM -0700, Nicolin Chen wrote:
> On Mon, Jun 16, 2025 at 08:15:11AM +0000, Duan, Zhenzhong wrote:
> > >IIUIC, the guest kernel cmdline can switch the mode between the
> > >stage1 (nesting) and stage2 (legacy/emulated VT-d), right?
> > 
> > Right. E.g., kexec from "intel_iommu=on,sm_on" to "intel_iommu=on,sm_off",
> > Then first kernel will run in scalable mode and use stage1(nesting) and
> > second kernel will run in legacy mode and use stage2.
> 
> In scalable mode, guest kernel has a stage1 (nested) domain and
> host kernel has a stage2 (nesting parent) domain. In this case,
> the VFIO container IOAS could be the system AS corresponding to
> the kernel-managed stage2 domain.
> 
> In legacy mode, guest kernel has a stage2 (normal) domain while
> host kernel has a stage2 (shadow) domain? In this case, the VFIO
> container IOAS should be the iommu AS corresponding to the kernel
> guest-level stage2 domain (or should it be shadow)?

What you want is to disable HW support for legacy mode in qemu so the
kernel rejects sm_off operation.

The HW spec is really goofy, we get an ecap_slts but it only applies
to a PASID table entry (scalable mode). So the HW has to support
second stage for legacy always but can turn it off for PASID?

IMHO the intention was to allow the VMM to not support shadowing, but
it seems the execution was mangled.

I suggest fixing the Linux driver to refuse to run in sm_on mode if
the HW supports scalable mode and ecap_slts = false. That may not be
100% spec compliant but it seems like a reasonable approach.

> The ARM model that Shameer is proposing only allows a nested SMMU
> when such a legacy mode is off. This simplifies a lot of things.
> But the difficulty of the VT-d model is that it has to rely on a
> guest bootcmd during runtime..

ARM is cleaner because it doesn't have these drivers issues. qemu can
reliably say not to use the S2 and all the existing guest kernels will
obey that.

AMD has the same issues, BTW, arguably even worse as I didn't notice
any way to specify if the v1 page table is supported :\

Jason

Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to host

Posted by Yi Liu 5 months ago

On 2025/6/17 20:37, Jason Gunthorpe wrote:
> On Mon, Jun 16, 2025 at 08:14:27PM -0700, Nicolin Chen wrote:
>> On Mon, Jun 16, 2025 at 08:15:11AM +0000, Duan, Zhenzhong wrote:
>>>> IIUIC, the guest kernel cmdline can switch the mode between the
>>>> stage1 (nesting) and stage2 (legacy/emulated VT-d), right?
>>>
>>> Right. E.g., kexec from "intel_iommu=on,sm_on" to "intel_iommu=on,sm_off",
>>> Then first kernel will run in scalable mode and use stage1(nesting) and
>>> second kernel will run in legacy mode and use stage2.
>>
>> In scalable mode, guest kernel has a stage1 (nested) domain and
>> host kernel has a stage2 (nesting parent) domain. In this case,
>> the VFIO container IOAS could be the system AS corresponding to
>> the kernel-managed stage2 domain.
>>
>> In legacy mode, guest kernel has a stage2 (normal) domain while
>> host kernel has a stage2 (shadow) domain? In this case, the VFIO
>> container IOAS should be the iommu AS corresponding to the kernel
>> guest-level stage2 domain (or should it be shadow)?
> 
> What you want is to disable HW support for legacy mode in qemu so the
> kernel rejects sm_off operation.

that can be the future. :)

> The HW spec is really goofy, we get an ecap_slts but it only applies
> to a PASID table entry (scalable mode). So the HW has to support
> second stage for legacy always but can turn it off for PASID?

yes. legacy mode (page table following second stage format) is anyhow
supported.

> IMHO the intention was to allow the VMM to not support shadowing, but
> it seems the execution was mangled.
> 
> I suggest fixing the Linux driver to refuse to run in sm_on mode if
> the HW supports scalable mode and ecap_slts = false. That may not be
> 100% spec compliant but it seems like a reasonable approach.

running sm_on with only ecap_flts==true is what we want here. We want
the guest use stage-1 page table hence it can be used by hw under the
nested translation mode. While this page table is only available in sm_on
mode.

If we want to drop the legacy mode usage in virtualization environment, we
might let linux iommu driver refuse running legacy mode while ecap_slts is
false. I suppose HW is going to advertise both ecap_slts and ecap_flts. So
this will just let guest get rid of using legacy mode.

But this is not necessary so far. As the discussion going here, we intend
to reuse the GPA HWPT allocated by VFIO container as well.[1] This is now
aligned with Nic and Shameer.

[1] 
https://lore.kernel.org/qemu-devel/b3d31287-4de5-4e0e-a81b-99f82edd5bcc@intel.com/

>> The ARM model that Shameer is proposing only allows a nested SMMU
>> when such a legacy mode is off. This simplifies a lot of things.
>> But the difficulty of the VT-d model is that it has to rely on a
>> guest bootcmd during runtime..
> 
> ARM is cleaner because it doesn't have these drivers issues. qemu can
> reliably say not to use the S2 and all the existing guest kernels will
> obey that.

out of curious, does SMMU have legacy mode or a given version of SMMU
only supports either legacy mode or newer mode?

> AMD has the same issues, BTW, arguably even worse as I didn't notice
> any way to specify if the v1 page table is supported :\
> 
> Jason

-- 
Regards,
Yi Liu

Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to host

Posted by Jason Gunthorpe 5 months ago

On Tue, Jun 17, 2025 at 09:03:32PM +0800, Yi Liu wrote:
> > I suggest fixing the Linux driver to refuse to run in sm_on mode if
> > the HW supports scalable mode and ecap_slts = false. That may not be
> > 100% spec compliant but it seems like a reasonable approach.
> 
> running sm_on with only ecap_flts==true is what we want here. We want
> the guest use stage-1 page table hence it can be used by hw under the
> nested translation mode. While this page table is only available in sm_on
> mode.
> 
> If we want to drop the legacy mode usage in virtualization environment, we
> might let linux iommu driver refuse running legacy mode while ecap_slts is
> false. I suppose HW is going to advertise both ecap_slts and ecap_flts. So
> this will just let guest get rid of using legacy mode.
> 
> But this is not necessary so far. As the discussion going here, we intend
> to reuse the GPA HWPT allocated by VFIO container as well.[1] This is now
> aligned with Nic and Shameer.

I think it is an issue, nobody really wants to accidently start
supporting and using shadow mode just because the VM is misconfigured.

What is desirable is to make this automatic and ensure we stay in the
nesting configuration only.

> > ARM is cleaner because it doesn't have these drivers issues. qemu can
> > reliably say not to use the S2 and all the existing guest kernels will
> > obey that.
> 
> out of curious, does SMMU have legacy mode or a given version of SMMU
> only supports either legacy mode or newer mode?

The SMMUv3 spec started out with definitions for S1 and S2 as well as
capability bits for them at day 0. So it never had this backward
compatible problem where we want to remove something that was
a mandatory part of the specification.

Jason

Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to host

Posted by Yi Liu 5 months ago

On 2025/6/17 21:11, Jason Gunthorpe wrote:
> On Tue, Jun 17, 2025 at 09:03:32PM +0800, Yi Liu wrote:
>>> I suggest fixing the Linux driver to refuse to run in sm_on mode if
>>> the HW supports scalable mode and ecap_slts = false. That may not be
>>> 100% spec compliant but it seems like a reasonable approach.
>>
>> running sm_on with only ecap_flts==true is what we want here. We want
>> the guest use stage-1 page table hence it can be used by hw under the
>> nested translation mode. While this page table is only available in sm_on
>> mode.
>>
>> If we want to drop the legacy mode usage in virtualization environment, we
>> might let linux iommu driver refuse running legacy mode while ecap_slts is
>> false. I suppose HW is going to advertise both ecap_slts and ecap_flts. So
>> this will just let guest get rid of using legacy mode.
>>
>> But this is not necessary so far. As the discussion going here, we intend
>> to reuse the GPA HWPT allocated by VFIO container as well.[1] This is now
>> aligned with Nic and Shameer.
> 
> I think it is an issue, nobody really wants to accidently start
> supporting and using shadow mode just because the VM is misconfigured.

hmmm. intel iommu driver makes sm_on by default since v5.15. So if guest
configs sm_off, that means it wants it. For the kernel <5.15, yes it will
use legacy mode if it has not configured sm_on explicitly. So this seems
not an issue.

Actually, as I explained in the first hunk of [1], there is no issue with
the legacy mode support. :)

[1] 
https://lore.kernel.org/qemu-devel/20250521111452.3316354-1-zhenzhong.duan@intel.com/T/#m4c8fa70742001d4c22b3c297e240a2151d2c617f

> What is desirable is to make this automatic and ensure we stay in the
> nesting configuration only.

yes, once QEMU supports nested translation based vIOMMU, it's better to use
sm_on instead of legacy. I think for the kernels >= 5.15, this automation
has already been achieved since sm is default on.

>>> ARM is cleaner because it doesn't have these drivers issues. qemu can
>>> reliably say not to use the S2 and all the existing guest kernels will
>>> obey that.
>>
>> out of curious, does SMMU have legacy mode or a given version of SMMU
>> only supports either legacy mode or newer mode?
> 
> The SMMUv3 spec started out with definitions for S1 and S2 as well as
> capability bits for them at day 0. So it never had this backward
> compatible problem where we want to remove something that was
> a mandatory part of the specification.

got it. yes, it's all about backward compatible support.

Regards,
Yi Liu

Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to host

Posted by Jason Gunthorpe 5 months ago

On Wed, Jun 18, 2025 at 11:40:38AM +0800, Yi Liu wrote:

> Actually, as I explained in the first hunk of [1], there is no issue with
> the legacy mode support. :)
> 
> [1] https://lore.kernel.org/qemu-devel/20250521111452.3316354-1-zhenzhong.duan@intel.com/T/#m4c8fa70742001d4c22b3c297e240a2151d2c617f

My feeling is that it is undesirable to have the shadowing code in the
VMM at all, as it increases the attack surface/complexity/etc.

There should be a way to fully inhibit legacy mode, and if that means
old kernels don't work that's just how it is.

Jason

RE: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to host

Posted by Duan, Zhenzhong 5 months ago


>-----Original Message-----
>From: Jason Gunthorpe <jgg@nvidia.com>
>Subject: Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to
>host
>
>On Tue, Jun 17, 2025 at 09:03:32PM +0800, Yi Liu wrote:
>> > I suggest fixing the Linux driver to refuse to run in sm_on mode if
>> > the HW supports scalable mode and ecap_slts = false. That may not be
>> > 100% spec compliant but it seems like a reasonable approach.
>>
>> running sm_on with only ecap_flts==true is what we want here. We want
>> the guest use stage-1 page table hence it can be used by hw under the
>> nested translation mode. While this page table is only available in sm_on
>> mode.
>>
>> If we want to drop the legacy mode usage in virtualization environment, we
>> might let linux iommu driver refuse running legacy mode while ecap_slts is
>> false. I suppose HW is going to advertise both ecap_slts and ecap_flts. So
>> this will just let guest get rid of using legacy mode.
>>
>> But this is not necessary so far. As the discussion going here, we intend
>> to reuse the GPA HWPT allocated by VFIO container as well.[1] This is now
>> aligned with Nic and Shameer.
>
>I think it is an issue, nobody really wants to accidently start
>supporting and using shadow mode just because the VM is misconfigured.
>
>What is desirable is to make this automatic and ensure we stay in the
>nesting configuration only.

This will break QEMU's back compatibility. Current QEMU supports legacy
mode when ecap_flts==true, but a newer QEMU not?

Thanks
Zhenzhong

Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to host

Posted by Yi Liu 5 months ago

On 2025/5/28 15:12, Duan, Zhenzhong wrote:
> 
> 
>> -----Original Message-----
>> From: Nicolin Chen <nicolinc@nvidia.com>
>> Subject: Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to
>> host
>>
>> OK. Let me clarify this at the top as I see the gap here now:
>>
>> First, the vSMMU model is based on Zhenzhong's older series that
>> keeps an ioas_id in the HostIOMMUDeviceIOMMUFD structure, which
>> now it only keeps an hwpt_id in this RFCv3 series. This ioas_id
>> is allocated when a passthrough cdev attaches to a VFIO container.
>>
>> Second, the vSMMU model reuses the default IOAS via that ioas_id.
>> Since the VFIO container doesn't allocate a nesting parent S2 HWPT
>> (maybe it could?), so the vSMMU allocates another S2 HWPT in the
>> vIOMMU code.
>>
>> Third, the vSMMU model, for invalidation efficiency and HW Queue
>> support, isolates all emulated devices out of the nesting-enabled
>> vSMMU instance, suggested by Jason. So, only passthrough devices
>> would use the nesting-enabled vSMMU instance, meaning there is no
>> need of IOMMU_NOTIFIER_IOTLB_EVENTS:
> 
> I see, then you need to check if there is emulated device under nesting-enabled vSMMU and fail if there is.
> 
>> - MAP is not needed as there is no shadow page table. QEMU only
>>    traps the page table pointer and forwards it to host kernel.
>> - UNMAP is not needed as QEMU only traps invalidation requests
>>    and forwards them to host kernel.
>>
>> (let's forget about the "address space switch" for MSI for now.)
>>
>> So, in the vSMMU model, there is actually no need for the iommu
>> AS. And there is only one IOAS in the VM instance allocated by the
>> VFIO container. And this IOAS manages the GPA->PA mappings. So,
>> get_address_space() returns the system AS for passthrough devices.
>>
>> On the other hand, the VT-d model is a bit different. It's a giant
>> vIOMMU for all devices (either passthrough or emualted). For all
>> emulated devices, it needs IOMMU_NOTIFIER_IOTLB_EVENTS, i.e. the
>> iommu address space returned via get_address_space().
>>
>> That being said, IOMMU_NOTIFIER_IOTLB_EVENTS should not be needed
>> for passthrough devices, right?
> 
> No, even if x-flts=on is configured in QEMU cmdline, that only mean virtual vtd
> supports stage-1 translation, guest still can choose to run in legacy mode(stage2),
> e.g., with kernel cmdline intel_iommu=on,sm_off
> 
> So before guest run, we don't know which kind of page table either stage1 or stage2
> for this VFIO device by guest. So we have to use iommu AS to catch stage2's MAP event
> if guest choose stage2.

@Zheznzhong, if guest decides to use legacy mode then vIOMMU should switch
the MRs of the device's AS, hence the IOAS created by VFIO container would
be switched to using the IOMMU_NOTIFIER_IOTLB_EVENTS since the MR is
switched to IOMMU MR. So it should be able to support shadowing the guest
IO page table. Hence, this should not be a problem.

@Nicolin, I think your major point is making the VFIO container IOAS as a
GPA IOAS (always return system AS in get_address_space op) and reusing it
when setting nested translation. Is it? I think it should work if:
1) we can let the vfio memory listener filter out the RO pages per vIOMMU's
    request. But I don't want the get_address_space op always return system
    AS as the reason mentioned by Zhenzhong above.
2) we can disallow emulated/passthru devices behind the same pcie-pci
    bridge[1]. For emulated devices, AS should switch to iommu MR, while for
    passthru devices, it needs the AS stick with the system MR hence be able
    to keep the VFIO container IOAS as a GPA IOAS. To support this, let AS
    switch to iommu MR and have a separate GPA IOAS is needed. This separate
    GPA IOAS can be shared by all the passthru devices.

[1] 
https://lore.kernel.org/all/SJ0PR11MB6744E2BA00BBE677B2B49BE99265A@SJ0PR11MB6744.namprd11.prod.outlook.com/#t

So basically, we are ok with your idea. But we should decide if it is 
necessary to support the topology in 2). I think this is a general
question. TBH. I don't have much information to judge if it is valuable.
Perhaps, let's hear from more people.

>>
>> IIUIC, in the VT-d model, a passthrough device also gets attached
>> to the VFIO container via iommufd_cdev_attach, allocating an IOAS.
>> But it returns the iommu address space, treating them like those
>> emulated devices, although the underlying MR of the returned IOMMU
>> AS is backed by a nodmar MR (that is essentially a system AS).
>>
>> This seems to completely ignore the default IOAS owned by the VFIO
>> container, because it needs to bypass those RO mappings(?)
>>
>> Then for passthrough devices, the VT-d model allocates an internal
>> IOAS that further requires an internal S2 listener, which seems an
>> large duplication of what the VFIO container already does..
>>
>> So, here are things that I want us to conclude:
>> 1) Since the VFIO container already has an IOAS for a passthrough
>>     device, and IOMMU_NOTIFIER_IOTLB_EVENTS isn't seemingly needed,
>>     why not setup this default IOAS to manage gPA=>PA mappings by
>>     returning the system AS via get_address_space() for passthrough
>>     devices?
>>
>>     I got that the VT-d model might have some concern against this,
>>     as the default listener would map those RO regions. Yet, maybe
>>     the right approach is to figure out a way to bypass RO regions
>>     in the core v.s. duplicating another ioas_alloc()/map() and S2
>>     listener?
>>
>> 2) If (1) makes sense, I think we can further simplify the routine
>>     by allocating a nesting parent HWPT in iommufd_cdev_attach(),
>>     as long as the attaching device is identified as "passthrough"
>>     and there is "iommufd" in its "-device" string?
>>
>>     After all, IOMMU_HWPT_ALLOC_NEST_PARENT is a common flag.
>>
>> On Mon, May 26, 2025 at 03:24:50PM +0800, Yi Liu wrote:
>>> vfio_listener_region_add, section->mr->name: pc.bios, iova: fffc0000, size:
>>> 40000, vaddr: 7fb314200000, RO
>>> vfio_listener_region_add, section->mr->name: pc.rom, iova: c0000, size:
>>> 20000, vaddr: 7fb206c00000, RO
>> ..
>>> vfio_listener_region_add, section->mr->name: pc.ram, iova: ce000, size:
>>> 1a000, vaddr: 7fb207ece000, RO
>>
>> OK. They look like memory carveouts for FWs. "iova" is gPA right?
>>
>> And they can be in the range of a guest RAM..
>>
>> Mind elaborating why they shouldn't be mapped onto nesting parent
>> S2?

@Nicolin, It's due to ERRATA_772415.

>>> IMHO. At least for vfio devices, I can see only one get_address_space()
>>> call. So even there are two ASs, how should the vfio be notified when the
>>> AS changed? Since vIOMMU is the source of map/umap requests, it looks fine
>>> to always return iommu AS and handle the AS switch by switching the enabled
>>> subregions according to the guest vIOMMU translation types.
>>
>> No, VFIO doesn't get notified when the AS changes.
>>
>> The vSMMU model wants VFIO to stay in the system AS since the VFIO
>> container manages the S2 mappings for guest PA.
>>
>> The "switch" in vSMMU model is only needed by KVM for MSI doorbell
>> translation. By thinking it carefully, maybe it shouldn't switch AS
>> because VFIO might be confused if it somehow does get_address_space
>> again in the future..

@Nicolin, not quite get the detailed logic for the MSI stuff on SMMU. But I
agree with the last sentence. get_address_space should return a consistent
AS.

-- 
Regards,
Yi Liu

Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to host

Posted by Nicolin Chen 5 months ago

On Thu, Jun 12, 2025 at 08:53:40PM +0800, Yi Liu wrote:
> > > That being said, IOMMU_NOTIFIER_IOTLB_EVENTS should not be needed
> > > for passthrough devices, right?
> > 
> > No, even if x-flts=on is configured in QEMU cmdline, that only mean virtual vtd
> > supports stage-1 translation, guest still can choose to run in legacy mode(stage2),
> > e.g., with kernel cmdline intel_iommu=on,sm_off
> > 
> > So before guest run, we don't know which kind of page table either stage1 or stage2
> > for this VFIO device by guest. So we have to use iommu AS to catch stage2's MAP event
> > if guest choose stage2.
> 
> @Zheznzhong, if guest decides to use legacy mode then vIOMMU should switch
> the MRs of the device's AS, hence the IOAS created by VFIO container would
> be switched to using the IOMMU_NOTIFIER_IOTLB_EVENTS since the MR is
> switched to IOMMU MR. So it should be able to support shadowing the guest
> IO page table. Hence, this should not be a problem.
> 
> @Nicolin, I think your major point is making the VFIO container IOAS as a
> GPA IOAS (always return system AS in get_address_space op) and reusing it
> when setting nested translation. Is it? I think it should work if:
> 1) we can let the vfio memory listener filter out the RO pages per vIOMMU's
>    request.

Yes.

> But I don't want the get_address_space op always return system
>    AS as the reason mentioned by Zhenzhong above.

So, you mean the VT-d model would need a runtime notification to
switch the address space of the VFIO ioas?

TBH, I am still unclear how many cases the VT-d model would need
support here :-/

> 2) we can disallow emulated/passthru devices behind the same pcie-pci
>    bridge[1]. For emulated devices, AS should switch to iommu MR, while for
>    passthru devices, it needs the AS stick with the system MR hence be able
>    to keep the VFIO container IOAS as a GPA IOAS. To support this, let AS
>    switch to iommu MR and have a separate GPA IOAS is needed. This separate
>    GPA IOAS can be shared by all the passthru devices.

Yea, ARM is doing in a similar way.

> So basically, we are ok with your idea. But we should decide if it is
> necessary to support the topology in 2). I think this is a general
> question. TBH. I don't have much information to judge if it is valuable.
> Perhaps, let's hear from more people.

I would be okay if VT-d decides to move on with its own listener,
if it turns out to be the relatively better case. But for ARM, I'd
like to see we can reuse the VFIO container IOAS.

Thanks
Nicolin

Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to host

Posted by Yi Liu 5 months ago

On 2025/6/16 13:59, Nicolin Chen wrote:
> On Thu, Jun 12, 2025 at 08:53:40PM +0800, Yi Liu wrote:
>>>> That being said, IOMMU_NOTIFIER_IOTLB_EVENTS should not be needed
>>>> for passthrough devices, right?
>>>
>>> No, even if x-flts=on is configured in QEMU cmdline, that only mean virtual vtd
>>> supports stage-1 translation, guest still can choose to run in legacy mode(stage2),
>>> e.g., with kernel cmdline intel_iommu=on,sm_off
>>>
>>> So before guest run, we don't know which kind of page table either stage1 or stage2
>>> for this VFIO device by guest. So we have to use iommu AS to catch stage2's MAP event
>>> if guest choose stage2.
>>
>> @Zheznzhong, if guest decides to use legacy mode then vIOMMU should switch
>> the MRs of the device's AS, hence the IOAS created by VFIO container would
>> be switched to using the IOMMU_NOTIFIER_IOTLB_EVENTS since the MR is
>> switched to IOMMU MR. So it should be able to support shadowing the guest
>> IO page table. Hence, this should not be a problem.
>>
>> @Nicolin, I think your major point is making the VFIO container IOAS as a
>> GPA IOAS (always return system AS in get_address_space op) and reusing it
>> when setting nested translation. Is it? I think it should work if:
>> 1) we can let the vfio memory listener filter out the RO pages per vIOMMU's
>>     request.
> 
> Yes.
> 
>> But I don't want the get_address_space op always return system
>>     AS as the reason mentioned by Zhenzhong above.
> 
> So, you mean the VT-d model would need a runtime notification to
> switch the address space of the VFIO ioas?

It's not a notification. It's done by switching AS. Detail can be found
in vtd_switch_address_space().

> TBH, I am still unclear how many cases the VT-d model would need
> support here :-/
 >
>> 2) we can disallow emulated/passthru devices behind the same pcie-pci
>>     bridge[1]. For emulated devices, AS should switch to iommu MR, while for
>>     passthru devices, it needs the AS stick with the system MR hence be able
>>     to keep the VFIO container IOAS as a GPA IOAS. To support this, let AS
>>     switch to iommu MR and have a separate GPA IOAS is needed. This separate
>>     GPA IOAS can be shared by all the passthru devices.
> 
> Yea, ARM is doing in a similar way.
> 
>> So basically, we are ok with your idea. But we should decide if it is
>> necessary to support the topology in 2). I think this is a general
>> question. TBH. I don't have much information to judge if it is valuable.
>> Perhaps, let's hear from more people.
> 
> I would be okay if VT-d decides to move on with its own listener,
> if it turns out to be the relatively better case. But for ARM, I'd
> like to see we can reuse the VFIO container IOAS.

I didn't see a problem so far on this part. Have you seen any?

-- 
Regards,
Yi Liu

Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to host

Posted by Nicolin Chen 5 months ago

On Mon, Jun 16, 2025 at 03:38:26PM +0800, Yi Liu wrote:
> On 2025/6/16 13:59, Nicolin Chen wrote:
> > On Thu, Jun 12, 2025 at 08:53:40PM +0800, Yi Liu wrote:
> > > > > That being said, IOMMU_NOTIFIER_IOTLB_EVENTS should not be needed
> > > > > for passthrough devices, right?
> > > > 
> > > > No, even if x-flts=on is configured in QEMU cmdline, that only mean virtual vtd
> > > > supports stage-1 translation, guest still can choose to run in legacy mode(stage2),
> > > > e.g., with kernel cmdline intel_iommu=on,sm_off
> > > > 
> > > > So before guest run, we don't know which kind of page table either stage1 or stage2
> > > > for this VFIO device by guest. So we have to use iommu AS to catch stage2's MAP event
> > > > if guest choose stage2.
> > > 
> > > @Zheznzhong, if guest decides to use legacy mode then vIOMMU should switch
> > > the MRs of the device's AS, hence the IOAS created by VFIO container would
> > > be switched to using the IOMMU_NOTIFIER_IOTLB_EVENTS since the MR is
> > > switched to IOMMU MR. So it should be able to support shadowing the guest
> > > IO page table. Hence, this should not be a problem.
> > > 
> > > @Nicolin, I think your major point is making the VFIO container IOAS as a
> > > GPA IOAS (always return system AS in get_address_space op) and reusing it
> > > when setting nested translation. Is it? I think it should work if:
> > > 1) we can let the vfio memory listener filter out the RO pages per vIOMMU's
> > >     request.
> > 
> > Yes.
> > 
> > > But I don't want the get_address_space op always return system
> > >     AS as the reason mentioned by Zhenzhong above.
> > 
> > So, you mean the VT-d model would need a runtime notification to
> > switch the address space of the VFIO ioas?
> 
> It's not a notification. It's done by switching AS. Detail can be found
> in vtd_switch_address_space().

OK. I got confused about the "switch", thinking that was about
the get_address_space() call.

> > TBH, I am still unclear how many cases the VT-d model would need
> > support here :-/
> >
> > > 2) we can disallow emulated/passthru devices behind the same pcie-pci
> > >     bridge[1]. For emulated devices, AS should switch to iommu MR, while for
> > >     passthru devices, it needs the AS stick with the system MR hence be able
> > >     to keep the VFIO container IOAS as a GPA IOAS. To support this, let AS
> > >     switch to iommu MR and have a separate GPA IOAS is needed. This separate
> > >     GPA IOAS can be shared by all the passthru devices.
> > 
> > Yea, ARM is doing in a similar way.
> > 
> > > So basically, we are ok with your idea. But we should decide if it is
> > > necessary to support the topology in 2). I think this is a general
> > > question. TBH. I don't have much information to judge if it is valuable.
> > > Perhaps, let's hear from more people.
> > 
> > I would be okay if VT-d decides to move on with its own listener,
> > if it turns out to be the relatively better case. But for ARM, I'd
> > like to see we can reuse the VFIO container IOAS.
> 
> I didn't see a problem so far on this part. Have you seen any?

Probably no functional problem with that internal listener. ARM
could work using one like that as well. The only problem is code
duplication. It's not ideal for everybody to have an internal S2
listener while wasting the VFIO one.

But given that VT-d has more complicated use cases like runtime
guest-level configuration that switches between nesting and non-
nesting modes, perhaps having an internal listener is a better
idea?

Thanks
Nicolin

Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to host

Posted by Yi Liu 5 months ago

On 2025/6/17 11:22, Nicolin Chen wrote:
> On Mon, Jun 16, 2025 at 03:38:26PM +0800, Yi Liu wrote:
>> On 2025/6/16 13:59, Nicolin Chen wrote:
>>> On Thu, Jun 12, 2025 at 08:53:40PM +0800, Yi Liu wrote:
>>>>>> That being said, IOMMU_NOTIFIER_IOTLB_EVENTS should not be needed
>>>>>> for passthrough devices, right?
>>>>>
>>>>> No, even if x-flts=on is configured in QEMU cmdline, that only mean virtual vtd
>>>>> supports stage-1 translation, guest still can choose to run in legacy mode(stage2),
>>>>> e.g., with kernel cmdline intel_iommu=on,sm_off
>>>>>
>>>>> So before guest run, we don't know which kind of page table either stage1 or stage2
>>>>> for this VFIO device by guest. So we have to use iommu AS to catch stage2's MAP event
>>>>> if guest choose stage2.
>>>>
>>>> @Zheznzhong, if guest decides to use legacy mode then vIOMMU should switch
>>>> the MRs of the device's AS, hence the IOAS created by VFIO container would
>>>> be switched to using the IOMMU_NOTIFIER_IOTLB_EVENTS since the MR is
>>>> switched to IOMMU MR. So it should be able to support shadowing the guest
>>>> IO page table. Hence, this should not be a problem.
>>>>
>>>> @Nicolin, I think your major point is making the VFIO container IOAS as a
>>>> GPA IOAS (always return system AS in get_address_space op) and reusing it
>>>> when setting nested translation. Is it? I think it should work if:
>>>> 1) we can let the vfio memory listener filter out the RO pages per vIOMMU's
>>>>      request.
>>>
>>> Yes.
>>>
>>>> But I don't want the get_address_space op always return system
>>>>      AS as the reason mentioned by Zhenzhong above.
>>>
>>> So, you mean the VT-d model would need a runtime notification to
>>> switch the address space of the VFIO ioas?
>>
>> It's not a notification. It's done by switching AS. Detail can be found
>> in vtd_switch_address_space().
> 
> OK. I got confused about the "switch", thinking that was about
> the get_address_space() call.

yeah, not that call. The all magic is the MR enable/disable. This will
switch to iommu MR hence the vfio_listener_region_add() will see the
MR is iommu MR and register iommu notifier.

>>> TBH, I am still unclear how many cases the VT-d model would need
>>> support here :-/
>>>
>>>> 2) we can disallow emulated/passthru devices behind the same pcie-pci
>>>>      bridge[1]. For emulated devices, AS should switch to iommu MR, while for
>>>>      passthru devices, it needs the AS stick with the system MR hence be able
>>>>      to keep the VFIO container IOAS as a GPA IOAS. To support this, let AS
>>>>      switch to iommu MR and have a separate GPA IOAS is needed. This separate
>>>>      GPA IOAS can be shared by all the passthru devices.
>>>
>>> Yea, ARM is doing in a similar way.
>>>
>>>> So basically, we are ok with your idea. But we should decide if it is
>>>> necessary to support the topology in 2). I think this is a general
>>>> question. TBH. I don't have much information to judge if it is valuable.
>>>> Perhaps, let's hear from more people.
>>>
>>> I would be okay if VT-d decides to move on with its own listener,
>>> if it turns out to be the relatively better case. But for ARM, I'd
>>> like to see we can reuse the VFIO container IOAS.
>>
>> I didn't see a problem so far on this part. Have you seen any?
> 
> Probably no functional problem with that internal listener. ARM
> could work using one like that as well. The only problem is code
> duplication. It's not ideal for everybody to have an internal S2
> listener while wasting the VFIO one.
> 
> But given that VT-d has more complicated use cases like runtime
> guest-level configuration that switches between nesting and non-
> nesting modes, perhaps having an internal listener is a better
> idea?

I noticed there is quite a few duplication now (container/ioas/hwpt). let's
see if anyone wants to put the emulated device and passthru devices under
the same pci bridge. If no, let's avoid duplicating code.

-- 
Regards,
Yi Liu

RE: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to host

Posted by Duan, Zhenzhong 5 months ago


>-----Original Message-----
>From: Liu, Yi L <yi.l.liu@intel.com>
>Subject: Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to
>host
>
>On 2025/5/28 15:12, Duan, Zhenzhong wrote:
>>
>>
>>> -----Original Message-----
>>> From: Nicolin Chen <nicolinc@nvidia.com>
>>> Subject: Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table
>to
>>> host
>>>
>>> OK. Let me clarify this at the top as I see the gap here now:
>>>
>>> First, the vSMMU model is based on Zhenzhong's older series that
>>> keeps an ioas_id in the HostIOMMUDeviceIOMMUFD structure, which
>>> now it only keeps an hwpt_id in this RFCv3 series. This ioas_id
>>> is allocated when a passthrough cdev attaches to a VFIO container.
>>>
>>> Second, the vSMMU model reuses the default IOAS via that ioas_id.
>>> Since the VFIO container doesn't allocate a nesting parent S2 HWPT
>>> (maybe it could?), so the vSMMU allocates another S2 HWPT in the
>>> vIOMMU code.
>>>
>>> Third, the vSMMU model, for invalidation efficiency and HW Queue
>>> support, isolates all emulated devices out of the nesting-enabled
>>> vSMMU instance, suggested by Jason. So, only passthrough devices
>>> would use the nesting-enabled vSMMU instance, meaning there is no
>>> need of IOMMU_NOTIFIER_IOTLB_EVENTS:
>>
>> I see, then you need to check if there is emulated device under nesting-enabled
>vSMMU and fail if there is.
>>
>>> - MAP is not needed as there is no shadow page table. QEMU only
>>>    traps the page table pointer and forwards it to host kernel.
>>> - UNMAP is not needed as QEMU only traps invalidation requests
>>>    and forwards them to host kernel.
>>>
>>> (let's forget about the "address space switch" for MSI for now.)
>>>
>>> So, in the vSMMU model, there is actually no need for the iommu
>>> AS. And there is only one IOAS in the VM instance allocated by the
>>> VFIO container. And this IOAS manages the GPA->PA mappings. So,
>>> get_address_space() returns the system AS for passthrough devices.
>>>
>>> On the other hand, the VT-d model is a bit different. It's a giant
>>> vIOMMU for all devices (either passthrough or emualted). For all
>>> emulated devices, it needs IOMMU_NOTIFIER_IOTLB_EVENTS, i.e. the
>>> iommu address space returned via get_address_space().
>>>
>>> That being said, IOMMU_NOTIFIER_IOTLB_EVENTS should not be needed
>>> for passthrough devices, right?
>>
>> No, even if x-flts=on is configured in QEMU cmdline, that only mean virtual vtd
>> supports stage-1 translation, guest still can choose to run in legacy
>mode(stage2),
>> e.g., with kernel cmdline intel_iommu=on,sm_off
>>
>> So before guest run, we don't know which kind of page table either stage1 or
>stage2
>> for this VFIO device by guest. So we have to use iommu AS to catch stage2's
>MAP event
>> if guest choose stage2.
>
>@Zheznzhong, if guest decides to use legacy mode then vIOMMU should switch
>the MRs of the device's AS, hence the IOAS created by VFIO container would
>be switched to using the IOMMU_NOTIFIER_IOTLB_EVENTS since the MR is
>switched to IOMMU MR. So it should be able to support shadowing the guest
>IO page table. Hence, this should not be a problem.
>
>@Nicolin, I think your major point is making the VFIO container IOAS as a
>GPA IOAS (always return system AS in get_address_space op) and reusing it
>when setting nested translation. Is it? I think it should work if:
>1) we can let the vfio memory listener filter out the RO pages per vIOMMU's
>    request. But I don't want the get_address_space op always return system
>    AS as the reason mentioned by Zhenzhong above.
>2) we can disallow emulated/passthru devices behind the same pcie-pci
>    bridge[1]. For emulated devices, AS should switch to iommu MR, while for
>    passthru devices, it needs the AS stick with the system MR hence be able
>    to keep the VFIO container IOAS as a GPA IOAS. To support this, let AS
>    switch to iommu MR and have a separate GPA IOAS is needed. This separate
>    GPA IOAS can be shared by all the passthru devices.
>
>[1]
>https://lore.kernel.org/all/SJ0PR11MB6744E2BA00BBE677B2B49BE99265A@SJ0
>PR11MB6744.namprd11.prod.outlook.com/#t
>
>So basically, we are ok with your idea. But we should decide if it is
>necessary to support the topology in 2). I think this is a general
>question. TBH. I don't have much information to judge if it is valuable.
>Perhaps, let's hear from more people.

Hi @Liu, Yi L @Nicolin Chen, for emulated/passthru devices behind the same pcie-pci bridge, I think of an idea, adding a new PCI callback:

AddressSpace * (*get_address_space_extend)(PCIBus *bus, void *opaque, int devfn, bool accel_dev);

which pass in real bus/devfn and a new param accel_dev which is true for vfio device.
Vtd implements this callback and return separate AS for vfio device if it's under an pcie-pci bridge and flts=on;
otherwise it fallback to call .get_address_space(). This way emulated devices and passthru devices behind the same pcie-pci bridge can have different AS.

If above idea is acceptable, then only obstacle is ERRATA_772415, maybe we can let VFIO check this errata and bypass RO mapping from beginning?
Or we just block this VFIO device running with flts=on if ERRATA_772415 and suggesting running with flts=off?

Thanks
Zhenzhong

Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to host

Posted by Nicolin Chen 5 months ago

On Mon, Jun 16, 2025 at 03:24:06AM +0000, Duan, Zhenzhong wrote:
> Hi @Liu, Yi L @Nicolin Chen, for emulated/passthru devices
> behind the same pcie-pci bridge, I think of an idea, adding
> a new PCI callback:
> 
> AddressSpace * (*get_address_space_extend)(PCIBus *bus, 
> void *opaque, int devfn, bool accel_dev);
>
> which pass in real bus/devfn and a new param accel_dev which
> is true for vfio device.

Just =y for all vfio (passthrough) devices?

ARM tentatively does this for get_address_space using Shameer's
trick to detect if the device is a passthrough VFIO one:

    PCIDevice *pdev = pci_find_device(bus, pci_bus_num(bus), devfn);
    bool has_iommufd = !!object_property_find(OBJECT(pdev), "iommufd");

    if (smmu->nested && ... && has_iommufd) {
        return &sdev->as_sysmem;
    }

So, I guess "accel_dev" could be just:
    !!object_property_find(OBJECT(pdev), "iommufd")
?

> Vtd implements this callback and return separate AS for vfio
> device if it's under an pcie-pci bridge and flts=on;
> otherwise it fallback to call .get_address_space(). This way
> emulated devices and passthru devices behind the same pcie-pci
> bridge can have different AS.

Again, if "vfio-device" tag with "iommufd" property is enough to
identify devices to separate their address spaces, perhaps the
existing get_address_space is enough.

> If above idea is acceptable, then only obstacle is ERRATA_772415,
> maybe we can let VFIO check this errata and bypass RO mapping from
> beginning?

Yes. There can be some communication between vIOMMU and the VFIO
core.

> Or we just block this VFIO device running with flts=on if
> ERRATA_772415 and suggesting running with flts=off?

That sounds like a simpler solution, so long as nobody complains
about this limitation :)

Thanks
Nicolin

RE: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to host

Posted by Duan, Zhenzhong 5 months ago


>-----Original Message-----
>From: Nicolin Chen <nicolinc@nvidia.com>
>Subject: Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to
>host
>
>On Mon, Jun 16, 2025 at 03:24:06AM +0000, Duan, Zhenzhong wrote:
>> Hi @Liu, Yi L @Nicolin Chen, for emulated/passthru devices
>> behind the same pcie-pci bridge, I think of an idea, adding
>> a new PCI callback:
>>
>> AddressSpace * (*get_address_space_extend)(PCIBus *bus,
>> void *opaque, int devfn, bool accel_dev);
>>
>> which pass in real bus/devfn and a new param accel_dev which
>> is true for vfio device.
>
>Just =y for all vfio (passthrough) devices?
>
>ARM tentatively does this for get_address_space using Shameer's
>trick to detect if the device is a passthrough VFIO one:
>
>    PCIDevice *pdev = pci_find_device(bus, pci_bus_num(bus), devfn);
>    bool has_iommufd = !!object_property_find(OBJECT(pdev), "iommufd");
>
>    if (smmu->nested && ... && has_iommufd) {
>        return &sdev->as_sysmem;
>    }
>
>So, I guess "accel_dev" could be just:
>    !!object_property_find(OBJECT(pdev), "iommufd")
>?

You are right, we don't need param accel_dev. Below should work:

object_dynamic_cast(OBJECT(hiod), TYPE_HOST_IOMMU_DEVICE_IOMMUFD)

>
>> Vtd implements this callback and return separate AS for vfio
>> device if it's under an pcie-pci bridge and flts=on;
>> otherwise it fallback to call .get_address_space(). This way
>> emulated devices and passthru devices behind the same pcie-pci
>> bridge can have different AS.
>
>Again, if "vfio-device" tag with "iommufd" property is enough to
>identify devices to separate their address spaces, perhaps the
>existing get_address_space is enough.

We need get_address_space_extend() to pass real BDF.
get_address_space pass group's BDF which made pci_find_device return wrong device.

>
>> If above idea is acceptable, then only obstacle is ERRATA_772415,
>> maybe we can let VFIO check this errata and bypass RO mapping from
>> beginning?
>
>Yes. There can be some communication between vIOMMU and the VFIO
>core.
>
>> Or we just block this VFIO device running with flts=on if
>> ERRATA_772415 and suggesting running with flts=off?
>
>That sounds like a simpler solution, so long as nobody complains
>about this limitation :)

I plan to apply this simpler solution except there is objection, because
I don't want to bring complexity to VFIO just for an Errata. I remember
ERRATA_772415 exists only on old SPR, @Liu, Yi L can correct me if I'm wrong.

We can also introduce a new vtd option ignore_errata to force ignore the errata
with a warning message in case user care more on performance over security.

With these two solutions, vtd can reuse VFIO container's IOAS and HWPT just
like ARM, so we are common on this part.

Comments welcome! If no objection, I will apply these solutions in v2.

Thanks
Zhenzhong

Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to host

Posted by Yi Liu 5 months ago

On 2025/6/16 16:54, Duan, Zhenzhong wrote:
> 
> 
>> -----Original Message-----
>> From: Nicolin Chen <nicolinc@nvidia.com>
>> Subject: Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to
>> host
>>
>> On Mon, Jun 16, 2025 at 03:24:06AM +0000, Duan, Zhenzhong wrote:
>>> Hi @Liu, Yi L @Nicolin Chen, for emulated/passthru devices
>>> behind the same pcie-pci bridge, I think of an idea, adding
>>> a new PCI callback:
>>>
>>> AddressSpace * (*get_address_space_extend)(PCIBus *bus,
>>> void *opaque, int devfn, bool accel_dev);
>>>
>>> which pass in real bus/devfn and a new param accel_dev which
>>> is true for vfio device.
>>
>> Just =y for all vfio (passthrough) devices?
>>

TBH. It's a bit hacky to me in concept. It may be more cleaner to detect
and block such topology.

BTW. @Nic, I suppose nesting vSMMUv3 does not have this concern since
you will put the passthru devices under a separate vIOMMU which should
ensure that the emulated devices won't share AS with passthrough device.
right?

>> ARM tentatively does this for get_address_space using Shameer's
>> trick to detect if the device is a passthrough VFIO one:
>>
>>     PCIDevice *pdev = pci_find_device(bus, pci_bus_num(bus), devfn);
>>     bool has_iommufd = !!object_property_find(OBJECT(pdev), "iommufd");
>>
>>     if (smmu->nested && ... && has_iommufd) {
>>         return &sdev->as_sysmem;
>>     }
>>
>> So, I guess "accel_dev" could be just:
>>     !!object_property_find(OBJECT(pdev), "iommufd")
>> ?
> 
> You are right, we don't need param accel_dev. Below should work:
> 
> object_dynamic_cast(OBJECT(hiod), TYPE_HOST_IOMMU_DEVICE_IOMMUFD)
> 
>>
>>> Vtd implements this callback and return separate AS for vfio
>>> device if it's under an pcie-pci bridge and flts=on;
>>> otherwise it fallback to call .get_address_space(). This way
>>> emulated devices and passthru devices behind the same pcie-pci
>>> bridge can have different AS.
>>
>> Again, if "vfio-device" tag with "iommufd" property is enough to
>> identify devices to separate their address spaces, perhaps the
>> existing get_address_space is enough.
> 
> We need get_address_space_extend() to pass real BDF.
> get_address_space pass group's BDF which made pci_find_device return wrong device.
>
>>
>>> If above idea is acceptable, then only obstacle is ERRATA_772415,
>>> maybe we can let VFIO check this errata and bypass RO mapping from
>>> beginning?
>>
>> Yes. There can be some communication between vIOMMU and the VFIO
>> core.
>>
>>> Or we just block this VFIO device running with flts=on if
>>> ERRATA_772415 and suggesting running with flts=off?
>>
>> That sounds like a simpler solution, so long as nobody complains
>> about this limitation :)
> 
> I plan to apply this simpler solution except there is objection, because
> I don't want to bring complexity to VFIO just for an Errata. I remember
> ERRATA_772415 exists only on old SPR, @Liu, Yi L can correct me if I'm wrong.

hmmm. I'm fine to pass some info to vfio hence let vfio skip RO mappings.
Is there other info that VFIO needs to get from vIOMMU? Hope start adding
such mechanism with normal requirement. :)

-- 
Regards,
Yi Liu

RE: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to host

Posted by Duan, Zhenzhong 5 months ago


>-----Original Message-----
>From: Liu, Yi L <yi.l.liu@intel.com>
>Subject: Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to
>host
>
>On 2025/6/16 16:54, Duan, Zhenzhong wrote:
>>
>>
>>> -----Original Message-----
>>> From: Nicolin Chen <nicolinc@nvidia.com>
>>> Subject: Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table
>to
>>> host
>>>
>>> On Mon, Jun 16, 2025 at 03:24:06AM +0000, Duan, Zhenzhong wrote:
>>>> Hi @Liu, Yi L @Nicolin Chen, for emulated/passthru devices
>>>> behind the same pcie-pci bridge, I think of an idea, adding
>>>> a new PCI callback:
>>>>
>>>> AddressSpace * (*get_address_space_extend)(PCIBus *bus,
>>>> void *opaque, int devfn, bool accel_dev);
>>>>
>>>> which pass in real bus/devfn and a new param accel_dev which
>>>> is true for vfio device.
>>>
>>> Just =y for all vfio (passthrough) devices?
>>>
>
>TBH. It's a bit hacky to me in concept. It may be more cleaner to detect
>and block such topology.

OK, then we don't need get_address_space_extend(). Will do in v2.

>
>BTW. @Nic, I suppose nesting vSMMUv3 does not have this concern since
>you will put the passthru devices under a separate vIOMMU which should
>ensure that the emulated devices won't share AS with passthrough device.
>right?
>
>>> ARM tentatively does this for get_address_space using Shameer's
>>> trick to detect if the device is a passthrough VFIO one:
>>>
>>>     PCIDevice *pdev = pci_find_device(bus, pci_bus_num(bus), devfn);
>>>     bool has_iommufd = !!object_property_find(OBJECT(pdev), "iommufd");
>>>
>>>     if (smmu->nested && ... && has_iommufd) {
>>>         return &sdev->as_sysmem;
>>>     }
>>>
>>> So, I guess "accel_dev" could be just:
>>>     !!object_property_find(OBJECT(pdev), "iommufd")
>>> ?
>>
>> You are right, we don't need param accel_dev. Below should work:
>>
>> object_dynamic_cast(OBJECT(hiod), TYPE_HOST_IOMMU_DEVICE_IOMMUFD)
>>
>>>
>>>> Vtd implements this callback and return separate AS for vfio
>>>> device if it's under an pcie-pci bridge and flts=on;
>>>> otherwise it fallback to call .get_address_space(). This way
>>>> emulated devices and passthru devices behind the same pcie-pci
>>>> bridge can have different AS.
>>>
>>> Again, if "vfio-device" tag with "iommufd" property is enough to
>>> identify devices to separate their address spaces, perhaps the
>>> existing get_address_space is enough.
>>
>> We need get_address_space_extend() to pass real BDF.
>> get_address_space pass group's BDF which made pci_find_device return wrong
>device.
>>
>>>
>>>> If above idea is acceptable, then only obstacle is ERRATA_772415,
>>>> maybe we can let VFIO check this errata and bypass RO mapping from
>>>> beginning?
>>>
>>> Yes. There can be some communication between vIOMMU and the VFIO
>>> core.
>>>
>>>> Or we just block this VFIO device running with flts=on if
>>>> ERRATA_772415 and suggesting running with flts=off?
>>>
>>> That sounds like a simpler solution, so long as nobody complains
>>> about this limitation :)
>>
>> I plan to apply this simpler solution except there is objection, because
>> I don't want to bring complexity to VFIO just for an Errata. I remember
>> ERRATA_772415 exists only on old SPR, @Liu, Yi L can correct me if I'm wrong.
>
>hmmm. I'm fine to pass some info to vfio hence let vfio skip RO mappings.
>Is there other info that VFIO needs to get from vIOMMU? Hope start adding
>such mechanism with normal requirement. :)

I can think of ERRATA_772415 and NESTED capability. NESTED used for creating
VFIO default HWPT in stage2 mode.

Thanks
Zhenzhong

Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to host

Posted by Yi Liu 5 months ago

On 2025/6/16 18:16, Duan, Zhenzhong wrote:
> 
> 
>> -----Original Message-----
>> From: Liu, Yi L <yi.l.liu@intel.com>
>> Subject: Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to
>> host
>>
>> On 2025/6/16 16:54, Duan, Zhenzhong wrote:
>>>
>>>
>>>> -----Original Message-----
>>>> From: Nicolin Chen <nicolinc@nvidia.com>
>>>> Subject: Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table
>> to
>>>> host
>>>>
>>>> On Mon, Jun 16, 2025 at 03:24:06AM +0000, Duan, Zhenzhong wrote:
>>>>> Hi @Liu, Yi L @Nicolin Chen, for emulated/passthru devices
>>>>> behind the same pcie-pci bridge, I think of an idea, adding
>>>>> a new PCI callback:
>>>>>
>>>>> AddressSpace * (*get_address_space_extend)(PCIBus *bus,
>>>>> void *opaque, int devfn, bool accel_dev);
>>>>>
>>>>> which pass in real bus/devfn and a new param accel_dev which
>>>>> is true for vfio device.
>>>>
>>>> Just =y for all vfio (passthrough) devices?
>>>>
>>
>> TBH. It's a bit hacky to me in concept. It may be more cleaner to detect
>> and block such topology.
> 
> OK, then we don't need get_address_space_extend(). Will do in v2.
> 
>>
>> BTW. @Nic, I suppose nesting vSMMUv3 does not have this concern since
>> you will put the passthru devices under a separate vIOMMU which should
>> ensure that the emulated devices won't share AS with passthrough device.
>> right?
>>
>>>> ARM tentatively does this for get_address_space using Shameer's
>>>> trick to detect if the device is a passthrough VFIO one:
>>>>
>>>>      PCIDevice *pdev = pci_find_device(bus, pci_bus_num(bus), devfn);
>>>>      bool has_iommufd = !!object_property_find(OBJECT(pdev), "iommufd");
>>>>
>>>>      if (smmu->nested && ... && has_iommufd) {
>>>>          return &sdev->as_sysmem;
>>>>      }
>>>>
>>>> So, I guess "accel_dev" could be just:
>>>>      !!object_property_find(OBJECT(pdev), "iommufd")
>>>> ?
>>>
>>> You are right, we don't need param accel_dev. Below should work:
>>>
>>> object_dynamic_cast(OBJECT(hiod), TYPE_HOST_IOMMU_DEVICE_IOMMUFD)
>>>
>>>>
>>>>> Vtd implements this callback and return separate AS for vfio
>>>>> device if it's under an pcie-pci bridge and flts=on;
>>>>> otherwise it fallback to call .get_address_space(). This way
>>>>> emulated devices and passthru devices behind the same pcie-pci
>>>>> bridge can have different AS.
>>>>
>>>> Again, if "vfio-device" tag with "iommufd" property is enough to
>>>> identify devices to separate their address spaces, perhaps the
>>>> existing get_address_space is enough.
>>>
>>> We need get_address_space_extend() to pass real BDF.
>>> get_address_space pass group's BDF which made pci_find_device return wrong
>> device.
>>>
>>>>
>>>>> If above idea is acceptable, then only obstacle is ERRATA_772415,
>>>>> maybe we can let VFIO check this errata and bypass RO mapping from
>>>>> beginning?
>>>>
>>>> Yes. There can be some communication between vIOMMU and the VFIO
>>>> core.
>>>>
>>>>> Or we just block this VFIO device running with flts=on if
>>>>> ERRATA_772415 and suggesting running with flts=off?
>>>>
>>>> That sounds like a simpler solution, so long as nobody complains
>>>> about this limitation :)
>>>
>>> I plan to apply this simpler solution except there is objection, because
>>> I don't want to bring complexity to VFIO just for an Errata. I remember
>>> ERRATA_772415 exists only on old SPR, @Liu, Yi L can correct me if I'm wrong.
>>
>> hmmm. I'm fine to pass some info to vfio hence let vfio skip RO mappings.
>> Is there other info that VFIO needs to get from vIOMMU? Hope start adding
>> such mechanism with normal requirement. :)
> 
> I can think of ERRATA_772415 and NESTED capability. NESTED used for creating
> VFIO default HWPT in stage2 mode.

yeah. NESTED should be a hard requirement. VFIO should allocate hwpt with 
nested_parent flag.

-- 
Regards,
Yi Liu

RE: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to host

Posted by Shameerali Kolothum Thodi via 5 months ago


> -----Original Message-----
> From: Yi Liu <yi.l.liu@intel.com>
> Sent: Thursday, June 12, 2025 1:54 PM
> To: Duan, Zhenzhong <zhenzhong.duan@intel.com>; Nicolin Chen
> <nicolinc@nvidia.com>
> Cc: Peter Xu <peterx@redhat.com>; qemu-devel@nongnu.org;
> alex.williamson@redhat.com; clg@redhat.com; eric.auger@redhat.com;
> mst@redhat.com; jasowang@redhat.com; ddutile@redhat.com;
> jgg@nvidia.com; Shameerali Kolothum Thodi
> <shameerali.kolothum.thodi@huawei.com>; joao.m.martins@oracle.com;
> clement.mathieu--drif@eviden.com; Tian, Kevin <kevin.tian@intel.com>;
> Peng, Chao P <chao.p.peng@intel.com>; Yi Sun <yi.y.sun@linux.intel.com>;
> Marcel Apfelbaum <marcel.apfelbaum@gmail.com>; Paolo Bonzini
> <pbonzini@redhat.com>; Richard Henderson
> <richard.henderson@linaro.org>; Eduardo Habkost
> <eduardo@habkost.net>
> Subject: Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page
> table to host
 
> >> The "switch" in vSMMU model is only needed by KVM for MSI doorbell
> >> translation. By thinking it carefully, maybe it shouldn't switch AS
> >> because VFIO might be confused if it somehow does get_address_space
> >> again in the future..
> 
> @Nicolin, not quite get the detailed logic for the MSI stuff on SMMU. But I
> agree with the last sentence. get_address_space should return a consistent
> AS.

I think it is because, in ARM world the MSI doorbell address is translated by
an IOMMU. Hence, if the Guest device is behind IOMMU, it needs to return
the IOMMU AS in,

kvm_irqchip_add_msi_route()
 kvm_arch_fixup_msi_route()
   pci_device_iommu_address_space()  --> .get_address_space()  -->At this point we now return IOMMU AS.

If not the device will be configured with a  wrong MSI doorbell address.

Nicolin, you seems to suggest we could avoid this switching and always return
System AS. Does that mean we handle this KVM/MSI case separately?
Could you please detail out the idea?

Thanks,
Shameer

Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to host

Posted by Nicolin Chen 5 months ago

On Thu, Jun 12, 2025 at 02:06:15PM +0000, Shameerali Kolothum Thodi wrote:
> > >> The "switch" in vSMMU model is only needed by KVM for MSI doorbell
> > >> translation. By thinking it carefully, maybe it shouldn't switch AS
> > >> because VFIO might be confused if it somehow does get_address_space
> > >> again in the future..
> > 
> > @Nicolin, not quite get the detailed logic for the MSI stuff on SMMU. But I
> > agree with the last sentence. get_address_space should return a consistent
> > AS.
> 
> I think it is because, in ARM world the MSI doorbell address is translated by
> an IOMMU. Hence, if the Guest device is behind IOMMU, it needs to return
> the IOMMU AS in,
> 
> kvm_irqchip_add_msi_route()
>  kvm_arch_fixup_msi_route()
>    pci_device_iommu_address_space()  --> .get_address_space()  -->At this point we now return IOMMU AS.
> 
> If not the device will be configured with a  wrong MSI doorbell address.

Yes. The KVM code on ARM needs to translate the MSI location from
gIOVA to gPA, because MSI on ARM is behind IOMMU.

> Nicolin, you seems to suggest we could avoid this switching and always return
> System AS. Does that mean we handle this KVM/MSI case separately?
> Could you please detail out the idea?

We could add one of following ops:
    get_msi_address_space
    get_msi_address/translate_msi_iova

Thanks
Nicolin

RE: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to host

Posted by Duan, Zhenzhong 5 months, 3 weeks ago


>-----Original Message-----
>From: Nicolin Chen <nicolinc@nvidia.com>
>Subject: Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to
...
>> yes based on the current design. when guest GPTT==PT, attach device
>> to S2 hwpt, when it goes to S1, then attach it to a S1 hwpt whose
>> parent is the aforementioned S2 hwpt. This S2 hwpt is always there
>> for use.
>
>ARM is doing the same thing. And the exact point "this S2 hwpt is
>always there for use" has been telling me that the device can just
>stay at the S2 address space (system), since the guest kernel will
>take care of the S1 address space (iommu).
>
>Overall, the questions here have been two-fold:
>
>1.Why does VT-d need an internal listener?
>
>  I can see the (only) reason is for the RO mappings.

It's not the only reason. Another reason is we want to support the
case that VFIO device and emulated device under same group,
e.g., they are under a PCIe-to-PCI bridge.

In fact, .get_address_space() returns the AS for the group rather
than device, see pci_device_get_iommu_bus_devfn().

>
>  Yet, Is there anything that we can do to the VFIO listener to
>  bypass these RO mappings?
>
>2.Why not return the system AS all the time when nesting is on?
>  Why switch to the iommu AS when device attaches to S1 HWPT?

The emulated device wants the iommu AS and VFIO device wants
System AS, just curious how you handle this case in your series?

Thanks
Zhenzhong

RE: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to host

Posted by Duan, Zhenzhong 5 months, 3 weeks ago


>-----Original Message-----
>From: Liu, Yi L <yi.l.liu@intel.com>
>Subject: Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to
>host
>
>Hey Nic,
>
>On 2025/5/22 06:49, Nicolin Chen wrote:
>> On Wed, May 21, 2025 at 07:14:45PM +0800, Zhenzhong Duan wrote:
>>> +static const MemoryListener iommufd_s2domain_memory_listener = {
>>> +    .name = "iommufd_s2domain",
>>> +    .priority = 1000,
>>> +    .region_add = iommufd_listener_region_add_s2domain,
>>> +    .region_del = iommufd_listener_region_del_s2domain,
>>> +};
>>
>> Would you mind elaborating When and how vtd does all S2 mappings?
>>
>> On ARM, the default vfio_memory_listener could capture the entire
>> guest RAM and add to the address space. So what we do is basically
>> reusing the vfio_memory_listener:
>> https://lore.kernel.org/qemu-devel/20250311141045.66620-13-
>shameerali.kolothum.thodi@huawei.com/
>
>in concept yes, all the guest ram. but due to an errata, we need
>to skip the RO mappings.
>
>> The thing is that when a VFIO device is attached to the container
>> upon a nesting configuration, the ->get_address_space op should
>> return the system address space as S1 nested HWPT isn't allocated
>> yet. Then all the iommu as routines in vfio_listener_region_add()
>> would be skipped, ending up with mapping the guest RAM in S2 HWPT
>> correctly. Not until the S1 nested HWPT is allocated by the guest
>> OS (after guest boots), can the ->get_address_space op return the
>> iommu address space.
>
>This seems a bit different between ARM and VT-d emulation. The VT-d
>emulation code returns the iommu address space regardless of what
>translation mode guest configured. But the MR of the address space
>has two overlapped subregions, one is nodmar, another one is iommu.
>As the naming shows, the nodmar is aliased to the system MR. And before
>the guest enables iommu and set PGTT to a non-PT mode (e.g. S1 or S2),
>the effective MR alias is the nodmar, hence the mapping this address
>space holds are the GPA mappings in the beginning. If guest set PGTT to S2,
>then the iommu MR is enabled, hence the mapping is gIOVA mappings
>accordingly. So in VT-d emulation, the address space switch is more the MR
>alias switching.
>
>In this series, we mainly want to support S1 translation type for guest.
>And it is based on nested translation, which needs a S2 domain that holds
>the GPA mappings. Besides S1 translation type, PT is also supported. Both
>the two types need a S2 domain which already holds GPA mappings. So we have
>this internal listener. Also, we want to skip RO mappings on S2, so that's
>another reason for it.  @Zhenzhong, perhaps, it can be described in the
>commit message why an internal listener is introduced.

Thanks Yi for accurate explanation, sure, will add comments for internal listener.

BRs,
Zhenzhong

>
>>
>> With this address space shift, S2 mappings can be simply captured
>> and done by vfio_memory_listener. Then, such an s2domain listener
>> would be largely redundant.
>
>hope above addressed your question.
>
>> So the second question is:
>> Does vtd have to own this iommufd_s2domain_memory_listener? IOW,
>
>yes based on the current design. when guest GPTT==PT, attach device
>to S2 hwpt, when it goes to S1, then attach it to a S1 hwpt whose
>parent is the aforementioned S2 hwpt. This S2 hwpt is always there
>for use.
>
>> does vtd_host_dma_iommu() have to return the iommu address space
>> all the time?
>
>yes, all the time.
>
>--
>Regards,
>Yi Liu

RE: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to host

Posted by Duan, Zhenzhong 5 months, 3 weeks ago


>-----Original Message-----
>From: Nicolin Chen <nicolinc@nvidia.com>
>Subject: Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to
>host
>
>On Wed, May 21, 2025 at 07:14:45PM +0800, Zhenzhong Duan wrote:
>> +static const MemoryListener iommufd_s2domain_memory_listener = {
>> +    .name = "iommufd_s2domain",
>> +    .priority = 1000,
>> +    .region_add = iommufd_listener_region_add_s2domain,
>> +    .region_del = iommufd_listener_region_del_s2domain,
>> +};
>
>Would you mind elaborating When and how vtd does all S2 mappings?

When guest trigger pasid cache invalidation, vIOMMU will attach device
to stage2 page table if guest's PGTT=PT or nested page table if PGTT=Stage1.
All these page tables are dynamically created during attach. We don't use
VFIO's shadow page table. The S2 mappings are also created during attach.
See:

vtd_device_attach_iommufd()
{
...
    vtd_device_attach_container();
    container->listener = iommufd_s2domain_memory_listener;
    memory_listener_register(&container->listener, &address_space_memory);
...
}

>
>On ARM, the default vfio_memory_listener could capture the entire
>guest RAM and add to the address space. So what we do is basically
>reusing the vfio_memory_listener:
>https://lore.kernel.org/qemu-devel/20250311141045.66620-13-
>shameerali.kolothum.thodi@huawei.com/
>
>The thing is that when a VFIO device is attached to the container
>upon a nesting configuration, the ->get_address_space op should
>return the system address space as S1 nested HWPT isn't allocated
>yet. Then all the iommu as routines in vfio_listener_region_add()
>would be skipped, ending up with mapping the guest RAM in S2 HWPT
>correctly. Not until the S1 nested HWPT is allocated by the guest
>OS (after guest boots), can the ->get_address_space op return the
>iommu address space.

When S1 hwpt is allocated by guest, who will notify VFIO to call
->get_address_space op() again to get iommu address space?

>
>With this address space shift, S2 mappings can be simply captured
>and done by vfio_memory_listener. Then, such an s2domain listener
>would be largely redundant.

I didn't get how arm smmu supports switching address space, will VFIO call
->get_address_space() twice, once to get system address space and the other
for iommu address space?

>
>So the second question is:
>Does vtd have to own this iommufd_s2domain_memory_listener? IOW,
>does vtd_host_dma_iommu() have to return the iommu address space
>all the time?

Vtd only support to return a fixed address space, under the address space
there can be either system memory region or iommu memory region enabled.

>
>> +static int vtd_create_s1_hwpt(HostIOMMUDeviceIOMMUFD *idev,
>> +                              VTDS2Hwpt *s2_hwpt, VTDHwpt *hwpt,
>> +                              VTDPASIDEntry *pe, Error **errp)
>> +{
>> +    struct iommu_hwpt_vtd_s1 vtd;
>> +    uint32_t hwpt_id, s2_hwpt_id = s2_hwpt->hwpt_id;
>> +
>> +    vtd_init_s1_hwpt_data(&vtd, pe);
>> +
>> +    if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid,
>> +                                    s2_hwpt_id, 0, IOMMU_HWPT_DATA_VTD_S1,
>> +                                    sizeof(vtd), &vtd, &hwpt_id, errp)) {
>> +        return -EINVAL;
>> +    }
>> +
>> +    hwpt->hwpt_id = hwpt_id;
>> +
>> +    return 0;
>> +}
>> +
>> +static void vtd_destroy_s1_hwpt(HostIOMMUDeviceIOMMUFD *idev,
>VTDHwpt *hwpt)
>> +{
>> +    iommufd_backend_free_id(idev->iommufd, hwpt->hwpt_id);
>> +}
>
>I think you did some substantial work to isolate the get_hw_info
>part inside the iommufd backend code, which looks nice and clean
>as the vIOMMU code simply does iodc->get_cap().
>
>However, that then makes these direct raw backend function calls
>very awkward :-/
>
>In my view, the way to make sense is either:
>* We don't do any isolation, but just call raw backend functions
>  in vIOMMU code
>* We do every isolation, and never call raw backend functions in
>  vIOMMU code

Iommufd backend functions are general for all modules usage including
vIOMMU. I think we are not blocking vIOMMU calling raw backend functions.
We just provide a general interface for querying capabilities, not all capabilities
are from iommufd get_hw_info result, e.g., aw_bits.

Zhenzhong

Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to host

Posted by Nicolin Chen 5 months, 3 weeks ago

On Thu, May 22, 2025 at 06:50:42AM +0000, Duan, Zhenzhong wrote:
> 
> 
> >-----Original Message-----
> >From: Nicolin Chen <nicolinc@nvidia.com>
> >Subject: Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to
> >host
> >
> >On Wed, May 21, 2025 at 07:14:45PM +0800, Zhenzhong Duan wrote:
> >> +static const MemoryListener iommufd_s2domain_memory_listener = {
> >> +    .name = "iommufd_s2domain",
> >> +    .priority = 1000,
> >> +    .region_add = iommufd_listener_region_add_s2domain,
> >> +    .region_del = iommufd_listener_region_del_s2domain,
> >> +};
> >
> >Would you mind elaborating When and how vtd does all S2 mappings?
> 
> When guest trigger pasid cache invalidation, vIOMMU will attach device
> to stage2 page table if guest's PGTT=PT or nested page table if PGTT=Stage1.
> All these page tables are dynamically created during attach. We don't use
> VFIO's shadow page table. The S2 mappings are also created during attach.

OK. That I can understand.

Next question: what does VTD actually map onto the S2 page table?
The entire guest RAM? Or just a part of that?

On ARM, the VFIO listener would capture the entire RAM, and map it
on S2 page table. I wonder if VTD would do the same.

> >On ARM, the default vfio_memory_listener could capture the entire
> >guest RAM and add to the address space. So what we do is basically
> >reusing the vfio_memory_listener:
> >https://lore.kernel.org/qemu-devel/20250311141045.66620-13-
> >shameerali.kolothum.thodi@huawei.com/
> >
> >The thing is that when a VFIO device is attached to the container
> >upon a nesting configuration, the ->get_address_space op should
> >return the system address space as S1 nested HWPT isn't allocated
> >yet. Then all the iommu as routines in vfio_listener_region_add()
> >would be skipped, ending up with mapping the guest RAM in S2 HWPT
> >correctly. Not until the S1 nested HWPT is allocated by the guest
> >OS (after guest boots), can the ->get_address_space op return the
> >iommu address space.
> 
> When S1 hwpt is allocated by guest, who will notify VFIO to call
> ->get_address_space op() again to get iommu address space?

Hmm, would you please elaborate why VFIO needs to call that again?

I can see VFIO create the MAP/UNMAP notifiers for an iommu address
space. However, the device operating in the nested translation mode
should go through IOMMU HW for these two:
 - S1 page table (MAP) will be created by the guest OS
 - S1 invalidation (UNMAP) will be issued by the guest OS, and then
   trapped by QEMU to forward to the HWPT uAPI to the host kernel.

As you mentioned, there is no need of a shadow page table in this
mode. What else does VT-d need from an iommu address space?

On ARM, the only reason that we shift address space, is for KVM to
inject MSI, as it only has the gIOVA and requires the iommu address
space to translate that to gPA. Refer to kvm_arch_fixup_msi_route()
in target/arm/kvm.c where it calls pci_device_iommu_address_space.

> >With this address space shift, S2 mappings can be simply captured
> >and done by vfio_memory_listener. Then, such an s2domain listener
> >would be largely redundant.
> 
> I didn't get how arm smmu supports switching address space, will VFIO call
> ->get_address_space() twice, once to get system address space and the other
> for iommu address space?

The set_iommu_device() attaches the device to an stage2 page table
by default, indicating that the device works in the S1 passthrough
mode (for VTD, that's PGTT=PT) at VM creation. And this is where
the system address space should be returned by get_address_space().

If the guest kernel sets an S1 Translate mode for the device (for
VTD, that's PGTT=Stage1), QEMU would trap that and allocate an S1
HWPT for device to attach. Starting from here, get_address_space()
can return the iommu address space -- on ARM, we only need it for
KVM to translate MSI.

If the guest kernel sets an S1 Bypass mode for the device (for VTD,
that's PGTT=PT), the device would continue stay in the system AS,
i.e. get_address_space() wouldn't need to switch.

> >> +static int vtd_create_s1_hwpt(HostIOMMUDeviceIOMMUFD *idev,
> >> +                              VTDS2Hwpt *s2_hwpt, VTDHwpt *hwpt,
> >> +                              VTDPASIDEntry *pe, Error **errp)
> >> +{
> >> +    struct iommu_hwpt_vtd_s1 vtd;
> >> +    uint32_t hwpt_id, s2_hwpt_id = s2_hwpt->hwpt_id;
> >> +
> >> +    vtd_init_s1_hwpt_data(&vtd, pe);
> >> +
> >> +    if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid,
> >> +                                    s2_hwpt_id, 0, IOMMU_HWPT_DATA_VTD_S1,
> >> +                                    sizeof(vtd), &vtd, &hwpt_id, errp)) {
> >> +        return -EINVAL;
> >> +    }
> >> +
> >> +    hwpt->hwpt_id = hwpt_id;
> >> +
> >> +    return 0;
> >> +}
> >> +
> >> +static void vtd_destroy_s1_hwpt(HostIOMMUDeviceIOMMUFD *idev,
> >VTDHwpt *hwpt)
> >> +{
> >> +    iommufd_backend_free_id(idev->iommufd, hwpt->hwpt_id);
> >> +}
> >
> >I think you did some substantial work to isolate the get_hw_info
> >part inside the iommufd backend code, which looks nice and clean
> >as the vIOMMU code simply does iodc->get_cap().
> >
> >However, that then makes these direct raw backend function calls
> >very awkward :-/
> >
> >In my view, the way to make sense is either:
> >* We don't do any isolation, but just call raw backend functions
> >  in vIOMMU code
> >* We do every isolation, and never call raw backend functions in
> >  vIOMMU code
> 
> Iommufd backend functions are general for all modules usage including
> vIOMMU. I think we are not blocking vIOMMU calling raw backend functions.

Well, I am not against doing that. Calling the raw iommufd APIs is
easier :)

> We just provide a general interface for querying capabilities, not all capabilities
> are from iommufd get_hw_info result, e.g., aw_bits.

Question is why we need a general interface for get_hw_info(), yet
not for other iommufd APIs.

IIRC, Cedirc's suggested a general interface for get_hw_info so the
vIOMMU code can be unaware of the underlying source (doesn't matter
if it's from iommufd or from vfio).

I see aw_bits from QEMU command line. Mind elaborating?

Thanks
Nicolin

RE: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to host

Posted by Duan, Zhenzhong 5 months, 3 weeks ago


>-----Original Message-----
>From: Nicolin Chen <nicolinc@nvidia.com>
...
>> >I think you did some substantial work to isolate the get_hw_info
>> >part inside the iommufd backend code, which looks nice and clean
>> >as the vIOMMU code simply does iodc->get_cap().
>> >
>> >However, that then makes these direct raw backend function calls
>> >very awkward :-/
>> >
>> >In my view, the way to make sense is either:
>> >* We don't do any isolation, but just call raw backend functions
>> >  in vIOMMU code
>> >* We do every isolation, and never call raw backend functions in
>> >  vIOMMU code
>>
>> Iommufd backend functions are general for all modules usage including
>> vIOMMU. I think we are not blocking vIOMMU calling raw backend functions.
>
>Well, I am not against doing that. Calling the raw iommufd APIs is
>easier :)
>
>> We just provide a general interface for querying capabilities, not all capabilities
>> are from iommufd get_hw_info result, e.g., aw_bits.
>
>Question is why we need a general interface for get_hw_info(), yet
>not for other iommufd APIs.

Because other iommufd APIs are already abstract and hides vendor difference,
but get_hw_info() is not, it returns raw vendor data.
This makes consumer of this raw vendor data have to recognize the vendor data format.
For example, when virtio-iommu wants to know host iommu capability, it has to check
raw data from get_hw_info(), we hide this difference in the backend so virtio-iommu
could just call .get_cap(HOST_IOMMU_DEVICE_CAP_*), no matter what underlying
platform is, Vtd, smmuv3, etc..

>
>IIRC, Cedirc's suggested a general interface for get_hw_info so the
>vIOMMU code can be unaware of the underlying source (doesn't matter
>if it's from iommufd or from vfio).
>
>I see aw_bits from QEMU command line. Mind elaborating?

You mean aw_bits for virtual intel-iommu, that defines the IOVA address width of intel-iommu.
We need to check aw_bits from host intel-iommu with virtual intel-iommu's to determine
compatibility.

Thanks
Zhenzhong

Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to host

Posted by Yi Liu 5 months, 3 weeks ago

On 2025/5/23 03:29, Nicolin Chen wrote:
> On Thu, May 22, 2025 at 06:50:42AM +0000, Duan, Zhenzhong wrote:
>>
>>
>>> -----Original Message-----
>>> From: Nicolin Chen <nicolinc@nvidia.com>
>>> Subject: Re: [PATCH rfcv3 15/21] intel_iommu: Bind/unbind guest page table to
>>> host
>>>
>>> On Wed, May 21, 2025 at 07:14:45PM +0800, Zhenzhong Duan wrote:
>>>> +static const MemoryListener iommufd_s2domain_memory_listener = {
>>>> +    .name = "iommufd_s2domain",
>>>> +    .priority = 1000,
>>>> +    .region_add = iommufd_listener_region_add_s2domain,
>>>> +    .region_del = iommufd_listener_region_del_s2domain,
>>>> +};
>>>
>>> Would you mind elaborating When and how vtd does all S2 mappings?
>>
>> When guest trigger pasid cache invalidation, vIOMMU will attach device
>> to stage2 page table if guest's PGTT=PT or nested page table if PGTT=Stage1.
>> All these page tables are dynamically created during attach. We don't use
>> VFIO's shadow page table. The S2 mappings are also created during attach.
> 
> OK. That I can understand.
> 
> Next question: what does VTD actually map onto the S2 page table?
> The entire guest RAM? Or just a part of that?
> 
> On ARM, the VFIO listener would capture the entire RAM, and map it
> on S2 page table. I wonder if VTD would do the same.
> 
>>> On ARM, the default vfio_memory_listener could capture the entire
>>> guest RAM and add to the address space. So what we do is basically
>>> reusing the vfio_memory_listener:
>>> https://lore.kernel.org/qemu-devel/20250311141045.66620-13-
>>> shameerali.kolothum.thodi@huawei.com/
>>>
>>> The thing is that when a VFIO device is attached to the container
>>> upon a nesting configuration, the ->get_address_space op should
>>> return the system address space as S1 nested HWPT isn't allocated
>>> yet. Then all the iommu as routines in vfio_listener_region_add()
>>> would be skipped, ending up with mapping the guest RAM in S2 HWPT
>>> correctly. Not until the S1 nested HWPT is allocated by the guest
>>> OS (after guest boots), can the ->get_address_space op return the
>>> iommu address space.
>>
>> When S1 hwpt is allocated by guest, who will notify VFIO to call
>> ->get_address_space op() again to get iommu address space?
> 
> Hmm, would you please elaborate why VFIO needs to call that again?
> 
> I can see VFIO create the MAP/UNMAP notifiers for an iommu address
> space. However, the device operating in the nested translation mode
> should go through IOMMU HW for these two:
>   - S1 page table (MAP) will be created by the guest OS
>   - S1 invalidation (UNMAP) will be issued by the guest OS, and then
>     trapped by QEMU to forward to the HWPT uAPI to the host kernel.
> 
> As you mentioned, there is no need of a shadow page table in this
> mode. What else does VT-d need from an iommu address space?
> 
> On ARM, the only reason that we shift address space, is for KVM to
> inject MSI, as it only has the gIOVA and requires the iommu address
> space to translate that to gPA. Refer to kvm_arch_fixup_msi_route()
> in target/arm/kvm.c where it calls pci_device_iommu_address_space.
> 
>>> With this address space shift, S2 mappings can be simply captured
>>> and done by vfio_memory_listener. Then, such an s2domain listener
>>> would be largely redundant.
>>
>> I didn't get how arm smmu supports switching address space, will VFIO call
>> ->get_address_space() twice, once to get system address space and the other
>> for iommu address space?
> 
> The set_iommu_device() attaches the device to an stage2 page table

hmmm. I'm not sure if this is accurate. I think this set_iommu_device
just acts as setting some handle for this particular device to the vIOMMU
side. It has not idea about what address space nor page table.

> by default, indicating that the device works in the S1 passthrough
> mode (for VTD, that's PGTT=PT) at VM creation. And this is where
> the system address space should be returned by get_address_space().
> 
> If the guest kernel sets an S1 Translate mode for the device (for
> VTD, that's PGTT=Stage1), QEMU would trap that and allocate an S1
> HWPT for device to attach. Starting from here, get_address_space()
> can return the iommu address space -- on ARM, we only need it for
> KVM to translate MSI.
> 

refer to the last reply. This seems to be different between ARM and VT-d
emulation.

-- 
Regards,
Yi Liu