[PATCH rfcv1 10/23] intel_iommu: bind/unbind guest page table to host

Zhenzhong Duan posted 23 patches 10 months, 2 weeks ago
[PATCH rfcv1 10/23] intel_iommu: bind/unbind guest page table to host
Posted by Zhenzhong Duan 10 months, 2 weeks ago
This captures the guest PASID table entry modifications and
propagates the changes to host to attach a hwpt with type determined
per guest PGTT configuration.

When PGTT is Pass-through(100b), the hwpt on host side is a stage-2
page table(GPA->HPA). When PGTT is First-stage Translation only(001b),
the hwpt on host side is a nested page table.

The guest page table is configured as stage-1 page table (gIOVA->GPA)
whose translation result would further go through host VT-d stage-2
page table(GPA->HPA) under nested translation mode. This is the key
to support gIOVA over stage-1 page table for Intel VT-d in
virtualization environment.

Stage-2 page table could be shared by different devices if there is
no conflict and devices link to same iommufd object, i.e. devices
under same host IOMMU can share same stage-2 page table. If there
is conflict, i.e. there is one device under non cache coherency
mode which is different from others, it requires a separate
stage-2 page table in non-CC mode.

See below example diagram:

      IntelIOMMUState
             |
             V
    .------------------.    .------------------.
    | VTDIOASContainer |--->| VTDIOASContainer |--->...
    |    (iommufd0)    |    |    (iommufd1)    |
    .------------------.    .------------------.
             |                       |
             |                       .-->...
             V
      .-------------------.    .-------------------.
      |   VTDS2Hwpt(CC)   |--->| VTDS2Hwpt(non-CC) |-->...
      .-------------------.    .-------------------.
          |            |               |
          |            |               |
    .-----------.  .-----------.  .------------.
    | IOMMUFD   |  | IOMMUFD   |  | IOMMUFD    |
    | Device(CC)|  | Device(CC)|  | Device     |
    | (iommufd0)|  | (iommufd0)|  | (non-CC)   |
    |           |  |           |  | (iommufd0) |
    .-----------.  .-----------.  .------------.

Co-Authored-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
---
 hw/i386/intel_iommu_internal.h |  16 +
 include/hw/i386/intel_iommu.h  |  30 ++
 hw/i386/intel_iommu.c          | 641 ++++++++++++++++++++++++++++++++-
 hw/i386/trace-events           |   8 +
 4 files changed, 677 insertions(+), 18 deletions(-)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 16dc712e94..e33c9f54b5 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -199,6 +199,7 @@
 #define VTD_ECAP_SMTS               (1ULL << 43)
 #define VTD_ECAP_SLTS               (1ULL << 46)
 #define VTD_ECAP_FLTS               (1ULL << 47)
+#define VTD_ECAP_RPS                (1ULL << 49)
 
 #define VTD_ECAP_MASK               (VTD_ECAP_SRS | VTD_ECAP_EAFS)
 #define VTD_GET_PSS(val)            (((val) >> 35) & 0x1f)
@@ -518,6 +519,14 @@ typedef struct VTDRootEntry VTDRootEntry;
 #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(aw)  (0x1e0ULL | ~VTD_HAW_MASK(aw))
 #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL1      0xffffffffffe00000ULL
 
+enum VTDPASIDOp {
+    VTD_PASID_BIND,
+    VTD_PASID_UPDATE,
+    VTD_PASID_UNBIND,
+    VTD_OP_NUM
+};
+typedef enum VTDPASIDOp VTDPASIDOp;
+
 typedef enum VTDPCInvType {
     /* force reset all */
     VTD_PASID_CACHE_FORCE_RESET = 0,
@@ -533,6 +542,7 @@ struct VTDPASIDCacheInfo {
     uint32_t pasid;
     PCIBus *bus;
     uint16_t devfn;
+    bool error_happened;
 };
 typedef struct VTDPASIDCacheInfo VTDPASIDCacheInfo;
 
@@ -560,6 +570,12 @@ typedef struct VTDPASIDCacheInfo VTDPASIDCacheInfo;
 #define VTD_SM_PASID_ENTRY_AW          7ULL /* Adjusted guest-address-width */
 #define VTD_SM_PASID_ENTRY_DID(val)    ((val) & VTD_DOMAIN_ID_MASK)
 
+#define VTD_SM_PASID_ENTRY_FLPM          3ULL
+#define VTD_SM_PASID_ENTRY_FLPTPTR       (~0xfffULL)
+#define VTD_SM_PASID_ENTRY_SRE_BIT(val)  (!!((val) & 1ULL))
+#define VTD_SM_PASID_ENTRY_WPE_BIT(val)  (!!(((val) >> 4) & 1ULL))
+#define VTD_SM_PASID_ENTRY_EAFE_BIT(val) (!!(((val) >> 7) & 1ULL))
+
 /* Second Level Page Translation Pointer*/
 #define VTD_SM_PASID_ENTRY_SLPTPTR     (~0xfffULL)
 
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index c7b707a3d5..d3122cf699 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -65,6 +65,9 @@ typedef struct VTDPASIDEntry VTDPASIDEntry;
 typedef struct VTDIOMMUFDDevice VTDIOMMUFDDevice;
 typedef struct VTDPASIDCacheEntry VTDPASIDCacheEntry;
 typedef struct VTDPASIDAddressSpace VTDPASIDAddressSpace;
+typedef struct VTDHwpt VTDHwpt;
+typedef struct VTDIOASContainer VTDIOASContainer;
+typedef struct VTDS2Hwpt VTDS2Hwpt;
 
 /* Context-Entry */
 struct VTDContextEntry {
@@ -102,14 +105,37 @@ struct pasid_key {
     uint16_t sid;
 };
 
+struct VTDIOASContainer {
+    IOMMUFDBackend *iommufd;
+    uint32_t ioas_id;
+    MemoryListener listener;
+    QLIST_HEAD(, VTDS2Hwpt) s2_hwpt_list;
+    QLIST_ENTRY(VTDIOASContainer) next;
+    Error *error;
+};
+
+struct VTDS2Hwpt {
+    uint32_t users;
+    uint32_t hwpt_id;
+    VTDIOASContainer *container;
+    QLIST_ENTRY(VTDS2Hwpt) next;
+};
+
+struct VTDHwpt {
+    uint32_t hwpt_id;
+    VTDS2Hwpt *s2_hwpt;
+};
+
 struct VTDPASIDCacheEntry {
     struct VTDPASIDEntry pasid_entry;
+    bool cache_filled;
 };
 
 struct VTDPASIDAddressSpace {
     PCIBus *bus;
     uint8_t devfn;
     uint32_t pasid;
+    VTDHwpt hwpt;
     IntelIOMMUState *iommu_state;
     VTDContextCacheEntry context_cache_entry;
     QLIST_ENTRY(VTDPASIDAddressSpace) next;
@@ -330,8 +356,12 @@ struct IntelIOMMUState {
     /* list of VTDIOMMUFDDevices */
     QLIST_HEAD(, VTDIOMMUFDDevice) vtd_idev_list;
 
+    QLIST_HEAD(, VTDIOASContainer) containers;
+
     GHashTable *vtd_iommufd_dev;             /* VTDIOMMUFDDevice */
 
+    VTDHwpt *s2_hwpt;
+
     /* interrupt remapping */
     bool intr_enabled;              /* Whether guest enabled IR */
     dma_addr_t intr_root;           /* Interrupt remapping table pointer */
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index a1a1f23246..df93fcacd8 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -40,6 +40,7 @@
 #include "migration/vmstate.h"
 #include "trace.h"
 #include "qemu/jhash.h"
+#include "sysemu/iommufd.h"
 
 /* context entry operations */
 #define VTD_CE_GET_RID2PASID(ce) \
@@ -771,6 +772,24 @@ static inline uint32_t vtd_sm_ce_get_pdt_entry_num(VTDContextEntry *ce)
     return 1U << (VTD_SM_CONTEXT_ENTRY_PDTS(ce->val[0]) + 7);
 }
 
+static inline uint32_t vtd_pe_get_fl_aw(VTDPASIDEntry *pe)
+{
+    return 48 + ((pe->val[2] >> 2) & VTD_SM_PASID_ENTRY_FLPM) * 9;
+}
+
+static inline dma_addr_t vtd_pe_get_flpt_base(VTDPASIDEntry *pe)
+{
+    return pe->val[2] & VTD_SM_PASID_ENTRY_FLPTPTR;
+}
+
+static inline void pasid_cache_info_set_error(VTDPASIDCacheInfo *pc_info)
+{
+    if (pc_info->error_happened) {
+        return;
+    }
+    pc_info->error_happened = true;
+}
+
 static inline bool vtd_pdire_present(VTDPASIDDirEntry *pdire)
 {
     return pdire->val & 1;
@@ -1631,6 +1650,17 @@ static int vtd_address_space_sync(VTDAddressSpace *vtd_as)
     return vtd_sync_shadow_page_table_range(vtd_as, &ce, 0, UINT64_MAX);
 }
 
+static bool vtd_pe_pgtt_is_pt(VTDPASIDEntry *pe)
+{
+    return (VTD_PE_GET_TYPE(pe) == VTD_SM_PASID_ENTRY_PT);
+}
+
+/* check if pgtt is first stage translation */
+static bool vtd_pe_pgtt_is_flt(VTDPASIDEntry *pe)
+{
+    return (VTD_PE_GET_TYPE(pe) == VTD_SM_PASID_ENTRY_FLT);
+}
+
 /*
  * Check if specific device is configured to bypass address
  * translation for DMA requests. In Scalable Mode, bypass
@@ -1652,7 +1682,7 @@ static bool vtd_dev_pt_enabled(IntelIOMMUState *s, VTDContextEntry *ce,
              */
             return false;
         }
-        return (VTD_PE_GET_TYPE(&pe) == VTD_SM_PASID_ENTRY_PT);
+        return vtd_pe_pgtt_is_pt(&pe);
     }
 
     return (vtd_ce_get_type(ce) == VTD_CONTEXT_TT_PASS_THROUGH);
@@ -2091,6 +2121,543 @@ static void vtd_context_global_invalidate(IntelIOMMUState *s)
     vtd_iommu_replay_all(s);
 }
 
+static bool iommufd_listener_skipped_section(MemoryRegionSection *section)
+{
+    return !memory_region_is_ram(section->mr) ||
+           memory_region_is_protected(section->mr) ||
+           /*
+            * Sizing an enabled 64-bit BAR can cause spurious mappings to
+            * addresses in the upper part of the 64-bit address space.  These
+            * are never accessed by the CPU and beyond the address width of
+            * some IOMMU hardware.  TODO: VFIO should tell us the IOMMU width.
+            */
+           section->offset_within_address_space & (1ULL << 63);
+}
+
+static void iommufd_listener_region_add_s2domain(MemoryListener *listener,
+                                                 MemoryRegionSection *section)
+{
+    VTDIOASContainer *container = container_of(listener,
+                                               VTDIOASContainer, listener);
+    IOMMUFDBackend *iommufd = container->iommufd;
+    uint32_t ioas_id = container->ioas_id;
+    hwaddr iova;
+    Int128 llend, llsize;
+    void *vaddr;
+    Error *err = NULL;
+    int ret;
+
+    if (iommufd_listener_skipped_section(section)) {
+        return;
+    }
+    iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
+    llend = int128_make64(section->offset_within_address_space);
+    llend = int128_add(llend, section->size);
+    llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
+    llsize = int128_sub(llend, int128_make64(iova));
+    vaddr = memory_region_get_ram_ptr(section->mr) +
+            section->offset_within_region +
+            (iova - section->offset_within_address_space);
+
+    memory_region_ref(section->mr);
+
+    ret = iommufd_backend_map_dma(iommufd, ioas_id, iova, int128_get64(llsize),
+                                  vaddr, section->readonly);
+    if (!ret) {
+        return;
+    }
+
+    error_setg(&err,
+               "iommufd_listener_region_add_s2domain(%p, 0x%"HWADDR_PRIx", "
+               "0x%"HWADDR_PRIx", %p) = %d (%s)",
+               container, iova, int128_get64(llsize), vaddr, ret,
+               strerror(-ret));
+
+    if (memory_region_is_ram_device(section->mr)) {
+        /* Allow unexpected mappings not to be fatal for RAM devices */
+        error_report_err(err);
+        return;
+    }
+
+    if (!container->error) {
+        error_propagate_prepend(&container->error, err, "Region %s: ",
+                                memory_region_name(section->mr));
+    } else {
+        error_free(err);
+    }
+}
+
+static void iommufd_listener_region_del_s2domain(MemoryListener *listener,
+                                                 MemoryRegionSection *section)
+{
+    VTDIOASContainer *container = container_of(listener,
+                                               VTDIOASContainer, listener);
+    IOMMUFDBackend *iommufd = container->iommufd;
+    uint32_t ioas_id = container->ioas_id;
+    hwaddr iova;
+    Int128 llend, llsize;
+    int ret;
+
+    if (iommufd_listener_skipped_section(section)) {
+        return;
+    }
+    iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
+    llend = int128_make64(section->offset_within_address_space);
+    llend = int128_add(llend, section->size);
+    llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
+    llsize = int128_sub(llend, int128_make64(iova));
+
+    ret = iommufd_backend_unmap_dma(iommufd, ioas_id,
+                                    iova, int128_get64(llsize));
+    if (ret) {
+        error_report("iommufd_listener_region_del_s2domain(%p, "
+                     "0x%"HWADDR_PRIx", 0x%"HWADDR_PRIx") = %d (%s)",
+                     container, iova, int128_get64(llsize), ret,
+                     strerror(-ret));
+    }
+
+    memory_region_unref(section->mr);
+}
+
+static const MemoryListener iommufd_s2domain_memory_listener = {
+    .name = "iommufd_s2domain",
+    .priority = 1000,
+    .region_add = iommufd_listener_region_add_s2domain,
+    .region_del = iommufd_listener_region_del_s2domain,
+};
+
+static void vtd_init_s1_hwpt_data(struct iommu_hwpt_vtd_s1 *vtd,
+                                  VTDPASIDEntry *pe)
+{
+    memset(vtd, 0, sizeof(*vtd));
+
+    vtd->flags =  (VTD_SM_PASID_ENTRY_SRE_BIT(pe->val[2]) ?
+                                        IOMMU_VTD_S1_SRE : 0) |
+                  (VTD_SM_PASID_ENTRY_WPE_BIT(pe->val[2]) ?
+                                        IOMMU_VTD_S1_WPE : 0) |
+                  (VTD_SM_PASID_ENTRY_EAFE_BIT(pe->val[2]) ?
+                                        IOMMU_VTD_S1_EAFE : 0);
+    vtd->addr_width = vtd_pe_get_fl_aw(pe);
+    vtd->pgtbl_addr = (uint64_t)vtd_pe_get_flpt_base(pe);
+}
+
+static int vtd_create_s1_hwpt(IOMMUFDDevice *idev,
+                              VTDS2Hwpt *s2_hwpt, VTDHwpt *hwpt,
+                              VTDPASIDEntry *pe, Error **errp)
+{
+    struct iommu_hwpt_vtd_s1 vtd;
+    uint32_t hwpt_id, s2_hwpt_id = s2_hwpt->hwpt_id;
+    int ret;
+
+    vtd_init_s1_hwpt_data(&vtd, pe);
+
+    ret = iommufd_backend_alloc_hwpt(idev->iommufd, idev->dev_id,
+                                     s2_hwpt_id, 0, IOMMU_HWPT_DATA_VTD_S1,
+                                     sizeof(vtd), &vtd, &hwpt_id);
+    if (ret) {
+        error_setg(errp, "Failed to allocate stage-1 page table, dev_id %d",
+                   idev->dev_id);
+        return ret;
+    }
+
+    hwpt->hwpt_id = hwpt_id;
+
+    return 0;
+}
+
+static void vtd_destroy_s1_hwpt(IOMMUFDDevice *idev, VTDHwpt *hwpt)
+{
+    iommufd_backend_free_id(idev->iommufd, hwpt->hwpt_id);
+}
+
+static VTDS2Hwpt *vtd_ioas_container_get_s2_hwpt(VTDIOASContainer *container,
+                                                 uint32_t hwpt_id)
+{
+    VTDS2Hwpt *s2_hwpt;
+
+    QLIST_FOREACH(s2_hwpt, &container->s2_hwpt_list, next) {
+        if (s2_hwpt->hwpt_id == hwpt_id) {
+            return s2_hwpt;
+        }
+    }
+
+    s2_hwpt = g_malloc0(sizeof(*s2_hwpt));
+
+    s2_hwpt->hwpt_id = hwpt_id;
+    s2_hwpt->container = container;
+    QLIST_INSERT_HEAD(&container->s2_hwpt_list, s2_hwpt, next);
+
+    return s2_hwpt;
+}
+
+static void vtd_ioas_container_put_s2_hwpt(VTDS2Hwpt *s2_hwpt)
+{
+    VTDIOASContainer *container = s2_hwpt->container;
+
+    if (s2_hwpt->users) {
+        return;
+    }
+
+    QLIST_REMOVE(s2_hwpt, next);
+    iommufd_backend_free_id(container->iommufd, s2_hwpt->hwpt_id);
+    g_free(s2_hwpt);
+}
+
+static void vtd_ioas_container_destroy(VTDIOASContainer *container)
+{
+    if (!QLIST_EMPTY(&container->s2_hwpt_list)) {
+        return;
+    }
+
+    QLIST_REMOVE(container, next);
+    memory_listener_unregister(&container->listener);
+    iommufd_backend_free_id(container->iommufd, container->ioas_id);
+    g_free(container);
+}
+
+static int vtd_device_attach_hwpt(VTDIOMMUFDDevice *vtd_idev,
+                                  uint32_t rid_pasid, VTDPASIDEntry *pe,
+                                  VTDS2Hwpt *s2_hwpt, VTDHwpt *hwpt,
+                                  Error **errp)
+{
+    IOMMUFDDevice *idev = vtd_idev->idev;
+    int ret;
+
+    if (vtd_pe_pgtt_is_flt(pe)) {
+        ret = vtd_create_s1_hwpt(vtd_idev->idev, s2_hwpt,
+                                 hwpt, pe, errp);
+        if (ret) {
+            return ret;
+        }
+    } else {
+        hwpt->hwpt_id = s2_hwpt->hwpt_id;
+    }
+
+    ret = iommufd_device_attach_hwpt(idev, hwpt->hwpt_id);
+    trace_vtd_device_attach_hwpt(idev->dev_id, rid_pasid, hwpt->hwpt_id, ret);
+    if (ret) {
+        if (vtd_pe_pgtt_is_flt(pe)) {
+            vtd_destroy_s1_hwpt(idev, hwpt);
+        }
+        hwpt->hwpt_id = 0;
+        error_setg(errp, "dev_id %d pasid %d failed to attach hwpt %d",
+                   idev->dev_id, rid_pasid, hwpt->hwpt_id);
+        return ret;
+    }
+
+    s2_hwpt->users++;
+    hwpt->s2_hwpt = s2_hwpt;
+
+    return 0;
+}
+
+static void vtd_device_detach_hwpt(VTDIOMMUFDDevice *vtd_idev,
+                                   uint32_t rid_pasid, VTDPASIDEntry *pe,
+                                   VTDHwpt *hwpt, Error **errp)
+{
+    IOMMUFDDevice *idev = vtd_idev->idev;
+    int ret;
+
+    if (vtd_idev->iommu_state->dmar_enabled) {
+        ret = iommufd_device_detach_hwpt(idev);
+        trace_vtd_device_detach_hwpt(idev->dev_id, rid_pasid, ret);
+    } else {
+        ret = iommufd_device_attach_hwpt(idev, idev->ioas_id);
+        trace_vtd_device_reattach_def_ioas(idev->dev_id, rid_pasid,
+                                           idev->ioas_id, ret);
+    }
+
+    if (ret) {
+        error_setg(errp, "dev_id %d pasid %d failed to attach hwpt %d",
+                   idev->dev_id, rid_pasid, hwpt->hwpt_id);
+    }
+
+    if (vtd_pe_pgtt_is_flt(pe)) {
+        vtd_destroy_s1_hwpt(idev, hwpt);
+    }
+
+    hwpt->s2_hwpt->users--;
+    hwpt->s2_hwpt = NULL;
+    hwpt->hwpt_id = 0;
+}
+
+static int vtd_device_attach_container(VTDIOMMUFDDevice *vtd_idev,
+                                       VTDIOASContainer *container,
+                                       uint32_t rid_pasid,
+                                       VTDPASIDEntry *pe,
+                                       VTDHwpt *hwpt,
+                                       Error **errp)
+{
+    IOMMUFDDevice *idev = vtd_idev->idev;
+    IOMMUFDBackend *iommufd = idev->iommufd;
+    VTDS2Hwpt *s2_hwpt;
+    uint32_t s2_hwpt_id;
+    Error *err = NULL;
+    int ret;
+
+    /* try to attach to an existing hwpt in this container */
+    QLIST_FOREACH(s2_hwpt, &container->s2_hwpt_list, next) {
+        ret = vtd_device_attach_hwpt(vtd_idev, rid_pasid, pe,
+                                     s2_hwpt, hwpt, &err);
+        if (ret) {
+            const char *msg = error_get_pretty(err);
+
+            trace_vtd_device_fail_attach_existing_hwpt(msg);
+            error_free(err);
+            err = NULL;
+        } else {
+            goto found_hwpt;
+        }
+    }
+
+    ret = iommufd_backend_alloc_hwpt(iommufd, idev->dev_id,
+                                     container->ioas_id,
+                                     IOMMU_HWPT_ALLOC_NEST_PARENT,
+                                     IOMMU_HWPT_DATA_NONE,
+                                     0, NULL, &s2_hwpt_id);
+    if (ret) {
+        error_setg_errno(errp, errno, "error alloc parent hwpt");
+        return ret;
+    }
+
+    s2_hwpt = vtd_ioas_container_get_s2_hwpt(container, s2_hwpt_id);
+
+    /* Attach vtd device to a new allocated hwpt within iommufd */
+    ret = vtd_device_attach_hwpt(vtd_idev, rid_pasid, pe, s2_hwpt, hwpt, &err);
+    if (ret) {
+        goto err_attach_hwpt;
+    }
+
+found_hwpt:
+    trace_vtd_device_attach_container(iommufd->fd, idev->dev_id, rid_pasid,
+                                      container->ioas_id, hwpt->hwpt_id);
+    return 0;
+
+err_attach_hwpt:
+    vtd_ioas_container_put_s2_hwpt(s2_hwpt);
+    return ret;
+}
+
+static void vtd_device_detach_container(VTDIOMMUFDDevice *vtd_idev,
+                                        uint32_t rid_pasid,
+                                        VTDPASIDEntry *pe,
+                                        VTDHwpt *hwpt,
+                                        Error **errp)
+{
+    IOMMUFDDevice *idev = vtd_idev->idev;
+    IOMMUFDBackend *iommufd = idev->iommufd;
+    VTDS2Hwpt *s2_hwpt = hwpt->s2_hwpt;
+
+    trace_vtd_device_detach_container(iommufd->fd, idev->dev_id, rid_pasid);
+    vtd_device_detach_hwpt(vtd_idev, rid_pasid, pe, hwpt, errp);
+    vtd_ioas_container_put_s2_hwpt(s2_hwpt);
+}
+
+static int vtd_device_attach_iommufd(VTDIOMMUFDDevice *vtd_idev,
+                                     uint32_t rid_pasid,
+                                     VTDPASIDEntry *pe,
+                                     VTDHwpt *hwpt,
+                                     Error **errp)
+{
+    IntelIOMMUState *s = vtd_idev->iommu_state;
+    VTDIOASContainer *container;
+    IOMMUFDBackend *iommufd = vtd_idev->idev->iommufd;
+    Error *err = NULL;
+    uint32_t ioas_id;
+    int ret;
+
+    /* try to attach to an existing container in this space */
+    QLIST_FOREACH(container, &s->containers, next) {
+        if (container->iommufd != iommufd) {
+            continue;
+        }
+
+        if (vtd_device_attach_container(vtd_idev, container,
+                                        rid_pasid, pe, hwpt, &err)) {
+            const char *msg = error_get_pretty(err);
+
+            trace_vtd_device_fail_attach_existing_container(msg);
+            error_free(err);
+            err = NULL;
+        } else {
+            return 0;
+        }
+    }
+
+    /* Need to allocate a new dedicated container */
+    ret = iommufd_backend_alloc_ioas(iommufd, &ioas_id, errp);
+    if (ret < 0) {
+        return ret;
+    }
+
+    trace_vtd_device_alloc_ioas(iommufd->fd, ioas_id);
+
+    container = g_malloc0(sizeof(*container));
+    container->iommufd = iommufd;
+    container->ioas_id = ioas_id;
+    QLIST_INIT(&container->s2_hwpt_list);
+
+    if (vtd_device_attach_container(vtd_idev, container,
+                                    rid_pasid, pe, hwpt, errp)) {
+        goto err_attach_container;
+    }
+
+    container->listener = iommufd_s2domain_memory_listener;
+    memory_listener_register(&container->listener, &address_space_memory);
+
+    if (container->error) {
+        ret = -1;
+        error_propagate_prepend(errp, container->error,
+                                "memory listener initialization failed: ");
+        goto err_listener_register;
+    }
+
+    QLIST_INSERT_HEAD(&s->containers, container, next);
+
+    return 0;
+
+err_listener_register:
+    vtd_device_detach_container(vtd_idev, rid_pasid, pe, hwpt, errp);
+err_attach_container:
+    iommufd_backend_free_id(iommufd, container->ioas_id);
+    g_free(container);
+    return ret;
+}
+
+static void vtd_device_detach_iommufd(VTDIOMMUFDDevice *vtd_idev,
+                                      uint32_t rid_pasid,
+                                      VTDPASIDEntry *pe,
+                                      VTDHwpt *hwpt,
+                                      Error **errp)
+{
+    VTDIOASContainer *container = hwpt->s2_hwpt->container;
+
+    vtd_device_detach_container(vtd_idev, rid_pasid, pe, hwpt, errp);
+    vtd_ioas_container_destroy(container);
+}
+
+static int vtd_device_attach_pgtbl(VTDIOMMUFDDevice *vtd_idev,
+                                   VTDPASIDEntry *pe,
+                                   VTDPASIDAddressSpace *vtd_pasid_as,
+                                   uint32_t rid_pasid)
+{
+    /*
+     * If pe->gptt != FLT, should be go ahead to do bind as host only
+     * accepts guest FLT under nesting. If pe->pgtt==PT, should setup
+     * the pasid with GPA page table. Otherwise should return failure.
+     */
+    if (!vtd_pe_pgtt_is_flt(pe) && !vtd_pe_pgtt_is_pt(pe)) {
+        return -EINVAL;
+    }
+
+    /* Should fail if the FLPT base is 0 */
+    if (vtd_pe_pgtt_is_flt(pe) && !vtd_pe_get_flpt_base(pe)) {
+        return -EINVAL;
+    }
+
+    return vtd_device_attach_iommufd(vtd_idev, rid_pasid, pe,
+                                     &vtd_pasid_as->hwpt, &error_abort);
+}
+
+static int vtd_device_detach_pgtbl(VTDIOMMUFDDevice *vtd_idev,
+                                  VTDPASIDAddressSpace *vtd_pasid_as,
+                                  uint32_t rid_pasid)
+{
+    VTDPASIDEntry *cached_pe = vtd_pasid_as->pasid_cache_entry.cache_filled ?
+                       &vtd_pasid_as->pasid_cache_entry.pasid_entry : NULL;
+
+    if (!cached_pe ||
+        (!vtd_pe_pgtt_is_flt(cached_pe) && !vtd_pe_pgtt_is_pt(cached_pe))) {
+        return 0;
+    }
+
+    vtd_device_detach_iommufd(vtd_idev, rid_pasid, cached_pe,
+                              &vtd_pasid_as->hwpt, &error_abort);
+
+    return 0;
+}
+
+static int vtd_dev_get_rid2pasid(IntelIOMMUState *s, uint8_t bus_num,
+                                 uint8_t devfn, uint32_t *rid_pasid)
+{
+    VTDContextEntry ce;
+    int ret;
+
+    /*
+     * Currently, ECAP.RPS bit is likely to be reported as "Clear".
+     * And per VT-d 3.1 spec, it will use PASID #0 as RID2PASID when
+     * RPS bit is reported as "Clear".
+     */
+    if (likely(!(s->ecap & VTD_ECAP_RPS))) {
+        *rid_pasid = 0;
+        return 0;
+    }
+
+    /*
+     * In future, to improve performance, could try to fetch context
+     * entry from cache firstly.
+     */
+    ret = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
+    if (!ret) {
+        *rid_pasid = VTD_CE_GET_RID2PASID(&ce);
+    }
+
+    return ret;
+}
+
+/**
+ * Caller should hold iommu_lock.
+ */
+static int vtd_bind_guest_pasid(VTDPASIDAddressSpace *vtd_pasid_as,
+                                VTDPASIDEntry *pe, VTDPASIDOp op)
+{
+    IntelIOMMUState *s = vtd_pasid_as->iommu_state;
+    VTDIOMMUFDDevice *vtd_idev;
+    uint32_t rid_pasid;
+    int devfn = vtd_pasid_as->devfn;
+    int ret = -EINVAL;
+    struct vtd_as_key key = {
+        .bus = vtd_pasid_as->bus,
+        .devfn = devfn,
+    };
+
+    vtd_idev = g_hash_table_lookup(s->vtd_iommufd_dev, &key);
+    if (!vtd_idev || !vtd_idev->idev) {
+        /* means no need to go further, e.g. for emulated devices */
+        return 0;
+    }
+
+    if (vtd_dev_get_rid2pasid(s, pci_bus_num(vtd_pasid_as->bus),
+                              devfn, &rid_pasid)) {
+        error_report("Unable to get rid_pasid for devfn: %d!", devfn);
+        return ret;
+    }
+
+    if (vtd_pasid_as->pasid != rid_pasid) {
+        error_report("Non-rid_pasid %d not supported yet", vtd_pasid_as->pasid);
+        return ret;
+    }
+
+    switch (op) {
+    case VTD_PASID_UPDATE:
+    case VTD_PASID_BIND:
+    {
+        ret = vtd_device_attach_pgtbl(vtd_idev, pe, vtd_pasid_as, rid_pasid);
+        break;
+    }
+    case VTD_PASID_UNBIND:
+    {
+        ret = vtd_device_detach_pgtbl(vtd_idev, vtd_pasid_as, rid_pasid);
+        break;
+    }
+    default:
+        error_report_once("Unknown VTDPASIDOp!!!\n");
+        break;
+    }
+
+    return ret;
+}
+
 /* Do a context-cache device-selective invalidation.
  * @func_mask: FM field after shifting
  */
@@ -2717,22 +3284,30 @@ static bool vtd_pasid_entry_compare(VTDPASIDEntry *p1, VTDPASIDEntry *p2)
  * This function fills in the pasid entry in &vtd_pasid_as. Caller
  * of this function should hold iommu_lock.
  */
-static void vtd_fill_pe_in_cache(IntelIOMMUState *s,
-                                 VTDPASIDAddressSpace *vtd_pasid_as,
-                                 VTDPASIDEntry *pe)
+static int vtd_fill_pe_in_cache(IntelIOMMUState *s,
+                                VTDPASIDAddressSpace *vtd_pasid_as,
+                                VTDPASIDEntry *pe)
 {
     VTDPASIDCacheEntry *pc_entry = &vtd_pasid_as->pasid_cache_entry;
+    int ret;
 
-    if (vtd_pasid_entry_compare(pe, &pc_entry->pasid_entry)) {
-        /* No need to go further as cached pasid entry is latest */
-        return;
+    if (pc_entry->cache_filled) {
+        if (vtd_pasid_entry_compare(pe, &pc_entry->pasid_entry)) {
+            /* No need to go further as cached pasid entry is latest */
+            return 0;
+        }
+        ret = vtd_bind_guest_pasid(vtd_pasid_as,
+                                   pe, VTD_PASID_UPDATE);
+    } else {
+        ret = vtd_bind_guest_pasid(vtd_pasid_as,
+                                   pe, VTD_PASID_BIND);
     }
 
-    pc_entry->pasid_entry = *pe;
-    /*
-     * TODO:
-     * - send pasid bind to host for passthru devices
-     */
+    if (!ret) {
+        pc_entry->pasid_entry = *pe;
+        pc_entry->cache_filled = true;
+    }
+    return ret;
 }
 
 /*
@@ -2795,7 +3370,11 @@ static gboolean vtd_flush_pasid(gpointer key, gpointer value,
         goto remove;
     }
 
-    vtd_fill_pe_in_cache(s, vtd_pasid_as, &pe);
+    if (vtd_fill_pe_in_cache(s, vtd_pasid_as, &pe)) {
+        pasid_cache_info_set_error(pc_info);
+        return true;
+    }
+
     /*
      * TODO:
      * - when pasid-base-iotlb(piotlb) infrastructure is ready,
@@ -2805,10 +3384,14 @@ static gboolean vtd_flush_pasid(gpointer key, gpointer value,
 remove:
     /*
      * TODO:
-     * - send pasid bind to host for passthru devices
      * - when pasid-base-iotlb(piotlb) infrastructure is ready,
      *   should invalidate QEMU piotlb togehter with this change.
      */
+    if (vtd_bind_guest_pasid(vtd_pasid_as,
+                             NULL, VTD_PASID_UNBIND)) {
+        pasid_cache_info_set_error(pc_info);
+    }
+
     return true;
 }
 
@@ -2854,6 +3437,22 @@ static VTDPASIDAddressSpace *vtd_add_find_pasid_as(IntelIOMMUState *s,
     return vtd_pasid_as;
 }
 
+/* Caller of this function should hold iommu_lock. */
+static void vtd_remove_pasid_as(VTDPASIDAddressSpace *vtd_pasid_as)
+{
+    IntelIOMMUState *s = vtd_pasid_as->iommu_state;
+    PCIBus *bus = vtd_pasid_as->bus;
+    struct pasid_key key;
+    int devfn = vtd_pasid_as->devfn;
+    uint32_t pasid = vtd_pasid_as->pasid;
+    uint16_t sid;
+
+    sid = PCI_BUILD_BDF(pci_bus_num(bus), devfn);
+    vtd_init_pasid_key(pasid, sid, &key);
+
+    g_hash_table_remove(s->vtd_pasid_as, &key);
+}
+
 /* Caller of this function should hold iommu_lock. */
 static void vtd_sm_pasid_table_walk_one(IntelIOMMUState *s,
                                         dma_addr_t pt_base,
@@ -2884,7 +3483,10 @@ static void vtd_sm_pasid_table_walk_one(IntelIOMMUState *s,
                 pasid = pasid_next;
                 continue;
             }
-            vtd_fill_pe_in_cache(s, vtd_pasid_as, &pe);
+            if (vtd_fill_pe_in_cache(s, vtd_pasid_as, &pe)) {
+                vtd_remove_pasid_as(vtd_pasid_as);
+                pasid_cache_info_set_error(info);
+            }
         }
         pasid = pasid_next;
     }
@@ -2991,6 +3593,9 @@ static void vtd_replay_guest_pasid_bindings(IntelIOMMUState *s,
         walk_info.devfn = vtd_idev->devfn;
         vtd_replay_pasid_bind_for_dev(s, start, end, &walk_info);
     }
+    if (walk_info.error_happened) {
+        pasid_cache_info_set_error(pc_info);
+    }
 }
 
 /*
@@ -3060,7 +3665,7 @@ static void vtd_pasid_cache_sync(IntelIOMMUState *s,
 /* Caller of this function should hold iommu_lock */
 static void vtd_pasid_cache_reset(IntelIOMMUState *s)
 {
-    VTDPASIDCacheInfo pc_info;
+    VTDPASIDCacheInfo pc_info = { .error_happened = false, };
 
     trace_vtd_pasid_cache_reset();
 
@@ -3082,9 +3687,9 @@ static void vtd_pasid_cache_reset(IntelIOMMUState *s)
 static bool vtd_process_pasid_desc(IntelIOMMUState *s,
                                    VTDInvDesc *inv_desc)
 {
+    VTDPASIDCacheInfo pc_info = { .error_happened = false, };
     uint16_t domain_id;
     uint32_t pasid;
-    VTDPASIDCacheInfo pc_info;
 
     if ((inv_desc->val[0] & VTD_INV_DESC_PASIDC_RSVD_VAL0) ||
         (inv_desc->val[1] & VTD_INV_DESC_PASIDC_RSVD_VAL1) ||
@@ -3125,7 +3730,7 @@ static bool vtd_process_pasid_desc(IntelIOMMUState *s,
     }
 
     vtd_pasid_cache_sync(s, &pc_info);
-    return true;
+    return !pc_info.error_happened ? true : false;
 }
 
 static bool vtd_process_inv_iec_desc(IntelIOMMUState *s,
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
index 91d6c400b4..17e7191696 100644
--- a/hw/i386/trace-events
+++ b/hw/i386/trace-events
@@ -72,6 +72,14 @@ vtd_frr_new(int index, uint64_t hi, uint64_t lo) "index %d high 0x%"PRIx64" low
 vtd_warn_invalid_qi_tail(uint16_t tail) "tail 0x%"PRIx16
 vtd_warn_ir_vector(uint16_t sid, int index, int vec, int target) "sid 0x%"PRIx16" index %d vec %d (should be: %d)"
 vtd_warn_ir_trigger(uint16_t sid, int index, int trig, int target) "sid 0x%"PRIx16" index %d trigger %d (should be: %d)"
+vtd_device_attach_hwpt(uint32_t dev_id, uint32_t pasid, uint32_t hwpt_id, int ret) "dev_id %d pasid %d hwpt_id %d, ret: %d"
+vtd_device_detach_hwpt(uint32_t dev_id, uint32_t pasid, int ret) "dev_id %d pasid %d ret: %d"
+vtd_device_reattach_def_ioas(uint32_t dev_id, uint32_t pasid, uint32_t ioas_id, int ret) "dev_id %d pasid %d ioas_id %d, ret: %d"
+vtd_device_fail_attach_existing_hwpt(const char *msg) " %s"
+vtd_device_attach_container(int fd, uint32_t dev_id, uint32_t pasid, uint32_t ioas_id, uint32_t hwpt_id) "iommufd %d dev_id %d pasid %d ioas_id %d hwpt_id %d"
+vtd_device_detach_container(int fd, uint32_t dev_id, uint32_t pasid) "iommufd %d dev_id %d pasid %d"
+vtd_device_fail_attach_existing_container(const char *msg) " %s"
+vtd_device_alloc_ioas(int fd, uint32_t ioas_id) "iommufd %d ioas_id %d"
 
 # amd_iommu.c
 amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at addr 0x%"PRIx64" +  offset 0x%"PRIx32
-- 
2.34.1