This captures the guest PASID table entry modifications and propagates
the changes to host to attach a hwpt with type determined per guest IOMMU
PGTT configuration.
When PGTT=PT, attach PASID_0 to a second stage HWPT(GPA->HPA).
When PGTT=FST, attach PASID_0 to nested HWPT with nesting parent HWPT
coming from VFIO.
Co-Authored-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
---
hw/i386/intel_iommu_accel.h | 6 ++
include/hw/i386/intel_iommu.h | 1 +
hw/i386/intel_iommu.c | 22 ++++++-
hw/i386/intel_iommu_accel.c | 114 ++++++++++++++++++++++++++++++++++
hw/i386/trace-events | 3 +
5 files changed, 143 insertions(+), 3 deletions(-)
diff --git a/hw/i386/intel_iommu_accel.h b/hw/i386/intel_iommu_accel.h
index dbe6ee6982..1a396c50a0 100644
--- a/hw/i386/intel_iommu_accel.h
+++ b/hw/i386/intel_iommu_accel.h
@@ -16,6 +16,7 @@
bool vtd_check_hiod_accel(IntelIOMMUState *s, VTDHostIOMMUDevice *vtd_hiod,
Error **errp);
VTDHostIOMMUDevice *vtd_find_hiod_iommufd(VTDAddressSpace *as);
+bool vtd_bind_guest_pasid(VTDAddressSpace *vtd_as, Error **errp);
#else
static inline bool vtd_check_hiod_accel(IntelIOMMUState *s,
VTDHostIOMMUDevice *vtd_hiod,
@@ -30,5 +31,10 @@ static inline VTDHostIOMMUDevice *vtd_find_hiod_iommufd(VTDAddressSpace *as)
{
return NULL;
}
+
+static inline bool vtd_bind_guest_pasid(VTDAddressSpace *vtd_as, Error **errp)
+{
+ return true;
+}
#endif
#endif
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 401322665a..8ce8fe1b75 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -104,6 +104,7 @@ struct VTDAddressSpace {
PCIBus *bus;
uint8_t devfn;
uint32_t pasid;
+ uint32_t fs_hwpt;
AddressSpace as;
IOMMUMemoryRegion iommu;
MemoryRegion root; /* The root container of the device */
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 513b2c85d4..36449bf161 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -88,7 +88,11 @@ static void vtd_pasid_cache_reset_locked(IntelIOMMUState *s)
g_hash_table_iter_init(&as_it, s->vtd_address_spaces);
while (g_hash_table_iter_next(&as_it, NULL, (void **)&vtd_as)) {
VTDPASIDCacheEntry *pc_entry = &vtd_as->pasid_cache_entry;
- pc_entry->valid = false;
+ if (pc_entry->valid) {
+ pc_entry->valid = false;
+ /* It's fatal to get failure during reset */
+ vtd_bind_guest_pasid(vtd_as, &error_fatal);
+ }
}
}
@@ -3074,6 +3078,8 @@ static void vtd_pasid_cache_sync_locked(gpointer key, gpointer value,
VTDPASIDEntry pe;
IOMMUNotifier *n;
uint16_t did;
+ const char *err_prefix = "Attaching to HWPT failed: ";
+ Error *local_err = NULL;
if (vtd_dev_get_pe_from_pasid(vtd_as, &pe)) {
if (!pc_entry->valid) {
@@ -3094,7 +3100,9 @@ static void vtd_pasid_cache_sync_locked(gpointer key, gpointer value,
vtd_address_space_unmap(vtd_as, n);
}
vtd_switch_address_space(vtd_as);
- return;
+
+ err_prefix = "Detaching from HWPT failed: ";
+ goto do_bind_unbind;
}
/*
@@ -3122,12 +3130,20 @@ static void vtd_pasid_cache_sync_locked(gpointer key, gpointer value,
if (!pc_entry->valid) {
pc_entry->pasid_entry = pe;
pc_entry->valid = true;
- } else if (!vtd_pasid_entry_compare(&pe, &pc_entry->pasid_entry)) {
+ } else if (vtd_pasid_entry_compare(&pe, &pc_entry->pasid_entry)) {
+ err_prefix = "Replacing HWPT attachment failed: ";
+ } else {
return;
}
vtd_switch_address_space(vtd_as);
vtd_address_space_sync(vtd_as);
+
+do_bind_unbind:
+ /* TODO: Fault event injection into guest, report error to QEMU for now */
+ if (!vtd_bind_guest_pasid(vtd_as, &local_err)) {
+ error_reportf_err(local_err, "%s", err_prefix);
+ }
}
static void vtd_pasid_cache_sync(IntelIOMMUState *s, VTDPASIDCacheInfo *pc_info)
diff --git a/hw/i386/intel_iommu_accel.c b/hw/i386/intel_iommu_accel.c
index ebfc503d64..66570ea919 100644
--- a/hw/i386/intel_iommu_accel.c
+++ b/hw/i386/intel_iommu_accel.c
@@ -13,6 +13,7 @@
#include "intel_iommu_internal.h"
#include "intel_iommu_accel.h"
#include "hw/pci/pci_bus.h"
+#include "trace.h"
bool vtd_check_hiod_accel(IntelIOMMUState *s, VTDHostIOMMUDevice *vtd_hiod,
Error **errp)
@@ -68,3 +69,116 @@ VTDHostIOMMUDevice *vtd_find_hiod_iommufd(VTDAddressSpace *as)
}
return NULL;
}
+
+static bool vtd_create_fs_hwpt(HostIOMMUDeviceIOMMUFD *idev,
+ VTDPASIDEntry *pe, uint32_t *fs_hwpt,
+ Error **errp)
+{
+ struct iommu_hwpt_vtd_s1 vtd = {};
+
+ vtd.flags = (VTD_SM_PASID_ENTRY_SRE(pe) ? IOMMU_VTD_S1_SRE : 0) |
+ (VTD_SM_PASID_ENTRY_WPE(pe) ? IOMMU_VTD_S1_WPE : 0) |
+ (VTD_SM_PASID_ENTRY_EAFE(pe) ? IOMMU_VTD_S1_EAFE : 0);
+ vtd.addr_width = vtd_pe_get_fs_aw(pe);
+ vtd.pgtbl_addr = (uint64_t)vtd_pe_get_fspt_base(pe);
+
+ return iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid, idev->hwpt_id,
+ 0, IOMMU_HWPT_DATA_VTD_S1, sizeof(vtd),
+ &vtd, fs_hwpt, errp);
+}
+
+static void vtd_destroy_old_fs_hwpt(HostIOMMUDeviceIOMMUFD *idev,
+ VTDAddressSpace *vtd_as)
+{
+ if (!vtd_as->fs_hwpt) {
+ return;
+ }
+ iommufd_backend_free_id(idev->iommufd, vtd_as->fs_hwpt);
+ vtd_as->fs_hwpt = 0;
+}
+
+static bool vtd_device_attach_iommufd(VTDHostIOMMUDevice *vtd_hiod,
+ VTDAddressSpace *vtd_as, Error **errp)
+{
+ HostIOMMUDeviceIOMMUFD *idev = HOST_IOMMU_DEVICE_IOMMUFD(vtd_hiod->hiod);
+ VTDPASIDEntry *pe = &vtd_as->pasid_cache_entry.pasid_entry;
+ uint32_t hwpt_id;
+ bool ret;
+
+ /*
+ * We can get here only if flts=on, the supported PGTT is FST or PT.
+ * Catch invalid PGTT when processing invalidation request to avoid
+ * attaching to wrong hwpt.
+ */
+ if (!vtd_pe_pgtt_is_fst(pe) && !vtd_pe_pgtt_is_pt(pe)) {
+ error_setg(errp, "Invalid PGTT type %d",
+ (uint8_t)VTD_SM_PASID_ENTRY_PGTT(pe));
+ return false;
+ }
+
+ if (vtd_pe_pgtt_is_pt(pe)) {
+ hwpt_id = idev->hwpt_id;
+ } else if (!vtd_create_fs_hwpt(idev, pe, &hwpt_id, errp)) {
+ return false;
+ }
+
+ ret = host_iommu_device_iommufd_attach_hwpt(idev, hwpt_id, errp);
+ trace_vtd_device_attach_hwpt(idev->devid, vtd_as->pasid, hwpt_id, ret);
+ if (ret) {
+ /* Destroy old fs_hwpt if it's a replacement */
+ vtd_destroy_old_fs_hwpt(idev, vtd_as);
+ if (vtd_pe_pgtt_is_fst(pe)) {
+ vtd_as->fs_hwpt = hwpt_id;
+ }
+ } else if (vtd_pe_pgtt_is_fst(pe)) {
+ iommufd_backend_free_id(idev->iommufd, hwpt_id);
+ }
+
+ return ret;
+}
+
+static bool vtd_device_detach_iommufd(VTDHostIOMMUDevice *vtd_hiod,
+ VTDAddressSpace *vtd_as, Error **errp)
+{
+ HostIOMMUDeviceIOMMUFD *idev = HOST_IOMMU_DEVICE_IOMMUFD(vtd_hiod->hiod);
+ IntelIOMMUState *s = vtd_as->iommu_state;
+ uint32_t pasid = vtd_as->pasid;
+ bool ret;
+
+ if (s->dmar_enabled && s->root_scalable) {
+ ret = host_iommu_device_iommufd_detach_hwpt(idev, errp);
+ trace_vtd_device_detach_hwpt(idev->devid, pasid, ret);
+ } else {
+ /*
+ * If DMAR remapping is disabled or guest switches to legacy mode,
+ * we fallback to the default HWPT which contains shadow page table.
+ * So guest DMA could still work.
+ */
+ ret = host_iommu_device_iommufd_attach_hwpt(idev, idev->hwpt_id, errp);
+ trace_vtd_device_reattach_def_hwpt(idev->devid, pasid, idev->hwpt_id,
+ ret);
+ }
+
+ if (ret) {
+ vtd_destroy_old_fs_hwpt(idev, vtd_as);
+ }
+
+ return ret;
+}
+
+bool vtd_bind_guest_pasid(VTDAddressSpace *vtd_as, Error **errp)
+{
+ VTDPASIDCacheEntry *pc_entry = &vtd_as->pasid_cache_entry;
+ VTDHostIOMMUDevice *vtd_hiod = vtd_find_hiod_iommufd(vtd_as);
+
+ /* Ignore emulated device or legacy VFIO backed device */
+ if (!vtd_as->iommu_state->fsts || !vtd_hiod) {
+ return true;
+ }
+
+ if (pc_entry->valid) {
+ return vtd_device_attach_iommufd(vtd_hiod, vtd_as, errp);
+ }
+
+ return vtd_device_detach_iommufd(vtd_hiod, vtd_as, errp);
+}
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
index b704f4f90c..5a3ee1cf64 100644
--- a/hw/i386/trace-events
+++ b/hw/i386/trace-events
@@ -73,6 +73,9 @@ vtd_warn_invalid_qi_tail(uint16_t tail) "tail 0x%"PRIx16
vtd_warn_ir_vector(uint16_t sid, int index, int vec, int target) "sid 0x%"PRIx16" index %d vec %d (should be: %d)"
vtd_warn_ir_trigger(uint16_t sid, int index, int trig, int target) "sid 0x%"PRIx16" index %d trigger %d (should be: %d)"
vtd_reset_exit(void) ""
+vtd_device_attach_hwpt(uint32_t dev_id, uint32_t pasid, uint32_t hwpt_id, int ret) "dev_id %d pasid %d hwpt_id %d, ret: %d"
+vtd_device_detach_hwpt(uint32_t dev_id, uint32_t pasid, int ret) "dev_id %d pasid %d ret: %d"
+vtd_device_reattach_def_hwpt(uint32_t dev_id, uint32_t pasid, uint32_t hwpt_id, int ret) "dev_id %d pasid %d hwpt_id %d, ret: %d"
# amd_iommu.c
amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at addr 0x%"PRIx64" + offset 0x%"PRIx32
--
2.47.1
Hi Zhenzhong,
On 11/17/25 10:37 AM, Zhenzhong Duan wrote:
> This captures the guest PASID table entry modifications and propagates
> the changes to host to attach a hwpt with type determined per guest IOMMU
> PGTT configuration.
>
> When PGTT=PT, attach PASID_0 to a second stage HWPT(GPA->HPA).
> When PGTT=FST, attach PASID_0 to nested HWPT with nesting parent HWPT
> coming from VFIO.
>
> Co-Authored-by: Yi Liu <yi.l.liu@intel.com>
> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
> ---
> hw/i386/intel_iommu_accel.h | 6 ++
> include/hw/i386/intel_iommu.h | 1 +
> hw/i386/intel_iommu.c | 22 ++++++-
> hw/i386/intel_iommu_accel.c | 114 ++++++++++++++++++++++++++++++++++
> hw/i386/trace-events | 3 +
> 5 files changed, 143 insertions(+), 3 deletions(-)
>
> diff --git a/hw/i386/intel_iommu_accel.h b/hw/i386/intel_iommu_accel.h
> index dbe6ee6982..1a396c50a0 100644
> --- a/hw/i386/intel_iommu_accel.h
> +++ b/hw/i386/intel_iommu_accel.h
> @@ -16,6 +16,7 @@
> bool vtd_check_hiod_accel(IntelIOMMUState *s, VTDHostIOMMUDevice *vtd_hiod,
> Error **errp);
> VTDHostIOMMUDevice *vtd_find_hiod_iommufd(VTDAddressSpace *as);
> +bool vtd_bind_guest_pasid(VTDAddressSpace *vtd_as, Error **errp);
> #else
> static inline bool vtd_check_hiod_accel(IntelIOMMUState *s,
> VTDHostIOMMUDevice *vtd_hiod,
> @@ -30,5 +31,10 @@ static inline VTDHostIOMMUDevice *vtd_find_hiod_iommufd(VTDAddressSpace *as)
> {
> return NULL;
> }
> +
> +static inline bool vtd_bind_guest_pasid(VTDAddressSpace *vtd_as, Error **errp)
> +{
> + return true;
> +}
> #endif
> #endif
> diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
> index 401322665a..8ce8fe1b75 100644
> --- a/include/hw/i386/intel_iommu.h
> +++ b/include/hw/i386/intel_iommu.h
> @@ -104,6 +104,7 @@ struct VTDAddressSpace {
> PCIBus *bus;
> uint8_t devfn;
> uint32_t pasid;
> + uint32_t fs_hwpt;
nit use fs_hwpt_id
> AddressSpace as;
> IOMMUMemoryRegion iommu;
> MemoryRegion root; /* The root container of the device */
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index 513b2c85d4..36449bf161 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -88,7 +88,11 @@ static void vtd_pasid_cache_reset_locked(IntelIOMMUState *s)
> g_hash_table_iter_init(&as_it, s->vtd_address_spaces);
> while (g_hash_table_iter_next(&as_it, NULL, (void **)&vtd_as)) {
> VTDPASIDCacheEntry *pc_entry = &vtd_as->pasid_cache_entry;
> - pc_entry->valid = false;
> + if (pc_entry->valid) {
> + pc_entry->valid = false;
> + /* It's fatal to get failure during reset */
could you elaborate on this. Why is it more fatal on reset that on other
circumstances which do not abort?
> + vtd_bind_guest_pasid(vtd_as, &error_fatal);
> + }
> }
> }
>
> @@ -3074,6 +3078,8 @@ static void vtd_pasid_cache_sync_locked(gpointer key, gpointer value,
> VTDPASIDEntry pe;
> IOMMUNotifier *n;
> uint16_t did;
> + const char *err_prefix = "Attaching to HWPT failed: ";
> + Error *local_err = NULL;
>
> if (vtd_dev_get_pe_from_pasid(vtd_as, &pe)) {
> if (!pc_entry->valid) {
> @@ -3094,7 +3100,9 @@ static void vtd_pasid_cache_sync_locked(gpointer key, gpointer value,
> vtd_address_space_unmap(vtd_as, n);
> }
> vtd_switch_address_space(vtd_as);
> - return;
> +
> + err_prefix = "Detaching from HWPT failed: ";
> + goto do_bind_unbind;
> }
>
> /*
> @@ -3122,12 +3130,20 @@ static void vtd_pasid_cache_sync_locked(gpointer key, gpointer value,
> if (!pc_entry->valid) {
> pc_entry->pasid_entry = pe;
> pc_entry->valid = true;
> - } else if (!vtd_pasid_entry_compare(&pe, &pc_entry->pasid_entry)) {
> + } else if (vtd_pasid_entry_compare(&pe, &pc_entry->pasid_entry)) {
> + err_prefix = "Replacing HWPT attachment failed: ";
> + } else {
> return;
> }
>
> vtd_switch_address_space(vtd_as);
> vtd_address_space_sync(vtd_as);
> +
> +do_bind_unbind:
> + /* TODO: Fault event injection into guest, report error to QEMU for now */
> + if (!vtd_bind_guest_pasid(vtd_as, &local_err)) {
> + error_reportf_err(local_err, "%s", err_prefix);
> + }
> }
>
> static void vtd_pasid_cache_sync(IntelIOMMUState *s, VTDPASIDCacheInfo *pc_info)
> diff --git a/hw/i386/intel_iommu_accel.c b/hw/i386/intel_iommu_accel.c
> index ebfc503d64..66570ea919 100644
> --- a/hw/i386/intel_iommu_accel.c
> +++ b/hw/i386/intel_iommu_accel.c
> @@ -13,6 +13,7 @@
> #include "intel_iommu_internal.h"
> #include "intel_iommu_accel.h"
> #include "hw/pci/pci_bus.h"
> +#include "trace.h"
>
> bool vtd_check_hiod_accel(IntelIOMMUState *s, VTDHostIOMMUDevice *vtd_hiod,
> Error **errp)
> @@ -68,3 +69,116 @@ VTDHostIOMMUDevice *vtd_find_hiod_iommufd(VTDAddressSpace *as)
> }
> return NULL;
> }
> +
> +static bool vtd_create_fs_hwpt(HostIOMMUDeviceIOMMUFD *idev,
> + VTDPASIDEntry *pe, uint32_t *fs_hwpt,
> + Error **errp)
> +{
> + struct iommu_hwpt_vtd_s1 vtd = {};
> +
> + vtd.flags = (VTD_SM_PASID_ENTRY_SRE(pe) ? IOMMU_VTD_S1_SRE : 0) |
> + (VTD_SM_PASID_ENTRY_WPE(pe) ? IOMMU_VTD_S1_WPE : 0) |
> + (VTD_SM_PASID_ENTRY_EAFE(pe) ? IOMMU_VTD_S1_EAFE : 0);
> + vtd.addr_width = vtd_pe_get_fs_aw(pe);
> + vtd.pgtbl_addr = (uint64_t)vtd_pe_get_fspt_base(pe);
> +
> + return iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid, idev->hwpt_id,
> + 0, IOMMU_HWPT_DATA_VTD_S1, sizeof(vtd),
> + &vtd, fs_hwpt, errp);
> +}
> +
> +static void vtd_destroy_old_fs_hwpt(HostIOMMUDeviceIOMMUFD *idev,
> + VTDAddressSpace *vtd_as)
> +{
> + if (!vtd_as->fs_hwpt) {
> + return;
> + }
> + iommufd_backend_free_id(idev->iommufd, vtd_as->fs_hwpt);
> + vtd_as->fs_hwpt = 0;
is it a valid assumption a valid ID cannot be null? Is it documented
somewhere?
> +}
> +
> +static bool vtd_device_attach_iommufd(VTDHostIOMMUDevice *vtd_hiod,
> + VTDAddressSpace *vtd_as, Error **errp)
> +{
> + HostIOMMUDeviceIOMMUFD *idev = HOST_IOMMU_DEVICE_IOMMUFD(vtd_hiod->hiod);
> + VTDPASIDEntry *pe = &vtd_as->pasid_cache_entry.pasid_entry;
> + uint32_t hwpt_id;
nit init to idev->hwpt_id
> + bool ret;
> +
> + /*
> + * We can get here only if flts=on, the supported PGTT is FST or PT.
> + * Catch invalid PGTT when processing invalidation request to avoid
> + * attaching to wrong hwpt.
> + */
> + if (!vtd_pe_pgtt_is_fst(pe) && !vtd_pe_pgtt_is_pt(pe)) {
> + error_setg(errp, "Invalid PGTT type %d",
> + (uint8_t)VTD_SM_PASID_ENTRY_PGTT(pe));
> + return false;
> + }
> +
> + if (vtd_pe_pgtt_is_pt(pe)) {
> + hwpt_id = idev->hwpt_id;
> + } else if (!vtd_create_fs_hwpt(idev, pe, &hwpt_id, errp)) {
> + return false;
> + }
and
if (vtd_pe_pgtt_is_fst(pe)) {
if ((!vtd_create_fs_hwpt(idev, pe, &hwpt_id, errp) {
return false;
}
}
this will match free block below
> +
> + ret = host_iommu_device_iommufd_attach_hwpt(idev, hwpt_id, errp);
> + trace_vtd_device_attach_hwpt(idev->devid, vtd_as->pasid, hwpt_id, ret);
> + if (ret) {
> + /* Destroy old fs_hwpt if it's a replacement */
> + vtd_destroy_old_fs_hwpt(idev, vtd_as);
> + if (vtd_pe_pgtt_is_fst(pe)) {
> + vtd_as->fs_hwpt = hwpt_id;
> + }
> + } else if (vtd_pe_pgtt_is_fst(pe)) {
> + iommufd_backend_free_id(idev->iommufd, hwpt_id);
> + }
> +
> + return ret;
> +}
> +
> +static bool vtd_device_detach_iommufd(VTDHostIOMMUDevice *vtd_hiod,
> + VTDAddressSpace *vtd_as, Error **errp)
> +{
> + HostIOMMUDeviceIOMMUFD *idev = HOST_IOMMU_DEVICE_IOMMUFD(vtd_hiod->hiod);
> + IntelIOMMUState *s = vtd_as->iommu_state;
> + uint32_t pasid = vtd_as->pasid;
> + bool ret;
> +
> + if (s->dmar_enabled && s->root_scalable) {
> + ret = host_iommu_device_iommufd_detach_hwpt(idev, errp);
> + trace_vtd_device_detach_hwpt(idev->devid, pasid, ret);
> + } else {
> + /*
> + * If DMAR remapping is disabled or guest switches to legacy mode,
> + * we fallback to the default HWPT which contains shadow page table.
> + * So guest DMA could still work.
> + */
> + ret = host_iommu_device_iommufd_attach_hwpt(idev, idev->hwpt_id, errp);
> + trace_vtd_device_reattach_def_hwpt(idev->devid, pasid, idev->hwpt_id,
> + ret);
> + }
> +
> + if (ret) {
> + vtd_destroy_old_fs_hwpt(idev, vtd_as);
> + }
> +
> + return ret;
> +}
> +
> +bool vtd_bind_guest_pasid(VTDAddressSpace *vtd_as, Error **errp)
I find it a little bit confusing to have bind doing the unbind too.
at least I would rename into vtd_propagate_guest_pasid or something alike.
> +{
> + VTDPASIDCacheEntry *pc_entry = &vtd_as->pasid_cache_entry;
> + VTDHostIOMMUDevice *vtd_hiod = vtd_find_hiod_iommufd(vtd_as);
> +
> + /* Ignore emulated device or legacy VFIO backed device */
> + if (!vtd_as->iommu_state->fsts || !vtd_hiod) {
> + return true;
> + }
> +
> + if (pc_entry->valid) {
> + return vtd_device_attach_iommufd(vtd_hiod, vtd_as, errp);
> + }
> +
> + return vtd_device_detach_iommufd(vtd_hiod, vtd_as, errp);
> +}
> diff --git a/hw/i386/trace-events b/hw/i386/trace-events
> index b704f4f90c..5a3ee1cf64 100644
> --- a/hw/i386/trace-events
> +++ b/hw/i386/trace-events
> @@ -73,6 +73,9 @@ vtd_warn_invalid_qi_tail(uint16_t tail) "tail 0x%"PRIx16
> vtd_warn_ir_vector(uint16_t sid, int index, int vec, int target) "sid 0x%"PRIx16" index %d vec %d (should be: %d)"
> vtd_warn_ir_trigger(uint16_t sid, int index, int trig, int target) "sid 0x%"PRIx16" index %d trigger %d (should be: %d)"
> vtd_reset_exit(void) ""
> +vtd_device_attach_hwpt(uint32_t dev_id, uint32_t pasid, uint32_t hwpt_id, int ret) "dev_id %d pasid %d hwpt_id %d, ret: %d"
> +vtd_device_detach_hwpt(uint32_t dev_id, uint32_t pasid, int ret) "dev_id %d pasid %d ret: %d"
> +vtd_device_reattach_def_hwpt(uint32_t dev_id, uint32_t pasid, uint32_t hwpt_id, int ret) "dev_id %d pasid %d hwpt_id %d, ret: %d"
>
> # amd_iommu.c
> amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at addr 0x%"PRIx64" + offset 0x%"PRIx32
Thanks
Eric
Hi Eric,
>-----Original Message-----
>From: Eric Auger <eric.auger@redhat.com>
>Subject: Re: [PATCH v8 13/23] intel_iommu_accel: Bind/unbind guest page
>table to host
>
>Hi Zhenzhong,
>
>On 11/17/25 10:37 AM, Zhenzhong Duan wrote:
>> This captures the guest PASID table entry modifications and propagates
>> the changes to host to attach a hwpt with type determined per guest
>IOMMU
>> PGTT configuration.
>>
>> When PGTT=PT, attach PASID_0 to a second stage HWPT(GPA->HPA).
>> When PGTT=FST, attach PASID_0 to nested HWPT with nesting parent HWPT
>> coming from VFIO.
>>
>> Co-Authored-by: Yi Liu <yi.l.liu@intel.com>
>> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
>> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
>> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
>> ---
>> hw/i386/intel_iommu_accel.h | 6 ++
>> include/hw/i386/intel_iommu.h | 1 +
>> hw/i386/intel_iommu.c | 22 ++++++-
>> hw/i386/intel_iommu_accel.c | 114
>++++++++++++++++++++++++++++++++++
>> hw/i386/trace-events | 3 +
>> 5 files changed, 143 insertions(+), 3 deletions(-)
>>
>> diff --git a/hw/i386/intel_iommu_accel.h b/hw/i386/intel_iommu_accel.h
>> index dbe6ee6982..1a396c50a0 100644
>> --- a/hw/i386/intel_iommu_accel.h
>> +++ b/hw/i386/intel_iommu_accel.h
>> @@ -16,6 +16,7 @@
>> bool vtd_check_hiod_accel(IntelIOMMUState *s, VTDHostIOMMUDevice
>*vtd_hiod,
>> Error **errp);
>> VTDHostIOMMUDevice *vtd_find_hiod_iommufd(VTDAddressSpace *as);
>> +bool vtd_bind_guest_pasid(VTDAddressSpace *vtd_as, Error **errp);
>> #else
>> static inline bool vtd_check_hiod_accel(IntelIOMMUState *s,
>> VTDHostIOMMUDevice
>*vtd_hiod,
>> @@ -30,5 +31,10 @@ static inline VTDHostIOMMUDevice
>*vtd_find_hiod_iommufd(VTDAddressSpace *as)
>> {
>> return NULL;
>> }
>> +
>> +static inline bool vtd_bind_guest_pasid(VTDAddressSpace *vtd_as, Error
>**errp)
>> +{
>> + return true;
>> +}
>> #endif
>> #endif
>> diff --git a/include/hw/i386/intel_iommu.h
>b/include/hw/i386/intel_iommu.h
>> index 401322665a..8ce8fe1b75 100644
>> --- a/include/hw/i386/intel_iommu.h
>> +++ b/include/hw/i386/intel_iommu.h
>> @@ -104,6 +104,7 @@ struct VTDAddressSpace {
>> PCIBus *bus;
>> uint8_t devfn;
>> uint32_t pasid;
>> + uint32_t fs_hwpt;
>nit use fs_hwpt_id
Will do.
>> AddressSpace as;
>> IOMMUMemoryRegion iommu;
>> MemoryRegion root; /* The root container of the device
>*/
>> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
>> index 513b2c85d4..36449bf161 100644
>> --- a/hw/i386/intel_iommu.c
>> +++ b/hw/i386/intel_iommu.c
>> @@ -88,7 +88,11 @@ static void
>vtd_pasid_cache_reset_locked(IntelIOMMUState *s)
>> g_hash_table_iter_init(&as_it, s->vtd_address_spaces);
>> while (g_hash_table_iter_next(&as_it, NULL, (void **)&vtd_as)) {
>> VTDPASIDCacheEntry *pc_entry = &vtd_as->pasid_cache_entry;
>> - pc_entry->valid = false;
>> + if (pc_entry->valid) {
>> + pc_entry->valid = false;
>> + /* It's fatal to get failure during reset */
>could you elaborate on this. Why is it more fatal on reset that on other
>circumstances which do not abort?
During reset, s->dmar_enabled and pc_entry->valid are both false,
so vtd_bind_guest_pasid() is actually a call to
host_iommu_device_iommufd_attach_hwpt(idev, idev->hwpt_id, errp)
which does attachment to default HWPT containing shadow page table.
This should never fail or else we have caught it in vfio_pci_realize().
For vtd_bind_guest_pasid() during pasid entry invalidation, the error can
come from wrong first stage config by guest, we should inject fault event
to guest instead of abort qemu, this is currently not implemented yet and
marked as TODO.
>> + vtd_bind_guest_pasid(vtd_as, &error_fatal);
>> + }
>> }
>> }
>>
>> @@ -3074,6 +3078,8 @@ static void
>vtd_pasid_cache_sync_locked(gpointer key, gpointer value,
>> VTDPASIDEntry pe;
>> IOMMUNotifier *n;
>> uint16_t did;
>> + const char *err_prefix = "Attaching to HWPT failed: ";
>> + Error *local_err = NULL;
>>
>> if (vtd_dev_get_pe_from_pasid(vtd_as, &pe)) {
>> if (!pc_entry->valid) {
>> @@ -3094,7 +3100,9 @@ static void
>vtd_pasid_cache_sync_locked(gpointer key, gpointer value,
>> vtd_address_space_unmap(vtd_as, n);
>> }
>> vtd_switch_address_space(vtd_as);
>> - return;
>> +
>> + err_prefix = "Detaching from HWPT failed: ";
>> + goto do_bind_unbind;
>> }
>>
>> /*
>> @@ -3122,12 +3130,20 @@ static void
>vtd_pasid_cache_sync_locked(gpointer key, gpointer value,
>> if (!pc_entry->valid) {
>> pc_entry->pasid_entry = pe;
>> pc_entry->valid = true;
>> - } else if (!vtd_pasid_entry_compare(&pe, &pc_entry->pasid_entry)) {
>> + } else if (vtd_pasid_entry_compare(&pe, &pc_entry->pasid_entry)) {
>> + err_prefix = "Replacing HWPT attachment failed: ";
>> + } else {
>> return;
>> }
>>
>> vtd_switch_address_space(vtd_as);
>> vtd_address_space_sync(vtd_as);
>> +
>> +do_bind_unbind:
>> + /* TODO: Fault event injection into guest, report error to QEMU for
>now */
>> + if (!vtd_bind_guest_pasid(vtd_as, &local_err)) {
>> + error_reportf_err(local_err, "%s", err_prefix);
>> + }
>> }
>>
>> static void vtd_pasid_cache_sync(IntelIOMMUState *s,
>VTDPASIDCacheInfo *pc_info)
>> diff --git a/hw/i386/intel_iommu_accel.c b/hw/i386/intel_iommu_accel.c
>> index ebfc503d64..66570ea919 100644
>> --- a/hw/i386/intel_iommu_accel.c
>> +++ b/hw/i386/intel_iommu_accel.c
>> @@ -13,6 +13,7 @@
>> #include "intel_iommu_internal.h"
>> #include "intel_iommu_accel.h"
>> #include "hw/pci/pci_bus.h"
>> +#include "trace.h"
>>
>> bool vtd_check_hiod_accel(IntelIOMMUState *s, VTDHostIOMMUDevice
>*vtd_hiod,
>> Error **errp)
>> @@ -68,3 +69,116 @@ VTDHostIOMMUDevice
>*vtd_find_hiod_iommufd(VTDAddressSpace *as)
>> }
>> return NULL;
>> }
>> +
>> +static bool vtd_create_fs_hwpt(HostIOMMUDeviceIOMMUFD *idev,
>> + VTDPASIDEntry *pe, uint32_t
>*fs_hwpt,
>> + Error **errp)
>> +{
>> + struct iommu_hwpt_vtd_s1 vtd = {};
>> +
>> + vtd.flags = (VTD_SM_PASID_ENTRY_SRE(pe) ? IOMMU_VTD_S1_SRE :
>0) |
>> + (VTD_SM_PASID_ENTRY_WPE(pe) ?
>IOMMU_VTD_S1_WPE : 0) |
>> + (VTD_SM_PASID_ENTRY_EAFE(pe) ?
>IOMMU_VTD_S1_EAFE : 0);
>> + vtd.addr_width = vtd_pe_get_fs_aw(pe);
>> + vtd.pgtbl_addr = (uint64_t)vtd_pe_get_fspt_base(pe);
>> +
>> + return iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid,
>idev->hwpt_id,
>> + 0,
>IOMMU_HWPT_DATA_VTD_S1, sizeof(vtd),
>> + &vtd, fs_hwpt, errp);
>> +}
>> +
>> +static void vtd_destroy_old_fs_hwpt(HostIOMMUDeviceIOMMUFD *idev,
>> + VTDAddressSpace *vtd_as)
>> +{
>> + if (!vtd_as->fs_hwpt) {
>> + return;
>> + }
>> + iommufd_backend_free_id(idev->iommufd, vtd_as->fs_hwpt);
>> + vtd_as->fs_hwpt = 0;
>is it a valid assumption a valid ID cannot be null? Is it documented
>somewhere?
I didn't find it in uAPI doc, but it's hard coded in kernel:
xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT);
Hi @jasowang@redhat.com, @nicolinc@nvidia.com do you have a guideline on this?
Could I take zero id reserved?
>> +}
>> +
>> +static bool vtd_device_attach_iommufd(VTDHostIOMMUDevice
>*vtd_hiod,
>> + VTDAddressSpace *vtd_as,
>Error **errp)
>> +{
>> + HostIOMMUDeviceIOMMUFD *idev =
>HOST_IOMMU_DEVICE_IOMMUFD(vtd_hiod->hiod);
>> + VTDPASIDEntry *pe = &vtd_as->pasid_cache_entry.pasid_entry;
>> + uint32_t hwpt_id;
>nit init to idev->hwpt_id
Will do.
>> + bool ret;
>> +
>> + /*
>> + * We can get here only if flts=on, the supported PGTT is FST or PT.
>> + * Catch invalid PGTT when processing invalidation request to avoid
>> + * attaching to wrong hwpt.
>> + */
>> + if (!vtd_pe_pgtt_is_fst(pe) && !vtd_pe_pgtt_is_pt(pe)) {
>> + error_setg(errp, "Invalid PGTT type %d",
>> + (uint8_t)VTD_SM_PASID_ENTRY_PGTT(pe));
>> + return false;
>> + }
>> +
>> + if (vtd_pe_pgtt_is_pt(pe)) {
>> + hwpt_id = idev->hwpt_id;
>> + } else if (!vtd_create_fs_hwpt(idev, pe, &hwpt_id, errp)) {
>> + return false;
>> + }
>and
>
>if (vtd_pe_pgtt_is_fst(pe)) {
> if ((!vtd_create_fs_hwpt(idev, pe, &hwpt_id, errp) {
> return false;
> }
>
>}
>this will match free block below
Will do
>> +
>> + ret = host_iommu_device_iommufd_attach_hwpt(idev, hwpt_id,
>errp);
>> + trace_vtd_device_attach_hwpt(idev->devid, vtd_as->pasid, hwpt_id,
>ret);
>> + if (ret) {
>> + /* Destroy old fs_hwpt if it's a replacement */
>> + vtd_destroy_old_fs_hwpt(idev, vtd_as);
>> + if (vtd_pe_pgtt_is_fst(pe)) {
>> + vtd_as->fs_hwpt = hwpt_id;
>> + }
>> + } else if (vtd_pe_pgtt_is_fst(pe)) {
>> + iommufd_backend_free_id(idev->iommufd, hwpt_id);
>> + }
>> +
>> + return ret;
>> +}
>> +
>> +static bool vtd_device_detach_iommufd(VTDHostIOMMUDevice
>*vtd_hiod,
>> + VTDAddressSpace *vtd_as,
>Error **errp)
>> +{
>> + HostIOMMUDeviceIOMMUFD *idev =
>HOST_IOMMU_DEVICE_IOMMUFD(vtd_hiod->hiod);
>> + IntelIOMMUState *s = vtd_as->iommu_state;
>> + uint32_t pasid = vtd_as->pasid;
>> + bool ret;
>> +
>> + if (s->dmar_enabled && s->root_scalable) {
>> + ret = host_iommu_device_iommufd_detach_hwpt(idev, errp);
>> + trace_vtd_device_detach_hwpt(idev->devid, pasid, ret);
>> + } else {
>> + /*
>> + * If DMAR remapping is disabled or guest switches to legacy
>mode,
>> + * we fallback to the default HWPT which contains shadow page
>table.
>> + * So guest DMA could still work.
>> + */
>> + ret = host_iommu_device_iommufd_attach_hwpt(idev,
>idev->hwpt_id, errp);
>> + trace_vtd_device_reattach_def_hwpt(idev->devid, pasid,
>idev->hwpt_id,
>> + ret);
>> + }
>> +
>> + if (ret) {
>> + vtd_destroy_old_fs_hwpt(idev, vtd_as);
>> + }
>> +
>> + return ret;
>> +}
>> +
>> +bool vtd_bind_guest_pasid(VTDAddressSpace *vtd_as, Error **errp)
>I find it a little bit confusing to have bind doing the unbind too.
>at least I would rename into vtd_propagate_guest_pasid or something alike.
Indeed, I had ever thought to use vtd_bind_unbind_guest_pasid...
vtd_propagate_guest_pasid sounds better, will use it.
Thanks
Zhenzhong
>-----Original Message-----
>>> +static void vtd_destroy_old_fs_hwpt(HostIOMMUDeviceIOMMUFD
>*idev,
>>> + VTDAddressSpace *vtd_as)
>>> +{
>>> + if (!vtd_as->fs_hwpt) {
>>> + return;
>>> + }
>>> + iommufd_backend_free_id(idev->iommufd, vtd_as->fs_hwpt);
>>> + vtd_as->fs_hwpt = 0;
>>is it a valid assumption a valid ID cannot be null? Is it documented
>>somewhere?
>
>I didn't find it in uAPI doc, but it's hard coded in kernel:
>
>xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT);
>
>Hi @jasowang@redhat.com, @nicolinc@nvidia.com do you have a guideline
>on this?
>Could I take zero id reserved?
Sorry, typo error, I meant to @Jason Gunthorpe
Thanks
Zhenzhong
On Fri, Dec 12, 2025 at 02:12:09AM +0000, Duan, Zhenzhong wrote:
>
>
> >-----Original Message-----
> >>> +static void vtd_destroy_old_fs_hwpt(HostIOMMUDeviceIOMMUFD
> >*idev,
> >>> + VTDAddressSpace *vtd_as)
> >>> +{
> >>> + if (!vtd_as->fs_hwpt) {
> >>> + return;
> >>> + }
> >>> + iommufd_backend_free_id(idev->iommufd, vtd_as->fs_hwpt);
> >>> + vtd_as->fs_hwpt = 0;
> >>is it a valid assumption a valid ID cannot be null? Is it documented
> >>somewhere?
> >
> >I didn't find it in uAPI doc, but it's hard coded in kernel:
> >
> >xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT);
> >
> >Hi @jasowang@redhat.com, @nicolinc@nvidia.com do you have a guideline
> >on this?
> >Could I take zero id reserved?
>
> Sorry, typo error, I meant to @Jason Gunthorpe
It is not documented in uAPI but there is some implication that
ID=0 is restricted from being an actual object ID:
297 /**
298 * enum iommufd_option - ioctl(IOMMU_OPTION_RLIMIT_MODE) and
299 * ioctl(IOMMU_OPTION_HUGE_PAGES)
300 * @IOMMU_OPTION_RLIMIT_MODE:
301 * Change how RLIMIT_MEMLOCK accounting works. The caller must have privilege
302 * to invoke this. Value 0 (default) is user based accounting, 1 uses process
303 * based accounting. Global option, object_id must be 0
So, I think that it'd be easier for user space to follow, if we
add a line of statement:
-----------------------------------------------------------------
diff --git a/Documentation/userspace-api/iommufd.rst b/Documentation/userspace-api/iommufd.rst
index f1c4d21e5c5e..c250a9eafdd5 100644
--- a/Documentation/userspace-api/iommufd.rst
+++ b/Documentation/userspace-api/iommufd.rst
@@ -135,6 +135,8 @@ Following IOMMUFD objects are exposed to userspace:
HW QUEUE, the VMM must request a pair of mmap info (offset/length) and pass in
exactly to an mmap syscall via its offset and length arguments.
+Each object must be allocated via uAPI and returned with a non-zero ID.
+
All user-visible objects are destroyed via the IOMMU_DESTROY uAPI.
The diagrams below show relationships between user-visible objects and kernel
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index ce775fbbae94..4410f0561034 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -311,6 +311,7 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp)
}
init_rwsem(&ictx->ioas_creation_lock);
+ /* ID=0 is reserved for global use in uAPI */
xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT);
xa_init(&ictx->groups);
ictx->file = filp;
-----------------------------------------------------------------
Jason?
Thanks
Nicolin
© 2016 - 2025 Red Hat, Inc.