From: Yi Liu <yi.l.liu@intel.com>
This replays guest pasid bindings after context cache invalidation.
This is a behavior to ensure safety. Actually, programmer should issue
pasid cache invalidation with proper granularity after issuing a context
cache invalidation.
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
---
hw/i386/intel_iommu_internal.h | 2 ++
hw/i386/intel_iommu.c | 42 ++++++++++++++++++++++++++++++++++
hw/i386/trace-events | 1 +
3 files changed, 45 insertions(+)
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 61e35dbdc0..8af1004888 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -584,6 +584,8 @@ typedef enum VTDPCInvType {
/* Reset all PASID cache entries, used in system level reset */
VTD_PASID_CACHE_FORCE_RESET = 0x10,
+ /* Invalidate all PASID entries in a device */
+ VTD_PASID_CACHE_DEVSI,
} VTDPCInvType;
typedef struct VTDPASIDCacheInfo {
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index a10ee8eb4f..6c0e502d1c 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -91,6 +91,10 @@ static void vtd_address_space_refresh_all(IntelIOMMUState *s);
static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n);
static void vtd_pasid_cache_reset_locked(IntelIOMMUState *s);
+static void vtd_pasid_cache_sync(IntelIOMMUState *s,
+ VTDPASIDCacheInfo *pc_info);
+static void vtd_pasid_cache_devsi(IntelIOMMUState *s,
+ PCIBus *bus, uint16_t devfn);
static void vtd_panic_require_caching_mode(void)
{
@@ -2442,6 +2446,8 @@ static void vtd_iommu_replay_all(IntelIOMMUState *s)
static void vtd_context_global_invalidate(IntelIOMMUState *s)
{
+ VTDPASIDCacheInfo pc_info;
+
trace_vtd_inv_desc_cc_global();
/* Protects context cache */
vtd_iommu_lock(s);
@@ -2459,6 +2465,9 @@ static void vtd_context_global_invalidate(IntelIOMMUState *s)
* VT-d emulation codes.
*/
vtd_iommu_replay_all(s);
+
+ pc_info.type = VTD_PASID_CACHE_GLOBAL_INV;
+ vtd_pasid_cache_sync(s, &pc_info);
}
#ifdef CONFIG_IOMMUFD
@@ -2691,6 +2700,15 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s,
* happened.
*/
vtd_address_space_sync(vtd_as);
+ /*
+ * Per spec, context flush should also be followed with PASID
+ * cache and iotlb flush. In order to work with a guest which
+ * doesn't follow spec and missed PASID cache flush, we have
+ * vtd_pasid_cache_devsi() to invalidate PASID caches of the
+ * passthrough device. Host iommu driver would flush piotlb
+ * when a pasid unbind is pass down to it.
+ */
+ vtd_pasid_cache_devsi(s, vtd_as->bus, devfn);
}
}
}
@@ -3422,6 +3440,11 @@ static gboolean vtd_flush_pasid_locked(gpointer key, gpointer value,
break;
case VTD_PASID_CACHE_FORCE_RESET:
goto remove;
+ case VTD_PASID_CACHE_DEVSI:
+ if (pc_info->bus != vtd_as->bus || pc_info->devfn != vtd_as->devfn) {
+ return false;
+ }
+ break;
default:
error_setg(&error_fatal, "invalid pc_info->type for flush");
}
@@ -3635,6 +3658,11 @@ static void vtd_replay_guest_pasid_bindings(IntelIOMMUState *s,
case VTD_PASID_CACHE_FORCE_RESET:
/* For force reset, no need to go further replay */
return;
+ case VTD_PASID_CACHE_DEVSI:
+ walk_info.bus = pc_info->bus;
+ walk_info.devfn = pc_info->devfn;
+ vtd_replay_pasid_bind_for_dev(s, start, end, &walk_info);
+ return;
default:
error_setg(&error_fatal, "invalid pc_info->type for replay");
}
@@ -3683,6 +3711,20 @@ static void vtd_pasid_cache_sync(IntelIOMMUState *s, VTDPASIDCacheInfo *pc_info)
vtd_replay_guest_pasid_bindings(s, pc_info);
}
+static void vtd_pasid_cache_devsi(IntelIOMMUState *s,
+ PCIBus *bus, uint16_t devfn)
+{
+ VTDPASIDCacheInfo pc_info;
+
+ trace_vtd_pasid_cache_devsi(devfn);
+
+ pc_info.type = VTD_PASID_CACHE_DEVSI;
+ pc_info.bus = bus;
+ pc_info.devfn = devfn;
+
+ vtd_pasid_cache_sync(s, &pc_info);
+}
+
static bool vtd_process_pasid_desc(IntelIOMMUState *s,
VTDInvDesc *inv_desc)
{
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
index 1c31b9a873..830b11f68b 100644
--- a/hw/i386/trace-events
+++ b/hw/i386/trace-events
@@ -28,6 +28,7 @@ vtd_pasid_cache_reset(void) ""
vtd_pasid_cache_gsi(void) ""
vtd_pasid_cache_dsi(uint16_t domain) "Domain selective PC invalidation domain 0x%"PRIx16
vtd_pasid_cache_psi(uint16_t domain, uint32_t pasid) "PASID selective PC invalidation domain 0x%"PRIx16" pasid 0x%"PRIx32
+vtd_pasid_cache_devsi(uint16_t devfn) "Dev selective PC invalidation dev: 0x%"PRIx16
vtd_re_not_present(uint8_t bus) "Root entry bus %"PRIu8" not present"
vtd_ce_not_present(uint8_t bus, uint8_t devfn) "Context entry bus %"PRIu8" devfn %"PRIu8" not present"
vtd_iotlb_page_hit(uint16_t sid, uint64_t addr, uint64_t slpte, uint16_t domain) "IOTLB page hit sid 0x%"PRIx16" iova 0x%"PRIx64" slpte 0x%"PRIx64" domain 0x%"PRIx16
--
2.47.1
On 8/22/25 8:40 AM, Zhenzhong Duan wrote:
> From: Yi Liu <yi.l.liu@intel.com>
>
> This replays guest pasid bindings after context cache invalidation.
> This is a behavior to ensure safety. Actually, programmer should issue
> pasid cache invalidation with proper granularity after issuing a context
> cache invalidation.
So is this mandated? If the spec mandates specific invalidations and the
guest does not comply with the expected invalidation sequence shall we
do that behind the curtain?
>
> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
> ---
> hw/i386/intel_iommu_internal.h | 2 ++
> hw/i386/intel_iommu.c | 42 ++++++++++++++++++++++++++++++++++
> hw/i386/trace-events | 1 +
> 3 files changed, 45 insertions(+)
>
> diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
> index 61e35dbdc0..8af1004888 100644
> --- a/hw/i386/intel_iommu_internal.h
> +++ b/hw/i386/intel_iommu_internal.h
> @@ -584,6 +584,8 @@ typedef enum VTDPCInvType {
>
> /* Reset all PASID cache entries, used in system level reset */
> VTD_PASID_CACHE_FORCE_RESET = 0x10,
> + /* Invalidate all PASID entries in a device */
> + VTD_PASID_CACHE_DEVSI,
invalidation type that is not defined in the spec. I would avoid and
find another solution if you really need to do such kind of invalidation.
> } VTDPCInvType;
>
> typedef struct VTDPASIDCacheInfo {
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index a10ee8eb4f..6c0e502d1c 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -91,6 +91,10 @@ static void vtd_address_space_refresh_all(IntelIOMMUState *s);
> static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n);
>
> static void vtd_pasid_cache_reset_locked(IntelIOMMUState *s);
> +static void vtd_pasid_cache_sync(IntelIOMMUState *s,
> + VTDPASIDCacheInfo *pc_info);
> +static void vtd_pasid_cache_devsi(IntelIOMMUState *s,
> + PCIBus *bus, uint16_t devfn);
>
> static void vtd_panic_require_caching_mode(void)
> {
> @@ -2442,6 +2446,8 @@ static void vtd_iommu_replay_all(IntelIOMMUState *s)
>
> static void vtd_context_global_invalidate(IntelIOMMUState *s)
> {
> + VTDPASIDCacheInfo pc_info;
> +
> trace_vtd_inv_desc_cc_global();
> /* Protects context cache */
> vtd_iommu_lock(s);
> @@ -2459,6 +2465,9 @@ static void vtd_context_global_invalidate(IntelIOMMUState *s)
> * VT-d emulation codes.
> */
> vtd_iommu_replay_all(s);
> +
> + pc_info.type = VTD_PASID_CACHE_GLOBAL_INV;
> + vtd_pasid_cache_sync(s, &pc_info);
I would put this addition in a separate patch because it does not need
the new
VTD_PASID_CACHE_DEVSI stuff
> }
>
> #ifdef CONFIG_IOMMUFD
> @@ -2691,6 +2700,15 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s,
> * happened.
> */
> vtd_address_space_sync(vtd_as);
> + /*
> + * Per spec, context flush should also be followed with PASID
> + * cache and iotlb flush. In order to work with a guest which
> + * doesn't follow spec and missed PASID cache flush, we have
> + * vtd_pasid_cache_devsi() to invalidate PASID caches of the
> + * passthrough device. Host iommu driver would flush piotlb
> + * when a pasid unbind is pass down to it.
> + */
> + vtd_pasid_cache_devsi(s, vtd_as->bus, devfn);
> }
> }
> }
> @@ -3422,6 +3440,11 @@ static gboolean vtd_flush_pasid_locked(gpointer key, gpointer value,
> break;
> case VTD_PASID_CACHE_FORCE_RESET:
> goto remove;
> + case VTD_PASID_CACHE_DEVSI:
> + if (pc_info->bus != vtd_as->bus || pc_info->devfn != vtd_as->devfn) {
> + return false;
> + }
> + break;
> default:
> error_setg(&error_fatal, "invalid pc_info->type for flush");
> }
> @@ -3635,6 +3658,11 @@ static void vtd_replay_guest_pasid_bindings(IntelIOMMUState *s,
> case VTD_PASID_CACHE_FORCE_RESET:
> /* For force reset, no need to go further replay */
> return;
> + case VTD_PASID_CACHE_DEVSI:
> + walk_info.bus = pc_info->bus;
> + walk_info.devfn = pc_info->devfn;
> + vtd_replay_pasid_bind_for_dev(s, start, end, &walk_info);
> + return;
> default:
> error_setg(&error_fatal, "invalid pc_info->type for replay");
> }
> @@ -3683,6 +3711,20 @@ static void vtd_pasid_cache_sync(IntelIOMMUState *s, VTDPASIDCacheInfo *pc_info)
> vtd_replay_guest_pasid_bindings(s, pc_info);
> }
>
> +static void vtd_pasid_cache_devsi(IntelIOMMUState *s,
> + PCIBus *bus, uint16_t devfn)
> +{
> + VTDPASIDCacheInfo pc_info;
> +
> + trace_vtd_pasid_cache_devsi(devfn);
> +
> + pc_info.type = VTD_PASID_CACHE_DEVSI;
> + pc_info.bus = bus;
> + pc_info.devfn = devfn;
> +
> + vtd_pasid_cache_sync(s, &pc_info);
> +}
> +
> static bool vtd_process_pasid_desc(IntelIOMMUState *s,
> VTDInvDesc *inv_desc)
> {
> diff --git a/hw/i386/trace-events b/hw/i386/trace-events
> index 1c31b9a873..830b11f68b 100644
> --- a/hw/i386/trace-events
> +++ b/hw/i386/trace-events
> @@ -28,6 +28,7 @@ vtd_pasid_cache_reset(void) ""
> vtd_pasid_cache_gsi(void) ""
> vtd_pasid_cache_dsi(uint16_t domain) "Domain selective PC invalidation domain 0x%"PRIx16
> vtd_pasid_cache_psi(uint16_t domain, uint32_t pasid) "PASID selective PC invalidation domain 0x%"PRIx16" pasid 0x%"PRIx32
> +vtd_pasid_cache_devsi(uint16_t devfn) "Dev selective PC invalidation dev: 0x%"PRIx16
> vtd_re_not_present(uint8_t bus) "Root entry bus %"PRIu8" not present"
> vtd_ce_not_present(uint8_t bus, uint8_t devfn) "Context entry bus %"PRIu8" devfn %"PRIu8" not present"
> vtd_iotlb_page_hit(uint16_t sid, uint64_t addr, uint64_t slpte, uint16_t domain) "IOTLB page hit sid 0x%"PRIx16" iova 0x%"PRIx64" slpte 0x%"PRIx64" domain 0x%"PRIx16
Eric
On 2025/8/28 17:43, Eric Auger wrote:
>
>
> On 8/22/25 8:40 AM, Zhenzhong Duan wrote:
>> From: Yi Liu <yi.l.liu@intel.com>
>>
>> This replays guest pasid bindings after context cache invalidation.
>> This is a behavior to ensure safety. Actually, programmer should issue
>> pasid cache invalidation with proper granularity after issuing a context
>> cache invalidation.
> So is this mandated? If the spec mandates specific invalidations and the
> guest does not comply with the expected invalidation sequence shall we
> do that behind the curtain?
I think this is following the below decision. We can discuss if it's
really needed to replay the pasid bind.
d4d607e40d (Peter Xu 2017-04-07 18:59:15 +0800 2321)
/*
dd4d607e40d (Peter Xu 2017-04-07 18:59:15 +0800
2322) * From VT-d spec 6.5.2.1, a global context entry invalidation
dd4d607e40d (Peter Xu 2017-04-07 18:59:15 +0800
2323) * should be followed by a IOTLB global invalidation, so we should
dd4d607e40d (Peter Xu 2017-04-07 18:59:15 +0800
2324) * be safe even without this. Hoewever, let's replay the region as
dd4d607e40d (Peter Xu 2017-04-07 18:59:15 +0800
2325) * well to be safer, and go back here when we need finer tunes for
dd4d607e40d (Peter Xu 2017-04-07 18:59:15 +0800
2326) * VT-d emulation codes.
dd4d607e40d (Peter Xu 2017-04-07 18:59:15 +0800
2327) */
dd4d607e40d (Peter Xu 2017-04-07 18:59:15 +0800
2328) vtd_iommu_replay_all(s);
Regards,
Yi Liu
>-----Original Message----- >From: Liu, Yi L <yi.l.liu@intel.com> >Subject: Re: [PATCH v5 16/21] intel_iommu: Replay pasid bindings after >context cache invalidation > >On 2025/8/28 17:43, Eric Auger wrote: >> >> >> On 8/22/25 8:40 AM, Zhenzhong Duan wrote: >>> From: Yi Liu <yi.l.liu@intel.com> >>> >>> This replays guest pasid bindings after context cache invalidation. >>> This is a behavior to ensure safety. Actually, programmer should issue >>> pasid cache invalidation with proper granularity after issuing a context >>> cache invalidation. >> So is this mandated? If the spec mandates specific invalidations and the >> guest does not comply with the expected invalidation sequence shall we >> do that behind the curtain? > >I think this is following the below decision. We can discuss if it's >really needed to replay the pasid bind. > >d4d607e40d (Peter Xu 2017-04-07 18:59:15 +0800 >2321) > /* >dd4d607e40d (Peter Xu 2017-04-07 18:59:15 +0800 >2322) * From VT-d spec 6.5.2.1, a global context entry invalidation >dd4d607e40d (Peter Xu 2017-04-07 18:59:15 +0800 >2323) * should be followed by a IOTLB global invalidation, so we >should >dd4d607e40d (Peter Xu 2017-04-07 18:59:15 +0800 >2324) * be safe even without this. Hoewever, let's replay the region as >dd4d607e40d (Peter Xu 2017-04-07 18:59:15 +0800 >2325) * well to be safer, and go back here when we need finer tunes >for >dd4d607e40d (Peter Xu 2017-04-07 18:59:15 +0800 >2326) * VT-d emulation codes. >dd4d607e40d (Peter Xu 2017-04-07 18:59:15 +0800 >2327) */ >dd4d607e40d (Peter Xu 2017-04-07 18:59:15 +0800 >2328) vtd_iommu_replay_all(s); I have tested this series with this patch reverted, it works with guest linux kernel. Personally, I am inclined to stop adding workaround for guest kenrel bug, there will be more and more over time and it makes current code complex unnecessarily. @Eric, @Liu, Yi L your thought? Thanks Zhenzhong
On 2025/9/1 16:11, Duan, Zhenzhong wrote: > > >> -----Original Message----- >> From: Liu, Yi L <yi.l.liu@intel.com> >> Subject: Re: [PATCH v5 16/21] intel_iommu: Replay pasid bindings after >> context cache invalidation >> >> On 2025/8/28 17:43, Eric Auger wrote: >>> >>> >>> On 8/22/25 8:40 AM, Zhenzhong Duan wrote: >>>> From: Yi Liu <yi.l.liu@intel.com> >>>> >>>> This replays guest pasid bindings after context cache invalidation. >>>> This is a behavior to ensure safety. Actually, programmer should issue >>>> pasid cache invalidation with proper granularity after issuing a context >>>> cache invalidation. >>> So is this mandated? If the spec mandates specific invalidations and the >>> guest does not comply with the expected invalidation sequence shall we >>> do that behind the curtain? >> >> I think this is following the below decision. We can discuss if it's >> really needed to replay the pasid bind. >> >> d4d607e40d (Peter Xu 2017-04-07 18:59:15 +0800 >> 2321) >> /* >> dd4d607e40d (Peter Xu 2017-04-07 18:59:15 +0800 >> 2322) * From VT-d spec 6.5.2.1, a global context entry invalidation >> dd4d607e40d (Peter Xu 2017-04-07 18:59:15 +0800 >> 2323) * should be followed by a IOTLB global invalidation, so we >> should >> dd4d607e40d (Peter Xu 2017-04-07 18:59:15 +0800 >> 2324) * be safe even without this. Hoewever, let's replay the region as >> dd4d607e40d (Peter Xu 2017-04-07 18:59:15 +0800 >> 2325) * well to be safer, and go back here when we need finer tunes >> for >> dd4d607e40d (Peter Xu 2017-04-07 18:59:15 +0800 >> 2326) * VT-d emulation codes. >> dd4d607e40d (Peter Xu 2017-04-07 18:59:15 +0800 >> 2327) */ >> dd4d607e40d (Peter Xu 2017-04-07 18:59:15 +0800 >> 2328) vtd_iommu_replay_all(s); > > I have tested this series with this patch reverted, it works with guest linux kernel. > > Personally, I am inclined to stop adding workaround for guest kenrel bug, there will be more and more over time and it makes current code complex unnecessarily. @Eric, @Liu, Yi L your thought? Let's go back to the original purpose of this. Peter has identified a case in which a context modification is not followed by IOTLB invalidation. [1] This is a valid behavior since the old domain is still in use, no need to invalidate IOTLB. Hence the shadow page of the changed device has not been updated. So the vIOMMU chose to enforce a synchronization on the shadow page per context entry modification. Let's see if similar requirement on PASID table. Let me ask one question: since PASID cache is also tagged with domain ID, if the DID has not changed, maybe iommu driver will skip the PASID cache flush? [1] https://lore.kernel.org/qemu-devel/20170117084604.2b1f5e50@t450s.home/ Regards, Yi Liu
>-----Original Message----- >From: Liu, Yi L <yi.l.liu@intel.com> >Subject: Re: [PATCH v5 16/21] intel_iommu: Replay pasid bindings after >context cache invalidation > >On 2025/9/1 16:11, Duan, Zhenzhong wrote: >> >> >>> -----Original Message----- >>> From: Liu, Yi L <yi.l.liu@intel.com> >>> Subject: Re: [PATCH v5 16/21] intel_iommu: Replay pasid bindings after >>> context cache invalidation >>> >>> On 2025/8/28 17:43, Eric Auger wrote: >>>> >>>> >>>> On 8/22/25 8:40 AM, Zhenzhong Duan wrote: >>>>> From: Yi Liu <yi.l.liu@intel.com> >>>>> >>>>> This replays guest pasid bindings after context cache invalidation. >>>>> This is a behavior to ensure safety. Actually, programmer should issue >>>>> pasid cache invalidation with proper granularity after issuing a context >>>>> cache invalidation. >>>> So is this mandated? If the spec mandates specific invalidations and the >>>> guest does not comply with the expected invalidation sequence shall we >>>> do that behind the curtain? >>> >>> I think this is following the below decision. We can discuss if it's >>> really needed to replay the pasid bind. >>> >>> d4d607e40d (Peter Xu 2017-04-07 18:59:15 >+0800 >>> 2321) >>> /* >>> dd4d607e40d (Peter Xu 2017-04-07 18:59:15 >+0800 >>> 2322) * From VT-d spec 6.5.2.1, a global context entry invalidation >>> dd4d607e40d (Peter Xu 2017-04-07 18:59:15 >+0800 >>> 2323) * should be followed by a IOTLB global invalidation, so we >>> should >>> dd4d607e40d (Peter Xu 2017-04-07 18:59:15 >+0800 >>> 2324) * be safe even without this. Hoewever, let's replay the region >as >>> dd4d607e40d (Peter Xu 2017-04-07 18:59:15 >+0800 >>> 2325) * well to be safer, and go back here when we need finer tunes >>> for >>> dd4d607e40d (Peter Xu 2017-04-07 18:59:15 >+0800 >>> 2326) * VT-d emulation codes. >>> dd4d607e40d (Peter Xu 2017-04-07 18:59:15 >+0800 >>> 2327) */ >>> dd4d607e40d (Peter Xu 2017-04-07 18:59:15 >+0800 >>> 2328) vtd_iommu_replay_all(s); >> >> I have tested this series with this patch reverted, it works with guest linux >kernel. >> >> Personally, I am inclined to stop adding workaround for guest kenrel bug, >there will be more and more over time and it makes current code complex >unnecessarily. @Eric, @Liu, Yi L your thought? > >Let's go back to the original purpose of this. Peter has identified a >case in which a context modification is not followed by IOTLB >invalidation. [1] This is a valid behavior since the old domain is still >in use, no need to invalidate IOTLB. Hence the shadow page of the >changed device has not been updated. So the vIOMMU chose to enforce a >synchronization on the shadow page per context entry modification. Let's >see if similar requirement on PASID table. Different devices can share one domain, but It's a rare case to see different devices sharing same PASID table except they are in same iommu group, but if they are in same iommu group, they should always use a common PASID table. I think no need to support such rare case? At least linux does not work this way. > >Let me ask one question: since PASID cache is also tagged with domain >ID, if the DID has not changed, maybe iommu driver will skip the PASID >cache flush? My understanding is no matter what's changed in PASID entry, there should be PASID cache invalidation, either domain scope, pasid scope or global invalidation. Thanks Zhenzhong > >[1] >https://lore.kernel.org/qemu-devel/20170117084604.2b1f5e50@t450s.home >/ > >Regards, >Yi Liu
© 2016 - 2025 Red Hat, Inc.