[v1] intel_iommu: Enable PASID support for passthrough device

[PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in vtd_flush_host_piotlb_all_locked()

Posted by Zhenzhong Duan 1 month ago

In order to support PASID, we have switched from looping vtd_as to vtd_hiod,
vtd_hiod represents host passthrough device and never deferenced without BQL.
So we don't need extra iommu lock to protect it.

Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
---
 hw/i386/intel_iommu_accel.h | 14 +++++++-------
 hw/i386/intel_iommu.c       |  7 ++++---
 hw/i386/intel_iommu_accel.c |  6 +++---
 3 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/hw/i386/intel_iommu_accel.h b/hw/i386/intel_iommu_accel.h
index 1ae46d9250..3f1b1002b8 100644
--- a/hw/i386/intel_iommu_accel.h
+++ b/hw/i386/intel_iommu_accel.h
@@ -24,9 +24,9 @@ typedef struct VTDACCELPASIDCacheEntry {
 bool vtd_check_hiod_accel(IntelIOMMUState *s, VTDHostIOMMUDevice *vtd_hiod,
                           Error **errp);
 VTDHostIOMMUDevice *vtd_find_hiod_iommufd(VTDAddressSpace *as);
-void vtd_flush_host_piotlb_all_locked(IntelIOMMUState *s, uint16_t domain_id,
-                                      uint32_t pasid, hwaddr addr,
-                                      uint64_t npages, bool ih);
+void vtd_flush_host_piotlb_all_accel(IntelIOMMUState *s, uint16_t domain_id,
+                                     uint32_t pasid, hwaddr addr,
+                                     uint64_t npages, bool ih);
 void vtd_pasid_cache_sync_accel(IntelIOMMUState *s, VTDPASIDCacheInfo *pc_info);
 void vtd_pasid_cache_reset_accel(IntelIOMMUState *s);
 void vtd_iommu_ops_update_accel(PCIIOMMUOps *ops);
@@ -51,10 +51,10 @@ static inline bool vtd_propagate_guest_pasid(VTDAddressSpace *vtd_as,
     return true;
 }
 
-static inline void vtd_flush_host_piotlb_all_locked(IntelIOMMUState *s,
-                                                    uint16_t domain_id,
-                                                    uint32_t pasid, hwaddr addr,
-                                                    uint64_t npages, bool ih)
+static inline void vtd_flush_host_piotlb_all_accel(IntelIOMMUState *s,
+                                                   uint16_t domain_id,
+                                                   uint32_t pasid, hwaddr addr,
+                                                   uint64_t npages, bool ih)
 {
 }
 
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index edd2b8f0cc..3ea5b92b34 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3011,11 +3011,11 @@ static void vtd_piotlb_pasid_invalidate(IntelIOMMUState *s,
     info.domain_id = domain_id;
     info.pasid = pasid;
 
+    vtd_flush_host_piotlb_all_accel(s, domain_id, pasid, 0, (uint64_t)-1,
+                                     false);
     vtd_iommu_lock(s);
     g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_pasid,
                                 &info);
-    vtd_flush_host_piotlb_all_locked(s, domain_id, pasid, 0, (uint64_t)-1,
-                                     false);
     vtd_iommu_unlock(s);
 
     QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
@@ -3045,10 +3045,11 @@ static void vtd_piotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
     info.addr = addr;
     info.mask = ~((1 << am) - 1);
 
+    vtd_flush_host_piotlb_all_accel(s, domain_id, pasid, addr, 1 << am, ih);
+
     vtd_iommu_lock(s);
     g_hash_table_foreach_remove(s->iotlb,
                                 vtd_hash_remove_by_page_piotlb, &info);
-    vtd_flush_host_piotlb_all_locked(s, domain_id, pasid, addr, 1 << am, ih);
     vtd_iommu_unlock(s);
 
     vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am, pasid);
diff --git a/hw/i386/intel_iommu_accel.c b/hw/i386/intel_iommu_accel.c
index d7c1ff6b74..acb1b1e238 100644
--- a/hw/i386/intel_iommu_accel.c
+++ b/hw/i386/intel_iommu_accel.c
@@ -231,9 +231,9 @@ static void vtd_flush_host_piotlb(VTDACCELPASIDCacheEntry *vtd_pce,
     }
 }
 
-void vtd_flush_host_piotlb_all_locked(IntelIOMMUState *s, uint16_t domain_id,
-                                      uint32_t pasid, hwaddr addr,
-                                      uint64_t npages, bool ih)
+void vtd_flush_host_piotlb_all_accel(IntelIOMMUState *s, uint16_t domain_id,
+                                     uint32_t pasid, hwaddr addr,
+                                     uint64_t npages, bool ih)
 {
     struct iommu_hwpt_vtd_s1_invalidate cache_info = { 0 };
     VTDPIOTLBInvInfo piotlb_info;
-- 
2.47.3

Re: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in vtd_flush_host_piotlb_all_locked()

Posted by CLEMENT MATHIEU--DRIF 3 weeks, 2 days ago

vtd_piotlb_page_invalidate can be called from vtd_pri_perform_implicit_invalidation without any lock. Thus we can call vtd_flush_host_piotlb_all_accel unlocked. Is there a risk in case of concurrent hot plugging of another device?

cmd

On Thu, 2026-03-05 at 22:44 -0500, Zhenzhong Duan wrote:
> In order to support PASID, we have switched from looping vtd_as to vtd_hiod,  
> vtd_hiod represents host passthrough device and never deferenced without BQL.  
> So we don't need extra iommu lock to protect it.
> 
> Signed-off-by: Zhenzhong Duan <[zhenzhong.duan@intel.com](mailto:zhenzhong.duan@intel.com)>  
> ---  
>  hw/i386/intel_iommu_accel.h | 14 +++++++-------  
>  hw/i386/intel_iommu.c       |  7 ++++---  
>  hw/i386/intel_iommu_accel.c |  6 +++---  
>  3 files changed, 14 insertions(+), 13 deletions(-)
> 
> diff --git a/hw/i386/intel_iommu_accel.h b/hw/i386/intel_iommu_accel.h  
> index 1ae46d9250..3f1b1002b8 100644  
> --- a/hw/i386/intel_iommu_accel.h  
> +++ b/hw/i386/intel_iommu_accel.h  
> @@ -24,9 +24,9 @@ typedef struct VTDACCELPASIDCacheEntry {  
>  bool vtd_check_hiod_accel(IntelIOMMUState *s, VTDHostIOMMUDevice *vtd_hiod,  
>                            Error **errp);  
>  VTDHostIOMMUDevice *vtd_find_hiod_iommufd(VTDAddressSpace *as);  
> -void vtd_flush_host_piotlb_all_locked(IntelIOMMUState *s, uint16_t domain_id,  
> -                                      uint32_t pasid, hwaddr addr,  
> -                                      uint64_t npages, bool ih);  
> +void vtd_flush_host_piotlb_all_accel(IntelIOMMUState *s, uint16_t domain_id,  
> +                                     uint32_t pasid, hwaddr addr,  
> +                                     uint64_t npages, bool ih);  
>  void vtd_pasid_cache_sync_accel(IntelIOMMUState *s, VTDPASIDCacheInfo *pc_info);  
>  void vtd_pasid_cache_reset_accel(IntelIOMMUState *s);  
>  void vtd_iommu_ops_update_accel(PCIIOMMUOps *ops);  
> @@ -51,10 +51,10 @@ static inline bool vtd_propagate_guest_pasid(VTDAddressSpace *vtd_as,  
>      return true;  
>  }  
>    
> -static inline void vtd_flush_host_piotlb_all_locked(IntelIOMMUState *s,  
> -                                                    uint16_t domain_id,  
> -                                                    uint32_t pasid, hwaddr addr,  
> -                                                    uint64_t npages, bool ih)  
> +static inline void vtd_flush_host_piotlb_all_accel(IntelIOMMUState *s,  
> +                                                   uint16_t domain_id,  
> +                                                   uint32_t pasid, hwaddr addr,  
> +                                                   uint64_t npages, bool ih)  
>  {  
>  }  
>    
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c  
> index edd2b8f0cc..3ea5b92b34 100644  
> --- a/hw/i386/intel_iommu.c  
> +++ b/hw/i386/intel_iommu.c  
> @@ -3011,11 +3011,11 @@ static void vtd_piotlb_pasid_invalidate(IntelIOMMUState *s,  
>      info.domain_id = domain_id;  
>      info.pasid = pasid;  
>    
> +    vtd_flush_host_piotlb_all_accel(s, domain_id, pasid, 0, (uint64_t)-1,  
> +                                     false);  
>      vtd_iommu_lock(s);  
>      g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_pasid,  
>                                  &info);  
> -    vtd_flush_host_piotlb_all_locked(s, domain_id, pasid, 0, (uint64_t)-1,  
> -                                     false);  
>      vtd_iommu_unlock(s);  
>    
>      QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {  
> @@ -3045,10 +3045,11 @@ static void vtd_piotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,  
>      info.addr = addr;  
>      info.mask = ~((1 << am) - 1);  
>    
> +    vtd_flush_host_piotlb_all_accel(s, domain_id, pasid, addr, 1 << am, ih);  
> +  
>      vtd_iommu_lock(s);  
>      g_hash_table_foreach_remove(s->iotlb,  
>                                  vtd_hash_remove_by_page_piotlb, &info);  
> -    vtd_flush_host_piotlb_all_locked(s, domain_id, pasid, addr, 1 << am, ih);  
>      vtd_iommu_unlock(s);  
>    
>      vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am, pasid);  
> diff --git a/hw/i386/intel_iommu_accel.c b/hw/i386/intel_iommu_accel.c  
> index d7c1ff6b74..acb1b1e238 100644  
> --- a/hw/i386/intel_iommu_accel.c  
> +++ b/hw/i386/intel_iommu_accel.c  
> @@ -231,9 +231,9 @@ static void vtd_flush_host_piotlb(VTDACCELPASIDCacheEntry *vtd_pce,  
>      }  
>  }  
>    
> -void vtd_flush_host_piotlb_all_locked(IntelIOMMUState *s, uint16_t domain_id,  
> -                                      uint32_t pasid, hwaddr addr,  
> -                                      uint64_t npages, bool ih)  
> +void vtd_flush_host_piotlb_all_accel(IntelIOMMUState *s, uint16_t domain_id,  
> +                                     uint32_t pasid, hwaddr addr,  
> +                                     uint64_t npages, bool ih)  
>  {  
>      struct iommu_hwpt_vtd_s1_invalidate cache_info = { 0 };  
>      VTDPIOTLBInvInfo piotlb_info;

RE: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in vtd_flush_host_piotlb_all_locked()

Posted by Duan, Zhenzhong 3 weeks, 2 days ago

Hi Clement,

>-----Original Message-----
>From: CLEMENT MATHIEU--DRIF <clement.mathieu--drif@bull.com>
>Subject: Re: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in
>vtd_flush_host_piotlb_all_locked()
>
>vtd_piotlb_page_invalidate can be called from
>vtd_pri_perform_implicit_invalidation without any lock. Thus we can call
>vtd_flush_host_piotlb_all_accel unlocked. Is there a risk in case of concurrent hot
>plugging of another device?

Good catch, hotplug is protected by BQL, but if an iothread could send PRI request,
we suffer from the race.

I think we can bypass vtd_pri_perform_implicit_invalidation() for passthrough device
which doesn't cache iotlb entry in QEMU but in host, like:

@@ -5401,7 +5401,8 @@ static int vtd_pri_request_page(PCIBus *bus, void *opaque, int devfn,
         return -ENOSPC;
     }

-    if (vtd_pri_perform_implicit_invalidation(vtd_as, addr)) {
+    if (!vtd_find_hiod_iommufd(vtd_as) &&
+        vtd_pri_perform_implicit_invalidation(vtd_as, addr)) {
         return -EINVAL;
     }

Thanks
Zhenzhong


>
>cmd
>
>On Thu, 2026-03-05 at 22:44 -0500, Zhenzhong Duan wrote:
>> In order to support PASID, we have switched from looping vtd_as to vtd_hiod,
>> vtd_hiod represents host passthrough device and never deferenced without BQL.
>> So we don't need extra iommu lock to protect it.
>>
>> Signed-off-by: Zhenzhong Duan
><[zhenzhong.duan@intel.com](mailto:zhenzhong.duan@intel.com)>
>> ---
>>  hw/i386/intel_iommu_accel.h | 14 +++++++-------
>>  hw/i386/intel_iommu.c       |  7 ++++---
>>  hw/i386/intel_iommu_accel.c |  6 +++---
>>  3 files changed, 14 insertions(+), 13 deletions(-)
>>
>> diff --git a/hw/i386/intel_iommu_accel.h b/hw/i386/intel_iommu_accel.h
>> index 1ae46d9250..3f1b1002b8 100644
>> --- a/hw/i386/intel_iommu_accel.h
>> +++ b/hw/i386/intel_iommu_accel.h
>> @@ -24,9 +24,9 @@ typedef struct VTDACCELPASIDCacheEntry {
>>  bool vtd_check_hiod_accel(IntelIOMMUState *s, VTDHostIOMMUDevice
>*vtd_hiod,
>>                            Error **errp);
>>  VTDHostIOMMUDevice *vtd_find_hiod_iommufd(VTDAddressSpace *as);
>> -void vtd_flush_host_piotlb_all_locked(IntelIOMMUState *s, uint16_t
>domain_id,
>> -                                      uint32_t pasid, hwaddr addr,
>> -                                      uint64_t npages, bool ih);
>> +void vtd_flush_host_piotlb_all_accel(IntelIOMMUState *s, uint16_t domain_id,
>> +                                     uint32_t pasid, hwaddr addr,
>> +                                     uint64_t npages, bool ih);
>>  void vtd_pasid_cache_sync_accel(IntelIOMMUState *s, VTDPASIDCacheInfo
>*pc_info);
>>  void vtd_pasid_cache_reset_accel(IntelIOMMUState *s);
>>  void vtd_iommu_ops_update_accel(PCIIOMMUOps *ops);
>> @@ -51,10 +51,10 @@ static inline bool
>vtd_propagate_guest_pasid(VTDAddressSpace *vtd_as,
>>      return true;
>>  }
>>
>> -static inline void vtd_flush_host_piotlb_all_locked(IntelIOMMUState *s,
>> -                                                    uint16_t domain_id,
>> -                                                    uint32_t pasid, hwaddr addr,
>> -                                                    uint64_t npages, bool ih)
>> +static inline void vtd_flush_host_piotlb_all_accel(IntelIOMMUState *s,
>> +                                                   uint16_t domain_id,
>> +                                                   uint32_t pasid, hwaddr addr,
>> +                                                   uint64_t npages, bool ih)
>>  {
>>  }
>>
>> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
>> index edd2b8f0cc..3ea5b92b34 100644
>> --- a/hw/i386/intel_iommu.c
>> +++ b/hw/i386/intel_iommu.c
>> @@ -3011,11 +3011,11 @@ static void
>vtd_piotlb_pasid_invalidate(IntelIOMMUState *s,
>>      info.domain_id = domain_id;
>>      info.pasid = pasid;
>>
>> +    vtd_flush_host_piotlb_all_accel(s, domain_id, pasid, 0, (uint64_t)-1,
>> +                                     false);
>>      vtd_iommu_lock(s);
>>      g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_pasid,
>>                                  &info);
>> -    vtd_flush_host_piotlb_all_locked(s, domain_id, pasid, 0, (uint64_t)-1,
>> -                                     false);
>>      vtd_iommu_unlock(s);
>>
>>      QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
>> @@ -3045,10 +3045,11 @@ static void
>vtd_piotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
>>      info.addr = addr;
>>      info.mask = ~((1 << am) - 1);
>>
>> +    vtd_flush_host_piotlb_all_accel(s, domain_id, pasid, addr, 1 << am, ih);
>> +
>>      vtd_iommu_lock(s);
>>      g_hash_table_foreach_remove(s->iotlb,
>>                                  vtd_hash_remove_by_page_piotlb, &info);
>> -    vtd_flush_host_piotlb_all_locked(s, domain_id, pasid, addr, 1 << am, ih);
>>      vtd_iommu_unlock(s);
>>
>>      vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am, pasid);
>> diff --git a/hw/i386/intel_iommu_accel.c b/hw/i386/intel_iommu_accel.c
>> index d7c1ff6b74..acb1b1e238 100644
>> --- a/hw/i386/intel_iommu_accel.c
>> +++ b/hw/i386/intel_iommu_accel.c
>> @@ -231,9 +231,9 @@ static void
>vtd_flush_host_piotlb(VTDACCELPASIDCacheEntry *vtd_pce,
>>      }
>>  }
>>
>> -void vtd_flush_host_piotlb_all_locked(IntelIOMMUState *s, uint16_t
>domain_id,
>> -                                      uint32_t pasid, hwaddr addr,
>> -                                      uint64_t npages, bool ih)
>> +void vtd_flush_host_piotlb_all_accel(IntelIOMMUState *s, uint16_t domain_id,
>> +                                     uint32_t pasid, hwaddr addr,
>> +                                     uint64_t npages, bool ih)
>>  {
>>      struct iommu_hwpt_vtd_s1_invalidate cache_info = { 0 };
>>      VTDPIOTLBInvInfo piotlb_info;

RE: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in vtd_flush_host_piotlb_all_locked()

Posted by Duan, Zhenzhong 3 weeks, 1 day ago

Hi Clement,

>-----Original Message-----
>From: Duan, Zhenzhong
>Subject: RE: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in
>vtd_flush_host_piotlb_all_locked()
>
>Hi Clement,
>
>>-----Original Message-----
>>From: CLEMENT MATHIEU--DRIF <clement.mathieu--drif@bull.com>
>>Subject: Re: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in
>>vtd_flush_host_piotlb_all_locked()
>>
>>vtd_piotlb_page_invalidate can be called from
>>vtd_pri_perform_implicit_invalidation without any lock. Thus we can call
>>vtd_flush_host_piotlb_all_accel unlocked. Is there a risk in case of concurrent hot
>>plugging of another device?
>
>Good catch, hotplug is protected by BQL, but if an iothread could send PRI request,
>we suffer from the race.
>
>I think we can bypass vtd_pri_perform_implicit_invalidation() for passthrough
>device
>which doesn't cache iotlb entry in QEMU but in host, like:
>
>@@ -5401,7 +5401,8 @@ static int vtd_pri_request_page(PCIBus *bus, void
>*opaque, int devfn,
>         return -ENOSPC;
>     }
>
>-    if (vtd_pri_perform_implicit_invalidation(vtd_as, addr)) {
>+    if (!vtd_find_hiod_iommufd(vtd_as) &&
>+        vtd_pri_perform_implicit_invalidation(vtd_as, addr)) {
>         return -EINVAL;
>     }

After further thinking, I found above change doesn’t resolve the race you mentioned.

But I think the iothread should take BQL before calling ::pri_request_page(),
because it is a function changing vtd device context, e.g., PQT and PRS register,
page invalidation queue and iotlb entries. We should always take BQL before changing
context of any device. Let me know if you have different opinions.

Thanks
Zhenzhong

Re: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in vtd_flush_host_piotlb_all_locked()

Posted by CLEMENT MATHIEU--DRIF 1 week, 4 days ago

On Fri, 2026-03-20 at 04:04 +0000, Duan, Zhenzhong wrote:
> Hi Clement,
> 
> 
> > -----Original Message-----
> > From: Duan, Zhenzhong
> > Subject: RE: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in
> > vtd_flush_host_piotlb_all_locked()
> > 
> > Hi Clement,
> > 
> > 
> > > -----Original Message-----
> > > From: CLEMENT MATHIEU--DRIF <[clement.mathieu--drif@bull.com](mailto:clement.mathieu--drif@bull.com)>
> > > Subject: Re: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in
> > > vtd_flush_host_piotlb_all_locked()
> > > 
> > > vtd_piotlb_page_invalidate can be called from
> > > vtd_pri_perform_implicit_invalidation without any lock. Thus we can call
> > > vtd_flush_host_piotlb_all_accel unlocked. Is there a risk in case of concurrent hot
> > > plugging of another device?
> > 
> > 
> > Good catch, hotplug is protected by BQL, but if an iothread could send PRI request,
> > we suffer from the race.
> > 
> > I think we can bypass vtd_pri_perform_implicit_invalidation() for passthrough
> > device
> > which doesn't cache iotlb entry in QEMU but in host, like:
> > 
> > @@ -5401,7 +5401,8 @@ static int vtd_pri_request_page(PCIBus *bus, void
> > *opaque, int devfn,
> >         return -ENOSPC;
> >     }
> > 
> > -    if (vtd_pri_perform_implicit_invalidation(vtd_as, addr)) {
> > +    if (!vtd_find_hiod_iommufd(vtd_as) &&
> > +        vtd_pri_perform_implicit_invalidation(vtd_as, addr)) {
> >         return -EINVAL;
> >     }
> 
> 
> After further thinking, I found above change doesn’t resolve the race you mentioned.
> 
> But I think the iothread should take BQL before calling ::pri_request_page(),  
> because it is a function changing vtd device context, e.g., PQT and PRS register,  
> page invalidation queue and iotlb entries. We should always take BQL before changing  
> context of any device. Let me know if you have different opinions.

Do you mean that every emulated device that triggers a PRI request should hold the BQL?  
I'm not sure this is a viable pattern as the initiator probably already holds an internal lock  
and thus cannot take the BQL without risking deadlocks with concurrent MMIO :/

Otherwise, it means that the device always takes the BQL before taking  it's own lock.

> 
> Thanks  
> Zhenzhong

RE: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in vtd_flush_host_piotlb_all_locked()

Posted by Duan, Zhenzhong 1 week, 4 days ago


>-----Original Message-----
>From: CLEMENT MATHIEU--DRIF <clement.mathieu--drif@bull.com>
>Subject: Re: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in
>vtd_flush_host_piotlb_all_locked()
>
>
>On Fri, 2026-03-20 at 04:04 +0000, Duan, Zhenzhong wrote:
>> Hi Clement,
>>
>>
>> > -----Original Message-----
>> > From: Duan, Zhenzhong
>> > Subject: RE: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in
>> > vtd_flush_host_piotlb_all_locked()
>> >
>> > Hi Clement,
>> >
>> >
>> > > -----Original Message-----
>> > > From: CLEMENT MATHIEU--DRIF <[clement.mathieu--
>drif@bull.com](mailto:clement.mathieu--drif@bull.com)>
>> > > Subject: Re: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in
>> > > vtd_flush_host_piotlb_all_locked()
>> > >
>> > > vtd_piotlb_page_invalidate can be called from
>> > > vtd_pri_perform_implicit_invalidation without any lock. Thus we can call
>> > > vtd_flush_host_piotlb_all_accel unlocked. Is there a risk in case of
>concurrent hot
>> > > plugging of another device?
>> >
>> >
>> > Good catch, hotplug is protected by BQL, but if an iothread could send PRI
>request,
>> > we suffer from the race.
>> >
>> > I think we can bypass vtd_pri_perform_implicit_invalidation() for passthrough
>> > device
>> > which doesn't cache iotlb entry in QEMU but in host, like:
>> >
>> > @@ -5401,7 +5401,8 @@ static int vtd_pri_request_page(PCIBus *bus, void
>> > *opaque, int devfn,
>> >         return -ENOSPC;
>> >     }
>> >
>> > -    if (vtd_pri_perform_implicit_invalidation(vtd_as, addr)) {
>> > +    if (!vtd_find_hiod_iommufd(vtd_as) &&
>> > +        vtd_pri_perform_implicit_invalidation(vtd_as, addr)) {
>> >         return -EINVAL;
>> >     }
>>
>>
>> After further thinking, I found above change doesn’t resolve the race you
>mentioned.
>>
>> But I think the iothread should take BQL before calling ::pri_request_page(),
>> because it is a function changing vtd device context, e.g., PQT and PRS register,
>> page invalidation queue and iotlb entries. We should always take BQL before
>changing
>> context of any device. Let me know if you have different opinions.
>
>Do you mean that every emulated device that triggers a PRI request should hold
>the BQL?
>I'm not sure this is a viable pattern as the initiator probably already holds an
>internal lock
>and thus cannot take the BQL without risking deadlocks with concurrent MMIO :/

IIUC, qemu doesn't support concurrent MMIO emulations yet, they are serialized by BQL. If we don't take BQL in call site, then we need it in callee, like:

@@ -5351,6 +5351,8 @@ static int vtd_pri_request_page(PCIBus *bus, void *opaque, int devfn,
     IntelIOMMUState *s = opaque;
     VTDAddressSpace *vtd_as;

+    BQL_LOCK_GUARD();
+
     vtd_as = vtd_find_add_as(s, bus, devfn, pasid);

     uint64_t queue_addr_reg = vtd_get_quad(s, DMAR_PQA_REG);

>
>Otherwise, it means that the device always takes the BQL before taking  it's own
>lock.

I think it's true because we always take BQL before emulation start.

Thanks
Zhenzhong

Re: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in vtd_flush_host_piotlb_all_locked()

Posted by CLEMENT MATHIEU--DRIF 1 week, 3 days ago

On Tue, 2026-03-31 at 07:18 +0000, Duan, Zhenzhong wrote:
> 
> 
> 
> > -----Original Message-----
> > From: CLEMENT MATHIEU--DRIF <[clement.mathieu--drif@bull.com](mailto:clement.mathieu--drif@bull.com)>
> > Subject: Re: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in
> > vtd_flush_host_piotlb_all_locked()
> > 
> > 
> > On Fri, 2026-03-20 at 04:04 +0000, Duan, Zhenzhong wrote:
> > 
> > > Hi Clement,
> > > 
> > > 
> > > 
> > > > -----Original Message-----
> > > > From: Duan, Zhenzhong
> > > > Subject: RE: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in
> > > > vtd_flush_host_piotlb_all_locked()
> > > > 
> > > > Hi Clement,
> > > > 
> > > > 
> > > > 
> > > > > -----Original Message-----
> > > > > From: CLEMENT MATHIEU--DRIF <[clement.mathieu--
> > > > 
> > > 
> > 
> > [drif@bull.com](mailto:drif@bull.com)](mailto:[clement.mathieu--drif@bull.com](mailto:clement.mathieu--drif@bull.com))>
> > 
> > > 
> > > > 
> > > > > Subject: Re: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in
> > > > > vtd_flush_host_piotlb_all_locked()
> > > > > 
> > > > > vtd_piotlb_page_invalidate can be called from
> > > > > vtd_pri_perform_implicit_invalidation without any lock. Thus we can call
> > > > > vtd_flush_host_piotlb_all_accel unlocked. Is there a risk in case of
> > > > 
> > > 
> > 
> > concurrent hot
> > 
> > > 
> > > > 
> > > > > plugging of another device?
> > > > 
> > > > 
> > > > 
> > > > Good catch, hotplug is protected by BQL, but if an iothread could send PRI
> > > 
> > 
> > request,
> > 
> > > 
> > > > we suffer from the race.
> > > > 
> > > > I think we can bypass vtd_pri_perform_implicit_invalidation() for passthrough
> > > > device
> > > > which doesn't cache iotlb entry in QEMU but in host, like:
> > > > 
> > > > @@ -5401,7 +5401,8 @@ static int vtd_pri_request_page(PCIBus *bus, void
> > > > *opaque, int devfn,
> > > >         return -ENOSPC;
> > > >     }
> > > > 
> > > > -    if (vtd_pri_perform_implicit_invalidation(vtd_as, addr)) {
> > > > +    if (!vtd_find_hiod_iommufd(vtd_as) &&
> > > > +        vtd_pri_perform_implicit_invalidation(vtd_as, addr)) {
> > > >         return -EINVAL;
> > > >     }
> > > 
> > > 
> > > 
> > > After further thinking, I found above change doesn’t resolve the race you
> > 
> > mentioned.
> > 
> > > 
> > > But I think the iothread should take BQL before calling ::pri_request_page(),
> > > because it is a function changing vtd device context, e.g., PQT and PRS register,
> > > page invalidation queue and iotlb entries. We should always take BQL before
> > 
> > changing
> > 
> > > context of any device. Let me know if you have different opinions.
> > 
> > 
> > Do you mean that every emulated device that triggers a PRI request should hold
> > the BQL?
> > I'm not sure this is a viable pattern as the initiator probably already holds an
> > internal lock
> > and thus cannot take the BQL without risking deadlocks with concurrent MMIO :/
> 
> 
> IIUC, qemu doesn't support concurrent MMIO emulations yet, they are serialized by BQL. If we don't take BQL in call site, then we need it in callee, like:
> 
> @@ -5351,6 +5351,8 @@ static int vtd_pri_request_page(PCIBus *bus, void *opaque, int devfn,  
>      IntelIOMMUState *s = opaque;  
>      VTDAddressSpace *vtd_as;
> 
> +    BQL_LOCK_GUARD();  
> +  
>      vtd_as = vtd_find_add_as(s, bus, devfn, pasid);
> 
>      uint64_t queue_addr_reg = vtd_get_quad(s, DMAR_PQA_REG);
> 
> 
> 
> > Otherwise, it means that the device always takes the BQL before taking  it's own
> > lock.
> 
> 
> I think it's true because we always take BQL before emulation start.

This is pretty dangerous IMHO.  

The emulated device that ends up calling this function probably has a lock on its side already, let's call it dev_lock.  

I the device does:  
	- lock(&dev_lock)  
	- start a PRI request  
	- lock(&bql)

and if the host triggers an MMIO operation on the device:
	- lock(&bql)
	- lock(&dev_lock)

We can take dev_lock and bql in reverse order, leading to potential deadlocks o.O

> 
> Thanks  
> Zhenzhong

RE: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in vtd_flush_host_piotlb_all_locked()

Posted by Duan, Zhenzhong 1 week, 3 days ago

Hi Clement,

>-----Original Message-----
>From: CLEMENT MATHIEU--DRIF <clement.mathieu--drif@bull.com>
>Subject: Re: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in
>vtd_flush_host_piotlb_all_locked()
>
>
>On Tue, 2026-03-31 at 07:18 +0000, Duan, Zhenzhong wrote:
>>
>>
>>
>> > -----Original Message-----
>> > From: CLEMENT MATHIEU--DRIF <[clement.mathieu--
>drif@bull.com](mailto:clement.mathieu--drif@bull.com)>
>> > Subject: Re: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in
>> > vtd_flush_host_piotlb_all_locked()
>> >
>> >
>> > On Fri, 2026-03-20 at 04:04 +0000, Duan, Zhenzhong wrote:
>> >
>> > > Hi Clement,
>> > >
>> > >
>> > >
>> > > > -----Original Message-----
>> > > > From: Duan, Zhenzhong
>> > > > Subject: RE: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in
>> > > > vtd_flush_host_piotlb_all_locked()
>> > > >
>> > > > Hi Clement,
>> > > >
>> > > >
>> > > >
>> > > > > -----Original Message-----
>> > > > > From: CLEMENT MATHIEU--DRIF <[clement.mathieu--
>> > > >
>> > >
>> >
>> > [drif@bull.com](mailto:drif@bull.com)](mailto:[clement.mathieu--
>drif@bull.com](mailto:clement.mathieu--drif@bull.com))>
>> >
>> > >
>> > > >
>> > > > > Subject: Re: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in
>> > > > > vtd_flush_host_piotlb_all_locked()
>> > > > >
>> > > > > vtd_piotlb_page_invalidate can be called from
>> > > > > vtd_pri_perform_implicit_invalidation without any lock. Thus we can call
>> > > > > vtd_flush_host_piotlb_all_accel unlocked. Is there a risk in case of
>> > > >
>> > >
>> >
>> > concurrent hot
>> >
>> > >
>> > > >
>> > > > > plugging of another device?
>> > > >
>> > > >
>> > > >
>> > > > Good catch, hotplug is protected by BQL, but if an iothread could send PRI
>> > >
>> >
>> > request,
>> >
>> > >
>> > > > we suffer from the race.
>> > > >
>> > > > I think we can bypass vtd_pri_perform_implicit_invalidation() for
>passthrough
>> > > > device
>> > > > which doesn't cache iotlb entry in QEMU but in host, like:
>> > > >
>> > > > @@ -5401,7 +5401,8 @@ static int vtd_pri_request_page(PCIBus *bus,
>void
>> > > > *opaque, int devfn,
>> > > >         return -ENOSPC;
>> > > >     }
>> > > >
>> > > > -    if (vtd_pri_perform_implicit_invalidation(vtd_as, addr)) {
>> > > > +    if (!vtd_find_hiod_iommufd(vtd_as) &&
>> > > > +        vtd_pri_perform_implicit_invalidation(vtd_as, addr)) {
>> > > >         return -EINVAL;
>> > > >     }
>> > >
>> > >
>> > >
>> > > After further thinking, I found above change doesn’t resolve the race you
>> >
>> > mentioned.
>> >
>> > >
>> > > But I think the iothread should take BQL before calling ::pri_request_page(),
>> > > because it is a function changing vtd device context, e.g., PQT and PRS
>register,
>> > > page invalidation queue and iotlb entries. We should always take BQL before
>> >
>> > changing
>> >
>> > > context of any device. Let me know if you have different opinions.
>> >
>> >
>> > Do you mean that every emulated device that triggers a PRI request should
>hold
>> > the BQL?
>> > I'm not sure this is a viable pattern as the initiator probably already holds an
>> > internal lock
>> > and thus cannot take the BQL without risking deadlocks with concurrent
>MMIO :/
>>
>>
>> IIUC, qemu doesn't support concurrent MMIO emulations yet, they are serialized
>by BQL. If we don't take BQL in call site, then we need it in callee, like:
>>
>> @@ -5351,6 +5351,8 @@ static int vtd_pri_request_page(PCIBus *bus, void
>*opaque, int devfn,
>>      IntelIOMMUState *s = opaque;
>>      VTDAddressSpace *vtd_as;
>>
>> +    BQL_LOCK_GUARD();
>> +
>>      vtd_as = vtd_find_add_as(s, bus, devfn, pasid);
>>
>>      uint64_t queue_addr_reg = vtd_get_quad(s, DMAR_PQA_REG);
>>
>>
>>
>> > Otherwise, it means that the device always takes the BQL before taking  it's
>own
>> > lock.
>>
>>
>> I think it's true because we always take BQL before emulation start.
>
>This is pretty dangerous IMHO.
>
>The emulated device that ends up calling this function probably has a lock on its
>side already, let's call it dev_lock.
>
>I the device does:
>	- lock(&dev_lock)
>	- start a PRI request
>	- lock(&bql)

I have two questions:

1. which thread may have above sequence?
2. do we have to protect PRI request with dev_lock?

PRI state is protected by BQL, I think no need to be guarded by dev_lock. Maybe unlock dev_lock during PRI?

Thanks
Zhenzhong

>
>and if the host triggers an MMIO operation on the device:
>	- lock(&bql)
>	- lock(&dev_lock)
>
>We can take dev_lock and bql in reverse order, leading to potential deadlocks o.O
>
>>
>> Thanks
>> Zhenzhong

Re: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in vtd_flush_host_piotlb_all_locked()

Posted by CLEMENT MATHIEU--DRIF 1 week, 2 days ago

On Wed, 2026-04-01 at 09:23 +0000, Duan, Zhenzhong wrote:
> Hi Clement,
> 
> 
> > -----Original Message-----
> > From: CLEMENT MATHIEU--DRIF <[clement.mathieu--drif@bull.com](mailto:clement.mathieu--drif@bull.com)>
> > Subject: Re: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in
> > vtd_flush_host_piotlb_all_locked()
> > 
> > 
> > On Tue, 2026-03-31 at 07:18 +0000, Duan, Zhenzhong wrote:
> > 
> > > 
> > > 
> > > 
> > > 
> > > > -----Original Message-----
> > > > From: CLEMENT MATHIEU--DRIF <[clement.mathieu--
> > > 
> > 
> > [drif@bull.com](mailto:drif@bull.com)](mailto:[clement.mathieu--drif@bull.com](mailto:clement.mathieu--drif@bull.com))>
> > 
> > > 
> > > > Subject: Re: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in
> > > > vtd_flush_host_piotlb_all_locked()
> > > > 
> > > > 
> > > > On Fri, 2026-03-20 at 04:04 +0000, Duan, Zhenzhong wrote:
> > > > 
> > > > 
> > > > > Hi Clement,
> > > > > 
> > > > > 
> > > > > 
> > > > > 
> > > > > > -----Original Message-----
> > > > > > From: Duan, Zhenzhong
> > > > > > Subject: RE: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in
> > > > > > vtd_flush_host_piotlb_all_locked()
> > > > > > 
> > > > > > Hi Clement,
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > > -----Original Message-----
> > > > > > > From: CLEMENT MATHIEU--DRIF <[clement.mathieu--
> > > > > > 
> > > > > > 
> > > > > 
> > > > > 
> > > > 
> > > > 
> > > > [[drif@bull.com](mailto:drif@bull.com)](mailto:[drif@bull.com](mailto:drif@bull.com))](mailto:[clement.mathieu--
> > > 
> > 
> > [drif@bull.com](mailto:drif@bull.com)](mailto:[clement.mathieu--drif@bull.com](mailto:clement.mathieu--drif@bull.com)))>
> > 
> > > 
> > > > 
> > > > 
> > > > > 
> > > > > 
> > > > > > 
> > > > > > 
> > > > > > > Subject: Re: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in
> > > > > > > vtd_flush_host_piotlb_all_locked()
> > > > > > > 
> > > > > > > vtd_piotlb_page_invalidate can be called from
> > > > > > > vtd_pri_perform_implicit_invalidation without any lock. Thus we can call
> > > > > > > vtd_flush_host_piotlb_all_accel unlocked. Is there a risk in case of
> > > > > > 
> > > > > > 
> > > > > 
> > > > > 
> > > > 
> > > > 
> > > > concurrent hot
> > > > 
> > > > 
> > > > > 
> > > > > 
> > > > > > 
> > > > > > 
> > > > > > > plugging of another device?
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > Good catch, hotplug is protected by BQL, but if an iothread could send PRI
> > > > > 
> > > > > 
> > > > 
> > > > 
> > > > request,
> > > > 
> > > > 
> > > > > 
> > > > > 
> > > > > > we suffer from the race.
> > > > > > 
> > > > > > I think we can bypass vtd_pri_perform_implicit_invalidation() for
> > > > > 
> > > > 
> > > 
> > 
> > passthrough
> > 
> > > 
> > > > 
> > > > > 
> > > > > > device
> > > > > > which doesn't cache iotlb entry in QEMU but in host, like:
> > > > > > 
> > > > > > @@ -5401,7 +5401,8 @@ static int vtd_pri_request_page(PCIBus *bus,
> > > > > 
> > > > 
> > > 
> > 
> > void
> > 
> > > 
> > > > 
> > > > > 
> > > > > > *opaque, int devfn,
> > > > > >         return -ENOSPC;
> > > > > >     }
> > > > > > 
> > > > > > -    if (vtd_pri_perform_implicit_invalidation(vtd_as, addr)) {
> > > > > > +    if (!vtd_find_hiod_iommufd(vtd_as) &&
> > > > > > +        vtd_pri_perform_implicit_invalidation(vtd_as, addr)) {
> > > > > >         return -EINVAL;
> > > > > >     }
> > > > > 
> > > > > 
> > > > > 
> > > > > 
> > > > > After further thinking, I found above change doesn’t resolve the race you
> > > > 
> > > > 
> > > > mentioned.
> > > > 
> > > > 
> > > > > 
> > > > > But I think the iothread should take BQL before calling ::pri_request_page(),
> > > > > because it is a function changing vtd device context, e.g., PQT and PRS
> > > > 
> > > 
> > 
> > register,
> > 
> > > 
> > > > 
> > > > > page invalidation queue and iotlb entries. We should always take BQL before
> > > > 
> > > > 
> > > > changing
> > > > 
> > > > 
> > > > > context of any device. Let me know if you have different opinions.
> > > > 
> > > > 
> > > > 
> > > > Do you mean that every emulated device that triggers a PRI request should
> > > 
> > 
> > hold
> > 
> > > 
> > > > the BQL?
> > > > I'm not sure this is a viable pattern as the initiator probably already holds an
> > > > internal lock
> > > > and thus cannot take the BQL without risking deadlocks with concurrent
> > > 
> > 
> > MMIO :/
> > 
> > > 
> > > 
> > > IIUC, qemu doesn't support concurrent MMIO emulations yet, they are serialized
> > 
> > by BQL. If we don't take BQL in call site, then we need it in callee, like:
> > 
> > > 
> > > @@ -5351,6 +5351,8 @@ static int vtd_pri_request_page(PCIBus *bus, void
> > 
> > *opaque, int devfn,
> > 
> > >      IntelIOMMUState *s = opaque;
> > >      VTDAddressSpace *vtd_as;
> > > 
> > > +    BQL_LOCK_GUARD();
> > > +
> > >      vtd_as = vtd_find_add_as(s, bus, devfn, pasid);
> > > 
> > >      uint64_t queue_addr_reg = vtd_get_quad(s, DMAR_PQA_REG);
> > > 
> > > 
> > > 
> > > 
> > > > Otherwise, it means that the device always takes the BQL before taking  it's
> > > 
> > 
> > own
> > 
> > > 
> > > > lock.
> > > 
> > > 
> > > 
> > > I think it's true because we always take BQL before emulation start.
> > 
> > 
> > This is pretty dangerous IMHO.
> > 
> > The emulated device that ends up calling this function probably has a lock on its
> > side already, let's call it dev_lock.
> > 
> > I the device does:
> > 	- lock(&dev_lock)
> > 	- start a PRI request
> > 	- lock(&bql)
> 
> 
> I have two questions:
> 
> 1. which thread may have above sequence?  
> 2. do we have to protect PRI request with dev_lock?
> 
> PRI state is protected by BQL, I think no need to be guarded by dev_lock. Maybe unlock dev_lock during PRI?

Devices that perform svm operations instead of regular dma end up performing this kind of sequence.  
The issue is that device state is guarded by the device lock. Thus, taking the bql in the pri callback  
of the iommu requires the device to unlock its own lock before starting the PRI operation or to rely on  
the bql all the time instead of its own lock.

I'm just trying to make sure svm devices will not become impossible to implement just because they  
have to break their flow and unlock every time they trigger a memory operation.

> 
> Thanks  
> Zhenzhong
> 
> 
> 
> > and if the host triggers an MMIO operation on the device:
> > 	- lock(&bql)
> > 	- lock(&dev_lock)
> > 
> > We can take dev_lock and bql in reverse order, leading to potential deadlocks o.O
> > 
> > 
> > > 
> > > Thanks
> > > Zhenzhong
> >
>

RE: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in vtd_flush_host_piotlb_all_locked()

Posted by Duan, Zhenzhong 1 week, 2 days ago


>-----Original Message-----
>From: CLEMENT MATHIEU--DRIF <clement.mathieu--drif@bull.com>
>Subject: Re: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in
>vtd_flush_host_piotlb_all_locked()
>
>
>On Wed, 2026-04-01 at 09:23 +0000, Duan, Zhenzhong wrote:
>> Hi Clement,
>>
>>
>> > -----Original Message-----
>> > From: CLEMENT MATHIEU--DRIF <[clement.mathieu--
>drif@bull.com](mailto:clement.mathieu--drif@bull.com)>
>> > Subject: Re: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in
>> > vtd_flush_host_piotlb_all_locked()
>> >
>> >
>> > On Tue, 2026-03-31 at 07:18 +0000, Duan, Zhenzhong wrote:
>> >
>> > >
>> > >
>> > >
>> > >
>> > > > -----Original Message-----
>> > > > From: CLEMENT MATHIEU--DRIF <[clement.mathieu--
>> > >
>> >
>> > [drif@bull.com](mailto:drif@bull.com)](mailto:[clement.mathieu--
>drif@bull.com](mailto:clement.mathieu--drif@bull.com))>
>> >
>> > >
>> > > > Subject: Re: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in
>> > > > vtd_flush_host_piotlb_all_locked()
>> > > >
>> > > >
>> > > > On Fri, 2026-03-20 at 04:04 +0000, Duan, Zhenzhong wrote:
>> > > >
>> > > >
>> > > > > Hi Clement,
>> > > > >
>> > > > >
>> > > > >
>> > > > >
>> > > > > > -----Original Message-----
>> > > > > > From: Duan, Zhenzhong
>> > > > > > Subject: RE: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in
>> > > > > > vtd_flush_host_piotlb_all_locked()
>> > > > > >
>> > > > > > Hi Clement,
>> > > > > >
>> > > > > >
>> > > > > >
>> > > > > >
>> > > > > > > -----Original Message-----
>> > > > > > > From: CLEMENT MATHIEU--DRIF <[clement.mathieu--
>> > > > > >
>> > > > > >
>> > > > >
>> > > > >
>> > > >
>> > > >
>> > > >
>[[drif@bull.com](mailto:drif@bull.com)](mailto:[drif@bull.com](mailto:drif@bull.c
>om))](mailto:[clement.mathieu--
>> > >
>> >
>> > [drif@bull.com](mailto:drif@bull.com)](mailto:[clement.mathieu--
>drif@bull.com](mailto:clement.mathieu--drif@bull.com)))>
>> >
>> > >
>> > > >
>> > > >
>> > > > >
>> > > > >
>> > > > > >
>> > > > > >
>> > > > > > > Subject: Re: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in
>> > > > > > > vtd_flush_host_piotlb_all_locked()
>> > > > > > >
>> > > > > > > vtd_piotlb_page_invalidate can be called from
>> > > > > > > vtd_pri_perform_implicit_invalidation without any lock. Thus we can
>call
>> > > > > > > vtd_flush_host_piotlb_all_accel unlocked. Is there a risk in case of
>> > > > > >
>> > > > > >
>> > > > >
>> > > > >
>> > > >
>> > > >
>> > > > concurrent hot
>> > > >
>> > > >
>> > > > >
>> > > > >
>> > > > > >
>> > > > > >
>> > > > > > > plugging of another device?
>> > > > > >
>> > > > > >
>> > > > > >
>> > > > > >
>> > > > > > Good catch, hotplug is protected by BQL, but if an iothread could send
>PRI
>> > > > >
>> > > > >
>> > > >
>> > > >
>> > > > request,
>> > > >
>> > > >
>> > > > >
>> > > > >
>> > > > > > we suffer from the race.
>> > > > > >
>> > > > > > I think we can bypass vtd_pri_perform_implicit_invalidation() for
>> > > > >
>> > > >
>> > >
>> >
>> > passthrough
>> >
>> > >
>> > > >
>> > > > >
>> > > > > > device
>> > > > > > which doesn't cache iotlb entry in QEMU but in host, like:
>> > > > > >
>> > > > > > @@ -5401,7 +5401,8 @@ static int vtd_pri_request_page(PCIBus *bus,
>> > > > >
>> > > >
>> > >
>> >
>> > void
>> >
>> > >
>> > > >
>> > > > >
>> > > > > > *opaque, int devfn,
>> > > > > >         return -ENOSPC;
>> > > > > >     }
>> > > > > >
>> > > > > > -    if (vtd_pri_perform_implicit_invalidation(vtd_as, addr)) {
>> > > > > > +    if (!vtd_find_hiod_iommufd(vtd_as) &&
>> > > > > > +        vtd_pri_perform_implicit_invalidation(vtd_as, addr)) {
>> > > > > >         return -EINVAL;
>> > > > > >     }
>> > > > >
>> > > > >
>> > > > >
>> > > > >
>> > > > > After further thinking, I found above change doesn’t resolve the race you
>> > > >
>> > > >
>> > > > mentioned.
>> > > >
>> > > >
>> > > > >
>> > > > > But I think the iothread should take BQL before
>calling ::pri_request_page(),
>> > > > > because it is a function changing vtd device context, e.g., PQT and PRS
>> > > >
>> > >
>> >
>> > register,
>> >
>> > >
>> > > >
>> > > > > page invalidation queue and iotlb entries. We should always take BQL
>before
>> > > >
>> > > >
>> > > > changing
>> > > >
>> > > >
>> > > > > context of any device. Let me know if you have different opinions.
>> > > >
>> > > >
>> > > >
>> > > > Do you mean that every emulated device that triggers a PRI request should
>> > >
>> >
>> > hold
>> >
>> > >
>> > > > the BQL?
>> > > > I'm not sure this is a viable pattern as the initiator probably already holds an
>> > > > internal lock
>> > > > and thus cannot take the BQL without risking deadlocks with concurrent
>> > >
>> >
>> > MMIO :/
>> >
>> > >
>> > >
>> > > IIUC, qemu doesn't support concurrent MMIO emulations yet, they are
>serialized
>> >
>> > by BQL. If we don't take BQL in call site, then we need it in callee, like:
>> >
>> > >
>> > > @@ -5351,6 +5351,8 @@ static int vtd_pri_request_page(PCIBus *bus, void
>> >
>> > *opaque, int devfn,
>> >
>> > >      IntelIOMMUState *s = opaque;
>> > >      VTDAddressSpace *vtd_as;
>> > >
>> > > +    BQL_LOCK_GUARD();
>> > > +
>> > >      vtd_as = vtd_find_add_as(s, bus, devfn, pasid);
>> > >
>> > >      uint64_t queue_addr_reg = vtd_get_quad(s, DMAR_PQA_REG);
>> > >
>> > >
>> > >
>> > >
>> > > > Otherwise, it means that the device always takes the BQL before taking  it's
>> > >
>> >
>> > own
>> >
>> > >
>> > > > lock.
>> > >
>> > >
>> > >
>> > > I think it's true because we always take BQL before emulation start.
>> >
>> >
>> > This is pretty dangerous IMHO.
>> >
>> > The emulated device that ends up calling this function probably has a lock on its
>> > side already, let's call it dev_lock.
>> >
>> > I the device does:
>> > 	- lock(&dev_lock)
>> > 	- start a PRI request
>> > 	- lock(&bql)
>>
>>
>> I have two questions:
>>
>> 1. which thread may have above sequence?
>> 2. do we have to protect PRI request with dev_lock?
>>
>> PRI state is protected by BQL, I think no need to be guarded by dev_lock. Maybe
>unlock dev_lock during PRI?
>
>Devices that perform svm operations instead of regular dma end up performing this
>kind of sequence.
>The issue is that device state is guarded by the device lock. Thus, taking the bql in
>the pri callback
>of the iommu requires the device to unlock its own lock before starting the PRI
>operation or to rely on
>the bql all the time instead of its own lock.

Yes, this is a bit tricky.

>
>I'm just trying to make sure svm devices will not become impossible to implement
>just because they
>have to break their flow and unlock every time they trigger a memory operation.

OK, if don't use BQL, we need another lock to protect PRQ related device state,
or maybe handle the PRI sending in bottom half so we don't need the lock.

Anyway, it's not an issue yet until we have emulated device supporting PRI.

For the race with device hotplug, I can drop this patch, then the loop on s->vtd_host_iommu_dev
through PRI path is still protected by vtd_iommu_lock(). Does that make sense for you?

Thanks
Zhenzhong

Re: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in vtd_flush_host_piotlb_all_locked()

Posted by CLEMENT MATHIEU--DRIF 1 week, 2 days ago

On Thu, 2026-04-02 at 09:30 +0000, Duan, Zhenzhong wrote:
> 
> 
> 
> > -----Original Message-----
> > From: CLEMENT MATHIEU--DRIF <[clement.mathieu--drif@bull.com](mailto:clement.mathieu--drif@bull.com)>
> > Subject: Re: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in
> > vtd_flush_host_piotlb_all_locked()
> > 
> > 
> > On Wed, 2026-04-01 at 09:23 +0000, Duan, Zhenzhong wrote:
> > 
> > > Hi Clement,
> > > 
> > > 
> > > 
> > > > -----Original Message-----
> > > > From: CLEMENT MATHIEU--DRIF <[clement.mathieu--
> > > 
> > 
> > [drif@bull.com](mailto:drif@bull.com)](mailto:[clement.mathieu--drif@bull.com](mailto:clement.mathieu--drif@bull.com))>
> > 
> > > 
> > > > Subject: Re: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in
> > > > vtd_flush_host_piotlb_all_locked()
> > > > 
> > > > 
> > > > On Tue, 2026-03-31 at 07:18 +0000, Duan, Zhenzhong wrote:
> > > > 
> > > > 
> > > > > 
> > > > > 
> > > > > 
> > > > > 
> > > > > 
> > > > > > -----Original Message-----
> > > > > > From: CLEMENT MATHIEU--DRIF <[clement.mathieu--
> > > > > 
> > > > > 
> > > > 
> > > > 
> > > > [[drif@bull.com](mailto:drif@bull.com)](mailto:[drif@bull.com](mailto:drif@bull.com))](mailto:[clement.mathieu--
> > > 
> > 
> > [drif@bull.com](mailto:drif@bull.com)](mailto:[clement.mathieu--drif@bull.com](mailto:clement.mathieu--drif@bull.com)))>
> > 
> > > 
> > > > 
> > > > 
> > > > > 
> > > > > 
> > > > > > Subject: Re: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in
> > > > > > vtd_flush_host_piotlb_all_locked()
> > > > > > 
> > > > > > 
> > > > > > On Fri, 2026-03-20 at 04:04 +0000, Duan, Zhenzhong wrote:
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > > Hi Clement,
> > > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > > -----Original Message-----
> > > > > > > > From: Duan, Zhenzhong
> > > > > > > > Subject: RE: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in
> > > > > > > > vtd_flush_host_piotlb_all_locked()
> > > > > > > > 
> > > > > > > > Hi Clement,
> > > > > > > > 
> > > > > > > > 
> > > > > > > > 
> > > > > > > > 
> > > > > > > > 
> > > > > > > > > -----Original Message-----
> > > > > > > > > From: CLEMENT MATHIEU--DRIF <[clement.mathieu--
> > > > > > > > 
> > > > > > > > 
> > > > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > 
> > > > 
> > > 
> > 
> > [[[drif@bull.com](mailto:drif@bull.com)](mailto:[drif@bull.com](mailto:drif@bull.com))](mailto:[[drif@bull.com](mailto:drif@bull.com)](mailto:[drif@bull.c](mailto:drif@bull.c)
> > om))](mailto:[clement.mathieu--
> > 
> > > 
> > > > 
> > > > > 
> > > > 
> > > > 
> > > > [[drif@bull.com](mailto:drif@bull.com)](mailto:[drif@bull.com](mailto:drif@bull.com))](mailto:[clement.mathieu--
> > > 
> > 
> > [drif@bull.com](mailto:drif@bull.com)](mailto:[clement.mathieu--drif@bull.com](mailto:clement.mathieu--drif@bull.com))))>
> > 
> > > 
> > > > 
> > > > 
> > > > > 
> > > > > 
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > > 
> > > > > > > > 
> > > > > > > > 
> > > > > > > > > Subject: Re: [PATCH v1 11/13] intel_iommu_accel: drop _lock suffix in
> > > > > > > > > vtd_flush_host_piotlb_all_locked()
> > > > > > > > > 
> > > > > > > > > vtd_piotlb_page_invalidate can be called from
> > > > > > > > > vtd_pri_perform_implicit_invalidation without any lock. Thus we can
> > > > > > > > 
> > > > > > > 
> > > > > > 
> > > > > 
> > > > 
> > > 
> > 
> > call
> > 
> > > 
> > > > 
> > > > > 
> > > > > > 
> > > > > > > 
> > > > > > > > 
> > > > > > > > > vtd_flush_host_piotlb_all_accel unlocked. Is there a risk in case of
> > > > > > > > 
> > > > > > > > 
> > > > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > concurrent hot
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > > 
> > > > > > > > 
> > > > > > > > 
> > > > > > > > > plugging of another device?
> > > > > > > > 
> > > > > > > > 
> > > > > > > > 
> > > > > > > > 
> > > > > > > > 
> > > > > > > > Good catch, hotplug is protected by BQL, but if an iothread could send
> > > > > > > 
> > > > > > 
> > > > > 
> > > > 
> > > 
> > 
> > PRI
> > 
> > > 
> > > > 
> > > > > 
> > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > request,
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > > we suffer from the race.
> > > > > > > > 
> > > > > > > > I think we can bypass vtd_pri_perform_implicit_invalidation() for
> > > > > > > 
> > > > > > > 
> > > > > > 
> > > > > > 
> > > > > 
> > > > > 
> > > > 
> > > > 
> > > > passthrough
> > > > 
> > > > 
> > > > > 
> > > > > 
> > > > > > 
> > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > > device
> > > > > > > > which doesn't cache iotlb entry in QEMU but in host, like:
> > > > > > > > 
> > > > > > > > @@ -5401,7 +5401,8 @@ static int vtd_pri_request_page(PCIBus *bus,
> > > > > > > 
> > > > > > > 
> > > > > > 
> > > > > > 
> > > > > 
> > > > > 
> > > > 
> > > > 
> > > > void
> > > > 
> > > > 
> > > > > 
> > > > > 
> > > > > > 
> > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > > *opaque, int devfn,
> > > > > > > >         return -ENOSPC;
> > > > > > > >     }
> > > > > > > > 
> > > > > > > > -    if (vtd_pri_perform_implicit_invalidation(vtd_as, addr)) {
> > > > > > > > +    if (!vtd_find_hiod_iommufd(vtd_as) &&
> > > > > > > > +        vtd_pri_perform_implicit_invalidation(vtd_as, addr)) {
> > > > > > > >         return -EINVAL;
> > > > > > > >     }
> > > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > After further thinking, I found above change doesn’t resolve the race you
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > mentioned.
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > > 
> > > > > > > But I think the iothread should take BQL before
> > > > > > 
> > > > > 
> > > > 
> > > 
> > 
> > calling ::pri_request_page(),
> > 
> > > 
> > > > 
> > > > > 
> > > > > > 
> > > > > > > because it is a function changing vtd device context, e.g., PQT and PRS
> > > > > > 
> > > > > > 
> > > > > 
> > > > > 
> > > > 
> > > > 
> > > > register,
> > > > 
> > > > 
> > > > > 
> > > > > 
> > > > > > 
> > > > > > 
> > > > > > > page invalidation queue and iotlb entries. We should always take BQL
> > > > > > 
> > > > > 
> > > > 
> > > 
> > 
> > before
> > 
> > > 
> > > > 
> > > > > 
> > > > > > 
> > > > > > 
> > > > > > changing
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > > context of any device. Let me know if you have different opinions.
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > Do you mean that every emulated device that triggers a PRI request should
> > > > > 
> > > > > 
> > > > 
> > > > 
> > > > hold
> > > > 
> > > > 
> > > > > 
> > > > > 
> > > > > > the BQL?
> > > > > > I'm not sure this is a viable pattern as the initiator probably already holds an
> > > > > > internal lock
> > > > > > and thus cannot take the BQL without risking deadlocks with concurrent
> > > > > 
> > > > > 
> > > > 
> > > > 
> > > > MMIO :/
> > > > 
> > > > 
> > > > > 
> > > > > 
> > > > > IIUC, qemu doesn't support concurrent MMIO emulations yet, they are
> > > > 
> > > 
> > 
> > serialized
> > 
> > > 
> > > > 
> > > > by BQL. If we don't take BQL in call site, then we need it in callee, like:
> > > > 
> > > > 
> > > > > 
> > > > > @@ -5351,6 +5351,8 @@ static int vtd_pri_request_page(PCIBus *bus, void
> > > > 
> > > > 
> > > > *opaque, int devfn,
> > > > 
> > > > 
> > > > >      IntelIOMMUState *s = opaque;
> > > > >      VTDAddressSpace *vtd_as;
> > > > > 
> > > > > +    BQL_LOCK_GUARD();
> > > > > +
> > > > >      vtd_as = vtd_find_add_as(s, bus, devfn, pasid);
> > > > > 
> > > > >      uint64_t queue_addr_reg = vtd_get_quad(s, DMAR_PQA_REG);
> > > > > 
> > > > > 
> > > > > 
> > > > > 
> > > > > 
> > > > > > Otherwise, it means that the device always takes the BQL before taking  it's
> > > > > 
> > > > > 
> > > > 
> > > > 
> > > > own
> > > > 
> > > > 
> > > > > 
> > > > > 
> > > > > > lock.
> > > > > 
> > > > > 
> > > > > 
> > > > > 
> > > > > I think it's true because we always take BQL before emulation start.
> > > > 
> > > > 
> > > > 
> > > > This is pretty dangerous IMHO.
> > > > 
> > > > The emulated device that ends up calling this function probably has a lock on its
> > > > side already, let's call it dev_lock.
> > > > 
> > > > I the device does:
> > > > 	- lock(&dev_lock)
> > > > 	- start a PRI request
> > > > 	- lock(&bql)
> > > 
> > > 
> > > 
> > > I have two questions:
> > > 
> > > 1. which thread may have above sequence?
> > > 2. do we have to protect PRI request with dev_lock?
> > > 
> > > PRI state is protected by BQL, I think no need to be guarded by dev_lock. Maybe
> > 
> > unlock dev_lock during PRI?
> > 
> > Devices that perform svm operations instead of regular dma end up performing this
> > kind of sequence.
> > The issue is that device state is guarded by the device lock. Thus, taking the bql in
> > the pri callback
> > of the iommu requires the device to unlock its own lock before starting the PRI
> > operation or to rely on
> > the bql all the time instead of its own lock.
> 
> 
> Yes, this is a bit tricky.
> 
> 
> 
> > I'm just trying to make sure svm devices will not become impossible to implement
> > just because they
> > have to break their flow and unlock every time they trigger a memory operation.
> 
> 
> OK, if don't use BQL, we need another lock to protect PRQ related device state,  
> or maybe handle the PRI sending in bottom half so we don't need the lock.
> 
> Anyway, it's not an issue yet until we have emulated device supporting PRI.
> 
> For the race with device hotplug, I can drop this patch, then the loop on s->vtd_host_iommu_dev  
> through PRI path is still protected by vtd_iommu_lock(). Does that make sense for you?

Yep, lgtm, thanks ;)

> 
> Thanks  
> Zhenzhong