[PATCH v6 09/22] intel_iommu: Stick to system MR for IOMMUFD backed host device when x-fls=on

Zhenzhong Duan posted 22 patches 1 month, 3 weeks ago
Maintainers: Yi Liu <yi.l.liu@intel.com>, Eric Auger <eric.auger@redhat.com>, Zhenzhong Duan <zhenzhong.duan@intel.com>, "Michael S. Tsirkin" <mst@redhat.com>, Jason Wang <jasowang@redhat.com>, "Clément Mathieu--Drif" <clement.mathieu--drif@eviden.com>, Paolo Bonzini <pbonzini@redhat.com>, Richard Henderson <richard.henderson@linaro.org>, Eduardo Habkost <eduardo@habkost.net>, Marcel Apfelbaum <marcel.apfelbaum@gmail.com>, Alex Williamson <alex.williamson@redhat.com>, "Cédric Le Goater" <clg@redhat.com>, Fabiano Rosas <farosas@suse.de>, Laurent Vivier <lvivier@redhat.com>
There is a newer version of this series
[PATCH v6 09/22] intel_iommu: Stick to system MR for IOMMUFD backed host device when x-fls=on
Posted by Zhenzhong Duan 1 month, 3 weeks ago
When guest enables scalable mode and setup first stage page table, we don't
want to use IOMMU MR but rather continue using the system MR for IOMMUFD
backed host device.

Then default HWPT in VFIO contains GPA->HPA mappings which could be reused
as nesting parent HWPT to construct nested HWPT in vIOMMU.

Suggested-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
---
 hw/i386/intel_iommu.c | 37 +++++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index ba40649c85..bd80de1670 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -40,6 +40,7 @@
 #include "kvm/kvm_i386.h"
 #include "migration/vmstate.h"
 #include "trace.h"
+#include "system/iommufd.h"
 
 /* context entry operations */
 #define RID_PASID    0
@@ -1702,6 +1703,24 @@ static bool vtd_dev_pt_enabled(IntelIOMMUState *s, VTDContextEntry *ce,
 
 }
 
+static VTDHostIOMMUDevice *vtd_find_hiod_iommufd(VTDAddressSpace *as)
+{
+    IntelIOMMUState *s = as->iommu_state;
+    struct vtd_as_key key = {
+        .bus = as->bus,
+        .devfn = as->devfn,
+    };
+    VTDHostIOMMUDevice *vtd_hiod = g_hash_table_lookup(s->vtd_host_iommu_dev,
+                                                       &key);
+
+    if (vtd_hiod && vtd_hiod->hiod &&
+        object_dynamic_cast(OBJECT(vtd_hiod->hiod),
+                            TYPE_HOST_IOMMU_DEVICE_IOMMUFD)) {
+        return vtd_hiod;
+    }
+    return NULL;
+}
+
 static bool vtd_as_pt_enabled(VTDAddressSpace *as)
 {
     IntelIOMMUState *s;
@@ -1710,6 +1729,7 @@ static bool vtd_as_pt_enabled(VTDAddressSpace *as)
     assert(as);
 
     s = as->iommu_state;
+
     if (vtd_dev_to_context_entry(s, pci_bus_num(as->bus), as->devfn,
                                  &ce)) {
         /*
@@ -1727,12 +1747,25 @@ static bool vtd_as_pt_enabled(VTDAddressSpace *as)
 /* Return whether the device is using IOMMU translation. */
 static bool vtd_switch_address_space(VTDAddressSpace *as)
 {
+    IntelIOMMUState *s;
     bool use_iommu, pt;
 
     assert(as);
 
-    use_iommu = as->iommu_state->dmar_enabled && !vtd_as_pt_enabled(as);
-    pt = as->iommu_state->dmar_enabled && vtd_as_pt_enabled(as);
+    s = as->iommu_state;
+    use_iommu = s->dmar_enabled && !vtd_as_pt_enabled(as);
+    pt = s->dmar_enabled && vtd_as_pt_enabled(as);
+
+    /*
+     * When guest enables scalable mode and setup first stage page table,
+     * we stick to system MR for IOMMUFD backed host device. Then its
+     * default hwpt contains GPA->HPA mappings which is used directly
+     * if PGTT=PT and used as nesting parent if PGTT=FST. Otherwise
+     * fallback to original processing.
+     */
+    if (s->root_scalable && s->fsts && vtd_find_hiod_iommufd(as)) {
+        use_iommu = false;
+    }
 
     trace_vtd_switch_address_space(pci_bus_num(as->bus),
                                    VTD_PCI_SLOT(as->devfn),
-- 
2.47.1
Re: [PATCH v6 09/22] intel_iommu: Stick to system MR for IOMMUFD backed host device when x-fls=on
Posted by Yi Liu 1 month ago
On 2025/9/18 16:57, Zhenzhong Duan wrote:
> When guest enables scalable mode and setup first stage page table, we don't
> want to use IOMMU MR but rather continue using the system MR for IOMMUFD
> backed host device.
> 
> Then default HWPT in VFIO contains GPA->HPA mappings which could be reused
> as nesting parent HWPT to construct nested HWPT in vIOMMU.
> 
> Suggested-by: Yi Liu <yi.l.liu@intel.com>
> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
> ---
>   hw/i386/intel_iommu.c | 37 +++++++++++++++++++++++++++++++++++--
>   1 file changed, 35 insertions(+), 2 deletions(-)

Reviewed-by: Yi Liu <yi.l.liu@intel.com>

> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index ba40649c85..bd80de1670 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -40,6 +40,7 @@
>   #include "kvm/kvm_i386.h"
>   #include "migration/vmstate.h"
>   #include "trace.h"
> +#include "system/iommufd.h"
>   
>   /* context entry operations */
>   #define RID_PASID    0
> @@ -1702,6 +1703,24 @@ static bool vtd_dev_pt_enabled(IntelIOMMUState *s, VTDContextEntry *ce,
>   
>   }
>   
> +static VTDHostIOMMUDevice *vtd_find_hiod_iommufd(VTDAddressSpace *as)
> +{
> +    IntelIOMMUState *s = as->iommu_state;
> +    struct vtd_as_key key = {
> +        .bus = as->bus,
> +        .devfn = as->devfn,
> +    };
> +    VTDHostIOMMUDevice *vtd_hiod = g_hash_table_lookup(s->vtd_host_iommu_dev,
> +                                                       &key);
> +
> +    if (vtd_hiod && vtd_hiod->hiod &&
> +        object_dynamic_cast(OBJECT(vtd_hiod->hiod),
> +                            TYPE_HOST_IOMMU_DEVICE_IOMMUFD)) {
> +        return vtd_hiod;
> +    }
> +    return NULL;
> +}
> +
>   static bool vtd_as_pt_enabled(VTDAddressSpace *as)
>   {
>       IntelIOMMUState *s;
> @@ -1710,6 +1729,7 @@ static bool vtd_as_pt_enabled(VTDAddressSpace *as)
>       assert(as);
>   
>       s = as->iommu_state;
> +
>       if (vtd_dev_to_context_entry(s, pci_bus_num(as->bus), as->devfn,
>                                    &ce)) {
>           /*
> @@ -1727,12 +1747,25 @@ static bool vtd_as_pt_enabled(VTDAddressSpace *as)
>   /* Return whether the device is using IOMMU translation. */
>   static bool vtd_switch_address_space(VTDAddressSpace *as)
>   {
> +    IntelIOMMUState *s;
>       bool use_iommu, pt;
>   
>       assert(as);
>   
> -    use_iommu = as->iommu_state->dmar_enabled && !vtd_as_pt_enabled(as);
> -    pt = as->iommu_state->dmar_enabled && vtd_as_pt_enabled(as);
> +    s = as->iommu_state;
> +    use_iommu = s->dmar_enabled && !vtd_as_pt_enabled(as);
> +    pt = s->dmar_enabled && vtd_as_pt_enabled(as);
> +
> +    /*
> +     * When guest enables scalable mode and setup first stage page table,
> +     * we stick to system MR for IOMMUFD backed host device. Then its
> +     * default hwpt contains GPA->HPA mappings which is used directly
> +     * if PGTT=PT and used as nesting parent if PGTT=FST. Otherwise
> +     * fallback to original processing.
> +     */
> +    if (s->root_scalable && s->fsts && vtd_find_hiod_iommufd(as)) {
> +        use_iommu = false;
> +    }
>   
>       trace_vtd_switch_address_space(pci_bus_num(as->bus),
>                                      VTD_PCI_SLOT(as->devfn),
Re: [PATCH v6 09/22] intel_iommu: Stick to system MR for IOMMUFD backed host device when x-fls=on
Posted by Eric Auger 1 month, 2 weeks ago
Hi Zhenzhong,

On 9/18/25 10:57 AM, Zhenzhong Duan wrote:
> When guest enables scalable mode and setup first stage page table, we don't
> want to use IOMMU MR but rather continue using the system MR for IOMMUFD
> backed host device.
>
> Then default HWPT in VFIO contains GPA->HPA mappings which could be reused
> as nesting parent HWPT to construct nested HWPT in vIOMMU.
>
> Suggested-by: Yi Liu <yi.l.liu@intel.com>
> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
> ---
>  hw/i386/intel_iommu.c | 37 +++++++++++++++++++++++++++++++++++--
>  1 file changed, 35 insertions(+), 2 deletions(-)
>
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index ba40649c85..bd80de1670 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -40,6 +40,7 @@
>  #include "kvm/kvm_i386.h"
>  #include "migration/vmstate.h"
>  #include "trace.h"
> +#include "system/iommufd.h"
>  
>  /* context entry operations */
>  #define RID_PASID    0
> @@ -1702,6 +1703,24 @@ static bool vtd_dev_pt_enabled(IntelIOMMUState *s, VTDContextEntry *ce,
>  
>  }
>  
> +static VTDHostIOMMUDevice *vtd_find_hiod_iommufd(VTDAddressSpace *as)
> +{
> +    IntelIOMMUState *s = as->iommu_state;
> +    struct vtd_as_key key = {
> +        .bus = as->bus,
> +        .devfn = as->devfn,
> +    };
> +    VTDHostIOMMUDevice *vtd_hiod = g_hash_table_lookup(s->vtd_host_iommu_dev,
> +                                                       &key);
> +
> +    if (vtd_hiod && vtd_hiod->hiod &&
> +        object_dynamic_cast(OBJECT(vtd_hiod->hiod),
> +                            TYPE_HOST_IOMMU_DEVICE_IOMMUFD)) {
> +        return vtd_hiod;
> +    }
> +    return NULL;
> +}
> +
>  static bool vtd_as_pt_enabled(VTDAddressSpace *as)
>  {
>      IntelIOMMUState *s;
> @@ -1710,6 +1729,7 @@ static bool vtd_as_pt_enabled(VTDAddressSpace *as)
>      assert(as);
>  
>      s = as->iommu_state;
> +
not needed
>      if (vtd_dev_to_context_entry(s, pci_bus_num(as->bus), as->devfn,
>                                   &ce)) {
>          /*
> @@ -1727,12 +1747,25 @@ static bool vtd_as_pt_enabled(VTDAddressSpace *as)
>  /* Return whether the device is using IOMMU translation. */
>  static bool vtd_switch_address_space(VTDAddressSpace *as)
>  {
> +    IntelIOMMUState *s;
>      bool use_iommu, pt;
>  
>      assert(as);
>  
> -    use_iommu = as->iommu_state->dmar_enabled && !vtd_as_pt_enabled(as);
> -    pt = as->iommu_state->dmar_enabled && vtd_as_pt_enabled(as);
> +    s = as->iommu_state;
nit: init could be done at declaration
> +    use_iommu = s->dmar_enabled && !vtd_as_pt_enabled(as);
> +    pt = s->dmar_enabled && vtd_as_pt_enabled(as);
> +
> +    /*
> +     * When guest enables scalable mode and setup first stage page table,
sets up?
> +     * we stick to system MR for IOMMUFD backed host device. Then its
> +     * default hwpt contains GPA->HPA mappings which is used directly
> +     * if PGTT=PT and used as nesting parent if PGTT=FST. Otherwise
> +     * fallback to original processing.
fall back?
> +     */
> +    if (s->root_scalable && s->fsts && vtd_find_hiod_iommufd(as)) {
> +        use_iommu = false;
> +    }
>  
>      trace_vtd_switch_address_space(pci_bus_num(as->bus),
>                                     VTD_PCI_SLOT(as->devfn),
Besides
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Eric
RE: [PATCH v6 09/22] intel_iommu: Stick to system MR for IOMMUFD backed host device when x-fls=on
Posted by Duan, Zhenzhong 1 month ago
Hi Eric,

>-----Original Message-----
>From: Eric Auger <eric.auger@redhat.com>
>Subject: Re: [PATCH v6 09/22] intel_iommu: Stick to system MR for
>IOMMUFD backed host device when x-fls=on
>
>Hi Zhenzhong,
>
>On 9/18/25 10:57 AM, Zhenzhong Duan wrote:
>> When guest enables scalable mode and setup first stage page table, we
>don't
>> want to use IOMMU MR but rather continue using the system MR for
>IOMMUFD
>> backed host device.
>>
>> Then default HWPT in VFIO contains GPA->HPA mappings which could be
>reused
>> as nesting parent HWPT to construct nested HWPT in vIOMMU.
>>
>> Suggested-by: Yi Liu <yi.l.liu@intel.com>
>> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
>> ---
>>  hw/i386/intel_iommu.c | 37 +++++++++++++++++++++++++++++++++++--
>>  1 file changed, 35 insertions(+), 2 deletions(-)
>>
>> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
>> index ba40649c85..bd80de1670 100644
>> --- a/hw/i386/intel_iommu.c
>> +++ b/hw/i386/intel_iommu.c
>> @@ -40,6 +40,7 @@
>>  #include "kvm/kvm_i386.h"
>>  #include "migration/vmstate.h"
>>  #include "trace.h"
>> +#include "system/iommufd.h"
>>
>>  /* context entry operations */
>>  #define RID_PASID    0
>> @@ -1702,6 +1703,24 @@ static bool
>vtd_dev_pt_enabled(IntelIOMMUState *s, VTDContextEntry *ce,
>>
>>  }
>>
>> +static VTDHostIOMMUDevice *vtd_find_hiod_iommufd(VTDAddressSpace
>*as)
>> +{
>> +    IntelIOMMUState *s = as->iommu_state;
>> +    struct vtd_as_key key = {
>> +        .bus = as->bus,
>> +        .devfn = as->devfn,
>> +    };
>> +    VTDHostIOMMUDevice *vtd_hiod =
>g_hash_table_lookup(s->vtd_host_iommu_dev,
>> +                                                       &key);
>> +
>> +    if (vtd_hiod && vtd_hiod->hiod &&
>> +        object_dynamic_cast(OBJECT(vtd_hiod->hiod),
>> +
>TYPE_HOST_IOMMU_DEVICE_IOMMUFD)) {
>> +        return vtd_hiod;
>> +    }
>> +    return NULL;
>> +}
>> +
>>  static bool vtd_as_pt_enabled(VTDAddressSpace *as)
>>  {
>>      IntelIOMMUState *s;
>> @@ -1710,6 +1729,7 @@ static bool vtd_as_pt_enabled(VTDAddressSpace
>*as)
>>      assert(as);
>>
>>      s = as->iommu_state;
>> +
>not needed

Will do.

>>      if (vtd_dev_to_context_entry(s, pci_bus_num(as->bus), as->devfn,
>>                                   &ce)) {
>>          /*
>> @@ -1727,12 +1747,25 @@ static bool
>vtd_as_pt_enabled(VTDAddressSpace *as)
>>  /* Return whether the device is using IOMMU translation. */
>>  static bool vtd_switch_address_space(VTDAddressSpace *as)
>>  {
>> +    IntelIOMMUState *s;
>>      bool use_iommu, pt;
>>
>>      assert(as);
>>
>> -    use_iommu = as->iommu_state->dmar_enabled
>&& !vtd_as_pt_enabled(as);
>> -    pt = as->iommu_state->dmar_enabled && vtd_as_pt_enabled(as);
>> +    s = as->iommu_state;
>nit: init could be done at declaration

Not exactly, it must be after "assert(as);"

>> +    use_iommu = s->dmar_enabled && !vtd_as_pt_enabled(as);
>> +    pt = s->dmar_enabled && vtd_as_pt_enabled(as);
>> +
>> +    /*
>> +     * When guest enables scalable mode and setup first stage page
>table,
>sets up?

Sure

>> +     * we stick to system MR for IOMMUFD backed host device. Then its
>> +     * default hwpt contains GPA->HPA mappings which is used directly
>> +     * if PGTT=PT and used as nesting parent if PGTT=FST. Otherwise
>> +     * fallback to original processing.
>fall back?

Sure

Thanks
Zhenzhong

>> +     */
>> +    if (s->root_scalable && s->fsts && vtd_find_hiod_iommufd(as)) {
>> +        use_iommu = false;
>> +    }
>>
>>      trace_vtd_switch_address_space(pci_bus_num(as->bus),
>>                                     VTD_PCI_SLOT(as->devfn),
>Besides
>Reviewed-by: Eric Auger <eric.auger@redhat.com>
>Eric
>