[RFC PATCH v2 12/20] hw/arm/smmuv3-accel: Return sysmem if stage-1 is bypassed

Shameer Kolothum via posted 20 patches 3 weeks, 4 days ago
[RFC PATCH v2 12/20] hw/arm/smmuv3-accel: Return sysmem if stage-1 is bypassed
Posted by Shameer Kolothum via 3 weeks, 4 days ago
From: Nicolin Chen <nicolinc@nvidia.com>

When nested translation is enabled, there are 2-stage translation
occuring to two different address spaces: stage-1 in the iommu as,
while stage-2 in the system as.

If a device attached to the vSMMU doesn't enable stage-1 translation,
e.g. vSTE sets to Config=Bypass, the system as should be returned,
so QEMU can set up system memory mappings onto the stage-2 page table.
This is crucial for an iommufd enabled VFIO device as the VFIO core
code would register an iommu notifier and replay the address space
which should be bypassed for this nested translation case.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
---
 hw/arm/smmuv3-accel.c         | 22 +++++++++++++++++++++-
 include/hw/arm/smmuv3-accel.h |  3 +++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c
index 056bd23b2e..76134d106a 100644
--- a/hw/arm/smmuv3-accel.c
+++ b/hw/arm/smmuv3-accel.c
@@ -18,6 +18,7 @@
 static SMMUv3AccelDevice *smmuv3_accel_get_dev(SMMUState *s, SMMUPciBus *sbus,
                                                 PCIBus *bus, int devfn)
 {
+    SMMUv3AccelState *s_accel = ARM_SMMUV3_ACCEL(s);
     SMMUDevice *sdev = sbus->pbdev[devfn];
     SMMUv3AccelDevice *accel_dev;
 
@@ -29,6 +30,8 @@ static SMMUv3AccelDevice *smmuv3_accel_get_dev(SMMUState *s, SMMUPciBus *sbus,
 
         sbus->pbdev[devfn] = sdev;
         smmu_init_sdev(s, sdev, bus, devfn);
+        address_space_init(&accel_dev->as_sysmem, &s_accel->root,
+                           "smmuv3-accel-sysmem");
     }
 
     return accel_dev;
@@ -351,12 +354,23 @@ static AddressSpace *smmuv3_accel_find_add_as(PCIBus *bus, void *opaque,
     SMMUPciBus *sbus;
     SMMUv3AccelDevice *accel_dev;
     SMMUDevice *sdev;
+    PCIDevice *pdev = pci_find_device(bus, pci_bus_num(bus), devfn);
+    bool has_iommufd = false;
+
+    if (pdev) {
+        has_iommufd = object_property_find(OBJECT(pdev), "iommufd");
+    }
 
     sbus = smmu_get_sbus(s, bus);
     accel_dev = smmuv3_accel_get_dev(s, sbus, bus, devfn);
     sdev = &accel_dev->sdev;
 
-    return &sdev->as;
+    /* Return the system as if the device uses stage-2 only */
+    if (has_iommufd && !accel_dev->s1_hwpt) {
+        return &accel_dev->as_sysmem;
+    } else {
+        return &sdev->as;
+    }
 }
 
 static int smmuv3_accel_pxb_pcie_bus(Object *obj, void *opaque)
@@ -390,6 +404,12 @@ static void smmu_accel_realize(DeviceState *d, Error **errp)
         error_propagate(errp, local_err);
         return;
     }
+
+    memory_region_init(&s_accel->root, OBJECT(s_accel), "root", UINT64_MAX);
+    memory_region_init_alias(&s_accel->sysmem, OBJECT(s_accel),
+                             "smmuv3-accel-sysmem", get_system_memory(), 0,
+                             memory_region_size(get_system_memory()));
+    memory_region_add_subregion(&s_accel->root, 0, &s_accel->sysmem);
     bs->get_address_space = smmuv3_accel_find_add_as;
     bs->set_iommu_device = smmuv3_accel_set_iommu_device;
     bs->unset_iommu_device = smmuv3_accel_unset_iommu_device;
diff --git a/include/hw/arm/smmuv3-accel.h b/include/hw/arm/smmuv3-accel.h
index 54b217ab4f..58e68534c0 100644
--- a/include/hw/arm/smmuv3-accel.h
+++ b/include/hw/arm/smmuv3-accel.h
@@ -51,12 +51,15 @@ typedef struct SMMUv3AccelDevice {
     SMMUS1Hwpt  *s1_hwpt;
     SMMUViommu *viommu;
     SMMUVdev   *vdev;
+    AddressSpace as_sysmem;
     QLIST_ENTRY(SMMUv3AccelDevice) next;
 } SMMUv3AccelDevice;
 
 struct SMMUv3AccelState {
     SMMUv3State smmuv3_state;
     SMMUViommu *viommu;
+    MemoryRegion root;
+    MemoryRegion sysmem;
 };
 
 struct SMMUv3AccelClass {
-- 
2.34.1
Re: [RFC PATCH v2 12/20] hw/arm/smmuv3-accel: Return sysmem if stage-1 is bypassed
Posted by Eric Auger 1 week, 4 days ago

On 3/11/25 3:10 PM, Shameer Kolothum wrote:
> From: Nicolin Chen <nicolinc@nvidia.com>
>
> When nested translation is enabled, there are 2-stage translation
> occuring to two different address spaces: stage-1 in the iommu as,
> while stage-2 in the system as.
>
> If a device attached to the vSMMU doesn't enable stage-1 translation,
> e.g. vSTE sets to Config=Bypass, the system as should be returned,
> so QEMU can set up system memory mappings onto the stage-2 page table.
> This is crucial for an iommufd enabled VFIO device as the VFIO core
> code would register an iommu notifier and replay the address space
> which should be bypassed for this nested translation case.

I would suggest to get inspired of 90519b90539 ("virtio-iommu: Add
bypass mode support to assigned device") or similar patch on vtd
(558e0024a428 intel_iommu: allow dynamic switch of IOMMU region +
4b519ef1de9a  intel-iommu: optimize nodmar memory regions), ie. use the
same terminology and techniques/objects (switch_address_space) .
To me this is not related to HW acceleration but rather to VFIO in general.
>
> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
> ---
>  hw/arm/smmuv3-accel.c         | 22 +++++++++++++++++++++-
>  include/hw/arm/smmuv3-accel.h |  3 +++
>  2 files changed, 24 insertions(+), 1 deletion(-)
>
> diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c
> index 056bd23b2e..76134d106a 100644
> --- a/hw/arm/smmuv3-accel.c
> +++ b/hw/arm/smmuv3-accel.c
> @@ -18,6 +18,7 @@
>  static SMMUv3AccelDevice *smmuv3_accel_get_dev(SMMUState *s, SMMUPciBus *sbus,
>                                                  PCIBus *bus, int devfn)
>  {
> +    SMMUv3AccelState *s_accel = ARM_SMMUV3_ACCEL(s);
>      SMMUDevice *sdev = sbus->pbdev[devfn];
>      SMMUv3AccelDevice *accel_dev;
>  
> @@ -29,6 +30,8 @@ static SMMUv3AccelDevice *smmuv3_accel_get_dev(SMMUState *s, SMMUPciBus *sbus,
>  
>          sbus->pbdev[devfn] = sdev;
>          smmu_init_sdev(s, sdev, bus, devfn);
> +        address_space_init(&accel_dev->as_sysmem, &s_accel->root,
> +                           "smmuv3-accel-sysmem");
>      }
>  
>      return accel_dev;
> @@ -351,12 +354,23 @@ static AddressSpace *smmuv3_accel_find_add_as(PCIBus *bus, void *opaque,
>      SMMUPciBus *sbus;
>      SMMUv3AccelDevice *accel_dev;
>      SMMUDevice *sdev;
> +    PCIDevice *pdev = pci_find_device(bus, pci_bus_num(bus), devfn);
> +    bool has_iommufd = false;
> +
> +    if (pdev) {
> +        has_iommufd = object_property_find(OBJECT(pdev), "iommufd");
> +    }
Refering to the discussion we had on how to set MSI RESV regions at
virt-acpi-build level depending on whether the SMMU was accelerated I
think we can use exactly the above method (checking accel property)
>  
>      sbus = smmu_get_sbus(s, bus);
>      accel_dev = smmuv3_accel_get_dev(s, sbus, bus, devfn);
>      sdev = &accel_dev->sdev;
>  
> -    return &sdev->as;
> +    /* Return the system as if the device uses stage-2 only */
> +    if (has_iommufd && !accel_dev->s1_hwpt) {
> +        return &accel_dev->as_sysmem;
> +    } else {
> +        return &sdev->as;
> +    }
>  }
>  
>  static int smmuv3_accel_pxb_pcie_bus(Object *obj, void *opaque)
> @@ -390,6 +404,12 @@ static void smmu_accel_realize(DeviceState *d, Error **errp)
>          error_propagate(errp, local_err);
>          return;
>      }
> +
> +    memory_region_init(&s_accel->root, OBJECT(s_accel), "root", UINT64_MAX);
> +    memory_region_init_alias(&s_accel->sysmem, OBJECT(s_accel),
> +                             "smmuv3-accel-sysmem", get_system_memory(), 0,
> +                             memory_region_size(get_system_memory()));
> +    memory_region_add_subregion(&s_accel->root, 0, &s_accel->sysmem);
>      bs->get_address_space = smmuv3_accel_find_add_as;
>      bs->set_iommu_device = smmuv3_accel_set_iommu_device;
>      bs->unset_iommu_device = smmuv3_accel_unset_iommu_device;
> diff --git a/include/hw/arm/smmuv3-accel.h b/include/hw/arm/smmuv3-accel.h
> index 54b217ab4f..58e68534c0 100644
> --- a/include/hw/arm/smmuv3-accel.h
> +++ b/include/hw/arm/smmuv3-accel.h
> @@ -51,12 +51,15 @@ typedef struct SMMUv3AccelDevice {
>      SMMUS1Hwpt  *s1_hwpt;
>      SMMUViommu *viommu;
>      SMMUVdev   *vdev;
> +    AddressSpace as_sysmem;
>      QLIST_ENTRY(SMMUv3AccelDevice) next;
>  } SMMUv3AccelDevice;
>  
>  struct SMMUv3AccelState {
>      SMMUv3State smmuv3_state;
>      SMMUViommu *viommu;
> +    MemoryRegion root;
> +    MemoryRegion sysmem;
>  };
>  
>  struct SMMUv3AccelClass {