[PATCH v4 08/27] hw/arm/smmuv3-accel: Add set/unset_iommu_device callback

Shameer Kolothum posted 27 patches 1 month, 2 weeks ago
There is a newer version of this series
[PATCH v4 08/27] hw/arm/smmuv3-accel: Add set/unset_iommu_device callback
Posted by Shameer Kolothum 1 month, 2 weeks ago
From: Nicolin Chen <nicolinc@nvidia.com>

Implement a set_iommu_device callback:
 -If found an existing viommu reuse that.
 -Else,
    Allocate a vIOMMU with the nested parent S2 hwpt allocated by VFIO.
    Though, iommufd’s vIOMMU model supports nested translation by
    encapsulating a S2 nesting parent HWPT, devices cannot attach to this
    parent HWPT directly. So two proxy nested HWPTs (bypass and abort) are
    allocated to handle device attachments.
 -And add the dev to viommu device list

Also add an unset_iommu_device to unwind/cleanup above.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
 hw/arm/smmuv3-accel.c   | 150 ++++++++++++++++++++++++++++++++++++++++
 hw/arm/smmuv3-accel.h   |  17 +++++
 hw/arm/trace-events     |   4 ++
 include/hw/arm/smmuv3.h |   1 +
 4 files changed, 172 insertions(+)

diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c
index 6b0e512d86..81fa738f6f 100644
--- a/hw/arm/smmuv3-accel.c
+++ b/hw/arm/smmuv3-accel.c
@@ -8,6 +8,7 @@
 
 #include "qemu/osdep.h"
 #include "qemu/error-report.h"
+#include "trace.h"
 
 #include "hw/arm/smmuv3.h"
 #include "hw/iommu.h"
@@ -17,6 +18,9 @@
 
 #include "smmuv3-accel.h"
 
+#define SMMU_STE_VALID      (1ULL << 0)
+#define SMMU_STE_CFG_BYPASS (1ULL << 3)
+
 static SMMUv3AccelDevice *smmuv3_accel_get_dev(SMMUState *bs, SMMUPciBus *sbus,
                                                PCIBus *bus, int devfn)
 {
@@ -35,6 +39,149 @@ static SMMUv3AccelDevice *smmuv3_accel_get_dev(SMMUState *bs, SMMUPciBus *sbus,
     return accel_dev;
 }
 
+static bool
+smmuv3_accel_dev_alloc_viommu(SMMUv3AccelDevice *accel_dev,
+                              HostIOMMUDeviceIOMMUFD *idev, Error **errp)
+{
+    struct iommu_hwpt_arm_smmuv3 bypass_data = {
+        .ste = { SMMU_STE_CFG_BYPASS | SMMU_STE_VALID, 0x0ULL },
+    };
+    struct iommu_hwpt_arm_smmuv3 abort_data = {
+        .ste = { SMMU_STE_VALID, 0x0ULL },
+    };
+    SMMUDevice *sdev = &accel_dev->sdev;
+    SMMUState *bs = sdev->smmu;
+    SMMUv3State *s = ARM_SMMUV3(bs);
+    SMMUv3AccelState *s_accel = s->s_accel;
+    uint32_t s2_hwpt_id = idev->hwpt_id;
+    SMMUViommu *viommu;
+    uint32_t viommu_id;
+
+    if (s_accel->viommu) {
+        accel_dev->viommu = s_accel->viommu;
+        return true;
+    }
+
+    if (!iommufd_backend_alloc_viommu(idev->iommufd, idev->devid,
+                                      IOMMU_VIOMMU_TYPE_ARM_SMMUV3,
+                                      s2_hwpt_id, &viommu_id, errp)) {
+        return false;
+    }
+
+    viommu = g_new0(SMMUViommu, 1);
+    viommu->core.viommu_id = viommu_id;
+    viommu->core.s2_hwpt_id = s2_hwpt_id;
+    viommu->core.iommufd = idev->iommufd;
+
+    if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid,
+                                    viommu->core.viommu_id, 0,
+                                    IOMMU_HWPT_DATA_ARM_SMMUV3,
+                                    sizeof(abort_data), &abort_data,
+                                    &viommu->abort_hwpt_id, errp)) {
+        goto free_viommu;
+    }
+
+    if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid,
+                                    viommu->core.viommu_id, 0,
+                                    IOMMU_HWPT_DATA_ARM_SMMUV3,
+                                    sizeof(bypass_data), &bypass_data,
+                                    &viommu->bypass_hwpt_id, errp)) {
+        goto free_abort_hwpt;
+    }
+
+    viommu->iommufd = idev->iommufd;
+
+    s_accel->viommu = viommu;
+    accel_dev->viommu = viommu;
+    return true;
+
+free_abort_hwpt:
+    iommufd_backend_free_id(idev->iommufd, viommu->abort_hwpt_id);
+free_viommu:
+    iommufd_backend_free_id(idev->iommufd, viommu->core.viommu_id);
+    g_free(viommu);
+    return false;
+}
+
+static bool smmuv3_accel_set_iommu_device(PCIBus *bus, void *opaque, int devfn,
+                                          HostIOMMUDevice *hiod, Error **errp)
+{
+    HostIOMMUDeviceIOMMUFD *idev = HOST_IOMMU_DEVICE_IOMMUFD(hiod);
+    SMMUState *bs = opaque;
+    SMMUv3State *s = ARM_SMMUV3(bs);
+    SMMUv3AccelState *s_accel = s->s_accel;
+    SMMUPciBus *sbus = smmu_get_sbus(bs, bus);
+    SMMUv3AccelDevice *accel_dev = smmuv3_accel_get_dev(bs, sbus, bus, devfn);
+    SMMUDevice *sdev = &accel_dev->sdev;
+    uint16_t sid = smmu_get_sid(sdev);
+
+    if (!idev) {
+        return true;
+    }
+
+    if (accel_dev->idev) {
+        if (accel_dev->idev != idev) {
+            error_setg(errp, "Device 0x%x already has an associated IOMMU dev",
+                       sid);
+            return false;
+        }
+        return true;
+    }
+
+    if (!smmuv3_accel_dev_alloc_viommu(accel_dev, idev, errp)) {
+        error_setg(errp, "Device 0x%x: Unable to alloc viommu", sid);
+        return false;
+    }
+
+    accel_dev->idev = idev;
+    QLIST_INSERT_HEAD(&s_accel->viommu->device_list, accel_dev, next);
+    trace_smmuv3_accel_set_iommu_device(devfn, sid);
+    return true;
+}
+
+static void smmuv3_accel_unset_iommu_device(PCIBus *bus, void *opaque,
+                                            int devfn)
+{
+    SMMUState *bs = opaque;
+    SMMUv3State *s = ARM_SMMUV3(bs);
+    SMMUPciBus *sbus = g_hash_table_lookup(bs->smmu_pcibus_by_busptr, bus);
+    SMMUv3AccelDevice *accel_dev;
+    SMMUViommu *viommu;
+    SMMUDevice *sdev;
+    uint16_t sid;
+
+    if (!sbus) {
+        return;
+    }
+
+    sdev = sbus->pbdev[devfn];
+    if (!sdev) {
+        return;
+    }
+
+    sid = smmu_get_sid(sdev);
+    accel_dev = container_of(sdev, SMMUv3AccelDevice, sdev);
+    if (!host_iommu_device_iommufd_attach_hwpt(accel_dev->idev,
+                                               accel_dev->idev->hwpt_id,
+                                               NULL)) {
+        error_report("Unable to attach dev 0x%x to the default HW pagetable",
+                     sid);
+    }
+
+    accel_dev->idev = NULL;
+    QLIST_REMOVE(accel_dev, next);
+    trace_smmuv3_accel_unset_iommu_device(devfn, sid);
+
+    viommu = s->s_accel->viommu;
+    if (QLIST_EMPTY(&viommu->device_list)) {
+        iommufd_backend_free_id(viommu->iommufd, viommu->bypass_hwpt_id);
+        iommufd_backend_free_id(viommu->iommufd, viommu->abort_hwpt_id);
+        iommufd_backend_free_id(viommu->iommufd, viommu->core.viommu_id);
+        g_free(viommu);
+        s->s_accel->viommu = NULL;
+    }
+}
+
 static bool smmuv3_accel_pdev_allowed(PCIDevice *pdev, bool *vfio_pci)
 {
 
@@ -121,6 +268,8 @@ static uint64_t smmuv3_accel_get_viommu_flags(void *opaque)
 static const PCIIOMMUOps smmuv3_accel_ops = {
     .get_address_space = smmuv3_accel_find_add_as,
     .get_viommu_flags = smmuv3_accel_get_viommu_flags,
+    .set_iommu_device = smmuv3_accel_set_iommu_device,
+    .unset_iommu_device = smmuv3_accel_unset_iommu_device,
 };
 
 void smmuv3_accel_init(SMMUv3State *s)
@@ -128,4 +277,5 @@ void smmuv3_accel_init(SMMUv3State *s)
     SMMUState *bs = ARM_SMMU(s);
 
     bs->iommu_ops = &smmuv3_accel_ops;
+    s->s_accel = g_new0(SMMUv3AccelState, 1);
 }
diff --git a/hw/arm/smmuv3-accel.h b/hw/arm/smmuv3-accel.h
index 70da16960f..3c8506d1e6 100644
--- a/hw/arm/smmuv3-accel.h
+++ b/hw/arm/smmuv3-accel.h
@@ -10,12 +10,29 @@
 #define HW_ARM_SMMUV3_ACCEL_H
 
 #include "hw/arm/smmu-common.h"
+#include "system/iommufd.h"
+#include <linux/iommufd.h>
 #include CONFIG_DEVICES
 
+typedef struct SMMUViommu {
+    IOMMUFDBackend *iommufd;
+    IOMMUFDViommu core;
+    uint32_t bypass_hwpt_id;
+    uint32_t abort_hwpt_id;
+    QLIST_HEAD(, SMMUv3AccelDevice) device_list;
+} SMMUViommu;
+
 typedef struct SMMUv3AccelDevice {
     SMMUDevice  sdev;
+    HostIOMMUDeviceIOMMUFD *idev;
+    SMMUViommu *viommu;
+    QLIST_ENTRY(SMMUv3AccelDevice) next;
 } SMMUv3AccelDevice;
 
+typedef struct SMMUv3AccelState {
+    SMMUViommu *viommu;
+} SMMUv3AccelState;
+
 #ifdef CONFIG_ARM_SMMUV3_ACCEL
 void smmuv3_accel_init(SMMUv3State *s);
 #else
diff --git a/hw/arm/trace-events b/hw/arm/trace-events
index f3386bd7ae..86370d448a 100644
--- a/hw/arm/trace-events
+++ b/hw/arm/trace-events
@@ -66,6 +66,10 @@ smmuv3_notify_flag_del(const char *iommu) "DEL SMMUNotifier node for iommu mr=%s
 smmuv3_inv_notifiers_iova(const char *name, int asid, int vmid, uint64_t iova, uint8_t tg, uint64_t num_pages, int stage) "iommu mr=%s asid=%d vmid=%d iova=0x%"PRIx64" tg=%d num_pages=0x%"PRIx64" stage=%d"
 smmu_reset_exit(void) ""
 
+#smmuv3-accel.c
+smmuv3_accel_set_iommu_device(int devfn, uint32_t sid) "devfn=0x%x (sid=0x%x)"
+smmuv3_accel_unset_iommu_device(int devfn, uint32_t sid) "devfn=0x%x (sid=0x%x)"
+
 # strongarm.c
 strongarm_uart_update_parameters(const char *label, int speed, char parity, int data_bits, int stop_bits) "%s speed=%d parity=%c data=%d stop=%d"
 strongarm_ssp_read_underrun(void) "SSP rx underrun"
diff --git a/include/hw/arm/smmuv3.h b/include/hw/arm/smmuv3.h
index bb7076286b..5f3e9089a7 100644
--- a/include/hw/arm/smmuv3.h
+++ b/include/hw/arm/smmuv3.h
@@ -66,6 +66,7 @@ struct SMMUv3State {
 
     /* SMMU has HW accelerator support for nested S1 + s2 */
     bool accel;
+    struct SMMUv3AccelState  *s_accel;
 };
 
 typedef enum {
-- 
2.43.0


Re: [PATCH v4 08/27] hw/arm/smmuv3-accel: Add set/unset_iommu_device callback
Posted by Eric Auger 4 weeks ago

On 9/29/25 3:36 PM, Shameer Kolothum wrote:
> From: Nicolin Chen <nicolinc@nvidia.com>
>
> Implement a set_iommu_device callback:
>  -If found an existing viommu reuse that.
>  -Else,
>     Allocate a vIOMMU with the nested parent S2 hwpt allocated by VFIO.
>     Though, iommufd’s vIOMMU model supports nested translation by
>     encapsulating a S2 nesting parent HWPT, devices cannot attach to this
>     parent HWPT directly. So two proxy nested HWPTs (bypass and abort) are
>     allocated to handle device attachments.
>  -And add the dev to viommu device list
>
> Also add an unset_iommu_device to unwind/cleanup above.
>
> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com
> Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
> ---
>  hw/arm/smmuv3-accel.c   | 150 ++++++++++++++++++++++++++++++++++++++++
>  hw/arm/smmuv3-accel.h   |  17 +++++
>  hw/arm/trace-events     |   4 ++
>  include/hw/arm/smmuv3.h |   1 +
>  4 files changed, 172 insertions(+)
>
> diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c
> index 6b0e512d86..81fa738f6f 100644
> --- a/hw/arm/smmuv3-accel.c
> +++ b/hw/arm/smmuv3-accel.c
> @@ -8,6 +8,7 @@
>  
>  #include "qemu/osdep.h"
>  #include "qemu/error-report.h"
> +#include "trace.h"
>  
>  #include "hw/arm/smmuv3.h"
>  #include "hw/iommu.h"
> @@ -17,6 +18,9 @@
>  
>  #include "smmuv3-accel.h"
>  
> +#define SMMU_STE_VALID      (1ULL << 0)
> +#define SMMU_STE_CFG_BYPASS (1ULL << 3)
> +
>  static SMMUv3AccelDevice *smmuv3_accel_get_dev(SMMUState *bs, SMMUPciBus *sbus,
>                                                 PCIBus *bus, int devfn)
>  {
> @@ -35,6 +39,149 @@ static SMMUv3AccelDevice *smmuv3_accel_get_dev(SMMUState *bs, SMMUPciBus *sbus,
>      return accel_dev;
>  }
>  
> +static bool
> +smmuv3_accel_dev_alloc_viommu(SMMUv3AccelDevice *accel_dev,
> +                              HostIOMMUDeviceIOMMUFD *idev, Error **errp)
> +{
> +    struct iommu_hwpt_arm_smmuv3 bypass_data = {
> +        .ste = { SMMU_STE_CFG_BYPASS | SMMU_STE_VALID, 0x0ULL },
> +    };
> +    struct iommu_hwpt_arm_smmuv3 abort_data = {
> +        .ste = { SMMU_STE_VALID, 0x0ULL },
> +    };
> +    SMMUDevice *sdev = &accel_dev->sdev;
> +    SMMUState *bs = sdev->smmu;
> +    SMMUv3State *s = ARM_SMMUV3(bs);
> +    SMMUv3AccelState *s_accel = s->s_accel;
> +    uint32_t s2_hwpt_id = idev->hwpt_id;
> +    SMMUViommu *viommu;
> +    uint32_t viommu_id;
> +
> +    if (s_accel->viommu) {
> +        accel_dev->viommu = s_accel->viommu;
> +        return true;
> +    }
> +
> +    if (!iommufd_backend_alloc_viommu(idev->iommufd, idev->devid,
> +                                      IOMMU_VIOMMU_TYPE_ARM_SMMUV3,
> +                                      s2_hwpt_id, &viommu_id, errp)) {
> +        return false;
> +    }
> +
> +    viommu = g_new0(SMMUViommu, 1);
> +    viommu->core.viommu_id = viommu_id;
> +    viommu->core.s2_hwpt_id = s2_hwpt_id;
> +    viommu->core.iommufd = idev->iommufd;
> +
> +    if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid,
> +                                    viommu->core.viommu_id, 0,
> +                                    IOMMU_HWPT_DATA_ARM_SMMUV3,
> +                                    sizeof(abort_data), &abort_data,
> +                                    &viommu->abort_hwpt_id, errp)) {
> +        goto free_viommu;
> +    }
> +
> +    if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid,
> +                                    viommu->core.viommu_id, 0,
> +                                    IOMMU_HWPT_DATA_ARM_SMMUV3,
> +                                    sizeof(bypass_data), &bypass_data,
> +                                    &viommu->bypass_hwpt_id, errp)) {
> +        goto free_abort_hwpt;
> +    }
> +
> +    viommu->iommufd = idev->iommufd;
> +
> +    s_accel->viommu = viommu;
> +    accel_dev->viommu = viommu;
> +    return true;
> +
> +free_abort_hwpt:
> +    iommufd_backend_free_id(idev->iommufd, viommu->abort_hwpt_id);
> +free_viommu:
> +    iommufd_backend_free_id(idev->iommufd, viommu->core.viommu_id);
> +    g_free(viommu);
> +    return false;
> +}
> +
> +static bool smmuv3_accel_set_iommu_device(PCIBus *bus, void *opaque, int devfn,
> +                                          HostIOMMUDevice *hiod, Error **errp)
> +{
> +    HostIOMMUDeviceIOMMUFD *idev = HOST_IOMMU_DEVICE_IOMMUFD(hiod);
> +    SMMUState *bs = opaque;
> +    SMMUv3State *s = ARM_SMMUV3(bs);
> +    SMMUv3AccelState *s_accel = s->s_accel;
> +    SMMUPciBus *sbus = smmu_get_sbus(bs, bus);
> +    SMMUv3AccelDevice *accel_dev = smmuv3_accel_get_dev(bs, sbus, bus, devfn);
> +    SMMUDevice *sdev = &accel_dev->sdev;
> +    uint16_t sid = smmu_get_sid(sdev);
> +
> +    if (!idev) {
> +        return true;
> +    }
> +
> +    if (accel_dev->idev) {
> +        if (accel_dev->idev != idev) {
> +            error_setg(errp, "Device 0x%x already has an associated IOMMU dev",
> +                       sid);
> +            return false;
> +        }
> +        return true;
> +    }
> +
> +    if (!smmuv3_accel_dev_alloc_viommu(accel_dev, idev, errp)) {
> +        error_setg(errp, "Device 0x%x: Unable to alloc viommu", sid);
> +        return false;
> +    }
> +
> +    accel_dev->idev = idev;
> +    QLIST_INSERT_HEAD(&s_accel->viommu->device_list, accel_dev, next);
> +    trace_smmuv3_accel_set_iommu_device(devfn, sid);
> +    return true;
> +}
> +
> +static void smmuv3_accel_unset_iommu_device(PCIBus *bus, void *opaque,
> +                                            int devfn)
> +{
> +    SMMUState *bs = opaque;
> +    SMMUv3State *s = ARM_SMMUV3(bs);
> +    SMMUPciBus *sbus = g_hash_table_lookup(bs->smmu_pcibus_by_busptr, bus);
> +    SMMUv3AccelDevice *accel_dev;
> +    SMMUViommu *viommu;
> +    SMMUDevice *sdev;
> +    uint16_t sid;
> +
> +    if (!sbus) {
> +        return;
> +    }
> +
> +    sdev = sbus->pbdev[devfn];
> +    if (!sdev) {
> +        return;
> +    }
> +
> +    sid = smmu_get_sid(sdev);
> +    accel_dev = container_of(sdev, SMMUv3AccelDevice, sdev);
> +    if (!host_iommu_device_iommufd_attach_hwpt(accel_dev->idev,
> +                                               accel_dev->idev->hwpt_id,
> +                                               NULL)) {
> +        error_report("Unable to attach dev 0x%x to the default HW pagetable",
> +                     sid);
> +    }
> +
> +    accel_dev->idev = NULL;
> +    QLIST_REMOVE(accel_dev, next);
> +    trace_smmuv3_accel_unset_iommu_device(devfn, sid);
> +
> +    viommu = s->s_accel->viommu;
> +    if (QLIST_EMPTY(&viommu->device_list)) {
> +        iommufd_backend_free_id(viommu->iommufd, viommu->bypass_hwpt_id);
> +        iommufd_backend_free_id(viommu->iommufd, viommu->abort_hwpt_id);
> +        iommufd_backend_free_id(viommu->iommufd, viommu->core.viommu_id);
> +        g_free(viommu);
> +        s->s_accel->viommu = NULL;
> +    }
> +}
> +
>  static bool smmuv3_accel_pdev_allowed(PCIDevice *pdev, bool *vfio_pci)
>  {
>  
> @@ -121,6 +268,8 @@ static uint64_t smmuv3_accel_get_viommu_flags(void *opaque)
>  static const PCIIOMMUOps smmuv3_accel_ops = {
>      .get_address_space = smmuv3_accel_find_add_as,
>      .get_viommu_flags = smmuv3_accel_get_viommu_flags,
> +    .set_iommu_device = smmuv3_accel_set_iommu_device,
> +    .unset_iommu_device = smmuv3_accel_unset_iommu_device,
>  };
>  
>  void smmuv3_accel_init(SMMUv3State *s)
> @@ -128,4 +277,5 @@ void smmuv3_accel_init(SMMUv3State *s)
>      SMMUState *bs = ARM_SMMU(s);
>  
>      bs->iommu_ops = &smmuv3_accel_ops;
> +    s->s_accel = g_new0(SMMUv3AccelState, 1);
>  }
> diff --git a/hw/arm/smmuv3-accel.h b/hw/arm/smmuv3-accel.h
> index 70da16960f..3c8506d1e6 100644
> --- a/hw/arm/smmuv3-accel.h
> +++ b/hw/arm/smmuv3-accel.h
> @@ -10,12 +10,29 @@
>  #define HW_ARM_SMMUV3_ACCEL_H
>  
>  #include "hw/arm/smmu-common.h"
> +#include "system/iommufd.h"
> +#include <linux/iommufd.h>
>  #include CONFIG_DEVICES
>  
> +typedef struct SMMUViommu {
> +    IOMMUFDBackend *iommufd;
> +    IOMMUFDViommu core;
could we avoid using too generic field names like "core". In the rest of
the code it is then difficult to understand what the field corresponds to.

viommu?
> +    uint32_t bypass_hwpt_id;
> +    uint32_t abort_hwpt_id;
> +    QLIST_HEAD(, SMMUv3AccelDevice) device_list;
> +} SMMUViommu;
> +
>  typedef struct SMMUv3AccelDevice {
>      SMMUDevice  sdev;
> +    HostIOMMUDeviceIOMMUFD *idev;
same here. hdev at least would refer to host dev at least. Or does it
correspond to some kernel terminology?

Eric
> +    SMMUViommu *viommu;
> +    QLIST_ENTRY(SMMUv3AccelDevice) next;
>  } SMMUv3AccelDevice;
>  
> +typedef struct SMMUv3AccelState {
> +    SMMUViommu *viommu;
> +} SMMUv3AccelState;
> +
>  #ifdef CONFIG_ARM_SMMUV3_ACCEL
>  void smmuv3_accel_init(SMMUv3State *s);
>  #else
> diff --git a/hw/arm/trace-events b/hw/arm/trace-events
> index f3386bd7ae..86370d448a 100644
> --- a/hw/arm/trace-events
> +++ b/hw/arm/trace-events
> @@ -66,6 +66,10 @@ smmuv3_notify_flag_del(const char *iommu) "DEL SMMUNotifier node for iommu mr=%s
>  smmuv3_inv_notifiers_iova(const char *name, int asid, int vmid, uint64_t iova, uint8_t tg, uint64_t num_pages, int stage) "iommu mr=%s asid=%d vmid=%d iova=0x%"PRIx64" tg=%d num_pages=0x%"PRIx64" stage=%d"
>  smmu_reset_exit(void) ""
>  
> +#smmuv3-accel.c
> +smmuv3_accel_set_iommu_device(int devfn, uint32_t sid) "devfn=0x%x (sid=0x%x)"
> +smmuv3_accel_unset_iommu_device(int devfn, uint32_t sid) "devfn=0x%x (sid=0x%x)"
> +
>  # strongarm.c
>  strongarm_uart_update_parameters(const char *label, int speed, char parity, int data_bits, int stop_bits) "%s speed=%d parity=%c data=%d stop=%d"
>  strongarm_ssp_read_underrun(void) "SSP rx underrun"
> diff --git a/include/hw/arm/smmuv3.h b/include/hw/arm/smmuv3.h
> index bb7076286b..5f3e9089a7 100644
> --- a/include/hw/arm/smmuv3.h
> +++ b/include/hw/arm/smmuv3.h
> @@ -66,6 +66,7 @@ struct SMMUv3State {
>  
>      /* SMMU has HW accelerator support for nested S1 + s2 */
>      bool accel;
> +    struct SMMUv3AccelState  *s_accel;
>  };
>  
>  typedef enum {


Re: [PATCH v4 08/27] hw/arm/smmuv3-accel: Add set/unset_iommu_device callback
Posted by Eric Auger 1 month, 1 week ago
Hi Shameer,

On 9/29/25 3:36 PM, Shameer Kolothum wrote:
> From: Nicolin Chen <nicolinc@nvidia.com>
the original intent of this callback is

     * @set_iommu_device: attach a HostIOMMUDevice to a vIOMMU
     *
     * Optional callback, if not implemented in vIOMMU, then vIOMMU can't
     * retrieve host information from the associated HostIOMMUDevice.
     *

The implementation below goes way behond the simple "attachment" of the
HostIOMMUDevice to the vIOMMU.
allocation of a vIOMMU; allocation of 2 HWPTs, creation of a new
SMMUv3AccelState
>
> Implement a set_iommu_device callback:
>  -If found an existing viommu reuse that.
I think you need to document why you need a vIOMMU object.
>  -Else,
>     Allocate a vIOMMU with the nested parent S2 hwpt allocated by VFIO.
>     Though, iommufd’s vIOMMU model supports nested translation by
>     encapsulating a S2 nesting parent HWPT, devices cannot attach to this
>     parent HWPT directly. So two proxy nested HWPTs (bypass and abort) are
>     allocated to handle device attachments.

"devices cannot attach to this parent HWPT directly".  Why? It is not clear to me what those hwpt are used for compared to the original one. Why are they mandated? To me this deserves some additional explanations. If they are s2 ones, I would use an s2 prefix too.

>  -And add the dev to viommu device list
this is the initial objective of the callback
>
> Also add an unset_iommu_device to unwind/cleanup above.

I think you shall document the introduction of SMMUv3AccelState.It
currently contains a single member, do you plan to add others.
> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com
> Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
> ---
>  hw/arm/smmuv3-accel.c   | 150 ++++++++++++++++++++++++++++++++++++++++
>  hw/arm/smmuv3-accel.h   |  17 +++++
>  hw/arm/trace-events     |   4 ++
>  include/hw/arm/smmuv3.h |   1 +
>  4 files changed, 172 insertions(+)
>
> diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c
> index 6b0e512d86..81fa738f6f 100644
> --- a/hw/arm/smmuv3-accel.c
> +++ b/hw/arm/smmuv3-accel.c
> @@ -8,6 +8,7 @@
>  
>  #include "qemu/osdep.h"
>  #include "qemu/error-report.h"
> +#include "trace.h"
>  
>  #include "hw/arm/smmuv3.h"
>  #include "hw/iommu.h"
> @@ -17,6 +18,9 @@
>  
>  #include "smmuv3-accel.h"
>  
> +#define SMMU_STE_VALID      (1ULL << 0)
> +#define SMMU_STE_CFG_BYPASS (1ULL << 3)
I would rather put that in smmuv3-internal.h where you have other STE
macros. Look for "/* STE fields */
> +
>  static SMMUv3AccelDevice *smmuv3_accel_get_dev(SMMUState *bs, SMMUPciBus *sbus,
>                                                 PCIBus *bus, int devfn)
>  {
> @@ -35,6 +39,149 @@ static SMMUv3AccelDevice *smmuv3_accel_get_dev(SMMUState *bs, SMMUPciBus *sbus,
>      return accel_dev;
>  }
>  
> +static bool
> +smmuv3_accel_dev_alloc_viommu(SMMUv3AccelDevice *accel_dev,
> +                              HostIOMMUDeviceIOMMUFD *idev, Error **errp)
> +{
> +    struct iommu_hwpt_arm_smmuv3 bypass_data = {
> +        .ste = { SMMU_STE_CFG_BYPASS | SMMU_STE_VALID, 0x0ULL },
> +    };
> +    struct iommu_hwpt_arm_smmuv3 abort_data = {
> +        .ste = { SMMU_STE_VALID, 0x0ULL },
> +    };
> +    SMMUDevice *sdev = &accel_dev->sdev;
> +    SMMUState *bs = sdev->smmu;
> +    SMMUv3State *s = ARM_SMMUV3(bs);
> +    SMMUv3AccelState *s_accel = s->s_accel;
> +    uint32_t s2_hwpt_id = idev->hwpt_id;
> +    SMMUViommu *viommu;
> +    uint32_t viommu_id;
> +
> +    if (s_accel->viommu) {
> +        accel_dev->viommu = s_accel->viommu;
> +        return true;
> +    }
> +
> +    if (!iommufd_backend_alloc_viommu(idev->iommufd, idev->devid,
> +                                      IOMMU_VIOMMU_TYPE_ARM_SMMUV3,
> +                                      s2_hwpt_id, &viommu_id, errp)) {
> +        return false;
> +    }
> +
> +    viommu = g_new0(SMMUViommu, 1);
> +    viommu->core.viommu_id = viommu_id;
> +    viommu->core.s2_hwpt_id = s2_hwpt_id;
> +    viommu->core.iommufd = idev->iommufd;
> +
> +    if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid,
> +                                    viommu->core.viommu_id, 0,
> +                                    IOMMU_HWPT_DATA_ARM_SMMUV3,
> +                                    sizeof(abort_data), &abort_data,
> +                                    &viommu->abort_hwpt_id, errp)) {
> +        goto free_viommu;
> +    }
> +
> +    if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid,
> +                                    viommu->core.viommu_id, 0,
> +                                    IOMMU_HWPT_DATA_ARM_SMMUV3,
> +                                    sizeof(bypass_data), &bypass_data,
> +                                    &viommu->bypass_hwpt_id, errp)) {
> +        goto free_abort_hwpt;
> +    }
> +
> +    viommu->iommufd = idev->iommufd;
> +
> +    s_accel->viommu = viommu;
> +    accel_dev->viommu = viommu;
> +    return true;
> +
> +free_abort_hwpt:
> +    iommufd_backend_free_id(idev->iommufd, viommu->abort_hwpt_id);
> +free_viommu:
> +    iommufd_backend_free_id(idev->iommufd, viommu->core.viommu_id);
> +    g_free(viommu);
> +    return false;
> +}
> +
> +static bool smmuv3_accel_set_iommu_device(PCIBus *bus, void *opaque, int devfn,
> +                                          HostIOMMUDevice *hiod, Error **errp)
> +{
> +    HostIOMMUDeviceIOMMUFD *idev = HOST_IOMMU_DEVICE_IOMMUFD(hiod);
> +    SMMUState *bs = opaque;
> +    SMMUv3State *s = ARM_SMMUV3(bs);
> +    SMMUv3AccelState *s_accel = s->s_accel;
> +    SMMUPciBus *sbus = smmu_get_sbus(bs, bus);
> +    SMMUv3AccelDevice *accel_dev = smmuv3_accel_get_dev(bs, sbus, bus, devfn);
you are using smmuv3_accel_get_dev but logically the function was
already called once before in smmuv3_accel_find_add_as()? Meaning the
add/allocate part of the function is not something that should happen
here. Shouldn't we add a new helper that does SMMUPciBus *sbus =
g_hash_table_lookup(bs->smmu_pcibus_by_busptr, bus); if (!sbus) {
return; } sdev = sbus->pbdev[devfn]; if (!sdev) { return; } that would
be used both set and unset()?
> +    SMMUDevice *sdev = &accel_dev->sdev;
> +    uint16_t sid = smmu_get_sid(sdev);
> +
> +    if (!idev) {
> +        return true;
> +    }
> +
> +    if (accel_dev->idev) {
> +        if (accel_dev->idev != idev) {
> +            error_setg(errp, "Device 0x%x already has an associated IOMMU dev",
> +                       sid);
> +            return false;
> +        }
> +        return true;
> +    }
> +
> +    if (!smmuv3_accel_dev_alloc_viommu(accel_dev, idev, errp)) {
> +        error_setg(errp, "Device 0x%x: Unable to alloc viommu", sid);
> +        return false;
> +    }
> +
> +    accel_dev->idev = idev;
> +    QLIST_INSERT_HEAD(&s_accel->viommu->device_list, accel_dev, next);
> +    trace_smmuv3_accel_set_iommu_device(devfn, sid);
> +    return true;
> +}
> +
> +static void smmuv3_accel_unset_iommu_device(PCIBus *bus, void *opaque,
> +                                            int devfn)
> +{
> +    SMMUState *bs = opaque;
> +    SMMUv3State *s = ARM_SMMUV3(bs);
> +    SMMUPciBus *sbus = g_hash_table_lookup(bs->smmu_pcibus_by_busptr, bus);
> +    SMMUv3AccelDevice *accel_dev;
> +    SMMUViommu *viommu;
> +    SMMUDevice *sdev;
> +    uint16_t sid;
> +
> +    if (!sbus) {
> +        return;
> +    }
> +
> +    sdev = sbus->pbdev[devfn];
> +    if (!sdev) {
> +        return;
> +    }
> +
> +    sid = smmu_get_sid(sdev);
> +    accel_dev = container_of(sdev, SMMUv3AccelDevice, sdev);
> +    if (!host_iommu_device_iommufd_attach_hwpt(accel_dev->idev,
> +                                               accel_dev->idev->hwpt_id,
> +                                               NULL)) {
> +        error_report("Unable to attach dev 0x%x to the default HW pagetable",
> +                     sid);
> +    }
> +
> +    accel_dev->idev = NULL;
> +    QLIST_REMOVE(accel_dev, next);
> +    trace_smmuv3_accel_unset_iommu_device(devfn, sid);
> +
> +    viommu = s->s_accel->viommu;
> +    if (QLIST_EMPTY(&viommu->device_list)) {
> +        iommufd_backend_free_id(viommu->iommufd, viommu->bypass_hwpt_id);
> +        iommufd_backend_free_id(viommu->iommufd, viommu->abort_hwpt_id);
> +        iommufd_backend_free_id(viommu->iommufd, viommu->core.viommu_id);
> +        g_free(viommu);
> +        s->s_accel->viommu = NULL;
> +    }
> +}
> +
>  static bool smmuv3_accel_pdev_allowed(PCIDevice *pdev, bool *vfio_pci)
>  {
>  
> @@ -121,6 +268,8 @@ static uint64_t smmuv3_accel_get_viommu_flags(void *opaque)
>  static const PCIIOMMUOps smmuv3_accel_ops = {
>      .get_address_space = smmuv3_accel_find_add_as,
>      .get_viommu_flags = smmuv3_accel_get_viommu_flags,
> +    .set_iommu_device = smmuv3_accel_set_iommu_device,
> +    .unset_iommu_device = smmuv3_accel_unset_iommu_device,
>  };
>  
>  void smmuv3_accel_init(SMMUv3State *s)
> @@ -128,4 +277,5 @@ void smmuv3_accel_init(SMMUv3State *s)
>      SMMUState *bs = ARM_SMMU(s);
>  
>      bs->iommu_ops = &smmuv3_accel_ops;
> +    s->s_accel = g_new0(SMMUv3AccelState, 1);
>  }
> diff --git a/hw/arm/smmuv3-accel.h b/hw/arm/smmuv3-accel.h
> index 70da16960f..3c8506d1e6 100644
> --- a/hw/arm/smmuv3-accel.h
> +++ b/hw/arm/smmuv3-accel.h
> @@ -10,12 +10,29 @@
>  #define HW_ARM_SMMUV3_ACCEL_H
>  
>  #include "hw/arm/smmu-common.h"
> +#include "system/iommufd.h"
> +#include <linux/iommufd.h>
>  #include CONFIG_DEVICES
>  
> +typedef struct SMMUViommu {
would deserve to be documented with explanation of what it does abstract
> +    IOMMUFDBackend *iommufd;
> +    IOMMUFDViommu core;
> +    uint32_t bypass_hwpt_id;
> +    uint32_t abort_hwpt_id;
> +    QLIST_HEAD(, SMMUv3AccelDevice) device_list;
> +} SMMUViommu;
> +
>  typedef struct SMMUv3AccelDevice {
>      SMMUDevice  sdev;
> +    HostIOMMUDeviceIOMMUFD *idev;
> +    SMMUViommu *viommu;
> +    QLIST_ENTRY(SMMUv3AccelDevice) next;
>  } SMMUv3AccelDevice;
>  
> +typedef struct SMMUv3AccelState {
> +    SMMUViommu *viommu;
> +} SMMUv3AccelState;
> +
>  #ifdef CONFIG_ARM_SMMUV3_ACCEL
>  void smmuv3_accel_init(SMMUv3State *s);
>  #else
> diff --git a/hw/arm/trace-events b/hw/arm/trace-events
> index f3386bd7ae..86370d448a 100644
> --- a/hw/arm/trace-events
> +++ b/hw/arm/trace-events
> @@ -66,6 +66,10 @@ smmuv3_notify_flag_del(const char *iommu) "DEL SMMUNotifier node for iommu mr=%s
>  smmuv3_inv_notifiers_iova(const char *name, int asid, int vmid, uint64_t iova, uint8_t tg, uint64_t num_pages, int stage) "iommu mr=%s asid=%d vmid=%d iova=0x%"PRIx64" tg=%d num_pages=0x%"PRIx64" stage=%d"
>  smmu_reset_exit(void) ""
>  
> +#smmuv3-accel.c
> +smmuv3_accel_set_iommu_device(int devfn, uint32_t sid) "devfn=0x%x (sid=0x%x)"
> +smmuv3_accel_unset_iommu_device(int devfn, uint32_t sid) "devfn=0x%x (sid=0x%x)"
> +
>  # strongarm.c
>  strongarm_uart_update_parameters(const char *label, int speed, char parity, int data_bits, int stop_bits) "%s speed=%d parity=%c data=%d stop=%d"
>  strongarm_ssp_read_underrun(void) "SSP rx underrun"
> diff --git a/include/hw/arm/smmuv3.h b/include/hw/arm/smmuv3.h
> index bb7076286b..5f3e9089a7 100644
> --- a/include/hw/arm/smmuv3.h
> +++ b/include/hw/arm/smmuv3.h
> @@ -66,6 +66,7 @@ struct SMMUv3State {
>  
>      /* SMMU has HW accelerator support for nested S1 + s2 */
>      bool accel;
> +    struct SMMUv3AccelState  *s_accel;
>  };
>  
>  typedef enum {
Thanks

Eric


RE: [PATCH v4 08/27] hw/arm/smmuv3-accel: Add set/unset_iommu_device callback
Posted by Shameer Kolothum 1 month, 1 week ago

> -----Original Message-----
> From: Eric Auger <eric.auger@redhat.com>
> Sent: 02 October 2025 07:53
> To: Shameer Kolothum <skolothumtho@nvidia.com>; qemu-
> arm@nongnu.org; qemu-devel@nongnu.org
> Cc: peter.maydell@linaro.org; Jason Gunthorpe <jgg@nvidia.com>; Nicolin
> Chen <nicolinc@nvidia.com>; ddutile@redhat.com; berrange@redhat.com;
> Nathan Chen <nathanc@nvidia.com>; Matt Ochs <mochs@nvidia.com>;
> smostafa@google.com; wangzhou1@hisilicon.com;
> jiangkunkun@huawei.com; jonathan.cameron@huawei.com;
> zhangfei.gao@linaro.org; zhenzhong.duan@intel.com; yi.l.liu@intel.com;
> shameerkolothum@gmail.com
> Subject: Re: [PATCH v4 08/27] hw/arm/smmuv3-accel: Add
> set/unset_iommu_device callback
> 
> External email: Use caution opening links or attachments
> 
> 
> Hi Shameer,
> 
> On 9/29/25 3:36 PM, Shameer Kolothum wrote:
> > From: Nicolin Chen <nicolinc@nvidia.com>
> the original intent of this callback is
> 
>      * @set_iommu_device: attach a HostIOMMUDevice to a vIOMMU
>      *
>      * Optional callback, if not implemented in vIOMMU, then vIOMMU can't
>      * retrieve host information from the associated HostIOMMUDevice.
>      *
> 
> The implementation below goes way behond the simple "attachment" of the
> HostIOMMUDevice to the vIOMMU.
> allocation of a vIOMMU; allocation of 2 HWPTs, creation of a new
> SMMUv3AccelState

Sure. It does do all of this. Will update.
> >
> > Implement a set_iommu_device callback:
> >  -If found an existing viommu reuse that.
> I think you need to document why you need a vIOMMU object.
> >  -Else,
> >     Allocate a vIOMMU with the nested parent S2 hwpt allocated by VFIO.
> >     Though, iommufd’s vIOMMU model supports nested translation by
> >     encapsulating a S2 nesting parent HWPT, devices cannot attach to this
> >     parent HWPT directly. So two proxy nested HWPTs (bypass and abort) are
> >     allocated to handle device attachments.
> 
> "devices cannot attach to this parent HWPT directly".  Why? It is not clear to
> me what those hwpt are used for compared to the original one. Why are they
> mandated? To me this deserves some additional explanations. If they are s2
> ones, I would use an s2 prefix too.

Ok. This needs some rephrasing.

The idea is, we cannot yet attach a domain to the SMMUv3 for this device yet.
We need a vDEVICE object (which will link vSID to pSID) for attach. Please see
Patch #10.

Here we just allocate two domains(bypass or abort) for later attach based on
Guest request.

These are not S2 only HWPT per se. They are of type IOMMU_DOMAIN_NESTED.

From kernel doc:

#define __IOMMU_DOMAIN_NESTED   (1U << 6)  /* User-managed address space nested
                                              on a stage-2 translation        */

I will update the comment which will clarify this better.

@Nicolin, please correct me if my above understanding is not right.

> 
> >  -And add the dev to viommu device list
> this is the initial objective of the callback
> >
> > Also add an unset_iommu_device to unwind/cleanup above.
> 
> I think you shall document the introduction of SMMUv3AccelState.It
> currently contains a single member, do you plan to add others.

Ok. I think for this series we only have one member for now. But when
we add support for vEVeNTQ and vCMDQ, this will have more. 

> > Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> > Signed-off-by: Shameer Kolothum
> <shameerali.kolothum.thodi@huawei.com
> > Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
> > ---
> >  hw/arm/smmuv3-accel.c   | 150
> ++++++++++++++++++++++++++++++++++++++++
> >  hw/arm/smmuv3-accel.h   |  17 +++++
> >  hw/arm/trace-events     |   4 ++
> >  include/hw/arm/smmuv3.h |   1 +
> >  4 files changed, 172 insertions(+)
> >
> > diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c
> > index 6b0e512d86..81fa738f6f 100644
> > --- a/hw/arm/smmuv3-accel.c
> > +++ b/hw/arm/smmuv3-accel.c
> > @@ -8,6 +8,7 @@
> >
> >  #include "qemu/osdep.h"
> >  #include "qemu/error-report.h"
> > +#include "trace.h"
> >
> >  #include "hw/arm/smmuv3.h"
> >  #include "hw/iommu.h"
> > @@ -17,6 +18,9 @@
> >
> >  #include "smmuv3-accel.h"
> >
> > +#define SMMU_STE_VALID      (1ULL << 0)
> > +#define SMMU_STE_CFG_BYPASS (1ULL << 3)
> I would rather put that in smmuv3-internal.h where you have other STE
> macros. Look for "/* STE fields */

Ok.

> > +
> >  static SMMUv3AccelDevice *smmuv3_accel_get_dev(SMMUState *bs,
> SMMUPciBus *sbus,
> >                                                 PCIBus *bus, int devfn)
> >  {
> > @@ -35,6 +39,149 @@ static SMMUv3AccelDevice
> *smmuv3_accel_get_dev(SMMUState *bs, SMMUPciBus *sbus,
> >      return accel_dev;
> >  }
> >
> > +static bool
> > +smmuv3_accel_dev_alloc_viommu(SMMUv3AccelDevice *accel_dev,
> > +                              HostIOMMUDeviceIOMMUFD *idev, Error **errp)
> > +{
> > +    struct iommu_hwpt_arm_smmuv3 bypass_data = {
> > +        .ste = { SMMU_STE_CFG_BYPASS | SMMU_STE_VALID, 0x0ULL },
> > +    };
> > +    struct iommu_hwpt_arm_smmuv3 abort_data = {
> > +        .ste = { SMMU_STE_VALID, 0x0ULL },
> > +    };
> > +    SMMUDevice *sdev = &accel_dev->sdev;
> > +    SMMUState *bs = sdev->smmu;
> > +    SMMUv3State *s = ARM_SMMUV3(bs);
> > +    SMMUv3AccelState *s_accel = s->s_accel;
> > +    uint32_t s2_hwpt_id = idev->hwpt_id;
> > +    SMMUViommu *viommu;
> > +    uint32_t viommu_id;
> > +
> > +    if (s_accel->viommu) {
> > +        accel_dev->viommu = s_accel->viommu;
> > +        return true;
> > +    }
> > +
> > +    if (!iommufd_backend_alloc_viommu(idev->iommufd, idev->devid,
> > +                                      IOMMU_VIOMMU_TYPE_ARM_SMMUV3,
> > +                                      s2_hwpt_id, &viommu_id, errp)) {
> > +        return false;
> > +    }
> > +
> > +    viommu = g_new0(SMMUViommu, 1);
> > +    viommu->core.viommu_id = viommu_id;
> > +    viommu->core.s2_hwpt_id = s2_hwpt_id;
> > +    viommu->core.iommufd = idev->iommufd;
> > +
> > +    if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid,
> > +                                    viommu->core.viommu_id, 0,
> > +                                    IOMMU_HWPT_DATA_ARM_SMMUV3,
> > +                                    sizeof(abort_data), &abort_data,
> > +                                    &viommu->abort_hwpt_id, errp)) {
> > +        goto free_viommu;
> > +    }
> > +
> > +    if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid,
> > +                                    viommu->core.viommu_id, 0,
> > +                                    IOMMU_HWPT_DATA_ARM_SMMUV3,
> > +                                    sizeof(bypass_data), &bypass_data,
> > +                                    &viommu->bypass_hwpt_id, errp)) {
> > +        goto free_abort_hwpt;
> > +    }
> > +
> > +    viommu->iommufd = idev->iommufd;
> > +
> > +    s_accel->viommu = viommu;
> > +    accel_dev->viommu = viommu;
> > +    return true;
> > +
> > +free_abort_hwpt:
> > +    iommufd_backend_free_id(idev->iommufd, viommu->abort_hwpt_id);
> > +free_viommu:
> > +    iommufd_backend_free_id(idev->iommufd, viommu->core.viommu_id);
> > +    g_free(viommu);
> > +    return false;
> > +}
> > +
> > +static bool smmuv3_accel_set_iommu_device(PCIBus *bus, void *opaque,
> int devfn,
> > +                                          HostIOMMUDevice *hiod, Error **errp)
> > +{
> > +    HostIOMMUDeviceIOMMUFD *idev =
> HOST_IOMMU_DEVICE_IOMMUFD(hiod);
> > +    SMMUState *bs = opaque;
> > +    SMMUv3State *s = ARM_SMMUV3(bs);
> > +    SMMUv3AccelState *s_accel = s->s_accel;
> > +    SMMUPciBus *sbus = smmu_get_sbus(bs, bus);
> > +    SMMUv3AccelDevice *accel_dev = smmuv3_accel_get_dev(bs, sbus,
> bus, devfn);
> you are using smmuv3_accel_get_dev but logically the function was
> already called once before in smmuv3_accel_find_add_as()? Meaning the
> add/allocate part of the function is not something that should happen
> here. Shouldn't we add a new helper that does SMMUPciBus *sbus =
> g_hash_table_lookup(bs->smmu_pcibus_by_busptr, bus); if (!sbus) {
> return; } sdev = sbus->pbdev[devfn]; if (!sdev) { return; } that would
> be used both set and unset()?

Hmm.. I am not sure I quite follow the need for that.
smmuv3_accel_get_dev() second time will just retrieve the 

SMMUDevice *sdev = sbus->pbdev[devfn];

And then return immediately,
  if (sdev) {
        return container_of(sdev, SMMUv3AccelDevice, sdev);
  }

Do we need really need a separate helper for this?

Thanks,
Shameer
Re: [PATCH v4 08/27] hw/arm/smmuv3-accel: Add set/unset_iommu_device callback
Posted by Nicolin Chen 1 month, 1 week ago
On Thu, Oct 02, 2025 at 04:34:17AM -0700, Shameer Kolothum wrote:
> > > Implement a set_iommu_device callback:
> > >  -If found an existing viommu reuse that.
> > I think you need to document why you need a vIOMMU object.
> > >  -Else,
> > >     Allocate a vIOMMU with the nested parent S2 hwpt allocated by VFIO.
> > >     Though, iommufd’s vIOMMU model supports nested translation by
> > >     encapsulating a S2 nesting parent HWPT, devices cannot attach to this
> > >     parent HWPT directly. So two proxy nested HWPTs (bypass and abort) are
> > >     allocated to handle device attachments.
> > 
> > "devices cannot attach to this parent HWPT directly".  Why? It is not clear to
> > me what those hwpt are used for compared to the original one. Why are they
> > mandated? To me this deserves some additional explanations. If they are s2
> > ones, I would use an s2 prefix too.
> 
> Ok. This needs some rephrasing.
> 
> The idea is, we cannot yet attach a domain to the SMMUv3 for this device yet.
> We need a vDEVICE object (which will link vSID to pSID) for attach. Please see
> Patch #10.
> 
> Here we just allocate two domains(bypass or abort) for later attach based on
> Guest request.
> 
> These are not S2 only HWPT per se. They are of type IOMMU_DOMAIN_NESTED.
> 
> From kernel doc:
> 
> #define __IOMMU_DOMAIN_NESTED   (1U << 6)  /* User-managed address space nested
>                                               on a stage-2 translation        */

There are a couple of things going on here:
1) We should not attach directly to the S2 HWPT that eventually
   will be shared across vSMMU instances. In other word, an S2
   HWPT will not be attachable for lacking of its tie to an SMMU
   instance and not having a VMID at all. Instead, each vIOMMU
   object allocated using this S2 HWPT will hold the VMID.

2) A device cannot attach to a vIOMMU directly but has to attach
   through a proxy nested HWPT (IOMMU_DOMAIN_NESTED). To attach
   to an IOMMU_DOMAIN_NESTED, a vDEVICE must be allocated with a
   given vSID.

This might sound a bit complicated but I think it makes sense from
a VM perspective, as a device that's behind a vSMMU should have a
guest-level SID and its corresponding STE: if the device is working
in the S2-only mode (physically), there must be a guest-level STE
configuring to the S1-BYPASS mode, where the "bypass" proxy HWPT
will be picked for attachment.

So, for rephrasing, I think it would nicer to say something like:

"
A device that is put behind a vSMMU instance must have a vSID and its
corresponding vSTEs (bypass/abort/translate). Pre-allocate the bypass
and abort vSTEs as two proxy nested HWPTs for the device to attach to
a vIOMMU.

Note that the core-managed nesting parent HWPT should not be attached
directly when using the iommufd's vIOMMU model. This is also because
we want that nesting parent HWPT to be reused eventually across vSMMU
instances in the same VM.
"

Nicolin

RE: [PATCH v4 08/27] hw/arm/smmuv3-accel: Add set/unset_iommu_device callback
Posted by Shameer Kolothum 2 weeks, 4 days ago
Hi Eric,

Based on further off-list discussions, this needs some clarification.

> -----Original Message-----
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: 02 October 2025 17:44
> To: Shameer Kolothum <skolothumtho@nvidia.com>
> Cc: eric.auger@redhat.com; qemu-arm@nongnu.org; qemu-
> devel@nongnu.org; peter.maydell@linaro.org; Jason Gunthorpe
> <jgg@nvidia.com>; ddutile@redhat.com; berrange@redhat.com; Nathan
> Chen <nathanc@nvidia.com>; Matt Ochs <mochs@nvidia.com>;
> smostafa@google.com; wangzhou1@hisilicon.com;
> jiangkunkun@huawei.com; jonathan.cameron@huawei.com;
> zhangfei.gao@linaro.org; zhenzhong.duan@intel.com; yi.l.liu@intel.com;
> shameerkolothum@gmail.com
> Subject: Re: [PATCH v4 08/27] hw/arm/smmuv3-accel: Add
> set/unset_iommu_device callback

[..]

> There are a couple of things going on here:
> 1) We should not attach directly to the S2 HWPT that eventually
>    will be shared across vSMMU instances. In other word, an S2
>    HWPT will not be attachable for lacking of its tie to an SMMU
>    instance and not having a VMID at all. Instead, each vIOMMU
>    object allocated using this S2 HWPT will hold the VMID.

The above statement is not 100% correct as per the current kernel
and QEMU implementation. At present, VFIO core allocates an s2
HWPT when IOMMU_HWPT_ALLOC_NEST_PARENT flag is set and
attaches to it. 

In the set_iommu_device () path, we use this s2 HWPT id to create a 
vIOMMU and pre-allocate bypass and abort IOMMU_DOMAIN_NESTED
HWPTs, which are ready to be attached if required.

On unset_iommu_device() path, we attach back to the s2 HWPT as it
does today.
 
> 2) A device cannot attach to a vIOMMU directly but has to attach
>    through a proxy nested HWPT (IOMMU_DOMAIN_NESTED). To attach
>    to an IOMMU_DOMAIN_NESTED, a vDEVICE must be allocated with a
>    given vSID.

Also, it became clear that we need to relax the above restriction and allow
attaching proxy nested HWPTs without requiring a vDEVICE. This is necessary
to handle cases where the SMMUv3 operates in global bypass or abort modes,
as controlled by the GBPA register, which do not have any associated devices.

Please see Nicolin’s kernel patch [0] that introduces this relaxation.

I’ve updated this QEMU series accordingly. SMMUv3 GBPA based HWPT will
now be attached during both reset() and GBPA update paths.

The commit log and documentation will also be updated to reflect above
change.

Thanks,
Shameer
[0] https://lore.kernel.org/linux-iommu/20251024040551.1711281-1-nicolinc@nvidia.com/
Re: [PATCH v4 08/27] hw/arm/smmuv3-accel: Add set/unset_iommu_device callback
Posted by Eric Auger 2 weeks, 4 days ago

On 10/27/25 12:56 PM, Shameer Kolothum wrote:
> Hi Eric,
>
> Based on further off-list discussions, this needs some clarification.
>
>> -----Original Message-----
>> From: Nicolin Chen <nicolinc@nvidia.com>
>> Sent: 02 October 2025 17:44
>> To: Shameer Kolothum <skolothumtho@nvidia.com>
>> Cc: eric.auger@redhat.com; qemu-arm@nongnu.org; qemu-
>> devel@nongnu.org; peter.maydell@linaro.org; Jason Gunthorpe
>> <jgg@nvidia.com>; ddutile@redhat.com; berrange@redhat.com; Nathan
>> Chen <nathanc@nvidia.com>; Matt Ochs <mochs@nvidia.com>;
>> smostafa@google.com; wangzhou1@hisilicon.com;
>> jiangkunkun@huawei.com; jonathan.cameron@huawei.com;
>> zhangfei.gao@linaro.org; zhenzhong.duan@intel.com; yi.l.liu@intel.com;
>> shameerkolothum@gmail.com
>> Subject: Re: [PATCH v4 08/27] hw/arm/smmuv3-accel: Add
>> set/unset_iommu_device callback
> [..]
>
>> There are a couple of things going on here:
>> 1) We should not attach directly to the S2 HWPT that eventually
>>    will be shared across vSMMU instances. In other word, an S2
>>    HWPT will not be attachable for lacking of its tie to an SMMU
>>    instance and not having a VMID at all. Instead, each vIOMMU
>>    object allocated using this S2 HWPT will hold the VMID.
> The above statement is not 100% correct as per the current kernel
> and QEMU implementation. At present, VFIO core allocates an s2
> HWPT when IOMMU_HWPT_ALLOC_NEST_PARENT flag is set and
> attaches to it. 
>
> In the set_iommu_device () path, we use this s2 HWPT id to create a 
> vIOMMU and pre-allocate bypass and abort IOMMU_DOMAIN_NESTED
> HWPTs, which are ready to be attached if required.
those are s1 ones (bypass and abort) , right?
>
> On unset_iommu_device() path, we attach back to the s2 HWPT as it
> does today.
OK
>  
>> 2) A device cannot attach to a vIOMMU directly but has to attach
>>    through a proxy nested HWPT (IOMMU_DOMAIN_NESTED). To attach
>>    to an IOMMU_DOMAIN_NESTED, a vDEVICE must be allocated with a
>>    given vSID.
> Also, it became clear that we need to relax the above restriction and allow
> attaching proxy nested HWPTs without requiring a vDEVICE. This is necessary
> to handle cases where the SMMUv3 operates in global bypass or abort modes,
> as controlled by the GBPA register, which do not have any associated devices.
>
> Please see Nicolin’s kernel patch [0] that introduces this relaxation.
>
> I’ve updated this QEMU series accordingly. SMMUv3 GBPA based HWPT will
> now be attached during both reset() and GBPA update paths.

OK

Thank you for the update

Eric
>
> The commit log and documentation will also be updated to reflect above
> change.
>
> Thanks,
> Shameer
> [0] https://lore.kernel.org/linux-iommu/20251024040551.1711281-1-nicolinc@nvidia.com/


Re: [PATCH v4 08/27] hw/arm/smmuv3-accel: Add set/unset_iommu_device callback
Posted by Eric Auger 4 weeks ago

On 10/2/25 6:44 PM, Nicolin Chen wrote:
> On Thu, Oct 02, 2025 at 04:34:17AM -0700, Shameer Kolothum wrote:
>>>> Implement a set_iommu_device callback:
>>>>  -If found an existing viommu reuse that.
>>> I think you need to document why you need a vIOMMU object.
>>>>  -Else,
>>>>     Allocate a vIOMMU with the nested parent S2 hwpt allocated by VFIO.
>>>>     Though, iommufd’s vIOMMU model supports nested translation by
>>>>     encapsulating a S2 nesting parent HWPT, devices cannot attach to this
>>>>     parent HWPT directly. So two proxy nested HWPTs (bypass and abort) are
>>>>     allocated to handle device attachments.
>>> "devices cannot attach to this parent HWPT directly".  Why? It is not clear to
>>> me what those hwpt are used for compared to the original one. Why are they
>>> mandated? To me this deserves some additional explanations. If they are s2
>>> ones, I would use an s2 prefix too.
>> Ok. This needs some rephrasing.
>>
>> The idea is, we cannot yet attach a domain to the SMMUv3 for this device yet.
>> We need a vDEVICE object (which will link vSID to pSID) for attach. Please see
>> Patch #10.
>>
>> Here we just allocate two domains(bypass or abort) for later attach based on
>> Guest request.
>>
>> These are not S2 only HWPT per se. They are of type IOMMU_DOMAIN_NESTED.
>>
>> From kernel doc:
>>
>> #define __IOMMU_DOMAIN_NESTED   (1U << 6)  /* User-managed address space nested
>>                                               on a stage-2 translation        */
> There are a couple of things going on here:
> 1) We should not attach directly to the S2 HWPT that eventually
>    will be shared across vSMMU instances. In other word, an S2
>    HWPT will not be attachable for lacking of its tie to an SMMU
>    instance and not having a VMID at all. Instead, each vIOMMU
>    object allocated using this S2 HWPT will hold the VMID.
>
> 2) A device cannot attach to a vIOMMU directly but has to attach
>    through a proxy nested HWPT (IOMMU_DOMAIN_NESTED). To attach
>    to an IOMMU_DOMAIN_NESTED, a vDEVICE must be allocated with a
>    given vSID.
>
> This might sound a bit complicated but I think it makes sense from
> a VM perspective, as a device that's behind a vSMMU should have a
> guest-level SID and its corresponding STE: if the device is working
> in the S2-only mode (physically), there must be a guest-level STE
> configuring to the S1-BYPASS mode, where the "bypass" proxy HWPT
> will be picked for attachment.
>
> So, for rephrasing, I think it would nicer to say something like:
>
> "
> A device that is put behind a vSMMU instance must have a vSID and its
> corresponding vSTEs (bypass/abort/translate). Pre-allocate the bypass
> and abort vSTEs as two proxy nested HWPTs for the device to attach to
> a vIOMMU.
>
> Note that the core-managed nesting parent HWPT should not be attached
> directly when using the iommufd's vIOMMU model. This is also because
> we want that nesting parent HWPT to be reused eventually across vSMMU
> instances in the same VM.
> "

I would add 1) and 2) also in the commit msg. This definitively helps
understanding the whole setup

Eric
>
> Nicolin
>


Re: [PATCH v4 08/27] hw/arm/smmuv3-accel: Add set/unset_iommu_device callback
Posted by Jason Gunthorpe 1 month, 1 week ago
On Thu, Oct 02, 2025 at 09:44:18AM -0700, Nicolin Chen wrote:
> On Thu, Oct 02, 2025 at 04:34:17AM -0700, Shameer Kolothum wrote:
> > > > Implement a set_iommu_device callback:
> > > >  -If found an existing viommu reuse that.
> > > I think you need to document why you need a vIOMMU object.
> > > >  -Else,
> > > >     Allocate a vIOMMU with the nested parent S2 hwpt allocated by VFIO.
> > > >     Though, iommufd’s vIOMMU model supports nested translation by
> > > >     encapsulating a S2 nesting parent HWPT, devices cannot attach to this
> > > >     parent HWPT directly. So two proxy nested HWPTs (bypass and abort) are
> > > >     allocated to handle device attachments.
> > > 
> > > "devices cannot attach to this parent HWPT directly".  Why? It is not clear to
> > > me what those hwpt are used for compared to the original one. Why are they
> > > mandated? To me this deserves some additional explanations. If they are s2
> > > ones, I would use an s2 prefix too.
> > 
> > Ok. This needs some rephrasing.
> > 
> > The idea is, we cannot yet attach a domain to the SMMUv3 for this device yet.
> > We need a vDEVICE object (which will link vSID to pSID) for attach. Please see
> > Patch #10.
> > 
> > Here we just allocate two domains(bypass or abort) for later attach based on
> > Guest request.
> > 
> > These are not S2 only HWPT per se. They are of type IOMMU_DOMAIN_NESTED.
> > 
> > From kernel doc:
> > 
> > #define __IOMMU_DOMAIN_NESTED   (1U << 6)  /* User-managed address space nested
> >                                               on a stage-2 translation        */
> 
> There are a couple of things going on here:
> 1) We should not attach directly to the S2 HWPT that eventually
>    will be shared across vSMMU instances. In other word, an S2
>    HWPT will not be attachable for lacking of its tie to an SMMU
>    instance and not having a VMID at all. Instead, each vIOMMU
>    object allocated using this S2 HWPT will hold the VMID.
> 
> 2) A device cannot attach to a vIOMMU directly but has to attach
>    through a proxy nested HWPT (IOMMU_DOMAIN_NESTED). To attach
>    to an IOMMU_DOMAIN_NESTED, a vDEVICE must be allocated with a
>    given vSID.
> 
> This might sound a bit complicated but I think it makes sense from
> a VM perspective, as a device that's behind a vSMMU should have a
> guest-level SID and its corresponding STE: if the device is working
> in the S2-only mode (physically), there must be a guest-level STE
> configuring to the S1-BYPASS mode, where the "bypass" proxy HWPT
> will be picked for attachment.
> 
> So, for rephrasing, I think it would nicer to say something like:
> 
> "
> A device that is put behind a vSMMU instance must have a vSID and its
> corresponding vSTEs (bypass/abort/translate). Pre-allocate the bypass
> and abort vSTEs as two proxy nested HWPTs for the device to attach to
> a vIOMMU.
> 
> Note that the core-managed nesting parent HWPT should not be attached
> directly when using the iommufd's vIOMMU model. This is also because
> we want that nesting parent HWPT to be reused eventually across vSMMU
> instances in the same VM.
> "

This all seems correct to me

Thanks,
Jason

Re: [PATCH v4 08/27] hw/arm/smmuv3-accel: Add set/unset_iommu_device callback
Posted by Jonathan Cameron via 1 month, 2 weeks ago
On Mon, 29 Sep 2025 14:36:24 +0100
Shameer Kolothum <skolothumtho@nvidia.com> wrote:

> From: Nicolin Chen <nicolinc@nvidia.com>
> 
> Implement a set_iommu_device callback:
>  -If found an existing viommu reuse that.
>  -Else,
>     Allocate a vIOMMU with the nested parent S2 hwpt allocated by VFIO.
>     Though, iommufd’s vIOMMU model supports nested translation by
>     encapsulating a S2 nesting parent HWPT, devices cannot attach to this
>     parent HWPT directly. So two proxy nested HWPTs (bypass and abort) are
>     allocated to handle device attachments.
>  -And add the dev to viommu device list
> 
> Also add an unset_iommu_device to unwind/cleanup above.
> 
> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com
> Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>

Triviality follows.
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>



> diff --git a/hw/arm/smmuv3-accel.h b/hw/arm/smmuv3-accel.h
> index 70da16960f..3c8506d1e6 100644
> --- a/hw/arm/smmuv3-accel.h
> +++ b/hw/arm/smmuv3-accel.h
> @@ -10,12 +10,29 @@
>  #define HW_ARM_SMMUV3_ACCEL_H
>  
>  #include "hw/arm/smmu-common.h"
> +#include "system/iommufd.h"
> +#include <linux/iommufd.h>
>  #include CONFIG_DEVICES
>  
> +typedef struct SMMUViommu {
> +    IOMMUFDBackend *iommufd;
> +    IOMMUFDViommu core;
> +    uint32_t bypass_hwpt_id;
> +    uint32_t abort_hwpt_id;
> +    QLIST_HEAD(, SMMUv3AccelDevice) device_list;

Does that need a forwards def of SMMUv3AccelDevice ?

> +} SMMUViommu;
> +
>  typedef struct SMMUv3AccelDevice {
>      SMMUDevice  sdev;

I didn't comment on this earlier as thought there would be something added tht
aligned with the above, but seems not.  So why the double space?

> +    HostIOMMUDeviceIOMMUFD *idev;
> +    SMMUViommu *viommu;
> +    QLIST_ENTRY(SMMUv3AccelDevice) next;
>  } SMMUv3AccelDevice;
>  
> +typedef struct SMMUv3AccelState {
> +    SMMUViommu *viommu;
> +} SMMUv3AccelState;

>  strongarm_uart_update_parameters(const char *label, int speed, char parity, int data_bits, int stop_bits) "%s speed=%d parity=%c data=%d stop=%d"
>  strongarm_ssp_read_underrun(void) "SSP rx underrun"
> diff --git a/include/hw/arm/smmuv3.h b/include/hw/arm/smmuv3.h
> index bb7076286b..5f3e9089a7 100644
> --- a/include/hw/arm/smmuv3.h
> +++ b/include/hw/arm/smmuv3.h
> @@ -66,6 +66,7 @@ struct SMMUv3State {
>  
>      /* SMMU has HW accelerator support for nested S1 + s2 */
>      bool accel;
> +    struct SMMUv3AccelState  *s_accel;

Seems like a bonus space before *

>  };
>  
>  typedef enum {
RE: [PATCH v4 08/27] hw/arm/smmuv3-accel: Add set/unset_iommu_device callback
Posted by Shameer Kolothum 1 month, 2 weeks ago

> -----Original Message-----
> From: Jonathan Cameron <jonathan.cameron@huawei.com>
> Sent: 29 September 2025 17:26
> To: Shameer Kolothum <skolothumtho@nvidia.com>
> Cc: qemu-arm@nongnu.org; qemu-devel@nongnu.org;
> eric.auger@redhat.com; peter.maydell@linaro.org; Jason Gunthorpe
> <jgg@nvidia.com>; Nicolin Chen <nicolinc@nvidia.com>; ddutile@redhat.com;
> berrange@redhat.com; Nathan Chen <nathanc@nvidia.com>; Matt Ochs
> <mochs@nvidia.com>; smostafa@google.com; wangzhou1@hisilicon.com;
> jiangkunkun@huawei.com; zhangfei.gao@linaro.org;
> zhenzhong.duan@intel.com; yi.l.liu@intel.com;
> shameerkolothum@gmail.com
> Subject: Re: [PATCH v4 08/27] hw/arm/smmuv3-accel: Add
> set/unset_iommu_device callback
> 
>  #include CONFIG_DEVICES
> >
> > +typedef struct SMMUViommu {
> > +    IOMMUFDBackend *iommufd;
> > +    IOMMUFDViommu core;
> > +    uint32_t bypass_hwpt_id;
> > +    uint32_t abort_hwpt_id;
> > +    QLIST_HEAD(, SMMUv3AccelDevice) device_list;
> 
> Does that need a forwards def of SMMUv3AccelDevice ?

I haven't seen any compiler warnings or errors and it looks like,
QLIST_HEAD expands to something like below,

struct {
    struct SMMUv3AccelDevice *lh_first;
}

So should be fine, I guess.

Thanks,
Shameer