[PATCH RFCv2 2/8] vfio/iommufd: Introduce auto domain creation

Joao Martins posted 8 patches 2 months, 1 week ago
[PATCH RFCv2 2/8] vfio/iommufd: Introduce auto domain creation
Posted by Joao Martins 2 months, 1 week ago
There's generally two modes of operation for IOMMUFD:

* The simple user API which intends to perform relatively simple things
with IOMMUs e.g. DPDK. It generally creates an IOAS and attach to VFIO
and mainly performs IOAS_MAP and UNMAP.

* The native IOMMUFD API where you have fine grained control of the
IOMMU domain and model it accordingly. This is where most new feature
are being steered to.

For dirty tracking 2) is required, as it needs to ensure that
the stage-2/parent IOMMU domain will only attach devices
that support dirty tracking (so far it is all homogeneous in x86, likely
not the case for smmuv3). Such invariant on dirty tracking provides a
useful guarantee to VMMs that will refuse incompatible device
attachments for IOMMU domains.

For dirty tracking such property is enabled/enforced via HWPT_ALLOC,
which is responsible for creating an IOMMU domain. This is contrast to
the 'simple API' where the IOMMU domain is created by IOMMUFD
automatically when it attaches to VFIO (usually referred as autodomains)

To support dirty tracking with the advanced IOMMUFD API, it needs
similar logic, where IOMMU domains are created and devices attached to
compatible domains. Essentially mimmicing kernel
iommufd_device_auto_get_domain(). If this fails (i.e. mdevs) it falls back
to IOAS attach.

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
---
Right now the only alternative to a userspace autodomains implementation
is to mimmicing all the flags being added to HWPT_ALLOC but into VFIO
IOAS attach. So opted for autodomains userspace approach to avoid the
duplication of hwpt-alloc flags vs attach-ioas flags. I lack mdev real
drivers atm, so testing with those is still TBD.

Opinions, comments, welcome!
---
 backends/iommufd.c            | 29 +++++++++++++
 backends/trace-events         |  1 +
 hw/vfio/iommufd.c             | 78 +++++++++++++++++++++++++++++++++++
 include/hw/vfio/vfio-common.h |  9 ++++
 include/sysemu/iommufd.h      |  4 ++
 5 files changed, 121 insertions(+)

diff --git a/backends/iommufd.c b/backends/iommufd.c
index 8486894f1b3f..2970135af4b9 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -211,6 +211,35 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
     return ret;
 }
 
+int iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id,
+                               uint32_t pt_id, uint32_t flags,
+                               uint32_t data_type, uint32_t data_len,
+                               void *data_ptr, uint32_t *out_hwpt)
+{
+    int ret;
+    struct iommu_hwpt_alloc alloc_hwpt = {
+        .size = sizeof(struct iommu_hwpt_alloc),
+        .flags = flags,
+        .dev_id = dev_id,
+        .pt_id = pt_id,
+        .data_type = data_type,
+        .data_len = data_len,
+        .data_uptr = (uint64_t)data_ptr,
+        .__reserved = 0,
+    };
+
+    ret = ioctl(iommufd, IOMMU_HWPT_ALLOC, &alloc_hwpt);
+    trace_iommufd_backend_alloc_hwpt(iommufd, dev_id, pt_id, flags, data_type,
+                                     data_len, (uint64_t)data_ptr,
+                                     alloc_hwpt.out_hwpt_id, ret);
+    if (ret) {
+        error_report("IOMMU_HWPT_ALLOC failed: %m");
+    } else {
+        *out_hwpt = alloc_hwpt.out_hwpt_id;
+    }
+    return !ret ? 0 : -errno;
+}
+
 static const TypeInfo iommufd_backend_info = {
     .name = TYPE_IOMMUFD_BACKEND,
     .parent = TYPE_OBJECT,
diff --git a/backends/trace-events b/backends/trace-events
index d45c6e31a67e..f83a276a4253 100644
--- a/backends/trace-events
+++ b/backends/trace-events
@@ -13,5 +13,6 @@ iommu_backend_set_fd(int fd) "pre-opened /dev/iommu fd=%d"
 iommufd_backend_map_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, void *vaddr, bool readonly, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" addr=%p readonly=%d (%d)"
 iommufd_backend_unmap_dma_non_exist(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " Unmap nonexistent mapping: iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)"
 iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)"
+iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id, uint32_t pt_id, uint32_t flags, uint32_t hwpt_type, uint32_t len, uint64_t data_ptr, uint32_t out_hwpt_id, int ret) " iommufd=%d dev_id=%u pt_id=%u flags=0x%x hwpt_type=%u len=%u data_ptr=0x%"PRIx64" out_hwpt=%u (%d)"
 iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas, int ret) " iommufd=%d ioas=%d (%d)"
 iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)"
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 7d39d7a5fa51..ca7ec45e725c 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -219,10 +219,82 @@ static int iommufd_cdev_detach_ioas_hwpt(VFIODevice *vbasedev, Error **errp)
     return ret;
 }
 
+static int iommufd_cdev_autodomains_get(VFIODevice *vbasedev,
+                                        VFIOIOMMUFDContainer *container,
+                                        Error **errp)
+{
+    int iommufd = vbasedev->iommufd_dev.iommufd->fd;
+    VFIOIOASHwpt *hwpt;
+    Error *err = NULL;
+    int ret = -EINVAL;
+    uint32_t hwpt_id;
+
+    /* Try to find a domain */
+    QLIST_FOREACH(hwpt, &container->hwpt_list, next) {
+        ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, &err);
+        if (ret) {
+            /* -EINVAL means the domain is incompatible with the device. */
+            if (ret == -EINVAL) {
+                continue;
+            }
+            return ret;
+        } else {
+            vbasedev->hwpt = hwpt;
+            QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next);
+            return 0;
+        }
+    }
+
+    ret = iommufd_backend_alloc_hwpt(iommufd,
+                                     vbasedev->iommufd_dev.devid,
+                                     container->ioas_id, 0, 0, 0,
+                                     NULL, &hwpt_id);
+    if (ret) {
+        error_append_hint(&err,
+                   "Failed to allocate HWPT for device %s. Fallback to IOAS attach\n",
+                   vbasedev->name);
+        warn_report_err(err);
+        return ret;
+    }
+
+    hwpt = g_malloc0(sizeof(*hwpt));
+    hwpt->hwpt_id = hwpt_id;
+    QLIST_INIT(&hwpt->device_list);
+
+    ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, &err);
+    if (ret) {
+        iommufd_backend_free_id(container->be, hwpt->hwpt_id);
+        g_free(hwpt);
+        return ret;
+    }
+
+    vbasedev->hwpt = hwpt;
+    QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next);
+    QLIST_INSERT_HEAD(&container->hwpt_list, hwpt, next);
+    return 0;
+}
+
+static void iommufd_cdev_autodomains_put(VFIODevice *vbasedev,
+                                         VFIOIOMMUFDContainer *container)
+{
+    VFIOIOASHwpt *hwpt = vbasedev->hwpt;
+
+    QLIST_REMOVE(vbasedev, hwpt_next);
+    QLIST_REMOVE(hwpt, next);
+    if (QLIST_EMPTY(&hwpt->device_list)) {
+        iommufd_backend_free_id(container->be, hwpt->hwpt_id);
+        g_free(hwpt);
+    }
+}
+
 static int iommufd_cdev_attach_container(VFIODevice *vbasedev,
                                          VFIOIOMMUFDContainer *container,
                                          Error **errp)
 {
+    if (!iommufd_cdev_autodomains_get(vbasedev, container, errp)) {
+        return 0;
+    }
+
     return iommufd_cdev_attach_ioas_hwpt(vbasedev, container->ioas_id, errp);
 }
 
@@ -231,6 +303,11 @@ static void iommufd_cdev_detach_container(VFIODevice *vbasedev,
 {
     Error *err = NULL;
 
+    if (vbasedev->hwpt) {
+        iommufd_cdev_autodomains_put(vbasedev, container);
+        return;
+    }
+
     if (iommufd_cdev_detach_ioas_hwpt(vbasedev, &err)) {
         error_report_err(err);
     }
@@ -370,6 +447,7 @@ static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev,
     container = g_malloc0(sizeof(*container));
     container->be = vbasedev->iommufd_dev.iommufd;
     container->ioas_id = ioas_id;
+    QLIST_INIT(&container->hwpt_list);
 
     bcontainer = &container->bcontainer;
     vfio_container_init(bcontainer, space, iommufd_vioc);
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 9c4b60c906d9..7f7d823221e2 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -93,10 +93,17 @@ typedef struct VFIOHostDMAWindow {
 
 typedef struct IOMMUFDBackend IOMMUFDBackend;
 
+typedef struct VFIOIOASHwpt {
+    uint32_t hwpt_id;
+    QLIST_HEAD(, VFIODevice) device_list;
+    QLIST_ENTRY(VFIOIOASHwpt) next;
+} VFIOIOASHwpt;
+
 typedef struct VFIOIOMMUFDContainer {
     VFIOContainerBase bcontainer;
     IOMMUFDBackend *be;
     uint32_t ioas_id;
+    QLIST_HEAD(, VFIOIOASHwpt) hwpt_list;
 } VFIOIOMMUFDContainer;
 
 /* Abstraction of host IOMMU legacy device */
@@ -136,6 +143,8 @@ typedef struct VFIODevice {
         IOMMULegacyDevice legacy_dev;
         IOMMUFDDevice iommufd_dev;
     };
+    VFIOIOASHwpt *hwpt;
+    QLIST_ENTRY(VFIODevice) hwpt_next;
 } VFIODevice;
 
 QEMU_BUILD_BUG_ON(offsetof(VFIODevice, legacy_dev.base) !=
diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
index 4afe97307dbe..1966b75caae2 100644
--- a/include/sysemu/iommufd.h
+++ b/include/sysemu/iommufd.h
@@ -49,4 +49,8 @@ typedef struct IOMMUFDDevice {
 void iommufd_device_init(IOMMUFDDevice *idev);
 int iommufd_device_get_hw_capabilities(IOMMUFDDevice *idev, uint64_t *caps,
                                        Error **errp);
+int iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id,
+                               uint32_t pt_id, uint32_t flags,
+                               uint32_t data_type, uint32_t data_len,
+                               void *data_ptr, uint32_t *out_hwpt);
 #endif
-- 
2.39.3
Re: [PATCH RFCv2 2/8] vfio/iommufd: Introduce auto domain creation
Posted by Avihai Horon 2 months ago
Hi Joao,

On 12/02/2024 15:56, Joao Martins wrote:
> External email: Use caution opening links or attachments
>
>
> There's generally two modes of operation for IOMMUFD:
>
> * The simple user API which intends to perform relatively simple things
> with IOMMUs e.g. DPDK. It generally creates an IOAS and attach to VFIO
> and mainly performs IOAS_MAP and UNMAP.
>
> * The native IOMMUFD API where you have fine grained control of the
> IOMMU domain and model it accordingly. This is where most new feature
> are being steered to.
>
> For dirty tracking 2) is required, as it needs to ensure that
> the stage-2/parent IOMMU domain will only attach devices
> that support dirty tracking (so far it is all homogeneous in x86, likely
> not the case for smmuv3). Such invariant on dirty tracking provides a
> useful guarantee to VMMs that will refuse incompatible device
> attachments for IOMMU domains.
>
> For dirty tracking such property is enabled/enforced via HWPT_ALLOC,
> which is responsible for creating an IOMMU domain. This is contrast to
> the 'simple API' where the IOMMU domain is created by IOMMUFD
> automatically when it attaches to VFIO (usually referred as autodomains)
>
> To support dirty tracking with the advanced IOMMUFD API, it needs
> similar logic, where IOMMU domains are created and devices attached to
> compatible domains. Essentially mimmicing kernel
> iommufd_device_auto_get_domain(). If this fails (i.e. mdevs) it falls back
> to IOAS attach.
>
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> ---
> Right now the only alternative to a userspace autodomains implementation
> is to mimmicing all the flags being added to HWPT_ALLOC but into VFIO
> IOAS attach. So opted for autodomains userspace approach to avoid the
> duplication of hwpt-alloc flags vs attach-ioas flags. I lack mdev real
> drivers atm, so testing with those is still TBD.
>
> Opinions, comments, welcome!
> ---
>   backends/iommufd.c            | 29 +++++++++++++
>   backends/trace-events         |  1 +
>   hw/vfio/iommufd.c             | 78 +++++++++++++++++++++++++++++++++++
>   include/hw/vfio/vfio-common.h |  9 ++++
>   include/sysemu/iommufd.h      |  4 ++
>   5 files changed, 121 insertions(+)
>
> diff --git a/backends/iommufd.c b/backends/iommufd.c
> index 8486894f1b3f..2970135af4b9 100644
> --- a/backends/iommufd.c
> +++ b/backends/iommufd.c
> @@ -211,6 +211,35 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
>       return ret;
>   }
>
> +int iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id,
> +                               uint32_t pt_id, uint32_t flags,
> +                               uint32_t data_type, uint32_t data_len,
> +                               void *data_ptr, uint32_t *out_hwpt)
> +{
> +    int ret;
> +    struct iommu_hwpt_alloc alloc_hwpt = {
> +        .size = sizeof(struct iommu_hwpt_alloc),
> +        .flags = flags,
> +        .dev_id = dev_id,
> +        .pt_id = pt_id,
> +        .data_type = data_type,
> +        .data_len = data_len,
> +        .data_uptr = (uint64_t)data_ptr,
> +        .__reserved = 0,
> +    };
> +
> +    ret = ioctl(iommufd, IOMMU_HWPT_ALLOC, &alloc_hwpt);
> +    trace_iommufd_backend_alloc_hwpt(iommufd, dev_id, pt_id, flags, data_type,
> +                                     data_len, (uint64_t)data_ptr,
> +                                     alloc_hwpt.out_hwpt_id, ret);
> +    if (ret) {
> +        error_report("IOMMU_HWPT_ALLOC failed: %m");
> +    } else {
> +        *out_hwpt = alloc_hwpt.out_hwpt_id;
> +    }
> +    return !ret ? 0 : -errno;
> +}
> +
>   static const TypeInfo iommufd_backend_info = {
>       .name = TYPE_IOMMUFD_BACKEND,
>       .parent = TYPE_OBJECT,
> diff --git a/backends/trace-events b/backends/trace-events
> index d45c6e31a67e..f83a276a4253 100644
> --- a/backends/trace-events
> +++ b/backends/trace-events
> @@ -13,5 +13,6 @@ iommu_backend_set_fd(int fd) "pre-opened /dev/iommu fd=%d"
>   iommufd_backend_map_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, void *vaddr, bool readonly, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" addr=%p readonly=%d (%d)"
>   iommufd_backend_unmap_dma_non_exist(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " Unmap nonexistent mapping: iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)"
>   iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)"
> +iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id, uint32_t pt_id, uint32_t flags, uint32_t hwpt_type, uint32_t len, uint64_t data_ptr, uint32_t out_hwpt_id, int ret) " iommufd=%d dev_id=%u pt_id=%u flags=0x%x hwpt_type=%u len=%u data_ptr=0x%"PRIx64" out_hwpt=%u (%d)"
>   iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas, int ret) " iommufd=%d ioas=%d (%d)"
>   iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)"
> diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
> index 7d39d7a5fa51..ca7ec45e725c 100644
> --- a/hw/vfio/iommufd.c
> +++ b/hw/vfio/iommufd.c
> @@ -219,10 +219,82 @@ static int iommufd_cdev_detach_ioas_hwpt(VFIODevice *vbasedev, Error **errp)
>       return ret;
>   }
>
> +static int iommufd_cdev_autodomains_get(VFIODevice *vbasedev,
> +                                        VFIOIOMMUFDContainer *container,
> +                                        Error **errp)
> +{
> +    int iommufd = vbasedev->iommufd_dev.iommufd->fd;
> +    VFIOIOASHwpt *hwpt;
> +    Error *err = NULL;
> +    int ret = -EINVAL;

The -EINVAL initialization is not necessary.

> +    uint32_t hwpt_id;
> +
> +    /* Try to find a domain */
> +    QLIST_FOREACH(hwpt, &container->hwpt_list, next) {
> +        ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, &err);
> +        if (ret) {
> +            /* -EINVAL means the domain is incompatible with the device. */
> +            if (ret == -EINVAL) {

On error iommufd_cdev_attach_ioas_hwpt() returns -1 and not -errno, so I 
guess we need to change it.

> +                continue;
> +            }
> +            return ret;
> +        } else {
> +            vbasedev->hwpt = hwpt;
> +            QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next);
> +            return 0;
> +        }
> +    }
> +
> +    ret = iommufd_backend_alloc_hwpt(iommufd,
> +                                     vbasedev->iommufd_dev.devid,
> +                                     container->ioas_id, 0, 0, 0,

Should we explicitly pass IOMMU_HWPT_DATA_NONE instead of 0?

> +                                     NULL, &hwpt_id);
> +    if (ret) {
> +        error_append_hint(&err,
> +                   "Failed to allocate HWPT for device %s. Fallback to IOAS attach\n",
> +                   vbasedev->name);
> +        warn_report_err(err);
> +        return ret;
> +    }
> +
> +    hwpt = g_malloc0(sizeof(*hwpt));
> +    hwpt->hwpt_id = hwpt_id;
> +    QLIST_INIT(&hwpt->device_list);
> +
> +    ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, &err);
> +    if (ret) {
> +        iommufd_backend_free_id(container->be, hwpt->hwpt_id);
> +        g_free(hwpt);
> +        return ret;
> +    }
> +
> +    vbasedev->hwpt = hwpt;
> +    QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next);
> +    QLIST_INSERT_HEAD(&container->hwpt_list, hwpt, next);
> +    return 0;

I think we need to improve error handling in this function.
There are various places where err is not freed/reported, not NULL-ed 
before re-used, or used in error_append_hint() although it can be NULL.
If there are places where Error can be ignored, we can pass NULL instead 
of &err.
Plus, errp parameter passed to this function is never used.

> +}
> +
> +static void iommufd_cdev_autodomains_put(VFIODevice *vbasedev,
> +                                         VFIOIOMMUFDContainer *container)
> +{
> +    VFIOIOASHwpt *hwpt = vbasedev->hwpt;
> +
> +    QLIST_REMOVE(vbasedev, hwpt_next);
> +    QLIST_REMOVE(hwpt, next);

Shouldn't QLIST_REMOVE(hwpt, next) be moved inside the if?

> +    if (QLIST_EMPTY(&hwpt->device_list)) {
> +        iommufd_backend_free_id(container->be, hwpt->hwpt_id);
> +        g_free(hwpt);
> +    }
> +}
> +
>   static int iommufd_cdev_attach_container(VFIODevice *vbasedev,
>                                            VFIOIOMMUFDContainer *container,
>                                            Error **errp)
>   {
> +    if (!iommufd_cdev_autodomains_get(vbasedev, container, errp)) {
> +        return 0;
> +    }

If errp is used in iommufd_cdev_autodomains_get() eventually, we need to 
make sure it doens't contain an Error object before using it again below.

Thanks.

> +
>       return iommufd_cdev_attach_ioas_hwpt(vbasedev, container->ioas_id, errp);
>   }
>
> @@ -231,6 +303,11 @@ static void iommufd_cdev_detach_container(VFIODevice *vbasedev,
>   {
>       Error *err = NULL;
>
> +    if (vbasedev->hwpt) {
> +        iommufd_cdev_autodomains_put(vbasedev, container);
> +        return;
> +    }
> +
>       if (iommufd_cdev_detach_ioas_hwpt(vbasedev, &err)) {
>           error_report_err(err);
>       }
> @@ -370,6 +447,7 @@ static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev,
>       container = g_malloc0(sizeof(*container));
>       container->be = vbasedev->iommufd_dev.iommufd;
>       container->ioas_id = ioas_id;
> +    QLIST_INIT(&container->hwpt_list);
>
>       bcontainer = &container->bcontainer;
>       vfio_container_init(bcontainer, space, iommufd_vioc);
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index 9c4b60c906d9..7f7d823221e2 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -93,10 +93,17 @@ typedef struct VFIOHostDMAWindow {
>
>   typedef struct IOMMUFDBackend IOMMUFDBackend;
>
> +typedef struct VFIOIOASHwpt {
> +    uint32_t hwpt_id;
> +    QLIST_HEAD(, VFIODevice) device_list;
> +    QLIST_ENTRY(VFIOIOASHwpt) next;
> +} VFIOIOASHwpt;
> +
>   typedef struct VFIOIOMMUFDContainer {
>       VFIOContainerBase bcontainer;
>       IOMMUFDBackend *be;
>       uint32_t ioas_id;
> +    QLIST_HEAD(, VFIOIOASHwpt) hwpt_list;
>   } VFIOIOMMUFDContainer;
>
>   /* Abstraction of host IOMMU legacy device */
> @@ -136,6 +143,8 @@ typedef struct VFIODevice {
>           IOMMULegacyDevice legacy_dev;
>           IOMMUFDDevice iommufd_dev;
>       };
> +    VFIOIOASHwpt *hwpt;
> +    QLIST_ENTRY(VFIODevice) hwpt_next;
>   } VFIODevice;
>
>   QEMU_BUILD_BUG_ON(offsetof(VFIODevice, legacy_dev.base) !=
> diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
> index 4afe97307dbe..1966b75caae2 100644
> --- a/include/sysemu/iommufd.h
> +++ b/include/sysemu/iommufd.h
> @@ -49,4 +49,8 @@ typedef struct IOMMUFDDevice {
>   void iommufd_device_init(IOMMUFDDevice *idev);
>   int iommufd_device_get_hw_capabilities(IOMMUFDDevice *idev, uint64_t *caps,
>                                          Error **errp);
> +int iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id,
> +                               uint32_t pt_id, uint32_t flags,
> +                               uint32_t data_type, uint32_t data_len,
> +                               void *data_ptr, uint32_t *out_hwpt);
>   #endif
> --
> 2.39.3
>
Re: [PATCH RFCv2 2/8] vfio/iommufd: Introduce auto domain creation
Posted by Joao Martins 2 months ago
On 19/02/2024 08:58, Avihai Horon wrote:
> Hi Joao,
> 
> On 12/02/2024 15:56, Joao Martins wrote:
>> External email: Use caution opening links or attachments
>>
>>
>> There's generally two modes of operation for IOMMUFD:
>>
>> * The simple user API which intends to perform relatively simple things
>> with IOMMUs e.g. DPDK. It generally creates an IOAS and attach to VFIO
>> and mainly performs IOAS_MAP and UNMAP.
>>
>> * The native IOMMUFD API where you have fine grained control of the
>> IOMMU domain and model it accordingly. This is where most new feature
>> are being steered to.
>>
>> For dirty tracking 2) is required, as it needs to ensure that
>> the stage-2/parent IOMMU domain will only attach devices
>> that support dirty tracking (so far it is all homogeneous in x86, likely
>> not the case for smmuv3). Such invariant on dirty tracking provides a
>> useful guarantee to VMMs that will refuse incompatible device
>> attachments for IOMMU domains.
>>
>> For dirty tracking such property is enabled/enforced via HWPT_ALLOC,
>> which is responsible for creating an IOMMU domain. This is contrast to
>> the 'simple API' where the IOMMU domain is created by IOMMUFD
>> automatically when it attaches to VFIO (usually referred as autodomains)
>>
>> To support dirty tracking with the advanced IOMMUFD API, it needs
>> similar logic, where IOMMU domains are created and devices attached to
>> compatible domains. Essentially mimmicing kernel
>> iommufd_device_auto_get_domain(). If this fails (i.e. mdevs) it falls back
>> to IOAS attach.
>>
>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>> ---
>> Right now the only alternative to a userspace autodomains implementation
>> is to mimmicing all the flags being added to HWPT_ALLOC but into VFIO
>> IOAS attach. So opted for autodomains userspace approach to avoid the
>> duplication of hwpt-alloc flags vs attach-ioas flags. I lack mdev real
>> drivers atm, so testing with those is still TBD.
>>
>> Opinions, comments, welcome!
>> ---
>>   backends/iommufd.c            | 29 +++++++++++++
>>   backends/trace-events         |  1 +
>>   hw/vfio/iommufd.c             | 78 +++++++++++++++++++++++++++++++++++
>>   include/hw/vfio/vfio-common.h |  9 ++++
>>   include/sysemu/iommufd.h      |  4 ++
>>   5 files changed, 121 insertions(+)
>>
>> diff --git a/backends/iommufd.c b/backends/iommufd.c
>> index 8486894f1b3f..2970135af4b9 100644
>> --- a/backends/iommufd.c
>> +++ b/backends/iommufd.c
>> @@ -211,6 +211,35 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be,
>> uint32_t ioas_id,
>>       return ret;
>>   }
>>
>> +int iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id,
>> +                               uint32_t pt_id, uint32_t flags,
>> +                               uint32_t data_type, uint32_t data_len,
>> +                               void *data_ptr, uint32_t *out_hwpt)
>> +{
>> +    int ret;
>> +    struct iommu_hwpt_alloc alloc_hwpt = {
>> +        .size = sizeof(struct iommu_hwpt_alloc),
>> +        .flags = flags,
>> +        .dev_id = dev_id,
>> +        .pt_id = pt_id,
>> +        .data_type = data_type,
>> +        .data_len = data_len,
>> +        .data_uptr = (uint64_t)data_ptr,
>> +        .__reserved = 0,
>> +    };
>> +
>> +    ret = ioctl(iommufd, IOMMU_HWPT_ALLOC, &alloc_hwpt);
>> +    trace_iommufd_backend_alloc_hwpt(iommufd, dev_id, pt_id, flags, data_type,
>> +                                     data_len, (uint64_t)data_ptr,
>> +                                     alloc_hwpt.out_hwpt_id, ret);
>> +    if (ret) {
>> +        error_report("IOMMU_HWPT_ALLOC failed: %m");
>> +    } else {
>> +        *out_hwpt = alloc_hwpt.out_hwpt_id;
>> +    }
>> +    return !ret ? 0 : -errno;
>> +}
>> +
>>   static const TypeInfo iommufd_backend_info = {
>>       .name = TYPE_IOMMUFD_BACKEND,
>>       .parent = TYPE_OBJECT,
>> diff --git a/backends/trace-events b/backends/trace-events
>> index d45c6e31a67e..f83a276a4253 100644
>> --- a/backends/trace-events
>> +++ b/backends/trace-events
>> @@ -13,5 +13,6 @@ iommu_backend_set_fd(int fd) "pre-opened /dev/iommu fd=%d"
>>   iommufd_backend_map_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t
>> size, void *vaddr, bool readonly, int ret) " iommufd=%d ioas=%d
>> iova=0x%"PRIx64" size=0x%"PRIx64" addr=%p readonly=%d (%d)"
>>   iommufd_backend_unmap_dma_non_exist(int iommufd, uint32_t ioas, uint64_t
>> iova, uint64_t size, int ret) " Unmap nonexistent mapping: iommufd=%d ioas=%d
>> iova=0x%"PRIx64" size=0x%"PRIx64" (%d)"
>>   iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova,
>> uint64_t size, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64"
>> (%d)"
>> +iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id, uint32_t pt_id,
>> uint32_t flags, uint32_t hwpt_type, uint32_t len, uint64_t data_ptr, uint32_t
>> out_hwpt_id, int ret) " iommufd=%d dev_id=%u pt_id=%u flags=0x%x hwpt_type=%u
>> len=%u data_ptr=0x%"PRIx64" out_hwpt=%u (%d)"
>>   iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas, int ret) " iommufd=%d
>> ioas=%d (%d)"
>>   iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d
>> id=%d (%d)"
>> diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
>> index 7d39d7a5fa51..ca7ec45e725c 100644
>> --- a/hw/vfio/iommufd.c
>> +++ b/hw/vfio/iommufd.c
>> @@ -219,10 +219,82 @@ static int iommufd_cdev_detach_ioas_hwpt(VFIODevice
>> *vbasedev, Error **errp)
>>       return ret;
>>   }
>>
>> +static int iommufd_cdev_autodomains_get(VFIODevice *vbasedev,
>> +                                        VFIOIOMMUFDContainer *container,
>> +                                        Error **errp)
>> +{
>> +    int iommufd = vbasedev->iommufd_dev.iommufd->fd;
>> +    VFIOIOASHwpt *hwpt;
>> +    Error *err = NULL;
>> +    int ret = -EINVAL;
> 
> The -EINVAL initialization is not necessary.
> 
/me nods

>> +    uint32_t hwpt_id;
>> +
>> +    /* Try to find a domain */
>> +    QLIST_FOREACH(hwpt, &container->hwpt_list, next) {
>> +        ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, &err);
>> +        if (ret) {
>> +            /* -EINVAL means the domain is incompatible with the device. */
>> +            if (ret == -EINVAL) {
> 
> On error iommufd_cdev_attach_ioas_hwpt() returns -1 and not -errno, so I guess
> we need to change it.
> 

/me facepalm yes indeed

>> +                continue;
>> +            }
>> +            return ret;
>> +        } else {
>> +            vbasedev->hwpt = hwpt;
>> +            QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next);
>> +            return 0;
>> +        }
>> +    }
>> +
>> +    ret = iommufd_backend_alloc_hwpt(iommufd,
>> +                                     vbasedev->iommufd_dev.devid,
>> +                                     container->ioas_id, 0, 0, 0,
> 
> Should we explicitly pass IOMMU_HWPT_DATA_NONE instead of 0?
> 
That's better yes.

>> +                                     NULL, &hwpt_id);
>> +    if (ret) {
>> +        error_append_hint(&err,
>> +                   "Failed to allocate HWPT for device %s. Fallback to IOAS
>> attach\n",
>> +                   vbasedev->name);
>> +        warn_report_err(err);
>> +        return ret;
>> +    }
>> +
>> +    hwpt = g_malloc0(sizeof(*hwpt));
>> +    hwpt->hwpt_id = hwpt_id;
>> +    QLIST_INIT(&hwpt->device_list);
>> +
>> +    ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, &err);
>> +    if (ret) {
>> +        iommufd_backend_free_id(container->be, hwpt->hwpt_id);
>> +        g_free(hwpt);
>> +        return ret;
>> +    }
>> +
>> +    vbasedev->hwpt = hwpt;
>> +    QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next);
>> +    QLIST_INSERT_HEAD(&container->hwpt_list, hwpt, next);
>> +    return 0;
> 
> I think we need to improve error handling in this function.
> There are various places where err is not freed/reported, not NULL-ed before
> re-used, or used in error_append_hint() although it can be NULL.
> If there are places where Error can be ignored, we can pass NULL instead of &err.
> Plus, errp parameter passed to this function is never used.
> 
Let me fix all those for the non-RFC v3.

>> +}
>> +
>> +static void iommufd_cdev_autodomains_put(VFIODevice *vbasedev,
>> +                                         VFIOIOMMUFDContainer *container)
>> +{
>> +    VFIOIOASHwpt *hwpt = vbasedev->hwpt;
>> +
>> +    QLIST_REMOVE(vbasedev, hwpt_next);
>> +    QLIST_REMOVE(hwpt, next);
> 
> Shouldn't QLIST_REMOVE(hwpt, next) be moved inside the if?
> 
yes

>> +    if (QLIST_EMPTY(&hwpt->device_list)) {
>> +        iommufd_backend_free_id(container->be, hwpt->hwpt_id);
>> +        g_free(hwpt);
>> +    }
>> +}
>> +
>>   static int iommufd_cdev_attach_container(VFIODevice *vbasedev,
>>                                            VFIOIOMMUFDContainer *container,
>>                                            Error **errp)
>>   {
>> +    if (!iommufd_cdev_autodomains_get(vbasedev, container, errp)) {
>> +        return 0;
>> +    }
> 
> If errp is used in iommufd_cdev_autodomains_get() eventually, we need to make
> sure it doens't contain an Error object before using it again below.
> 
yes


Re: [PATCH RFCv2 2/8] vfio/iommufd: Introduce auto domain creation
Posted by Joao Martins 2 months, 1 week ago
On 12/02/2024 13:56, Joao Martins wrote:
> diff --git a/backends/iommufd.c b/backends/iommufd.c
> index 8486894f1b3f..2970135af4b9 100644
> --- a/backends/iommufd.c
> +++ b/backends/iommufd.c
> @@ -211,6 +211,35 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
>      return ret;
>  }
>  
> +int iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id,
> +                               uint32_t pt_id, uint32_t flags,
> +                               uint32_t data_type, uint32_t data_len,
> +                               void *data_ptr, uint32_t *out_hwpt)

This function looks to be against the style of the file. Which is to have
IOMMUFDBackend as an argument rather than passing the file descriptor directly.

Likewise for dev_id now that we have IOMMUFDDevice structure.

I've fixed that for the next version (whether or not we require Zhengzhong series).

	Joao
Re: [PATCH RFCv2 2/8] vfio/iommufd: Introduce auto domain creation
Posted by Jason Gunthorpe 2 months, 1 week ago
On Mon, Feb 12, 2024 at 01:56:37PM +0000, Joao Martins wrote:
> There's generally two modes of operation for IOMMUFD:
> 
> * The simple user API which intends to perform relatively simple things
> with IOMMUs e.g. DPDK. It generally creates an IOAS and attach to VFIO
> and mainly performs IOAS_MAP and UNMAP.
> 
> * The native IOMMUFD API where you have fine grained control of the
> IOMMU domain and model it accordingly. This is where most new feature
> are being steered to.
> 
> For dirty tracking 2) is required, as it needs to ensure that
> the stage-2/parent IOMMU domain will only attach devices
> that support dirty tracking (so far it is all homogeneous in x86, likely
> not the case for smmuv3). Such invariant on dirty tracking provides a
> useful guarantee to VMMs that will refuse incompatible device
> attachments for IOMMU domains.
> 
> For dirty tracking such property is enabled/enforced via HWPT_ALLOC,
> which is responsible for creating an IOMMU domain. This is contrast to
> the 'simple API' where the IOMMU domain is created by IOMMUFD
> automatically when it attaches to VFIO (usually referred as autodomains)
> 
> To support dirty tracking with the advanced IOMMUFD API, it needs
> similar logic, where IOMMU domains are created and devices attached to
> compatible domains. Essentially mimmicing kernel
> iommufd_device_auto_get_domain(). If this fails (i.e. mdevs) it falls back
> to IOAS attach.
> 
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> ---
> Right now the only alternative to a userspace autodomains implementation
> is to mimmicing all the flags being added to HWPT_ALLOC but into VFIO
> IOAS attach.

It was my expectation that VMM userspace would implement direct hwpt
support. This is needed in all kinds of cases going forward because
hwpt allocation is not uniform across iommu instances and managing
this in the kernel is only feasible for simpler cases. Dirty tracking
would be one of them.

> +int iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id,
> +                               uint32_t pt_id, uint32_t flags,
> +                               uint32_t data_type, uint32_t data_len,
> +                               void *data_ptr, uint32_t *out_hwpt)
> +{
> +    int ret;
> +    struct iommu_hwpt_alloc alloc_hwpt = {
> +        .size = sizeof(struct iommu_hwpt_alloc),
> +        .flags = flags,
> +        .dev_id = dev_id,
> +        .pt_id = pt_id,
> +        .data_type = data_type,
> +        .data_len = data_len,
> +        .data_uptr = (uint64_t)data_ptr,
> +        .__reserved = 0,

Unless the coding style requirs this it is not necessary to zero in
the __reserved within a bracketed in initializer..

> +static int iommufd_cdev_autodomains_get(VFIODevice *vbasedev,
> +                                        VFIOIOMMUFDContainer *container,
> +                                        Error **errp)
> +{
> +    int iommufd = vbasedev->iommufd_dev.iommufd->fd;
> +    VFIOIOASHwpt *hwpt;
> +    Error *err = NULL;
> +    int ret = -EINVAL;
> +    uint32_t hwpt_id;
> +
> +    /* Try to find a domain */
> +    QLIST_FOREACH(hwpt, &container->hwpt_list, next) {
> +        ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, &err);
> +        if (ret) {
> +            /* -EINVAL means the domain is incompatible with the device. */
> +            if (ret == -EINVAL) {

Please send a kernel kdoc patch to document this..

The approach looks good to me

The nesting patches surely need this too?

Jason
Re: [PATCH RFCv2 2/8] vfio/iommufd: Introduce auto domain creation
Posted by Joao Martins 2 months, 1 week ago
On 12/02/2024 16:27, Jason Gunthorpe wrote:
> On Mon, Feb 12, 2024 at 01:56:37PM +0000, Joao Martins wrote:
>> There's generally two modes of operation for IOMMUFD:
>>
>> * The simple user API which intends to perform relatively simple things
>> with IOMMUs e.g. DPDK. It generally creates an IOAS and attach to VFIO
>> and mainly performs IOAS_MAP and UNMAP.
>>
>> * The native IOMMUFD API where you have fine grained control of the
>> IOMMU domain and model it accordingly. This is where most new feature
>> are being steered to.
>>
>> For dirty tracking 2) is required, as it needs to ensure that
>> the stage-2/parent IOMMU domain will only attach devices
>> that support dirty tracking (so far it is all homogeneous in x86, likely
>> not the case for smmuv3). Such invariant on dirty tracking provides a
>> useful guarantee to VMMs that will refuse incompatible device
>> attachments for IOMMU domains.
>>
>> For dirty tracking such property is enabled/enforced via HWPT_ALLOC,
>> which is responsible for creating an IOMMU domain. This is contrast to
>> the 'simple API' where the IOMMU domain is created by IOMMUFD
>> automatically when it attaches to VFIO (usually referred as autodomains)
>>
>> To support dirty tracking with the advanced IOMMUFD API, it needs
>> similar logic, where IOMMU domains are created and devices attached to
>> compatible domains. Essentially mimmicing kernel
>> iommufd_device_auto_get_domain(). If this fails (i.e. mdevs) it falls back
>> to IOAS attach.
>>
>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>> ---
>> Right now the only alternative to a userspace autodomains implementation
>> is to mimmicing all the flags being added to HWPT_ALLOC but into VFIO
>> IOAS attach.
> 
> It was my expectation that VMM userspace would implement direct hwpt
> support. This is needed in all kinds of cases going forward because
> hwpt allocation is not uniform across iommu instances and managing
> this in the kernel is only feasible for simpler cases. Dirty tracking
> would be one of them.
> 

/me nods

>> +int iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id,
>> +                               uint32_t pt_id, uint32_t flags,
>> +                               uint32_t data_type, uint32_t data_len,
>> +                               void *data_ptr, uint32_t *out_hwpt)
>> +{
>> +    int ret;
>> +    struct iommu_hwpt_alloc alloc_hwpt = {
>> +        .size = sizeof(struct iommu_hwpt_alloc),
>> +        .flags = flags,
>> +        .dev_id = dev_id,
>> +        .pt_id = pt_id,
>> +        .data_type = data_type,
>> +        .data_len = data_len,
>> +        .data_uptr = (uint64_t)data_ptr,
>> +        .__reserved = 0,
> 
> Unless the coding style requirs this it is not necessary to zero in
> the __reserved within a bracketed in initializer..
> 

Ah yes; and no other helper is doing this, so definitely doesn't look code
style. I'll remove it.

>> +static int iommufd_cdev_autodomains_get(VFIODevice *vbasedev,
>> +                                        VFIOIOMMUFDContainer *container,
>> +                                        Error **errp)
>> +{
>> +    int iommufd = vbasedev->iommufd_dev.iommufd->fd;
>> +    VFIOIOASHwpt *hwpt;
>> +    Error *err = NULL;
>> +    int ret = -EINVAL;
>> +    uint32_t hwpt_id;
>> +
>> +    /* Try to find a domain */
>> +    QLIST_FOREACH(hwpt, &container->hwpt_list, next) {
>> +        ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, &err);
>> +        if (ret) {
>> +            /* -EINVAL means the domain is incompatible with the device. */
>> +            if (ret == -EINVAL) {
> 
> Please send a kernel kdoc patch to document this..
> 

Ack

> The approach looks good to me
> 
> The nesting patches surely need this too?

From what I understand, yes, but they seem to be able to hid this inside
intel-iommu for the parent hwpt allocation somehow. I'll let them comment;
RE: [PATCH RFCv2 2/8] vfio/iommufd: Introduce auto domain creation
Posted by Duan, Zhenzhong 1 month, 3 weeks ago

>-----Original Message-----
>From: Joao Martins <joao.m.martins@oracle.com>
>Subject: Re: [PATCH RFCv2 2/8] vfio/iommufd: Introduce auto domain
>creation
>
>On 12/02/2024 16:27, Jason Gunthorpe wrote:
>> On Mon, Feb 12, 2024 at 01:56:37PM +0000, Joao Martins wrote:
>>> There's generally two modes of operation for IOMMUFD:
>>>
>>> * The simple user API which intends to perform relatively simple things
>>> with IOMMUs e.g. DPDK. It generally creates an IOAS and attach to VFIO
>>> and mainly performs IOAS_MAP and UNMAP.
>>>
>>> * The native IOMMUFD API where you have fine grained control of the
>>> IOMMU domain and model it accordingly. This is where most new feature
>>> are being steered to.
>>>
>>> For dirty tracking 2) is required, as it needs to ensure that
>>> the stage-2/parent IOMMU domain will only attach devices
>>> that support dirty tracking (so far it is all homogeneous in x86, likely
>>> not the case for smmuv3). Such invariant on dirty tracking provides a
>>> useful guarantee to VMMs that will refuse incompatible device
>>> attachments for IOMMU domains.
>>>
>>> For dirty tracking such property is enabled/enforced via HWPT_ALLOC,
>>> which is responsible for creating an IOMMU domain. This is contrast to
>>> the 'simple API' where the IOMMU domain is created by IOMMUFD
>>> automatically when it attaches to VFIO (usually referred as autodomains)
>>>
>>> To support dirty tracking with the advanced IOMMUFD API, it needs
>>> similar logic, where IOMMU domains are created and devices attached to
>>> compatible domains. Essentially mimmicing kernel
>>> iommufd_device_auto_get_domain(). If this fails (i.e. mdevs) it falls back
>>> to IOAS attach.
>>>
>>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>>> ---
>>> Right now the only alternative to a userspace autodomains
>implementation
>>> is to mimmicing all the flags being added to HWPT_ALLOC but into VFIO
>>> IOAS attach.
>>
>> It was my expectation that VMM userspace would implement direct hwpt
>> support. This is needed in all kinds of cases going forward because
>> hwpt allocation is not uniform across iommu instances and managing
>> this in the kernel is only feasible for simpler cases. Dirty tracking
>> would be one of them.
>>
>
>/me nods
>
>>> +int iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id,
>>> +                               uint32_t pt_id, uint32_t flags,
>>> +                               uint32_t data_type, uint32_t data_len,
>>> +                               void *data_ptr, uint32_t *out_hwpt)
>>> +{
>>> +    int ret;
>>> +    struct iommu_hwpt_alloc alloc_hwpt = {
>>> +        .size = sizeof(struct iommu_hwpt_alloc),
>>> +        .flags = flags,
>>> +        .dev_id = dev_id,
>>> +        .pt_id = pt_id,
>>> +        .data_type = data_type,
>>> +        .data_len = data_len,
>>> +        .data_uptr = (uint64_t)data_ptr,
>>> +        .__reserved = 0,
>>
>> Unless the coding style requirs this it is not necessary to zero in
>> the __reserved within a bracketed in initializer..
>>
>
>Ah yes; and no other helper is doing this, so definitely doesn't look code
>style. I'll remove it.
>
>>> +static int iommufd_cdev_autodomains_get(VFIODevice *vbasedev,
>>> +                                        VFIOIOMMUFDContainer *container,
>>> +                                        Error **errp)
>>> +{
>>> +    int iommufd = vbasedev->iommufd_dev.iommufd->fd;
>>> +    VFIOIOASHwpt *hwpt;
>>> +    Error *err = NULL;
>>> +    int ret = -EINVAL;
>>> +    uint32_t hwpt_id;
>>> +
>>> +    /* Try to find a domain */
>>> +    QLIST_FOREACH(hwpt, &container->hwpt_list, next) {
>>> +        ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id,
>&err);
>>> +        if (ret) {
>>> +            /* -EINVAL means the domain is incompatible with the device. */
>>> +            if (ret == -EINVAL) {
>>
>> Please send a kernel kdoc patch to document this..
>>
>
>Ack
>
>> The approach looks good to me
>>
>> The nesting patches surely need this too?
>
>From what I understand, yes, but they seem to be able to hid this inside
>intel-iommu for the parent hwpt allocation somehow. I'll let them comment;

Yes, we have similar implementation in nesting series. See vtd_device_attach_container() in
https://lists.gnu.org/archive/html/qemu-devel/2024-01/msg02750.html

Thanks
Zhenzhong