[PATCH v3 04/10] vfio/iommufd: Introduce auto domain creation

Joao Martins posted 10 patches 4 months, 2 weeks ago
Maintainers: Yi Liu <yi.l.liu@intel.com>, Eric Auger <eric.auger@redhat.com>, Zhenzhong Duan <zhenzhong.duan@intel.com>, Alex Williamson <alex.williamson@redhat.com>, "Cédric Le Goater" <clg@redhat.com>
[PATCH v3 04/10] vfio/iommufd: Introduce auto domain creation
Posted by Joao Martins 4 months, 2 weeks ago
There's generally two modes of operation for IOMMUFD:

* The simple user API which intends to perform relatively simple things
with IOMMUs e.g. DPDK. It generally creates an IOAS and attach to VFIO
and mainly performs IOAS_MAP and UNMAP.

* The native IOMMUFD API where you have fine grained control of the
IOMMU domain and model it accordingly. This is where most new feature
are being steered to.

For dirty tracking 2) is required, as it needs to ensure that
the stage-2/parent IOMMU domain will only attach devices
that support dirty tracking (so far it is all homogeneous in x86, likely
not the case for smmuv3). Such invariant on dirty tracking provides a
useful guarantee to VMMs that will refuse incompatible device
attachments for IOMMU domains.

Dirty tracking insurance is enforced via HWPT_ALLOC, which is
responsible for creating an IOMMU domain. This is contrast to the
'simple API' where the IOMMU domain is created by IOMMUFD automatically
when it attaches to VFIO (usually referred as autodomains) but it has
the needed handling for mdevs.

To support dirty tracking with the advanced IOMMUFD API, it needs
similar logic, where IOMMU domains are created and devices attached to
compatible domains. Essentially mimmicing kernel
iommufd_device_auto_get_domain(). If this fails (i.e. mdevs) it falls back
to IOAS attach (which again is always the case for mdevs).

The auto domain logic allows different IOMMU domains to be created when
DMA dirty tracking is not desired (and VF can provide it), and others where
it is. Here is not used in this way here given how VFIODevice migration
state is initialized after the device attachment. But such mixed mode of
IOMMU dirty tracking + device dirty tracking is an improvement that can
be added on. Keep the 'all of nothing' approach that we have been using
so far between container vs device dirty tracking.

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
---
 include/hw/vfio/vfio-common.h |  9 ++++
 include/sysemu/iommufd.h      |  4 ++
 backends/iommufd.c            | 29 +++++++++++
 hw/vfio/iommufd.c             | 91 +++++++++++++++++++++++++++++++++++
 backends/trace-events         |  1 +
 5 files changed, 134 insertions(+)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index e8ddf92bb185..82c5a4aaa61e 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -95,10 +95,17 @@ typedef struct VFIOHostDMAWindow {
 
 typedef struct IOMMUFDBackend IOMMUFDBackend;
 
+typedef struct VFIOIOASHwpt {
+    uint32_t hwpt_id;
+    QLIST_HEAD(, VFIODevice) device_list;
+    QLIST_ENTRY(VFIOIOASHwpt) next;
+} VFIOIOASHwpt;
+
 typedef struct VFIOIOMMUFDContainer {
     VFIOContainerBase bcontainer;
     IOMMUFDBackend *be;
     uint32_t ioas_id;
+    QLIST_HEAD(, VFIOIOASHwpt) hwpt_list;
 } VFIOIOMMUFDContainer;
 
 OBJECT_DECLARE_SIMPLE_TYPE(VFIOIOMMUFDContainer, VFIO_IOMMU_IOMMUFD);
@@ -134,6 +141,8 @@ typedef struct VFIODevice {
     HostIOMMUDevice *hiod;
     int devid;
     IOMMUFDBackend *iommufd;
+    VFIOIOASHwpt *hwpt;
+    QLIST_ENTRY(VFIODevice) hwpt_next;
 } VFIODevice;
 
 struct VFIODeviceOps {
diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
index 57d502a1c79a..35a8cec9780f 100644
--- a/include/sysemu/iommufd.h
+++ b/include/sysemu/iommufd.h
@@ -50,6 +50,10 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
 bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid,
                                      uint32_t *type, void *data, uint32_t len,
                                      uint64_t *caps, Error **errp);
+int iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id,
+                               uint32_t pt_id, uint32_t flags,
+                               uint32_t data_type, uint32_t data_len,
+                               void *data_ptr, uint32_t *out_hwpt);
 
 #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd"
 #endif
diff --git a/backends/iommufd.c b/backends/iommufd.c
index 2b3d51af26d2..f5f73eaf4a1a 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -208,6 +208,35 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
     return ret;
 }
 
+int iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id,
+                               uint32_t pt_id, uint32_t flags,
+                               uint32_t data_type, uint32_t data_len,
+                               void *data_ptr, uint32_t *out_hwpt)
+{
+    int ret, fd = be->fd;
+    struct iommu_hwpt_alloc alloc_hwpt = {
+        .size = sizeof(struct iommu_hwpt_alloc),
+        .flags = flags,
+        .dev_id = dev_id,
+        .pt_id = pt_id,
+        .data_type = data_type,
+        .data_len = data_len,
+        .data_uptr = (uint64_t)data_ptr,
+    };
+
+    ret = ioctl(fd, IOMMU_HWPT_ALLOC, &alloc_hwpt);
+    trace_iommufd_backend_alloc_hwpt(fd, dev_id, pt_id, flags, data_type,
+                                     data_len, (uint64_t)data_ptr,
+                                     alloc_hwpt.out_hwpt_id, ret);
+    if (ret) {
+        ret = -errno;
+        error_report("IOMMU_HWPT_ALLOC failed: %m");
+    } else {
+        *out_hwpt = alloc_hwpt.out_hwpt_id;
+    }
+    return ret;
+}
+
 bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid,
                                      uint32_t *type, void *data, uint32_t len,
                                      uint64_t *caps, Error **errp)
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 5a5993b17c2e..2ca9a32cc7b6 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -212,10 +212,95 @@ static bool iommufd_cdev_detach_ioas_hwpt(VFIODevice *vbasedev, Error **errp)
     return true;
 }
 
+static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev,
+                                         VFIOIOMMUFDContainer *container,
+                                         Error **errp)
+{
+    IOMMUFDBackend *iommufd = vbasedev->iommufd;
+    uint32_t flags = 0;
+    VFIOIOASHwpt *hwpt;
+    uint32_t hwpt_id;
+    int ret;
+
+    /* Try to find a domain */
+    QLIST_FOREACH(hwpt, &container->hwpt_list, next) {
+        ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, NULL);
+        if (ret) {
+            /* -EINVAL means the domain is incompatible with the device. */
+            if (ret == -EINVAL) {
+                continue;
+            }
+
+            return false;
+        } else {
+            vbasedev->hwpt = hwpt;
+            QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next);
+            return true;
+        }
+    }
+
+    ret = iommufd_backend_alloc_hwpt(iommufd,
+                                     vbasedev->devid,
+                                     container->ioas_id, flags,
+                                     IOMMU_HWPT_DATA_NONE, 0, NULL, &hwpt_id);
+    if (ret) {
+        /*
+         * When there's no domains allocated we can still attempt a hardware
+         * pagetable allocation for an mdev, which ends up returning -ENOENT
+         * This is benign and meant to fallback into IOAS attach, hence don't
+         * set errp.
+         */
+        return false;
+    }
+
+    hwpt = g_malloc0(sizeof(*hwpt));
+    hwpt->hwpt_id = hwpt_id;
+    QLIST_INIT(&hwpt->device_list);
+
+    ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, errp);
+    if (ret) {
+        iommufd_backend_free_id(container->be, hwpt->hwpt_id);
+        g_free(hwpt);
+        return false;
+    }
+
+    vbasedev->hwpt = hwpt;
+    QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next);
+    QLIST_INSERT_HEAD(&container->hwpt_list, hwpt, next);
+    return true;
+}
+
+static void iommufd_cdev_autodomains_put(VFIODevice *vbasedev,
+                                         VFIOIOMMUFDContainer *container)
+{
+    VFIOIOASHwpt *hwpt = vbasedev->hwpt;
+
+    QLIST_REMOVE(vbasedev, hwpt_next);
+    if (QLIST_EMPTY(&hwpt->device_list)) {
+        QLIST_REMOVE(hwpt, next);
+        iommufd_backend_free_id(container->be, hwpt->hwpt_id);
+        g_free(hwpt);
+    }
+}
+
 static bool iommufd_cdev_attach_container(VFIODevice *vbasedev,
                                           VFIOIOMMUFDContainer *container,
                                           Error **errp)
 {
+    if (iommufd_cdev_autodomains_get(vbasedev, container, errp)) {
+        return true;
+    }
+
+    /*
+     * iommufd_cdev_autodomains_get() may have an expected failure (-ENOENT)
+     * for mdevs as they aren't real physical devices. @errp is only set
+     * when real failures happened i.e. failing to attach to a newly created
+     * hardware pagetable. These are fatal and should fail the attachment.
+     */
+    if (*errp) {
+        return false;
+    }
+
     return !iommufd_cdev_attach_ioas_hwpt(vbasedev, container->ioas_id, errp);
 }
 
@@ -224,6 +309,11 @@ static void iommufd_cdev_detach_container(VFIODevice *vbasedev,
 {
     Error *err = NULL;
 
+    if (vbasedev->hwpt) {
+        iommufd_cdev_autodomains_put(vbasedev, container);
+        return;
+    }
+
     if (!iommufd_cdev_detach_ioas_hwpt(vbasedev, &err)) {
         error_report_err(err);
     }
@@ -354,6 +444,7 @@ static bool iommufd_cdev_attach(const char *name, VFIODevice *vbasedev,
     container = VFIO_IOMMU_IOMMUFD(object_new(TYPE_VFIO_IOMMU_IOMMUFD));
     container->be = vbasedev->iommufd;
     container->ioas_id = ioas_id;
+    QLIST_INIT(&container->hwpt_list);
 
     bcontainer = &container->bcontainer;
     vfio_address_space_insert(space, bcontainer);
diff --git a/backends/trace-events b/backends/trace-events
index 211e6f374adc..4d8ac02fe7d6 100644
--- a/backends/trace-events
+++ b/backends/trace-events
@@ -14,4 +14,5 @@ iommufd_backend_map_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size
 iommufd_backend_unmap_dma_non_exist(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " Unmap nonexistent mapping: iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)"
 iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)"
 iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas) " iommufd=%d ioas=%d"
+iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id, uint32_t pt_id, uint32_t flags, uint32_t hwpt_type, uint32_t len, uint64_t data_ptr, uint32_t out_hwpt_id, int ret) " iommufd=%d dev_id=%u pt_id=%u flags=0x%x hwpt_type=%u len=%u data_ptr=0x%"PRIx64" out_hwpt=%u (%d)"
 iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)"
-- 
2.17.2
Re: [PATCH v3 04/10] vfio/iommufd: Introduce auto domain creation
Posted by Cédric Le Goater 4 months, 2 weeks ago
On 7/8/24 4:34 PM, Joao Martins wrote:
> There's generally two modes of operation for IOMMUFD:
> 
> * The simple user API which intends to perform relatively simple things
> with IOMMUs e.g. DPDK. It generally creates an IOAS and attach to VFIO
> and mainly performs IOAS_MAP and UNMAP.
> 
> * The native IOMMUFD API where you have fine grained control of the
> IOMMU domain and model it accordingly. This is where most new feature
> are being steered to.
> 
> For dirty tracking 2) is required, as it needs to ensure that
> the stage-2/parent IOMMU domain will only attach devices
> that support dirty tracking (so far it is all homogeneous in x86, likely
> not the case for smmuv3). Such invariant on dirty tracking provides a
> useful guarantee to VMMs that will refuse incompatible device
> attachments for IOMMU domains.
> 
> Dirty tracking insurance is enforced via HWPT_ALLOC, which is
> responsible for creating an IOMMU domain. This is contrast to the
> 'simple API' where the IOMMU domain is created by IOMMUFD automatically
> when it attaches to VFIO (usually referred as autodomains) but it has
> the needed handling for mdevs.
> 
> To support dirty tracking with the advanced IOMMUFD API, it needs
> similar logic, where IOMMU domains are created and devices attached to
> compatible domains. Essentially mimmicing kernel
> iommufd_device_auto_get_domain(). If this fails (i.e. mdevs) it falls back
> to IOAS attach (which again is always the case for mdevs).
> 
> The auto domain logic allows different IOMMU domains to be created when
> DMA dirty tracking is not desired (and VF can provide it), and others where
> it is. Here is not used in this way here given how VFIODevice migration
> state is initialized after the device attachment. But such mixed mode of
> IOMMU dirty tracking + device dirty tracking is an improvement that can
> be added on. Keep the 'all of nothing' approach that we have been using
> so far between container vs device dirty tracking.
> 
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> ---
>   include/hw/vfio/vfio-common.h |  9 ++++
>   include/sysemu/iommufd.h      |  4 ++
>   backends/iommufd.c            | 29 +++++++++++
>   hw/vfio/iommufd.c             | 91 +++++++++++++++++++++++++++++++++++
>   backends/trace-events         |  1 +
>   5 files changed, 134 insertions(+)
> 
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index e8ddf92bb185..82c5a4aaa61e 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -95,10 +95,17 @@ typedef struct VFIOHostDMAWindow {
>   
>   typedef struct IOMMUFDBackend IOMMUFDBackend;
>   
> +typedef struct VFIOIOASHwpt {
> +    uint32_t hwpt_id;
> +    QLIST_HEAD(, VFIODevice) device_list;
> +    QLIST_ENTRY(VFIOIOASHwpt) next;
> +} VFIOIOASHwpt;
> +
>   typedef struct VFIOIOMMUFDContainer {
>       VFIOContainerBase bcontainer;
>       IOMMUFDBackend *be;
>       uint32_t ioas_id;
> +    QLIST_HEAD(, VFIOIOASHwpt) hwpt_list;
>   } VFIOIOMMUFDContainer;
>   
>   OBJECT_DECLARE_SIMPLE_TYPE(VFIOIOMMUFDContainer, VFIO_IOMMU_IOMMUFD);
> @@ -134,6 +141,8 @@ typedef struct VFIODevice {
>       HostIOMMUDevice *hiod;
>       int devid;
>       IOMMUFDBackend *iommufd;
> +    VFIOIOASHwpt *hwpt;
> +    QLIST_ENTRY(VFIODevice) hwpt_next;
>   } VFIODevice;
>   
>   struct VFIODeviceOps {
> diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
> index 57d502a1c79a..35a8cec9780f 100644
> --- a/include/sysemu/iommufd.h
> +++ b/include/sysemu/iommufd.h
> @@ -50,6 +50,10 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
>   bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid,
>                                        uint32_t *type, void *data, uint32_t len,
>                                        uint64_t *caps, Error **errp);
> +int iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id,
> +                               uint32_t pt_id, uint32_t flags,
> +                               uint32_t data_type, uint32_t data_len,
> +                               void *data_ptr, uint32_t *out_hwpt);
>   
>   #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd"
>   #endif
> diff --git a/backends/iommufd.c b/backends/iommufd.c
> index 2b3d51af26d2..f5f73eaf4a1a 100644
> --- a/backends/iommufd.c
> +++ b/backends/iommufd.c
> @@ -208,6 +208,35 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
>       return ret;
>   }
>   
> +int iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id,
> +                               uint32_t pt_id, uint32_t flags,
> +                               uint32_t data_type, uint32_t data_len,
> +                               void *data_ptr, uint32_t *out_hwpt)
> +{
> +    int ret, fd = be->fd;
> +    struct iommu_hwpt_alloc alloc_hwpt = {
> +        .size = sizeof(struct iommu_hwpt_alloc),
> +        .flags = flags,
> +        .dev_id = dev_id,
> +        .pt_id = pt_id,
> +        .data_type = data_type,
> +        .data_len = data_len,
> +        .data_uptr = (uint64_t)data_ptr,
> +    };
> +
> +    ret = ioctl(fd, IOMMU_HWPT_ALLOC, &alloc_hwpt);
> +    trace_iommufd_backend_alloc_hwpt(fd, dev_id, pt_id, flags, data_type,
> +                                     data_len, (uint64_t)data_ptr,
> +                                     alloc_hwpt.out_hwpt_id, ret);
> +    if (ret) {
> +        ret = -errno;
> +        error_report("IOMMU_HWPT_ALLOC failed: %m");

Please add an 'Error **errp' parameter.

> +    } else {
> +        *out_hwpt = alloc_hwpt.out_hwpt_id;
> +    }
> +    return ret;
> +}
> +
>   bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid,
>                                        uint32_t *type, void *data, uint32_t len,
>                                        uint64_t *caps, Error **errp)
> diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
> index 5a5993b17c2e..2ca9a32cc7b6 100644
> --- a/hw/vfio/iommufd.c
> +++ b/hw/vfio/iommufd.c
> @@ -212,10 +212,95 @@ static bool iommufd_cdev_detach_ioas_hwpt(VFIODevice *vbasedev, Error **errp)
>       return true;
>   }
>   
> +static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev,
> +                                         VFIOIOMMUFDContainer *container,
> +                                         Error **errp)
> +{
> +    IOMMUFDBackend *iommufd = vbasedev->iommufd;
> +    uint32_t flags = 0;
> +    VFIOIOASHwpt *hwpt;
> +    uint32_t hwpt_id;
> +    int ret;
> +
> +    /* Try to find a domain */
> +    QLIST_FOREACH(hwpt, &container->hwpt_list, next) {
> +        ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, NULL);
> +        if (ret) {
> +            /* -EINVAL means the domain is incompatible with the device. */
> +            if (ret == -EINVAL) {
> +                continue;
> +            }

We should propagate an error in the false case.

> +            return false;
> +        } else {
> +            vbasedev->hwpt = hwpt;
> +            QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next);
> +            return true;
> +        }
> +    }
> +
> +    ret = iommufd_backend_alloc_hwpt(iommufd,
> +                                     vbasedev->devid,
> +                                     container->ioas_id, flags,
> +                                     IOMMU_HWPT_DATA_NONE, 0, NULL, &hwpt_id);
> +    if (ret) {
> +        /*
> +         * When there's no domains allocated we can still attempt a hardware
> +         * pagetable allocation for an mdev, which ends up returning -ENOENT
> +         * This is benign and meant to fallback into IOAS attach, hence don't
> +         * set errp.
> +         */

same here.

> +        return false;
> +    }
> +
> +    hwpt = g_malloc0(sizeof(*hwpt));
> +    hwpt->hwpt_id = hwpt_id;
> +    QLIST_INIT(&hwpt->device_list);
> +
> +    ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, errp);
> +    if (ret) {
> +        iommufd_backend_free_id(container->be, hwpt->hwpt_id);
> +        g_free(hwpt);
> +        return false;
> +    }
> +
> +    vbasedev->hwpt = hwpt;
> +    QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next);
> +    QLIST_INSERT_HEAD(&container->hwpt_list, hwpt, next);
> +    return true;
> +}
> +
> +static void iommufd_cdev_autodomains_put(VFIODevice *vbasedev,
> +                                         VFIOIOMMUFDContainer *container)
> +{
> +    VFIOIOASHwpt *hwpt = vbasedev->hwpt;
> +
> +    QLIST_REMOVE(vbasedev, hwpt_next);
> +    if (QLIST_EMPTY(&hwpt->device_list)) {
> +        QLIST_REMOVE(hwpt, next);
> +        iommufd_backend_free_id(container->be, hwpt->hwpt_id);
> +        g_free(hwpt);
> +    }
> +}
> +
>   static bool iommufd_cdev_attach_container(VFIODevice *vbasedev,
>                                             VFIOIOMMUFDContainer *container,
>                                             Error **errp)
>   {
> +    if (iommufd_cdev_autodomains_get(vbasedev, container, errp)) {
> +        return true;
> +    }
> +
> +    /*
> +     * iommufd_cdev_autodomains_get() may have an expected failure (-ENOENT)
> +     * for mdevs as they aren't real physical devices. @errp is only set
> +     * when real failures happened i.e. failing to attach to a newly created
> +     * hardware pagetable. These are fatal and should fail the attachment.
> +     */
> +    if (*errp) {
> +        return false;
> +    }
> +
>       return !iommufd_cdev_attach_ioas_hwpt(vbasedev, container->ioas_id, errp);
>   }
>   
> @@ -224,6 +309,11 @@ static void iommufd_cdev_detach_container(VFIODevice *vbasedev,
>   {
>       Error *err = NULL;
>   
> +    if (vbasedev->hwpt) {
> +        iommufd_cdev_autodomains_put(vbasedev, container);
> +        return;
> +    }
> +
>       if (!iommufd_cdev_detach_ioas_hwpt(vbasedev, &err)) {
>           error_report_err(err);
>       }
> @@ -354,6 +444,7 @@ static bool iommufd_cdev_attach(const char *name, VFIODevice *vbasedev,
>       container = VFIO_IOMMU_IOMMUFD(object_new(TYPE_VFIO_IOMMU_IOMMUFD));
>       container->be = vbasedev->iommufd;
>       container->ioas_id = ioas_id;
> +    QLIST_INIT(&container->hwpt_list);
>   
>       bcontainer = &container->bcontainer;
>       vfio_address_space_insert(space, bcontainer);
> diff --git a/backends/trace-events b/backends/trace-events
> index 211e6f374adc..4d8ac02fe7d6 100644
> --- a/backends/trace-events
> +++ b/backends/trace-events
> @@ -14,4 +14,5 @@ iommufd_backend_map_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size
>   iommufd_backend_unmap_dma_non_exist(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " Unmap nonexistent mapping: iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)"
>   iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)"
>   iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas) " iommufd=%d ioas=%d"
> +iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id, uint32_t pt_id, uint32_t flags, uint32_t hwpt_type, uint32_t len, uint64_t data_ptr, uint32_t out_hwpt_id, int ret) " iommufd=%d dev_id=%u pt_id=%u flags=0x%x hwpt_type=%u len=%u data_ptr=0x%"PRIx64" out_hwpt=%u (%d)"
>   iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)"
Re: [PATCH v3 04/10] vfio/iommufd: Introduce auto domain creation
Posted by Joao Martins 4 months, 2 weeks ago
On 09/07/2024 07:50, Cédric Le Goater wrote:
> On 7/8/24 4:34 PM, Joao Martins wrote:
>> There's generally two modes of operation for IOMMUFD:
>>
>> * The simple user API which intends to perform relatively simple things
>> with IOMMUs e.g. DPDK. It generally creates an IOAS and attach to VFIO
>> and mainly performs IOAS_MAP and UNMAP.
>>
>> * The native IOMMUFD API where you have fine grained control of the
>> IOMMU domain and model it accordingly. This is where most new feature
>> are being steered to.
>>
>> For dirty tracking 2) is required, as it needs to ensure that
>> the stage-2/parent IOMMU domain will only attach devices
>> that support dirty tracking (so far it is all homogeneous in x86, likely
>> not the case for smmuv3). Such invariant on dirty tracking provides a
>> useful guarantee to VMMs that will refuse incompatible device
>> attachments for IOMMU domains.
>>
>> Dirty tracking insurance is enforced via HWPT_ALLOC, which is
>> responsible for creating an IOMMU domain. This is contrast to the
>> 'simple API' where the IOMMU domain is created by IOMMUFD automatically
>> when it attaches to VFIO (usually referred as autodomains) but it has
>> the needed handling for mdevs.
>>
>> To support dirty tracking with the advanced IOMMUFD API, it needs
>> similar logic, where IOMMU domains are created and devices attached to
>> compatible domains. Essentially mimmicing kernel
>> iommufd_device_auto_get_domain(). If this fails (i.e. mdevs) it falls back
>> to IOAS attach (which again is always the case for mdevs).
>>
>> The auto domain logic allows different IOMMU domains to be created when
>> DMA dirty tracking is not desired (and VF can provide it), and others where
>> it is. Here is not used in this way here given how VFIODevice migration
>> state is initialized after the device attachment. But such mixed mode of
>> IOMMU dirty tracking + device dirty tracking is an improvement that can
>> be added on. Keep the 'all of nothing' approach that we have been using
>> so far between container vs device dirty tracking.
>>
>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>> ---
>>   include/hw/vfio/vfio-common.h |  9 ++++
>>   include/sysemu/iommufd.h      |  4 ++
>>   backends/iommufd.c            | 29 +++++++++++
>>   hw/vfio/iommufd.c             | 91 +++++++++++++++++++++++++++++++++++
>>   backends/trace-events         |  1 +
>>   5 files changed, 134 insertions(+)
>>
>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>> index e8ddf92bb185..82c5a4aaa61e 100644
>> --- a/include/hw/vfio/vfio-common.h
>> +++ b/include/hw/vfio/vfio-common.h
>> @@ -95,10 +95,17 @@ typedef struct VFIOHostDMAWindow {
>>     typedef struct IOMMUFDBackend IOMMUFDBackend;
>>   +typedef struct VFIOIOASHwpt {
>> +    uint32_t hwpt_id;
>> +    QLIST_HEAD(, VFIODevice) device_list;
>> +    QLIST_ENTRY(VFIOIOASHwpt) next;
>> +} VFIOIOASHwpt;
>> +
>>   typedef struct VFIOIOMMUFDContainer {
>>       VFIOContainerBase bcontainer;
>>       IOMMUFDBackend *be;
>>       uint32_t ioas_id;
>> +    QLIST_HEAD(, VFIOIOASHwpt) hwpt_list;
>>   } VFIOIOMMUFDContainer;
>>     OBJECT_DECLARE_SIMPLE_TYPE(VFIOIOMMUFDContainer, VFIO_IOMMU_IOMMUFD);
>> @@ -134,6 +141,8 @@ typedef struct VFIODevice {
>>       HostIOMMUDevice *hiod;
>>       int devid;
>>       IOMMUFDBackend *iommufd;
>> +    VFIOIOASHwpt *hwpt;
>> +    QLIST_ENTRY(VFIODevice) hwpt_next;
>>   } VFIODevice;
>>     struct VFIODeviceOps {
>> diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
>> index 57d502a1c79a..35a8cec9780f 100644
>> --- a/include/sysemu/iommufd.h
>> +++ b/include/sysemu/iommufd.h
>> @@ -50,6 +50,10 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t
>> ioas_id,
>>   bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid,
>>                                        uint32_t *type, void *data, uint32_t len,
>>                                        uint64_t *caps, Error **errp);
>> +int iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id,
>> +                               uint32_t pt_id, uint32_t flags,
>> +                               uint32_t data_type, uint32_t data_len,
>> +                               void *data_ptr, uint32_t *out_hwpt);
>>     #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd"
>>   #endif
>> diff --git a/backends/iommufd.c b/backends/iommufd.c
>> index 2b3d51af26d2..f5f73eaf4a1a 100644
>> --- a/backends/iommufd.c
>> +++ b/backends/iommufd.c
>> @@ -208,6 +208,35 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be,
>> uint32_t ioas_id,
>>       return ret;
>>   }
>>   +int iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id,
>> +                               uint32_t pt_id, uint32_t flags,
>> +                               uint32_t data_type, uint32_t data_len,
>> +                               void *data_ptr, uint32_t *out_hwpt)
>> +{
>> +    int ret, fd = be->fd;
>> +    struct iommu_hwpt_alloc alloc_hwpt = {
>> +        .size = sizeof(struct iommu_hwpt_alloc),
>> +        .flags = flags,
>> +        .dev_id = dev_id,
>> +        .pt_id = pt_id,
>> +        .data_type = data_type,
>> +        .data_len = data_len,
>> +        .data_uptr = (uint64_t)data_ptr,
>> +    };
>> +
>> +    ret = ioctl(fd, IOMMU_HWPT_ALLOC, &alloc_hwpt);
>> +    trace_iommufd_backend_alloc_hwpt(fd, dev_id, pt_id, flags, data_type,
>> +                                     data_len, (uint64_t)data_ptr,
>> +                                     alloc_hwpt.out_hwpt_id, ret);
>> +    if (ret) {
>> +        ret = -errno;
>> +        error_report("IOMMU_HWPT_ALLOC failed: %m");
> 
> Please add an 'Error **errp' parameter.
> 

Yes, but I need to do some special casing for mdev then. I'll expand below.

>> +    } else {
>> +        *out_hwpt = alloc_hwpt.out_hwpt_id;
>> +    }
>> +    return ret;
>> +}
>> +
>>   bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid,
>>                                        uint32_t *type, void *data, uint32_t len,
>>                                        uint64_t *caps, Error **errp)
>> diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
>> index 5a5993b17c2e..2ca9a32cc7b6 100644
>> --- a/hw/vfio/iommufd.c
>> +++ b/hw/vfio/iommufd.c
>> @@ -212,10 +212,95 @@ static bool iommufd_cdev_detach_ioas_hwpt(VFIODevice
>> *vbasedev, Error **errp)
>>       return true;
>>   }
>>   +static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev,
>> +                                         VFIOIOMMUFDContainer *container,
>> +                                         Error **errp)
>> +{
>> +    IOMMUFDBackend *iommufd = vbasedev->iommufd;
>> +    uint32_t flags = 0;
>> +    VFIOIOASHwpt *hwpt;
>> +    uint32_t hwpt_id;
>> +    int ret;
>> +
>> +    /* Try to find a domain */
>> +    QLIST_FOREACH(hwpt, &container->hwpt_list, next) {
>> +        ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, NULL);
>> +        if (ret) {
>> +            /* -EINVAL means the domain is incompatible with the device. */
>> +            if (ret == -EINVAL) {
>> +                continue;
>> +            }
> 
> We should propagate an error in the false case.
> 
Same as before due to mdev.

When we are dealing with physical devices this is correct to handle as an error
and stops the attach. However we may still get a -ENOENT for mdevs which is why
I wasn't making an error here.

It just so seems confusing that all the code tries not to special case mdev,
that I went along with it. But honestly on a second look I should really special
case mdev to avoid autodomains given that we know it will fail. This way I
should be able to propagate all errors causes and not wrongly hide errors that
might be applicable for physical devices.

Now the -EINVAL is not an error we are supposed to propagate, as we expect
-EINVAL as a 'this domain can't be used try the next one'.

>> +            return false;
>> +        } else {
>> +            vbasedev->hwpt = hwpt;
>> +            QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next);
>> +            return true;
>> +        }
>> +    }
>> +
>> +    ret = iommufd_backend_alloc_hwpt(iommufd,
>> +                                     vbasedev->devid,
>> +                                     container->ioas_id, flags,
>> +                                     IOMMU_HWPT_DATA_NONE, 0, NULL, &hwpt_id);
>> +    if (ret) {
>> +        /*
>> +         * When there's no domains allocated we can still attempt a hardware
>> +         * pagetable allocation for an mdev, which ends up returning -ENOENT
>> +         * This is benign and meant to fallback into IOAS attach, hence don't
>> +         * set errp.
>> +         */
> 
> same here.
> 
As the comment says the previous paragraph was the reason for not propagating an
error here.

>> +        return false;
>> +    }
>> +
>> +    hwpt = g_malloc0(sizeof(*hwpt));
>> +    hwpt->hwpt_id = hwpt_id;
>> +    QLIST_INIT(&hwpt->device_list);
>> +
>> +    ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, errp);
>> +    if (ret) {
>> +        iommufd_backend_free_id(container->be, hwpt->hwpt_id);
>> +        g_free(hwpt);
>> +        return false;
>> +    }
>> +
>> +    vbasedev->hwpt = hwpt;
>> +    QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next);
>> +    QLIST_INSERT_HEAD(&container->hwpt_list, hwpt, next);
>> +    return true;
>> +}
>> +
>> +static void iommufd_cdev_autodomains_put(VFIODevice *vbasedev,
>> +                                         VFIOIOMMUFDContainer *container)
>> +{
>> +    VFIOIOASHwpt *hwpt = vbasedev->hwpt;
>> +
>> +    QLIST_REMOVE(vbasedev, hwpt_next);
>> +    if (QLIST_EMPTY(&hwpt->device_list)) {
>> +        QLIST_REMOVE(hwpt, next);
>> +        iommufd_backend_free_id(container->be, hwpt->hwpt_id);
>> +        g_free(hwpt);
>> +    }
>> +}
>> +
>>   static bool iommufd_cdev_attach_container(VFIODevice *vbasedev,
>>                                             VFIOIOMMUFDContainer *container,
>>                                             Error **errp)
>>   {
>> +    if (iommufd_cdev_autodomains_get(vbasedev, container, errp)) {
>> +        return true;
>> +    }
>> +
>> +    /*
>> +     * iommufd_cdev_autodomains_get() may have an expected failure (-ENOENT)
>> +     * for mdevs as they aren't real physical devices. @errp is only set
>> +     * when real failures happened i.e. failing to attach to a newly created
>> +     * hardware pagetable. These are fatal and should fail the attachment.
>> +     */
>> +    if (*errp) {
>> +        return false;
>> +    }
>> +
>>       return !iommufd_cdev_attach_ioas_hwpt(vbasedev, container->ioas_id, errp);
>>   }
>>   @@ -224,6 +309,11 @@ static void iommufd_cdev_detach_container(VFIODevice
>> *vbasedev,
>>   {
>>       Error *err = NULL;
>>   +    if (vbasedev->hwpt) {
>> +        iommufd_cdev_autodomains_put(vbasedev, container);
>> +        return;
>> +    }
>> +
>>       if (!iommufd_cdev_detach_ioas_hwpt(vbasedev, &err)) {
>>           error_report_err(err);
>>       }
>> @@ -354,6 +444,7 @@ static bool iommufd_cdev_attach(const char *name,
>> VFIODevice *vbasedev,
>>       container = VFIO_IOMMU_IOMMUFD(object_new(TYPE_VFIO_IOMMU_IOMMUFD));
>>       container->be = vbasedev->iommufd;
>>       container->ioas_id = ioas_id;
>> +    QLIST_INIT(&container->hwpt_list);
>>         bcontainer = &container->bcontainer;
>>       vfio_address_space_insert(space, bcontainer);
>> diff --git a/backends/trace-events b/backends/trace-events
>> index 211e6f374adc..4d8ac02fe7d6 100644
>> --- a/backends/trace-events
>> +++ b/backends/trace-events
>> @@ -14,4 +14,5 @@ iommufd_backend_map_dma(int iommufd, uint32_t ioas, uint64_t
>> iova, uint64_t size
>>   iommufd_backend_unmap_dma_non_exist(int iommufd, uint32_t ioas, uint64_t
>> iova, uint64_t size, int ret) " Unmap nonexistent mapping: iommufd=%d ioas=%d
>> iova=0x%"PRIx64" size=0x%"PRIx64" (%d)"
>>   iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova,
>> uint64_t size, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64"
>> (%d)"
>>   iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas) " iommufd=%d ioas=%d"
>> +iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id, uint32_t pt_id,
>> uint32_t flags, uint32_t hwpt_type, uint32_t len, uint64_t data_ptr, uint32_t
>> out_hwpt_id, int ret) " iommufd=%d dev_id=%u pt_id=%u flags=0x%x hwpt_type=%u
>> len=%u data_ptr=0x%"PRIx64" out_hwpt=%u (%d)"
>>   iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d
>> id=%d (%d)"
> 


RE: [PATCH v3 04/10] vfio/iommufd: Introduce auto domain creation
Posted by Duan, Zhenzhong 4 months, 2 weeks ago

>-----Original Message-----
>From: Joao Martins <joao.m.martins@oracle.com>
>Subject: [PATCH v3 04/10] vfio/iommufd: Introduce auto domain creation
>
>There's generally two modes of operation for IOMMUFD:
>
>* The simple user API which intends to perform relatively simple things
>with IOMMUs e.g. DPDK. It generally creates an IOAS and attach to VFIO
>and mainly performs IOAS_MAP and UNMAP.
>
>* The native IOMMUFD API where you have fine grained control of the
>IOMMU domain and model it accordingly. This is where most new feature
>are being steered to.
>
>For dirty tracking 2) is required, as it needs to ensure that
>the stage-2/parent IOMMU domain will only attach devices
>that support dirty tracking (so far it is all homogeneous in x86, likely
>not the case for smmuv3). Such invariant on dirty tracking provides a
>useful guarantee to VMMs that will refuse incompatible device
>attachments for IOMMU domains.
>
>Dirty tracking insurance is enforced via HWPT_ALLOC, which is
>responsible for creating an IOMMU domain. This is contrast to the
>'simple API' where the IOMMU domain is created by IOMMUFD
>automatically
>when it attaches to VFIO (usually referred as autodomains) but it has
>the needed handling for mdevs.
>
>To support dirty tracking with the advanced IOMMUFD API, it needs
>similar logic, where IOMMU domains are created and devices attached to
>compatible domains. Essentially mimmicing kernel
>iommufd_device_auto_get_domain(). If this fails (i.e. mdevs) it falls back
>to IOAS attach (which again is always the case for mdevs).
>
>The auto domain logic allows different IOMMU domains to be created when
>DMA dirty tracking is not desired (and VF can provide it), and others where
>it is. Here is not used in this way here given how VFIODevice migration
>state is initialized after the device attachment. But such mixed mode of
>IOMMU dirty tracking + device dirty tracking is an improvement that can
>be added on. Keep the 'all of nothing' approach that we have been using
>so far between container vs device dirty tracking.
>
>Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>---
> include/hw/vfio/vfio-common.h |  9 ++++
> include/sysemu/iommufd.h      |  4 ++
> backends/iommufd.c            | 29 +++++++++++
> hw/vfio/iommufd.c             | 91
>+++++++++++++++++++++++++++++++++++
> backends/trace-events         |  1 +
> 5 files changed, 134 insertions(+)
>
>diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-
>common.h
>index e8ddf92bb185..82c5a4aaa61e 100644
>--- a/include/hw/vfio/vfio-common.h
>+++ b/include/hw/vfio/vfio-common.h
>@@ -95,10 +95,17 @@ typedef struct VFIOHostDMAWindow {
>
> typedef struct IOMMUFDBackend IOMMUFDBackend;
>
>+typedef struct VFIOIOASHwpt {
>+    uint32_t hwpt_id;
>+    QLIST_HEAD(, VFIODevice) device_list;
>+    QLIST_ENTRY(VFIOIOASHwpt) next;
>+} VFIOIOASHwpt;
>+
> typedef struct VFIOIOMMUFDContainer {
>     VFIOContainerBase bcontainer;
>     IOMMUFDBackend *be;
>     uint32_t ioas_id;
>+    QLIST_HEAD(, VFIOIOASHwpt) hwpt_list;
> } VFIOIOMMUFDContainer;
>
> OBJECT_DECLARE_SIMPLE_TYPE(VFIOIOMMUFDContainer,
>VFIO_IOMMU_IOMMUFD);
>@@ -134,6 +141,8 @@ typedef struct VFIODevice {
>     HostIOMMUDevice *hiod;
>     int devid;
>     IOMMUFDBackend *iommufd;
>+    VFIOIOASHwpt *hwpt;
>+    QLIST_ENTRY(VFIODevice) hwpt_next;
> } VFIODevice;
>
> struct VFIODeviceOps {
>diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
>index 57d502a1c79a..35a8cec9780f 100644
>--- a/include/sysemu/iommufd.h
>+++ b/include/sysemu/iommufd.h
>@@ -50,6 +50,10 @@ int
>iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
> bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t
>devid,
>                                      uint32_t *type, void *data, uint32_t len,
>                                      uint64_t *caps, Error **errp);
>+int iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id,
>+                               uint32_t pt_id, uint32_t flags,
>+                               uint32_t data_type, uint32_t data_len,
>+                               void *data_ptr, uint32_t *out_hwpt);
>
> #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD
>TYPE_HOST_IOMMU_DEVICE "-iommufd"
> #endif
>diff --git a/backends/iommufd.c b/backends/iommufd.c
>index 2b3d51af26d2..f5f73eaf4a1a 100644
>--- a/backends/iommufd.c
>+++ b/backends/iommufd.c
>@@ -208,6 +208,35 @@ int
>iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
>     return ret;
> }
>
>+int iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id,
>+                               uint32_t pt_id, uint32_t flags,
>+                               uint32_t data_type, uint32_t data_len,
>+                               void *data_ptr, uint32_t *out_hwpt)
>+{
>+    int ret, fd = be->fd;
>+    struct iommu_hwpt_alloc alloc_hwpt = {
>+        .size = sizeof(struct iommu_hwpt_alloc),
>+        .flags = flags,
>+        .dev_id = dev_id,
>+        .pt_id = pt_id,
>+        .data_type = data_type,
>+        .data_len = data_len,
>+        .data_uptr = (uint64_t)data_ptr,
>+    };
>+
>+    ret = ioctl(fd, IOMMU_HWPT_ALLOC, &alloc_hwpt);
>+    trace_iommufd_backend_alloc_hwpt(fd, dev_id, pt_id, flags, data_type,
>+                                     data_len, (uint64_t)data_ptr,
>+                                     alloc_hwpt.out_hwpt_id, ret);
>+    if (ret) {
>+        ret = -errno;
>+        error_report("IOMMU_HWPT_ALLOC failed: %m");
>+    } else {
>+        *out_hwpt = alloc_hwpt.out_hwpt_id;
>+    }
>+    return ret;
>+}
>+
> bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t
>devid,
>                                      uint32_t *type, void *data, uint32_t len,
>                                      uint64_t *caps, Error **errp)
>diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
>index 5a5993b17c2e..2ca9a32cc7b6 100644
>--- a/hw/vfio/iommufd.c
>+++ b/hw/vfio/iommufd.c
>@@ -212,10 +212,95 @@ static bool
>iommufd_cdev_detach_ioas_hwpt(VFIODevice *vbasedev, Error **errp)
>     return true;
> }
>
>+static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev,
>+                                         VFIOIOMMUFDContainer *container,
>+                                         Error **errp)
>+{
>+    IOMMUFDBackend *iommufd = vbasedev->iommufd;
>+    uint32_t flags = 0;
>+    VFIOIOASHwpt *hwpt;
>+    uint32_t hwpt_id;
>+    int ret;
>+
>+    /* Try to find a domain */
>+    QLIST_FOREACH(hwpt, &container->hwpt_list, next) {
>+        ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id,
>NULL);
>+        if (ret) {
>+            /* -EINVAL means the domain is incompatible with the device. */
>+            if (ret == -EINVAL) {
>+                continue;
>+            }
>+
>+            return false;
>+        } else {
>+            vbasedev->hwpt = hwpt;
>+            QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next);
>+            return true;
>+        }
>+    }
>+
>+    ret = iommufd_backend_alloc_hwpt(iommufd,
>+                                     vbasedev->devid,
>+                                     container->ioas_id, flags,
>+                                     IOMMU_HWPT_DATA_NONE, 0, NULL, &hwpt_id);
>+    if (ret) {
>+        /*
>+         * When there's no domains allocated we can still attempt a hardware
>+         * pagetable allocation for an mdev, which ends up returning -ENOENT
>+         * This is benign and meant to fallback into IOAS attach, hence don't
>+         * set errp.
>+         */
>+        return false;
>+    }
>+
>+    hwpt = g_malloc0(sizeof(*hwpt));
>+    hwpt->hwpt_id = hwpt_id;
>+    QLIST_INIT(&hwpt->device_list);
>+
>+    ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, errp);
>+    if (ret) {
>+        iommufd_backend_free_id(container->be, hwpt->hwpt_id);
>+        g_free(hwpt);
>+        return false;
>+    }
>+
>+    vbasedev->hwpt = hwpt;
>+    QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next);
>+    QLIST_INSERT_HEAD(&container->hwpt_list, hwpt, next);
>+    return true;
>+}
>+
>+static void iommufd_cdev_autodomains_put(VFIODevice *vbasedev,
>+                                         VFIOIOMMUFDContainer *container)
>+{
>+    VFIOIOASHwpt *hwpt = vbasedev->hwpt;
>+
>+    QLIST_REMOVE(vbasedev, hwpt_next);
>+    if (QLIST_EMPTY(&hwpt->device_list)) {
>+        QLIST_REMOVE(hwpt, next);
>+        iommufd_backend_free_id(container->be, hwpt->hwpt_id);
>+        g_free(hwpt);
>+    }
>+}
>+
> static bool iommufd_cdev_attach_container(VFIODevice *vbasedev,
>                                           VFIOIOMMUFDContainer *container,
>                                           Error **errp)
> {
>+    if (iommufd_cdev_autodomains_get(vbasedev, container, errp)) {
>+        return true;
>+    }
>+
>+    /*
>+     * iommufd_cdev_autodomains_get() may have an expected failure (-
>ENOENT)
>+     * for mdevs as they aren't real physical devices. @errp is only set
>+     * when real failures happened i.e. failing to attach to a newly created
>+     * hardware pagetable. These are fatal and should fail the attachment.
>+     */
>+    if (*errp) {

Better to use ERRP_GUARD()

Thanks
Zhenzhong

>+        return false;
>+    }
>+
>     return !iommufd_cdev_attach_ioas_hwpt(vbasedev, container->ioas_id,
>errp);
> }
>
>@@ -224,6 +309,11 @@ static void
>iommufd_cdev_detach_container(VFIODevice *vbasedev,
> {
>     Error *err = NULL;
>
>+    if (vbasedev->hwpt) {
>+        iommufd_cdev_autodomains_put(vbasedev, container);
>+        return;
>+    }
>+
>     if (!iommufd_cdev_detach_ioas_hwpt(vbasedev, &err)) {
>         error_report_err(err);
>     }
>@@ -354,6 +444,7 @@ static bool iommufd_cdev_attach(const char *name,
>VFIODevice *vbasedev,
>     container =
>VFIO_IOMMU_IOMMUFD(object_new(TYPE_VFIO_IOMMU_IOMMUFD));
>     container->be = vbasedev->iommufd;
>     container->ioas_id = ioas_id;
>+    QLIST_INIT(&container->hwpt_list);
>
>     bcontainer = &container->bcontainer;
>     vfio_address_space_insert(space, bcontainer);
>diff --git a/backends/trace-events b/backends/trace-events
>index 211e6f374adc..4d8ac02fe7d6 100644
>--- a/backends/trace-events
>+++ b/backends/trace-events
>@@ -14,4 +14,5 @@ iommufd_backend_map_dma(int iommufd, uint32_t
>ioas, uint64_t iova, uint64_t size
> iommufd_backend_unmap_dma_non_exist(int iommufd, uint32_t ioas,
>uint64_t iova, uint64_t size, int ret) " Unmap nonexistent mapping:
>iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)"
> iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova,
>uint64_t size, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64"
>size=0x%"PRIx64" (%d)"
> iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas) " iommufd=%d
>ioas=%d"
>+iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id, uint32_t
>pt_id, uint32_t flags, uint32_t hwpt_type, uint32_t len, uint64_t data_ptr,
>uint32_t out_hwpt_id, int ret) " iommufd=%d dev_id=%u pt_id=%u
>flags=0x%x hwpt_type=%u len=%u data_ptr=0x%"PRIx64" out_hwpt=%u
>(%d)"
> iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d
>id=%d (%d)"
>--
>2.17.2
Re: [PATCH v3 04/10] vfio/iommufd: Introduce auto domain creation
Posted by Joao Martins 4 months, 2 weeks ago
On 09/07/2024 07:26, Duan, Zhenzhong wrote:
> 
> 
>> -----Original Message-----
>> From: Joao Martins <joao.m.martins@oracle.com>
>> Subject: [PATCH v3 04/10] vfio/iommufd: Introduce auto domain creation
>>
>> There's generally two modes of operation for IOMMUFD:
>>
>> * The simple user API which intends to perform relatively simple things
>> with IOMMUs e.g. DPDK. It generally creates an IOAS and attach to VFIO
>> and mainly performs IOAS_MAP and UNMAP.
>>
>> * The native IOMMUFD API where you have fine grained control of the
>> IOMMU domain and model it accordingly. This is where most new feature
>> are being steered to.
>>
>> For dirty tracking 2) is required, as it needs to ensure that
>> the stage-2/parent IOMMU domain will only attach devices
>> that support dirty tracking (so far it is all homogeneous in x86, likely
>> not the case for smmuv3). Such invariant on dirty tracking provides a
>> useful guarantee to VMMs that will refuse incompatible device
>> attachments for IOMMU domains.
>>
>> Dirty tracking insurance is enforced via HWPT_ALLOC, which is
>> responsible for creating an IOMMU domain. This is contrast to the
>> 'simple API' where the IOMMU domain is created by IOMMUFD
>> automatically
>> when it attaches to VFIO (usually referred as autodomains) but it has
>> the needed handling for mdevs.
>>
>> To support dirty tracking with the advanced IOMMUFD API, it needs
>> similar logic, where IOMMU domains are created and devices attached to
>> compatible domains. Essentially mimmicing kernel
>> iommufd_device_auto_get_domain(). If this fails (i.e. mdevs) it falls back
>> to IOAS attach (which again is always the case for mdevs).
>>
>> The auto domain logic allows different IOMMU domains to be created when
>> DMA dirty tracking is not desired (and VF can provide it), and others where
>> it is. Here is not used in this way here given how VFIODevice migration
>> state is initialized after the device attachment. But such mixed mode of
>> IOMMU dirty tracking + device dirty tracking is an improvement that can
>> be added on. Keep the 'all of nothing' approach that we have been using
>> so far between container vs device dirty tracking.
>>
>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>> ---
>> include/hw/vfio/vfio-common.h |  9 ++++
>> include/sysemu/iommufd.h      |  4 ++
>> backends/iommufd.c            | 29 +++++++++++
>> hw/vfio/iommufd.c             | 91
>> +++++++++++++++++++++++++++++++++++
>> backends/trace-events         |  1 +
>> 5 files changed, 134 insertions(+)
>>
>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-
>> common.h
>> index e8ddf92bb185..82c5a4aaa61e 100644
>> --- a/include/hw/vfio/vfio-common.h
>> +++ b/include/hw/vfio/vfio-common.h
>> @@ -95,10 +95,17 @@ typedef struct VFIOHostDMAWindow {
>>
>> typedef struct IOMMUFDBackend IOMMUFDBackend;
>>
>> +typedef struct VFIOIOASHwpt {
>> +    uint32_t hwpt_id;
>> +    QLIST_HEAD(, VFIODevice) device_list;
>> +    QLIST_ENTRY(VFIOIOASHwpt) next;
>> +} VFIOIOASHwpt;
>> +
>> typedef struct VFIOIOMMUFDContainer {
>>     VFIOContainerBase bcontainer;
>>     IOMMUFDBackend *be;
>>     uint32_t ioas_id;
>> +    QLIST_HEAD(, VFIOIOASHwpt) hwpt_list;
>> } VFIOIOMMUFDContainer;
>>
>> OBJECT_DECLARE_SIMPLE_TYPE(VFIOIOMMUFDContainer,
>> VFIO_IOMMU_IOMMUFD);
>> @@ -134,6 +141,8 @@ typedef struct VFIODevice {
>>     HostIOMMUDevice *hiod;
>>     int devid;
>>     IOMMUFDBackend *iommufd;
>> +    VFIOIOASHwpt *hwpt;
>> +    QLIST_ENTRY(VFIODevice) hwpt_next;
>> } VFIODevice;
>>
>> struct VFIODeviceOps {
>> diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
>> index 57d502a1c79a..35a8cec9780f 100644
>> --- a/include/sysemu/iommufd.h
>> +++ b/include/sysemu/iommufd.h
>> @@ -50,6 +50,10 @@ int
>> iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
>> bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t
>> devid,
>>                                      uint32_t *type, void *data, uint32_t len,
>>                                      uint64_t *caps, Error **errp);
>> +int iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id,
>> +                               uint32_t pt_id, uint32_t flags,
>> +                               uint32_t data_type, uint32_t data_len,
>> +                               void *data_ptr, uint32_t *out_hwpt);
>>
>> #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD
>> TYPE_HOST_IOMMU_DEVICE "-iommufd"
>> #endif
>> diff --git a/backends/iommufd.c b/backends/iommufd.c
>> index 2b3d51af26d2..f5f73eaf4a1a 100644
>> --- a/backends/iommufd.c
>> +++ b/backends/iommufd.c
>> @@ -208,6 +208,35 @@ int
>> iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
>>     return ret;
>> }
>>
>> +int iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id,
>> +                               uint32_t pt_id, uint32_t flags,
>> +                               uint32_t data_type, uint32_t data_len,
>> +                               void *data_ptr, uint32_t *out_hwpt)
>> +{
>> +    int ret, fd = be->fd;
>> +    struct iommu_hwpt_alloc alloc_hwpt = {
>> +        .size = sizeof(struct iommu_hwpt_alloc),
>> +        .flags = flags,
>> +        .dev_id = dev_id,
>> +        .pt_id = pt_id,
>> +        .data_type = data_type,
>> +        .data_len = data_len,
>> +        .data_uptr = (uint64_t)data_ptr,
>> +    };
>> +
>> +    ret = ioctl(fd, IOMMU_HWPT_ALLOC, &alloc_hwpt);
>> +    trace_iommufd_backend_alloc_hwpt(fd, dev_id, pt_id, flags, data_type,
>> +                                     data_len, (uint64_t)data_ptr,
>> +                                     alloc_hwpt.out_hwpt_id, ret);
>> +    if (ret) {
>> +        ret = -errno;
>> +        error_report("IOMMU_HWPT_ALLOC failed: %m");
>> +    } else {
>> +        *out_hwpt = alloc_hwpt.out_hwpt_id;
>> +    }
>> +    return ret;
>> +}
>> +
>> bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t
>> devid,
>>                                      uint32_t *type, void *data, uint32_t len,
>>                                      uint64_t *caps, Error **errp)
>> diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
>> index 5a5993b17c2e..2ca9a32cc7b6 100644
>> --- a/hw/vfio/iommufd.c
>> +++ b/hw/vfio/iommufd.c
>> @@ -212,10 +212,95 @@ static bool
>> iommufd_cdev_detach_ioas_hwpt(VFIODevice *vbasedev, Error **errp)
>>     return true;
>> }
>>
>> +static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev,
>> +                                         VFIOIOMMUFDContainer *container,
>> +                                         Error **errp)
>> +{
>> +    IOMMUFDBackend *iommufd = vbasedev->iommufd;
>> +    uint32_t flags = 0;
>> +    VFIOIOASHwpt *hwpt;
>> +    uint32_t hwpt_id;
>> +    int ret;
>> +
>> +    /* Try to find a domain */
>> +    QLIST_FOREACH(hwpt, &container->hwpt_list, next) {
>> +        ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id,
>> NULL);
>> +        if (ret) {
>> +            /* -EINVAL means the domain is incompatible with the device. */
>> +            if (ret == -EINVAL) {
>> +                continue;
>> +            }
>> +
>> +            return false;
>> +        } else {
>> +            vbasedev->hwpt = hwpt;
>> +            QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next);
>> +            return true;
>> +        }
>> +    }
>> +
>> +    ret = iommufd_backend_alloc_hwpt(iommufd,
>> +                                     vbasedev->devid,
>> +                                     container->ioas_id, flags,
>> +                                     IOMMU_HWPT_DATA_NONE, 0, NULL, &hwpt_id);
>> +    if (ret) {
>> +        /*
>> +         * When there's no domains allocated we can still attempt a hardware
>> +         * pagetable allocation for an mdev, which ends up returning -ENOENT
>> +         * This is benign and meant to fallback into IOAS attach, hence don't
>> +         * set errp.
>> +         */
>> +        return false;
>> +    }
>> +
>> +    hwpt = g_malloc0(sizeof(*hwpt));
>> +    hwpt->hwpt_id = hwpt_id;
>> +    QLIST_INIT(&hwpt->device_list);
>> +
>> +    ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, errp);
>> +    if (ret) {
>> +        iommufd_backend_free_id(container->be, hwpt->hwpt_id);
>> +        g_free(hwpt);
>> +        return false;
>> +    }
>> +
>> +    vbasedev->hwpt = hwpt;
>> +    QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next);
>> +    QLIST_INSERT_HEAD(&container->hwpt_list, hwpt, next);
>> +    return true;
>> +}
>> +
>> +static void iommufd_cdev_autodomains_put(VFIODevice *vbasedev,
>> +                                         VFIOIOMMUFDContainer *container)
>> +{
>> +    VFIOIOASHwpt *hwpt = vbasedev->hwpt;
>> +
>> +    QLIST_REMOVE(vbasedev, hwpt_next);
>> +    if (QLIST_EMPTY(&hwpt->device_list)) {
>> +        QLIST_REMOVE(hwpt, next);
>> +        iommufd_backend_free_id(container->be, hwpt->hwpt_id);
>> +        g_free(hwpt);
>> +    }
>> +}
>> +
>> static bool iommufd_cdev_attach_container(VFIODevice *vbasedev,
>>                                           VFIOIOMMUFDContainer *container,
>>                                           Error **errp)
>> {
>> +    if (iommufd_cdev_autodomains_get(vbasedev, container, errp)) {
>> +        return true;
>> +    }
>> +
>> +    /*
>> +     * iommufd_cdev_autodomains_get() may have an expected failure (-
>> ENOENT)
>> +     * for mdevs as they aren't real physical devices. @errp is only set
>> +     * when real failures happened i.e. failing to attach to a newly created
>> +     * hardware pagetable. These are fatal and should fail the attachment.
>> +     */
>> +    if (*errp) {
> 
> Better to use ERRP_GUARD()
> 

OK

> Thanks
> Zhenzhong
> 
>> +        return false;
>> +    }
>> +
>>     return !iommufd_cdev_attach_ioas_hwpt(vbasedev, container->ioas_id,
>> errp);
>> }
>>
>> @@ -224,6 +309,11 @@ static void
>> iommufd_cdev_detach_container(VFIODevice *vbasedev,
>> {
>>     Error *err = NULL;
>>
>> +    if (vbasedev->hwpt) {
>> +        iommufd_cdev_autodomains_put(vbasedev, container);
>> +        return;
>> +    }
>> +
>>     if (!iommufd_cdev_detach_ioas_hwpt(vbasedev, &err)) {
>>         error_report_err(err);
>>     }
>> @@ -354,6 +444,7 @@ static bool iommufd_cdev_attach(const char *name,
>> VFIODevice *vbasedev,
>>     container =
>> VFIO_IOMMU_IOMMUFD(object_new(TYPE_VFIO_IOMMU_IOMMUFD));
>>     container->be = vbasedev->iommufd;
>>     container->ioas_id = ioas_id;
>> +    QLIST_INIT(&container->hwpt_list);
>>
>>     bcontainer = &container->bcontainer;
>>     vfio_address_space_insert(space, bcontainer);
>> diff --git a/backends/trace-events b/backends/trace-events
>> index 211e6f374adc..4d8ac02fe7d6 100644
>> --- a/backends/trace-events
>> +++ b/backends/trace-events
>> @@ -14,4 +14,5 @@ iommufd_backend_map_dma(int iommufd, uint32_t
>> ioas, uint64_t iova, uint64_t size
>> iommufd_backend_unmap_dma_non_exist(int iommufd, uint32_t ioas,
>> uint64_t iova, uint64_t size, int ret) " Unmap nonexistent mapping:
>> iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)"
>> iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova,
>> uint64_t size, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64"
>> size=0x%"PRIx64" (%d)"
>> iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas) " iommufd=%d
>> ioas=%d"
>> +iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id, uint32_t
>> pt_id, uint32_t flags, uint32_t hwpt_type, uint32_t len, uint64_t data_ptr,
>> uint32_t out_hwpt_id, int ret) " iommufd=%d dev_id=%u pt_id=%u
>> flags=0x%x hwpt_type=%u len=%u data_ptr=0x%"PRIx64" out_hwpt=%u
>> (%d)"
>> iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d
>> id=%d (%d)"
>> --
>> 2.17.2
>