[PATCH v6 01/21] backends/iommufd: Introduce the iommufd object

Zhenzhong Duan posted 21 patches 1 year ago
There is a newer version of this series
[PATCH v6 01/21] backends/iommufd: Introduce the iommufd object
Posted by Zhenzhong Duan 1 year ago
From: Eric Auger <eric.auger@redhat.com>

Introduce an iommufd object which allows the interaction
with the host /dev/iommu device.

The /dev/iommu can have been already pre-opened outside of qemu,
in which case the fd can be passed directly along with the
iommufd object:

This allows the iommufd object to be shared accross several
subsystems (VFIO, VDPA, ...). For example, libvirt would open
the /dev/iommu once.

If no fd is passed along with the iommufd object, the /dev/iommu
is opened by the qemu code.

Suggested-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
---
v6: remove redundant call, alloc_hwpt, get/put_ioas

 MAINTAINERS              |   7 ++
 qapi/qom.json            |  19 ++++
 include/sysemu/iommufd.h |  44 ++++++++
 backends/iommufd.c       | 228 +++++++++++++++++++++++++++++++++++++++
 backends/Kconfig         |   4 +
 backends/meson.build     |   1 +
 backends/trace-events    |  10 ++
 qemu-options.hx          |  12 +++
 8 files changed, 325 insertions(+)
 create mode 100644 include/sysemu/iommufd.h
 create mode 100644 backends/iommufd.c

diff --git a/MAINTAINERS b/MAINTAINERS
index ff1238bb98..a4891f7bda 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2166,6 +2166,13 @@ F: hw/vfio/ap.c
 F: docs/system/s390x/vfio-ap.rst
 L: qemu-s390x@nongnu.org
 
+iommufd
+M: Yi Liu <yi.l.liu@intel.com>
+M: Eric Auger <eric.auger@redhat.com>
+S: Supported
+F: backends/iommufd.c
+F: include/sysemu/iommufd.h
+
 vhost
 M: Michael S. Tsirkin <mst@redhat.com>
 S: Supported
diff --git a/qapi/qom.json b/qapi/qom.json
index c53ef978ff..1fd8555a75 100644
--- a/qapi/qom.json
+++ b/qapi/qom.json
@@ -794,6 +794,23 @@
 { 'struct': 'VfioUserServerProperties',
   'data': { 'socket': 'SocketAddress', 'device': 'str' } }
 
+##
+# @IOMMUFDProperties:
+#
+# Properties for iommufd objects.
+#
+# @fd: file descriptor name previously passed via 'getfd' command,
+#     which represents a pre-opened /dev/iommu.  This allows the
+#     iommufd object to be shared accross several subsystems
+#     (VFIO, VDPA, ...), and the file descriptor to be shared
+#     with other process, e.g. DPDK.  (default: QEMU opens
+#     /dev/iommu by itself)
+#
+# Since: 8.2
+##
+{ 'struct': 'IOMMUFDProperties',
+  'data': { '*fd': 'str' } }
+
 ##
 # @RngProperties:
 #
@@ -934,6 +951,7 @@
     'input-barrier',
     { 'name': 'input-linux',
       'if': 'CONFIG_LINUX' },
+    'iommufd',
     'iothread',
     'main-loop',
     { 'name': 'memory-backend-epc',
@@ -1003,6 +1021,7 @@
       'input-barrier':              'InputBarrierProperties',
       'input-linux':                { 'type': 'InputLinuxProperties',
                                       'if': 'CONFIG_LINUX' },
+      'iommufd':                    'IOMMUFDProperties',
       'iothread':                   'IothreadProperties',
       'main-loop':                  'MainLoopProperties',
       'memory-backend-epc':         { 'type': 'MemoryBackendEpcProperties',
diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
new file mode 100644
index 0000000000..9b3a86f57d
--- /dev/null
+++ b/include/sysemu/iommufd.h
@@ -0,0 +1,44 @@
+#ifndef SYSEMU_IOMMUFD_H
+#define SYSEMU_IOMMUFD_H
+
+#include "qom/object.h"
+#include "qemu/thread.h"
+#include "exec/hwaddr.h"
+#include "exec/cpu-common.h"
+
+#define TYPE_IOMMUFD_BACKEND "iommufd"
+OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass,
+                    IOMMUFD_BACKEND)
+#define IOMMUFD_BACKEND(obj) \
+    OBJECT_CHECK(IOMMUFDBackend, (obj), TYPE_IOMMUFD_BACKEND)
+#define IOMMUFD_BACKEND_GET_CLASS(obj) \
+    OBJECT_GET_CLASS(IOMMUFDBackendClass, (obj), TYPE_IOMMUFD_BACKEND)
+#define IOMMUFD_BACKEND_CLASS(klass) \
+    OBJECT_CLASS_CHECK(IOMMUFDBackendClass, (klass), TYPE_IOMMUFD_BACKEND)
+struct IOMMUFDBackendClass {
+    ObjectClass parent_class;
+};
+
+struct IOMMUFDBackend {
+    Object parent;
+
+    /*< protected >*/
+    int fd;            /* /dev/iommu file descriptor */
+    bool owned;        /* is the /dev/iommu opened internally */
+    QemuMutex lock;
+    uint32_t users;
+
+    /*< public >*/
+};
+
+int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp);
+void iommufd_backend_disconnect(IOMMUFDBackend *be);
+
+int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id,
+                               Error **errp);
+void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id);
+int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova,
+                            ram_addr_t size, void *vaddr, bool readonly);
+int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
+                              hwaddr iova, ram_addr_t size);
+#endif
diff --git a/backends/iommufd.c b/backends/iommufd.c
new file mode 100644
index 0000000000..ea3e2a8f85
--- /dev/null
+++ b/backends/iommufd.c
@@ -0,0 +1,228 @@
+/*
+ * iommufd container backend
+ *
+ * Copyright (C) 2023 Intel Corporation.
+ * Copyright Red Hat, Inc. 2023
+ *
+ * Authors: Yi Liu <yi.l.liu@intel.com>
+ *          Eric Auger <eric.auger@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "sysemu/iommufd.h"
+#include "qapi/error.h"
+#include "qapi/qmp/qerror.h"
+#include "qemu/module.h"
+#include "qom/object_interfaces.h"
+#include "qemu/error-report.h"
+#include "monitor/monitor.h"
+#include "trace.h"
+#include <sys/ioctl.h>
+#include <linux/iommufd.h>
+
+static void iommufd_backend_init(Object *obj)
+{
+    IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
+
+    be->fd = -1;
+    be->users = 0;
+    be->owned = true;
+    qemu_mutex_init(&be->lock);
+}
+
+static void iommufd_backend_finalize(Object *obj)
+{
+    IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
+
+    if (be->owned) {
+        close(be->fd);
+        be->fd = -1;
+    }
+}
+
+static void iommufd_backend_set_fd(Object *obj, const char *str, Error **errp)
+{
+    IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
+    int fd = -1;
+
+    fd = monitor_fd_param(monitor_cur(), str, errp);
+    if (fd == -1) {
+        error_prepend(errp, "Could not parse remote object fd %s:", str);
+        return;
+    }
+    qemu_mutex_lock(&be->lock);
+    be->fd = fd;
+    be->owned = false;
+    qemu_mutex_unlock(&be->lock);
+    trace_iommu_backend_set_fd(be->fd);
+}
+
+static void iommufd_backend_class_init(ObjectClass *oc, void *data)
+{
+    object_class_property_add_str(oc, "fd", NULL, iommufd_backend_set_fd);
+}
+
+int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp)
+{
+    int fd, ret = 0;
+
+    qemu_mutex_lock(&be->lock);
+    if (be->users == UINT32_MAX) {
+        error_setg(errp, "too many connections");
+        ret = -E2BIG;
+        goto out;
+    }
+    if (be->owned && !be->users) {
+        fd = qemu_open_old("/dev/iommu", O_RDWR);
+        if (fd < 0) {
+            error_setg_errno(errp, errno, "/dev/iommu opening failed");
+            ret = fd;
+            goto out;
+        }
+        be->fd = fd;
+    }
+    be->users++;
+out:
+    trace_iommufd_backend_connect(be->fd, be->owned,
+                                  be->users, ret);
+    qemu_mutex_unlock(&be->lock);
+    return ret;
+}
+
+void iommufd_backend_disconnect(IOMMUFDBackend *be)
+{
+    qemu_mutex_lock(&be->lock);
+    if (!be->users) {
+        goto out;
+    }
+    be->users--;
+    if (!be->users && be->owned) {
+        close(be->fd);
+        be->fd = -1;
+    }
+out:
+    trace_iommufd_backend_disconnect(be->fd, be->users);
+    qemu_mutex_unlock(&be->lock);
+}
+
+int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id,
+                               Error **errp)
+{
+    int ret, fd = be->fd;
+    struct iommu_ioas_alloc alloc_data  = {
+        .size = sizeof(alloc_data),
+        .flags = 0,
+    };
+
+    ret = ioctl(fd, IOMMU_IOAS_ALLOC, &alloc_data);
+    if (ret) {
+        error_setg_errno(errp, errno, "Failed to allocate ioas");
+        return ret;
+    }
+
+    *ioas_id = alloc_data.out_ioas_id;
+    trace_iommufd_backend_alloc_ioas(fd, *ioas_id, ret);
+
+    return ret;
+}
+
+void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id)
+{
+    int ret, fd = be->fd;
+    struct iommu_destroy des = {
+        .size = sizeof(des),
+        .id = id,
+    };
+
+    ret = ioctl(fd, IOMMU_DESTROY, &des);
+    trace_iommufd_backend_free_id(fd, id, ret);
+    if (ret) {
+        error_report("Failed to free id: %u %m", id);
+    }
+}
+
+int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova,
+                            ram_addr_t size, void *vaddr, bool readonly)
+{
+    int ret, fd = be->fd;
+    struct iommu_ioas_map map = {
+        .size = sizeof(map),
+        .flags = IOMMU_IOAS_MAP_READABLE |
+                 IOMMU_IOAS_MAP_FIXED_IOVA,
+        .ioas_id = ioas_id,
+        .__reserved = 0,
+        .user_va = (uintptr_t)vaddr,
+        .iova = iova,
+        .length = size,
+    };
+
+    if (!readonly) {
+        map.flags |= IOMMU_IOAS_MAP_WRITEABLE;
+    }
+
+    ret = ioctl(fd, IOMMU_IOAS_MAP, &map);
+    trace_iommufd_backend_map_dma(fd, ioas_id, iova, size,
+                                  vaddr, readonly, ret);
+    if (ret) {
+        ret = -errno;
+        error_report("IOMMU_IOAS_MAP failed: %m");
+    }
+    return ret;
+}
+
+int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
+                              hwaddr iova, ram_addr_t size)
+{
+    int ret, fd = be->fd;
+    struct iommu_ioas_unmap unmap = {
+        .size = sizeof(unmap),
+        .ioas_id = ioas_id,
+        .iova = iova,
+        .length = size,
+    };
+
+    ret = ioctl(fd, IOMMU_IOAS_UNMAP, &unmap);
+    /*
+     * IOMMUFD takes mapping as some kind of object, unmapping
+     * nonexistent mapping is treated as deleting a nonexistent
+     * object and return ENOENT. This is different from legacy
+     * backend which allows it. vIOMMU may trigger a lot of
+     * redundant unmapping, to avoid flush the log, treat them
+     * as succeess for IOMMUFD just like legacy backend.
+     */
+    if (ret && errno == ENOENT) {
+        trace_iommufd_backend_unmap_dma_non_exist(fd, ioas_id, iova, size, ret);
+        ret = 0;
+    } else {
+        trace_iommufd_backend_unmap_dma(fd, ioas_id, iova, size, ret);
+    }
+
+    if (ret) {
+        ret = -errno;
+        error_report("IOMMU_IOAS_UNMAP failed: %m");
+    }
+    return ret;
+}
+
+static const TypeInfo iommufd_backend_info = {
+    .name = TYPE_IOMMUFD_BACKEND,
+    .parent = TYPE_OBJECT,
+    .instance_size = sizeof(IOMMUFDBackend),
+    .instance_init = iommufd_backend_init,
+    .instance_finalize = iommufd_backend_finalize,
+    .class_size = sizeof(IOMMUFDBackendClass),
+    .class_init = iommufd_backend_class_init,
+    .interfaces = (InterfaceInfo[]) {
+        { TYPE_USER_CREATABLE },
+        { }
+    }
+};
+
+static void register_types(void)
+{
+    type_register_static(&iommufd_backend_info);
+}
+
+type_init(register_types);
diff --git a/backends/Kconfig b/backends/Kconfig
index f35abc1609..2cb23f62fa 100644
--- a/backends/Kconfig
+++ b/backends/Kconfig
@@ -1 +1,5 @@
 source tpm/Kconfig
+
+config IOMMUFD
+    bool
+    depends on VFIO
diff --git a/backends/meson.build b/backends/meson.build
index 914c7c4afb..9a5cea480d 100644
--- a/backends/meson.build
+++ b/backends/meson.build
@@ -20,6 +20,7 @@ if have_vhost_user
   system_ss.add(when: 'CONFIG_VIRTIO', if_true: files('vhost-user.c'))
 endif
 system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('cryptodev-vhost.c'))
+system_ss.add(when: 'CONFIG_IOMMUFD', if_true: files('iommufd.c'))
 if have_vhost_user_crypto
   system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('cryptodev-vhost-user.c'))
 endif
diff --git a/backends/trace-events b/backends/trace-events
index 652eb76a57..d45c6e31a6 100644
--- a/backends/trace-events
+++ b/backends/trace-events
@@ -5,3 +5,13 @@ dbus_vmstate_pre_save(void)
 dbus_vmstate_post_load(int version_id) "version_id: %d"
 dbus_vmstate_loading(const char *id) "id: %s"
 dbus_vmstate_saving(const char *id) "id: %s"
+
+# iommufd.c
+iommufd_backend_connect(int fd, bool owned, uint32_t users, int ret) "fd=%d owned=%d users=%d (%d)"
+iommufd_backend_disconnect(int fd, uint32_t users) "fd=%d users=%d"
+iommu_backend_set_fd(int fd) "pre-opened /dev/iommu fd=%d"
+iommufd_backend_map_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, void *vaddr, bool readonly, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" addr=%p readonly=%d (%d)"
+iommufd_backend_unmap_dma_non_exist(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " Unmap nonexistent mapping: iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)"
+iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)"
+iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas, int ret) " iommufd=%d ioas=%d (%d)"
+iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)"
diff --git a/qemu-options.hx b/qemu-options.hx
index 42fd09e4de..70507c0ee6 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -5224,6 +5224,18 @@ SRST
 
         The ``share`` boolean option is on by default with memfd.
 
+    ``-object iommufd,id=id[,fd=fd]``
+        Creates an iommufd backend which allows control of DMA mapping
+        through the /dev/iommu device.
+
+        The ``id`` parameter is a unique ID which frontends (such as
+        vfio-pci of vdpa) will use to connect with the iommufd backend.
+
+        The ``fd`` parameter is an optional pre-opened file descriptor
+        resulting from /dev/iommu opening. Usually the iommufd is shared
+        across all subsystems, bringing the benefit of centralized
+        reference counting.
+
     ``-object rng-builtin,id=id``
         Creates a random number generator backend which obtains entropy
         from QEMU builtin functions. The ``id`` parameter is a unique ID
-- 
2.34.1
Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object
Posted by Cédric Le Goater 1 year ago
Hello,

> +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova,
> +                            ram_addr_t size, void *vaddr, bool readonly)
> +{
> +    int ret, fd = be->fd;
> +    struct iommu_ioas_map map = {
> +        .size = sizeof(map),
> +        .flags = IOMMU_IOAS_MAP_READABLE |
> +                 IOMMU_IOAS_MAP_FIXED_IOVA,
> +        .ioas_id = ioas_id,
> +        .__reserved = 0,
> +        .user_va = (uintptr_t)vaddr,
> +        .iova = iova,
> +        .length = size,
> +    };
> +
> +    if (!readonly) {
> +        map.flags |= IOMMU_IOAS_MAP_WRITEABLE;
> +    }
> +
> +    ret = ioctl(fd, IOMMU_IOAS_MAP, &map);
> +    trace_iommufd_backend_map_dma(fd, ioas_id, iova, size,
> +                                  vaddr, readonly, ret);
> +    if (ret) {
> +        ret = -errno;
> +        error_report("IOMMU_IOAS_MAP failed: %m");
> +    }
> +    return ret;
> +}

When using a UEFI guest, QEMU reports errors when mapping regions
in the top PCI space :

   iommufd_backend_map_dma  iommufd=10 ioas=2 iova=0x380000001000 size=0x3000 addr=0x7fce2c28b000 readonly=0 (-1)
   qemu-system-x86_64: IOMMU_IOAS_MAP failed: Invalid argument
   qemu-system-x86_64: vfio_container_dma_map(0x55a21b03a150, 0x380000001000, 0x3000, 0x7fce2c28b000) = -22 (Invalid argument)

   iommufd_backend_map_dma  iommufd=10 ioas=2 iova=0x380000004000 size=0x4000 addr=0x7fce2c980000 readonly=0 (-1)
   qemu-system-x86_64: IOMMU_IOAS_MAP failed: Invalid argument
   qemu-system-x86_64: vfio_container_dma_map(0x55a21b03a150, 0x380000004000, 0x4000, 0x7fce2c980000) = -22 (Invalid argument)

This is because IOMMUFD reserved IOVAs areas are :

  [ fee00000 - feefffff ]
  [ 8000000000 - ffffffffffffffff ] (39 bits address space)

which were allocated when the device was initially attached.
The topology is basic. Something is wrong.

Thanks,

C.
RE: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object
Posted by Duan, Zhenzhong 1 year ago
Hi Cédric,

>-----Original Message-----
>From: Cédric Le Goater <clg@redhat.com>
>Sent: Friday, November 17, 2023 7:10 PM
>Subject: Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object
>
>Hello,
>
>> +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id,
>hwaddr iova,
>> +                            ram_addr_t size, void *vaddr, bool readonly)
>> +{
>> +    int ret, fd = be->fd;
>> +    struct iommu_ioas_map map = {
>> +        .size = sizeof(map),
>> +        .flags = IOMMU_IOAS_MAP_READABLE |
>> +                 IOMMU_IOAS_MAP_FIXED_IOVA,
>> +        .ioas_id = ioas_id,
>> +        .__reserved = 0,
>> +        .user_va = (uintptr_t)vaddr,
>> +        .iova = iova,
>> +        .length = size,
>> +    };
>> +
>> +    if (!readonly) {
>> +        map.flags |= IOMMU_IOAS_MAP_WRITEABLE;
>> +    }
>> +
>> +    ret = ioctl(fd, IOMMU_IOAS_MAP, &map);
>> +    trace_iommufd_backend_map_dma(fd, ioas_id, iova, size,
>> +                                  vaddr, readonly, ret);
>> +    if (ret) {
>> +        ret = -errno;
>> +        error_report("IOMMU_IOAS_MAP failed: %m");
>> +    }
>> +    return ret;
>> +}
>
>When using a UEFI guest, QEMU reports errors when mapping regions
>in the top PCI space :
>
>   iommufd_backend_map_dma  iommufd=10 ioas=2 iova=0x380000001000
>size=0x3000 addr=0x7fce2c28b000 readonly=0 (-1)
>   qemu-system-x86_64: IOMMU_IOAS_MAP failed: Invalid argument
>   qemu-system-x86_64: vfio_container_dma_map(0x55a21b03a150,
>0x380000001000, 0x3000, 0x7fce2c28b000) = -22 (Invalid argument)
>
>   iommufd_backend_map_dma  iommufd=10 ioas=2 iova=0x380000004000
>size=0x4000 addr=0x7fce2c980000 readonly=0 (-1)
>   qemu-system-x86_64: IOMMU_IOAS_MAP failed: Invalid argument
>   qemu-system-x86_64: vfio_container_dma_map(0x55a21b03a150,
>0x380000004000, 0x4000, 0x7fce2c980000) = -22 (Invalid argument)
>
>This is because IOMMUFD reserved IOVAs areas are :
>
>  [ fee00000 - feefffff ]
>  [ 8000000000 - ffffffffffffffff ] (39 bits address space)
>
>which were allocated when the device was initially attached.
>The topology is basic. Something is wrong.
	
Thanks for your report. This looks a hardware limit of
host IOMMU address width(39) < guest physical address width.

A similar issue with a fix submitted below, ccing related people.
https://lists.gnu.org/archive/html/qemu-devel/2023-11/msg02937.html
It looks the fix will not work for hotplug.

Or below qemu cmdline may help:
"-cpu host,host-phys-bits-limit=39"

Thanks
Zhenzhong

Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object
Posted by Eric Auger 1 year ago
Hi Cédric,

On 11/17/23 12:39, Duan, Zhenzhong wrote:
> Hi Cédric,
>
>> -----Original Message-----
>> From: Cédric Le Goater <clg@redhat.com>
>> Sent: Friday, November 17, 2023 7:10 PM
>> Subject: Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object
>>
>> Hello,
>>
>>> +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id,
>> hwaddr iova,
>>> +                            ram_addr_t size, void *vaddr, bool readonly)
>>> +{
>>> +    int ret, fd = be->fd;
>>> +    struct iommu_ioas_map map = {
>>> +        .size = sizeof(map),
>>> +        .flags = IOMMU_IOAS_MAP_READABLE |
>>> +                 IOMMU_IOAS_MAP_FIXED_IOVA,
>>> +        .ioas_id = ioas_id,
>>> +        .__reserved = 0,
>>> +        .user_va = (uintptr_t)vaddr,
>>> +        .iova = iova,
>>> +        .length = size,
>>> +    };
>>> +
>>> +    if (!readonly) {
>>> +        map.flags |= IOMMU_IOAS_MAP_WRITEABLE;
>>> +    }
>>> +
>>> +    ret = ioctl(fd, IOMMU_IOAS_MAP, &map);
>>> +    trace_iommufd_backend_map_dma(fd, ioas_id, iova, size,
>>> +                                  vaddr, readonly, ret);
>>> +    if (ret) {
>>> +        ret = -errno;
>>> +        error_report("IOMMU_IOAS_MAP failed: %m");
>>> +    }
>>> +    return ret;
>>> +}
>> When using a UEFI guest, QEMU reports errors when mapping regions
>> in the top PCI space :
>>
>>   iommufd_backend_map_dma  iommufd=10 ioas=2 iova=0x380000001000
>> size=0x3000 addr=0x7fce2c28b000 readonly=0 (-1)
>>   qemu-system-x86_64: IOMMU_IOAS_MAP failed: Invalid argument
>>   qemu-system-x86_64: vfio_container_dma_map(0x55a21b03a150,
>> 0x380000001000, 0x3000, 0x7fce2c28b000) = -22 (Invalid argument)
>>
>>   iommufd_backend_map_dma  iommufd=10 ioas=2 iova=0x380000004000
>> size=0x4000 addr=0x7fce2c980000 readonly=0 (-1)
>>   qemu-system-x86_64: IOMMU_IOAS_MAP failed: Invalid argument
>>   qemu-system-x86_64: vfio_container_dma_map(0x55a21b03a150,
>> 0x380000004000, 0x4000, 0x7fce2c980000) = -22 (Invalid argument)
>>
>> This is because IOMMUFD reserved IOVAs areas are :
>>
>>  [ fee00000 - feefffff ]
>>  [ 8000000000 - ffffffffffffffff ] (39 bits address space)
>>
>> which were allocated when the device was initially attached.
>> The topology is basic. Something is wrong.
> 	
> Thanks for your report. This looks a hardware limit of
> host IOMMU address width(39) < guest physical address width.
>
> A similar issue with a fix submitted below, ccing related people.
> https://lists.gnu.org/archive/html/qemu-devel/2023-11/msg02937.html
> It looks the fix will not work for hotplug.
>
> Or below qemu cmdline may help:
> "-cpu host,host-phys-bits-limit=39"

don't you have the same issue with legacy VFIO code, you should?

Eric
>
> Thanks
> Zhenzhong
>


Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object
Posted by Cédric Le Goater 1 year ago
On 11/17/23 14:29, Eric Auger wrote:
> Hi Cédric,
> 
> On 11/17/23 12:39, Duan, Zhenzhong wrote:
>> Hi Cédric,
>>
>>> -----Original Message-----
>>> From: Cédric Le Goater <clg@redhat.com>
>>> Sent: Friday, November 17, 2023 7:10 PM
>>> Subject: Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object
>>>
>>> Hello,
>>>
>>>> +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id,
>>> hwaddr iova,
>>>> +                            ram_addr_t size, void *vaddr, bool readonly)
>>>> +{
>>>> +    int ret, fd = be->fd;
>>>> +    struct iommu_ioas_map map = {
>>>> +        .size = sizeof(map),
>>>> +        .flags = IOMMU_IOAS_MAP_READABLE |
>>>> +                 IOMMU_IOAS_MAP_FIXED_IOVA,
>>>> +        .ioas_id = ioas_id,
>>>> +        .__reserved = 0,
>>>> +        .user_va = (uintptr_t)vaddr,
>>>> +        .iova = iova,
>>>> +        .length = size,
>>>> +    };
>>>> +
>>>> +    if (!readonly) {
>>>> +        map.flags |= IOMMU_IOAS_MAP_WRITEABLE;
>>>> +    }
>>>> +
>>>> +    ret = ioctl(fd, IOMMU_IOAS_MAP, &map);
>>>> +    trace_iommufd_backend_map_dma(fd, ioas_id, iova, size,
>>>> +                                  vaddr, readonly, ret);
>>>> +    if (ret) {
>>>> +        ret = -errno;
>>>> +        error_report("IOMMU_IOAS_MAP failed: %m");
>>>> +    }
>>>> +    return ret;
>>>> +}
>>> When using a UEFI guest, QEMU reports errors when mapping regions
>>> in the top PCI space :
>>>
>>>    iommufd_backend_map_dma  iommufd=10 ioas=2 iova=0x380000001000
>>> size=0x3000 addr=0x7fce2c28b000 readonly=0 (-1)
>>>    qemu-system-x86_64: IOMMU_IOAS_MAP failed: Invalid argument
>>>    qemu-system-x86_64: vfio_container_dma_map(0x55a21b03a150,
>>> 0x380000001000, 0x3000, 0x7fce2c28b000) = -22 (Invalid argument)
>>>
>>>    iommufd_backend_map_dma  iommufd=10 ioas=2 iova=0x380000004000
>>> size=0x4000 addr=0x7fce2c980000 readonly=0 (-1)
>>>    qemu-system-x86_64: IOMMU_IOAS_MAP failed: Invalid argument
>>>    qemu-system-x86_64: vfio_container_dma_map(0x55a21b03a150,
>>> 0x380000004000, 0x4000, 0x7fce2c980000) = -22 (Invalid argument)
>>>
>>> This is because IOMMUFD reserved IOVAs areas are :
>>>
>>>   [ fee00000 - feefffff ]
>>>   [ 8000000000 - ffffffffffffffff ] (39 bits address space)
>>>
>>> which were allocated when the device was initially attached.
>>> The topology is basic. Something is wrong.
>> 	
>> Thanks for your report. This looks a hardware limit of
>> host IOMMU address width(39) < guest physical address width.
>>
>> A similar issue with a fix submitted below, ccing related people.
>> https://lists.gnu.org/archive/html/qemu-devel/2023-11/msg02937.html
>> It looks the fix will not work for hotplug.
>>
>> Or below qemu cmdline may help:
>> "-cpu host,host-phys-bits-limit=39"
> 
> don't you have the same issue with legacy VFIO code, you should?

I tend to be lazy and use seabios for guests on the command line.
I do see the error with legacy VFIO and uefi.

However, with the address space size work-around and iommufd, the
error is different, an EFAULT now. Some page pinning issue it seems.

Thanks,

C.


RE: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object
Posted by Duan, Zhenzhong 1 year ago

>-----Original Message-----
>From: Cédric Le Goater <clg@redhat.com>
>Sent: Friday, November 17, 2023 9:56 PM
>Subject: Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object
>
>On 11/17/23 14:29, Eric Auger wrote:
>> Hi Cédric,
>>
>> On 11/17/23 12:39, Duan, Zhenzhong wrote:
>>> Hi Cédric,
>>>
>>>> -----Original Message-----
>>>> From: Cédric Le Goater <clg@redhat.com>
>>>> Sent: Friday, November 17, 2023 7:10 PM
>>>> Subject: Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd
>object
>>>>
>>>> Hello,
>>>>
>>>>> +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id,
>>>> hwaddr iova,
>>>>> +                            ram_addr_t size, void *vaddr, bool readonly)
>>>>> +{
>>>>> +    int ret, fd = be->fd;
>>>>> +    struct iommu_ioas_map map = {
>>>>> +        .size = sizeof(map),
>>>>> +        .flags = IOMMU_IOAS_MAP_READABLE |
>>>>> +                 IOMMU_IOAS_MAP_FIXED_IOVA,
>>>>> +        .ioas_id = ioas_id,
>>>>> +        .__reserved = 0,
>>>>> +        .user_va = (uintptr_t)vaddr,
>>>>> +        .iova = iova,
>>>>> +        .length = size,
>>>>> +    };
>>>>> +
>>>>> +    if (!readonly) {
>>>>> +        map.flags |= IOMMU_IOAS_MAP_WRITEABLE;
>>>>> +    }
>>>>> +
>>>>> +    ret = ioctl(fd, IOMMU_IOAS_MAP, &map);
>>>>> +    trace_iommufd_backend_map_dma(fd, ioas_id, iova, size,
>>>>> +                                  vaddr, readonly, ret);
>>>>> +    if (ret) {
>>>>> +        ret = -errno;
>>>>> +        error_report("IOMMU_IOAS_MAP failed: %m");
>>>>> +    }
>>>>> +    return ret;
>>>>> +}
>>>> When using a UEFI guest, QEMU reports errors when mapping regions
>>>> in the top PCI space :
>>>>
>>>>    iommufd_backend_map_dma  iommufd=10 ioas=2 iova=0x380000001000
>>>> size=0x3000 addr=0x7fce2c28b000 readonly=0 (-1)
>>>>    qemu-system-x86_64: IOMMU_IOAS_MAP failed: Invalid argument
>>>>    qemu-system-x86_64: vfio_container_dma_map(0x55a21b03a150,
>>>> 0x380000001000, 0x3000, 0x7fce2c28b000) = -22 (Invalid argument)
>>>>
>>>>    iommufd_backend_map_dma  iommufd=10 ioas=2 iova=0x380000004000
>>>> size=0x4000 addr=0x7fce2c980000 readonly=0 (-1)
>>>>    qemu-system-x86_64: IOMMU_IOAS_MAP failed: Invalid argument
>>>>    qemu-system-x86_64: vfio_container_dma_map(0x55a21b03a150,
>>>> 0x380000004000, 0x4000, 0x7fce2c980000) = -22 (Invalid argument)
>>>>
>>>> This is because IOMMUFD reserved IOVAs areas are :
>>>>
>>>>   [ fee00000 - feefffff ]
>>>>   [ 8000000000 - ffffffffffffffff ] (39 bits address space)
>>>>
>>>> which were allocated when the device was initially attached.
>>>> The topology is basic. Something is wrong.
>>>
>>> Thanks for your report. This looks a hardware limit of
>>> host IOMMU address width(39) < guest physical address width.
>>>
>>> A similar issue with a fix submitted below, ccing related people.
>>> https://lists.gnu.org/archive/html/qemu-devel/2023-11/msg02937.html
>>> It looks the fix will not work for hotplug.
>>>
>>> Or below qemu cmdline may help:
>>> "-cpu host,host-phys-bits-limit=39"
>>
>> don't you have the same issue with legacy VFIO code, you should?
>
>I tend to be lazy and use seabios for guests on the command line.
>I do see the error with legacy VFIO and uefi.
>
>However, with the address space size work-around and iommufd, the
>error is different, an EFAULT now. Some page pinning issue it seems.

Yes, this reminds me of iommufd not supporting p2p mapping yet.
So EFAULT is expected. Maybe I should add a comment in docs/devel/vfio-iommufd.rst

Thanks
Zhenzhong
Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object
Posted by Cédric Le Goater 1 year ago
>>>> A similar issue with a fix submitted below, ccing related people.
>>>> https://lists.gnu.org/archive/html/qemu-devel/2023-11/msg02937.html
>>>> It looks the fix will not work for hotplug.
>>>>
>>>> Or below qemu cmdline may help:
>>>> "-cpu host,host-phys-bits-limit=39"
>>>
>>> don't you have the same issue with legacy VFIO code, you should?
>>
>> I tend to be lazy and use seabios for guests on the command line.
>> I do see the error with legacy VFIO and uefi.
>>
>> However, with the address space size work-around and iommufd, the
>> error is different, an EFAULT now. Some page pinning issue it seems.
> 
> Yes, this reminds me of iommufd not supporting p2p mapping yet.

OK. Should we transform this error in a warning ? The code needs
at least a comment.

> So EFAULT is expected. Maybe I should add a comment in docs/devel/vfio-iommufd.rst

Yes. It would be good to have a list of gaps and effects in the
documentation. See Jason's presentation at LPC.

   https://lpc.events/event/17/contributions/1418/attachments/1297/2607/LPC2023_iommufd.pdf

Thanks,

C.
RE: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object
Posted by Duan, Zhenzhong 1 year ago
Hi Cédric,

>-----Original Message-----
>From: Cédric Le Goater <clg@redhat.com>
>Sent: Monday, November 20, 2023 4:25 PM
>Subject: Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object
>
>>>>> A similar issue with a fix submitted below, ccing related people.
>>>>> https://lists.gnu.org/archive/html/qemu-devel/2023-11/msg02937.html
>>>>> It looks the fix will not work for hotplug.
>>>>>
>>>>> Or below qemu cmdline may help:
>>>>> "-cpu host,host-phys-bits-limit=39"
>>>>
>>>> don't you have the same issue with legacy VFIO code, you should?
>>>
>>> I tend to be lazy and use seabios for guests on the command line.
>>> I do see the error with legacy VFIO and uefi.
>>>
>>> However, with the address space size work-around and iommufd, the
>>> error is different, an EFAULT now. Some page pinning issue it seems.
>>
>> Yes, this reminds me of iommufd not supporting p2p mapping yet.
>
>OK. Should we transform this error in a warning ? The code needs
>at least a comment.

Make sense, though I'm not clear if there is other corner case return EFAULT.
I plan below change in v7:

diff --git a/backends/iommufd.c b/backends/iommufd.c
index 53fdac4cc0..ba58a0eb0d 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -178,7 +178,13 @@ int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova,
                                   vaddr, readonly, ret);
     if (ret) {
         ret = -errno;
-        error_report("IOMMU_IOAS_MAP failed: %m");
+
+        /* TODO: Not support mapping hardware PCI BAR region for now. */
+        if (errno == EFAULT) {
+            warn_report("IOMMU_IOAS_MAP failed: %m, PCI BAR?");
+        } else {
+            error_report("IOMMU_IOAS_MAP failed: %m");
+        }
     }
     return ret;
 }

I failed to change vfio_container_dma_map print as warning because for legacy container, it's real errro.
So print after fix:

qemu-system-x86_64: warning: IOMMU_IOAS_MAP failed: Bad address, PCI BAR?
qemu-system-x86_64: vfio_container_dma_map(0x560cb6cb1620, 0xe000000021000, 0x3000, 0x7f32ed55c000) = -14 (Bad address)

>
>> So EFAULT is expected. Maybe I should add a comment in docs/devel/vfio-
>iommufd.rst
>
>Yes. It would be good to have a list of gaps and effects in the
>documentation. See Jason's presentation at LPC.
>
>
>https://lpc.events/event/17/contributions/1418/attachments/1297/2607/LPC202
>3_iommufd.pdf

I see, PCI Peer to Peer and POWER/SPAPR are related to qemu iommufd implementation.
For POWER/SPAPR, we have "Supported platform" section.
Below are other gaps I can think of for now:

Gaps:
1. dirty page sync, WIP (Joao)
2. p2p dma not supported yet.
3. fd passing with mdev not support ram discard(vfio-pci) as no way to know it's a mdev from a fd.

Thanks
Zhenzhong
Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object
Posted by Cédric Le Goater 1 year ago
Hello Zhenzhong

On 11/20/23 11:07, Duan, Zhenzhong wrote:
> Hi Cédric,
> 
>> -----Original Message-----
>> From: Cédric Le Goater <clg@redhat.com>
>> Sent: Monday, November 20, 2023 4:25 PM
>> Subject: Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object
>>
>>>>>> A similar issue with a fix submitted below, ccing related people.
>>>>>> https://lists.gnu.org/archive/html/qemu-devel/2023-11/msg02937.html
>>>>>> It looks the fix will not work for hotplug.
>>>>>>
>>>>>> Or below qemu cmdline may help:
>>>>>> "-cpu host,host-phys-bits-limit=39"
>>>>>
>>>>> don't you have the same issue with legacy VFIO code, you should?
>>>>
>>>> I tend to be lazy and use seabios for guests on the command line.
>>>> I do see the error with legacy VFIO and uefi.
>>>>
>>>> However, with the address space size work-around and iommufd, the
>>>> error is different, an EFAULT now. Some page pinning issue it seems.
>>>
>>> Yes, this reminds me of iommufd not supporting p2p mapping yet.
>>
>> OK. Should we transform this error in a warning ? The code needs
>> at least a comment.
> 
> Make sense, though I'm not clear if there is other corner case return EFAULT.

yep. That's the problem.

> I plan below change in v7:
> 
> diff --git a/backends/iommufd.c b/backends/iommufd.c
> index 53fdac4cc0..ba58a0eb0d 100644
> --- a/backends/iommufd.c
> +++ b/backends/iommufd.c
> @@ -178,7 +178,13 @@ int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova,
>                                     vaddr, readonly, ret);
>       if (ret) {
>           ret = -errno;
> -        error_report("IOMMU_IOAS_MAP failed: %m");
> +
> +        /* TODO: Not support mapping hardware PCI BAR region for now. */
> +        if (errno == EFAULT) {
> +            warn_report("IOMMU_IOAS_MAP failed: %m, PCI BAR?");
> +        } else {
> +            error_report("IOMMU_IOAS_MAP failed: %m");
> +        }
>       }
>       return ret;
>   }
> 
> I failed to change vfio_container_dma_map print as warning because for legacy container, it's real errro.
> So print after fix:
> 
> qemu-system-x86_64: warning: IOMMU_IOAS_MAP failed: Bad address, PCI BAR?
> qemu-system-x86_64: vfio_container_dma_map(0x560cb6cb1620, 0xe000000021000, 0x3000, 0x7f32ed55c000) = -14 (Bad address)

I am OK with that. Let's see what the others have to say.

>>
>>> So EFAULT is expected. Maybe I should add a comment in docs/devel/vfio-
>> iommufd.rst
>>
>> Yes. It would be good to have a list of gaps and effects in the
>> documentation. See Jason's presentation at LPC.
>>
>>
>> https://lpc.events/event/17/contributions/1418/attachments/1297/2607/LPC202
>> 3_iommufd.pdf
> 
> I see, PCI Peer to Peer and POWER/SPAPR are related to qemu iommufd implementation.
> For POWER/SPAPR, we have "Supported platform" section.

yes.

> Below are other gaps I can think of for now:
> 
> Gaps:
> 1. dirty page sync, WIP (Joao)
> 2. p2p dma not supported yet.
> 3. fd passing with mdev not support ram discard(vfio-pci) as no way to know it's a mdev from a fd.

Call the section Caveats maybe?


Thanks,

C.


RE: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object
Posted by Duan, Zhenzhong 1 year ago

>-----Original Message-----
>From: Cédric Le Goater <clg@redhat.com>
>Sent: Tuesday, November 21, 2023 1:09 AM
>Subject: Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object
>
>Hello Zhenzhong
>
>On 11/20/23 11:07, Duan, Zhenzhong wrote:
>> Hi Cédric,
>>
>>> -----Original Message-----
>>> From: Cédric Le Goater <clg@redhat.com>
>>> Sent: Monday, November 20, 2023 4:25 PM
>>> Subject: Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd
>object
>>>
>>>>>>> A similar issue with a fix submitted below, ccing related people.
>>>>>>> https://lists.gnu.org/archive/html/qemu-devel/2023-11/msg02937.html
>>>>>>> It looks the fix will not work for hotplug.
>>>>>>>
>>>>>>> Or below qemu cmdline may help:
>>>>>>> "-cpu host,host-phys-bits-limit=39"
>>>>>>
>>>>>> don't you have the same issue with legacy VFIO code, you should?
>>>>>
>>>>> I tend to be lazy and use seabios for guests on the command line.
>>>>> I do see the error with legacy VFIO and uefi.
>>>>>
>>>>> However, with the address space size work-around and iommufd, the
>>>>> error is different, an EFAULT now. Some page pinning issue it seems.
>>>>
>>>> Yes, this reminds me of iommufd not supporting p2p mapping yet.
>>>
>>> OK. Should we transform this error in a warning ? The code needs
>>> at least a comment.
>>
>> Make sense, though I'm not clear if there is other corner case return EFAULT.
>
>yep. That's the problem.
>
>> I plan below change in v7:
>>
>> diff --git a/backends/iommufd.c b/backends/iommufd.c
>> index 53fdac4cc0..ba58a0eb0d 100644
>> --- a/backends/iommufd.c
>> +++ b/backends/iommufd.c
>> @@ -178,7 +178,13 @@ int iommufd_backend_map_dma(IOMMUFDBackend
>*be, uint32_t ioas_id, hwaddr iova,
>>                                     vaddr, readonly, ret);
>>       if (ret) {
>>           ret = -errno;
>> -        error_report("IOMMU_IOAS_MAP failed: %m");
>> +
>> +        /* TODO: Not support mapping hardware PCI BAR region for now. */
>> +        if (errno == EFAULT) {
>> +            warn_report("IOMMU_IOAS_MAP failed: %m, PCI BAR?");
>> +        } else {
>> +            error_report("IOMMU_IOAS_MAP failed: %m");
>> +        }
>>       }
>>       return ret;
>>   }
>>
>> I failed to change vfio_container_dma_map print as warning because for legacy
>container, it's real errro.
>> So print after fix:
>>
>> qemu-system-x86_64: warning: IOMMU_IOAS_MAP failed: Bad address, PCI
>BAR?
>> qemu-system-x86_64: vfio_container_dma_map(0x560cb6cb1620,
>0xe000000021000, 0x3000, 0x7f32ed55c000) = -14 (Bad address)
>
>I am OK with that. Let's see what the others have to say.
>
>>>
>>>> So EFAULT is expected. Maybe I should add a comment in docs/devel/vfio-
>>> iommufd.rst
>>>
>>> Yes. It would be good to have a list of gaps and effects in the
>>> documentation. See Jason's presentation at LPC.
>>>
>>>
>>>
>https://lpc.events/event/17/contributions/1418/attachments/1297/2607/LPC202
>>> 3_iommufd.pdf
>>
>> I see, PCI Peer to Peer and POWER/SPAPR are related to qemu iommufd
>implementation.
>> For POWER/SPAPR, we have "Supported platform" section.
>
>yes.
>
>> Below are other gaps I can think of for now:
>>
>> Gaps:
>> 1. dirty page sync, WIP (Joao)
>> 2. p2p dma not supported yet.
>> 3. fd passing with mdev not support ram discard(vfio-pci) as no way to know it's
>a mdev from a fd.
>
>Call the section Caveats maybe?

Got it.

Thanks
Zhenzhong
Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object
Posted by Cédric Le Goater 1 year ago
Hello Zhenzhong,

>>> Below are other gaps I can think of for now:
>>>
>>> Gaps:
>>> 1. dirty page sync, WIP (Joao)
>>> 2. p2p dma not supported yet.
>>> 3. fd passing with mdev not support ram discard(vfio-pci) as no way to know it's
>> a mdev from a fd.
>>
>> Call the section Caveats maybe?
> 
> Got it.

It looks like v7 should be ready by rc2 (next week). I would then merge
in vfio-next and wait a week before sending a QEMU-9.0 PR.

Thanks,

C.
RE: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object
Posted by Duan, Zhenzhong 1 year ago

>-----Original Message-----
>From: Cédric Le Goater <clg@redhat.com>
>Sent: Tuesday, November 21, 2023 4:06 PM
>Subject: Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object
>
>Hello Zhenzhong,
>
>>>> Below are other gaps I can think of for now:
>>>>
>>>> Gaps:
>>>> 1. dirty page sync, WIP (Joao)
>>>> 2. p2p dma not supported yet.
>>>> 3. fd passing with mdev not support ram discard(vfio-pci) as no way to know
>it's
>>> a mdev from a fd.
>>>
>>> Call the section Caveats maybe?
>>
>> Got it.
>
>It looks like v7 should be ready by rc2 (next week). I would then merge
>in vfio-next and wait a week before sending a QEMU-9.0 PR.

Got it, I'll send out soon.

Thanks
Zhenzhong
Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object
Posted by Cédric Le Goater 1 year ago
On 11/17/23 12:39, Duan, Zhenzhong wrote:
> Hi Cédric,
> 
>> -----Original Message-----
>> From: Cédric Le Goater <clg@redhat.com>
>> Sent: Friday, November 17, 2023 7:10 PM
>> Subject: Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object
>>
>> Hello,
>>
>>> +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id,
>> hwaddr iova,
>>> +                            ram_addr_t size, void *vaddr, bool readonly)
>>> +{
>>> +    int ret, fd = be->fd;
>>> +    struct iommu_ioas_map map = {
>>> +        .size = sizeof(map),
>>> +        .flags = IOMMU_IOAS_MAP_READABLE |
>>> +                 IOMMU_IOAS_MAP_FIXED_IOVA,
>>> +        .ioas_id = ioas_id,
>>> +        .__reserved = 0,
>>> +        .user_va = (uintptr_t)vaddr,
>>> +        .iova = iova,
>>> +        .length = size,
>>> +    };
>>> +
>>> +    if (!readonly) {
>>> +        map.flags |= IOMMU_IOAS_MAP_WRITEABLE;
>>> +    }
>>> +
>>> +    ret = ioctl(fd, IOMMU_IOAS_MAP, &map);
>>> +    trace_iommufd_backend_map_dma(fd, ioas_id, iova, size,
>>> +                                  vaddr, readonly, ret);
>>> +    if (ret) {
>>> +        ret = -errno;
>>> +        error_report("IOMMU_IOAS_MAP failed: %m");
>>> +    }
>>> +    return ret;
>>> +}
>>
>> When using a UEFI guest, QEMU reports errors when mapping regions
>> in the top PCI space :
>>
>>    iommufd_backend_map_dma  iommufd=10 ioas=2 iova=0x380000001000
>> size=0x3000 addr=0x7fce2c28b000 readonly=0 (-1)
>>    qemu-system-x86_64: IOMMU_IOAS_MAP failed: Invalid argument
>>    qemu-system-x86_64: vfio_container_dma_map(0x55a21b03a150,
>> 0x380000001000, 0x3000, 0x7fce2c28b000) = -22 (Invalid argument)
>>
>>    iommufd_backend_map_dma  iommufd=10 ioas=2 iova=0x380000004000
>> size=0x4000 addr=0x7fce2c980000 readonly=0 (-1)
>>    qemu-system-x86_64: IOMMU_IOAS_MAP failed: Invalid argument
>>    qemu-system-x86_64: vfio_container_dma_map(0x55a21b03a150,
>> 0x380000004000, 0x4000, 0x7fce2c980000) = -22 (Invalid argument)
>>
>> This is because IOMMUFD reserved IOVAs areas are :
>>
>>   [ fee00000 - feefffff ]
>>   [ 8000000000 - ffffffffffffffff ] (39 bits address space)
>>
>> which were allocated when the device was initially attached.
>> The topology is basic. Something is wrong.
> 	
> Thanks for your report. This looks a hardware limit of
> host IOMMU address width(39) < guest physical address width.
>
  > A similar issue with a fix submitted below, ccing related people.
> https://lists.gnu.org/archive/html/qemu-devel/2023-11/msg02937.html
> It looks the fix will not work for hotplug.
> 
> Or below qemu cmdline may help:
> "-cpu host,host-phys-bits-limit=39"

Not that much. The IOMMU_IOAS_MAP failure becomes a "Bad address".

Thanks,

C.




Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object
Posted by Eric Auger 1 year ago
Hi Zhenzhong,

On 11/14/23 11:09, Zhenzhong Duan wrote:
> From: Eric Auger <eric.auger@redhat.com>
>
> Introduce an iommufd object which allows the interaction
> with the host /dev/iommu device.
>
> The /dev/iommu can have been already pre-opened outside of qemu,
> in which case the fd can be passed directly along with the
> iommufd object:
>
> This allows the iommufd object to be shared accross several
> subsystems (VFIO, VDPA, ...). For example, libvirt would open
> the /dev/iommu once.
>
> If no fd is passed along with the iommufd object, the /dev/iommu
> is opened by the qemu code.
>
> Suggested-by: Alex Williamson <alex.williamson@redhat.com>
> Signed-off-by: Eric Auger <eric.auger@redhat.com>
> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
> ---
> v6: remove redundant call, alloc_hwpt, get/put_ioas
>
>  MAINTAINERS              |   7 ++
>  qapi/qom.json            |  19 ++++
>  include/sysemu/iommufd.h |  44 ++++++++
>  backends/iommufd.c       | 228 +++++++++++++++++++++++++++++++++++++++
>  backends/Kconfig         |   4 +
>  backends/meson.build     |   1 +
>  backends/trace-events    |  10 ++
>  qemu-options.hx          |  12 +++
>  8 files changed, 325 insertions(+)
>  create mode 100644 include/sysemu/iommufd.h
>  create mode 100644 backends/iommufd.c
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index ff1238bb98..a4891f7bda 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -2166,6 +2166,13 @@ F: hw/vfio/ap.c
>  F: docs/system/s390x/vfio-ap.rst
>  L: qemu-s390x@nongnu.org
>  
> +iommufd
> +M: Yi Liu <yi.l.liu@intel.com>
> +M: Eric Auger <eric.auger@redhat.com>
Zhenzhong, don't you want to be added here?
> +S: Supported
> +F: backends/iommufd.c
> +F: include/sysemu/iommufd.h
> +
>  vhost
>  M: Michael S. Tsirkin <mst@redhat.com>
>  S: Supported
> diff --git a/qapi/qom.json b/qapi/qom.json
> index c53ef978ff..1fd8555a75 100644
> --- a/qapi/qom.json
> +++ b/qapi/qom.json
> @@ -794,6 +794,23 @@
>  { 'struct': 'VfioUserServerProperties',
>    'data': { 'socket': 'SocketAddress', 'device': 'str' } }
>  
> +##
> +# @IOMMUFDProperties:
> +#
> +# Properties for iommufd objects.
> +#
> +# @fd: file descriptor name previously passed via 'getfd' command,

"previously passed via 'getfd' command", I wonder if this applies here or whether it is copy/paste of 
RemoteObjectProperties.fd doc?

> +#     which represents a pre-opened /dev/iommu.  This allows the
> +#     iommufd object to be shared accross several subsystems
> +#     (VFIO, VDPA, ...), and the file descriptor to be shared
> +#     with other process, e.g. DPDK.  (default: QEMU opens
> +#     /dev/iommu by itself)
> +#
> +# Since: 8.2
> +##
> +{ 'struct': 'IOMMUFDProperties',
> +  'data': { '*fd': 'str' } }
> +
>  ##
>  # @RngProperties:
>  #
> @@ -934,6 +951,7 @@
>      'input-barrier',
>      { 'name': 'input-linux',
>        'if': 'CONFIG_LINUX' },
> +    'iommufd',
>      'iothread',
>      'main-loop',
>      { 'name': 'memory-backend-epc',
> @@ -1003,6 +1021,7 @@
>        'input-barrier':              'InputBarrierProperties',
>        'input-linux':                { 'type': 'InputLinuxProperties',
>                                        'if': 'CONFIG_LINUX' },
> +      'iommufd':                    'IOMMUFDProperties',
>        'iothread':                   'IothreadProperties',
>        'main-loop':                  'MainLoopProperties',
>        'memory-backend-epc':         { 'type': 'MemoryBackendEpcProperties',
> diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
> new file mode 100644
> index 0000000000..9b3a86f57d
> --- /dev/null
> +++ b/include/sysemu/iommufd.h
> @@ -0,0 +1,44 @@
> +#ifndef SYSEMU_IOMMUFD_H
> +#define SYSEMU_IOMMUFD_H
> +
> +#include "qom/object.h"
> +#include "qemu/thread.h"
> +#include "exec/hwaddr.h"
> +#include "exec/cpu-common.h"
> +
> +#define TYPE_IOMMUFD_BACKEND "iommufd"
> +OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass,
> +                    IOMMUFD_BACKEND)
> +#define IOMMUFD_BACKEND(obj) \
> +    OBJECT_CHECK(IOMMUFDBackend, (obj), TYPE_IOMMUFD_BACKEND)
> +#define IOMMUFD_BACKEND_GET_CLASS(obj) \
> +    OBJECT_GET_CLASS(IOMMUFDBackendClass, (obj), TYPE_IOMMUFD_BACKEND)
> +#define IOMMUFD_BACKEND_CLASS(klass) \
> +    OBJECT_CLASS_CHECK(IOMMUFDBackendClass, (klass), TYPE_IOMMUFD_BACKEND)
> +struct IOMMUFDBackendClass {
> +    ObjectClass parent_class;
> +};
> +
> +struct IOMMUFDBackend {
> +    Object parent;
> +
> +    /*< protected >*/
> +    int fd;            /* /dev/iommu file descriptor */
> +    bool owned;        /* is the /dev/iommu opened internally */
> +    QemuMutex lock;
> +    uint32_t users;
> +
> +    /*< public >*/
> +};
> +
> +int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp);
> +void iommufd_backend_disconnect(IOMMUFDBackend *be);
> +
> +int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id,
> +                               Error **errp);
> +void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id);
> +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova,
> +                            ram_addr_t size, void *vaddr, bool readonly);
> +int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
> +                              hwaddr iova, ram_addr_t size);
> +#endif
> diff --git a/backends/iommufd.c b/backends/iommufd.c
> new file mode 100644
> index 0000000000..ea3e2a8f85
> --- /dev/null
> +++ b/backends/iommufd.c
> @@ -0,0 +1,228 @@
> +/*
> + * iommufd container backend
> + *
> + * Copyright (C) 2023 Intel Corporation.
> + * Copyright Red Hat, Inc. 2023
> + *
> + * Authors: Yi Liu <yi.l.liu@intel.com>
> + *          Eric Auger <eric.auger@redhat.com>
> + *
> + * SPDX-License-Identifier: GPL-2.0-or-later
> + */
> +
> +#include "qemu/osdep.h"
> +#include "sysemu/iommufd.h"
> +#include "qapi/error.h"
> +#include "qapi/qmp/qerror.h"
> +#include "qemu/module.h"
> +#include "qom/object_interfaces.h"
> +#include "qemu/error-report.h"
> +#include "monitor/monitor.h"
> +#include "trace.h"
> +#include <sys/ioctl.h>
> +#include <linux/iommufd.h>
> +
> +static void iommufd_backend_init(Object *obj)
> +{
> +    IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
> +
> +    be->fd = -1;
> +    be->users = 0;
> +    be->owned = true;
> +    qemu_mutex_init(&be->lock);
> +}
> +
> +static void iommufd_backend_finalize(Object *obj)
> +{
> +    IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
> +
> +    if (be->owned) {
> +        close(be->fd);
> +        be->fd = -1;
> +    }
> +}
> +
> +static void iommufd_backend_set_fd(Object *obj, const char *str, Error **errp)
> +{
> +    IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
> +    int fd = -1;
> +
> +    fd = monitor_fd_param(monitor_cur(), str, errp);
> +    if (fd == -1) {
> +        error_prepend(errp, "Could not parse remote object fd %s:", str);
> +        return;
> +    }
> +    qemu_mutex_lock(&be->lock);
> +    be->fd = fd;
> +    be->owned = false;
> +    qemu_mutex_unlock(&be->lock);
> +    trace_iommu_backend_set_fd(be->fd);
> +}
> +
> +static void iommufd_backend_class_init(ObjectClass *oc, void *data)
> +{
> +    object_class_property_add_str(oc, "fd", NULL, iommufd_backend_set_fd);
> +}
> +
> +int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp)
> +{
> +    int fd, ret = 0;
> +
> +    qemu_mutex_lock(&be->lock);
> +    if (be->users == UINT32_MAX) {
> +        error_setg(errp, "too many connections");
> +        ret = -E2BIG;
> +        goto out;
> +    }
> +    if (be->owned && !be->users) {
> +        fd = qemu_open_old("/dev/iommu", O_RDWR);
> +        if (fd < 0) {
> +            error_setg_errno(errp, errno, "/dev/iommu opening failed");
> +            ret = fd;
> +            goto out;
> +        }
> +        be->fd = fd;
> +    }
> +    be->users++;
> +out:
> +    trace_iommufd_backend_connect(be->fd, be->owned,
> +                                  be->users, ret);
> +    qemu_mutex_unlock(&be->lock);
> +    return ret;
> +}
> +
> +void iommufd_backend_disconnect(IOMMUFDBackend *be)
> +{
> +    qemu_mutex_lock(&be->lock);
> +    if (!be->users) {
> +        goto out;
> +    }
> +    be->users--;
> +    if (!be->users && be->owned) {
> +        close(be->fd);
> +        be->fd = -1;
> +    }
> +out:
> +    trace_iommufd_backend_disconnect(be->fd, be->users);
> +    qemu_mutex_unlock(&be->lock);
> +}
> +
> +int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id,
> +                               Error **errp)
> +{
> +    int ret, fd = be->fd;
> +    struct iommu_ioas_alloc alloc_data  = {
> +        .size = sizeof(alloc_data),
> +        .flags = 0,
> +    };
> +
> +    ret = ioctl(fd, IOMMU_IOAS_ALLOC, &alloc_data);
> +    if (ret) {
> +        error_setg_errno(errp, errno, "Failed to allocate ioas");
> +        return ret;
> +    }
> +
> +    *ioas_id = alloc_data.out_ioas_id;
> +    trace_iommufd_backend_alloc_ioas(fd, *ioas_id, ret);
> +
> +    return ret;
> +}
> +
> +void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id)
> +{
> +    int ret, fd = be->fd;
> +    struct iommu_destroy des = {
> +        .size = sizeof(des),
> +        .id = id,
> +    };
> +
> +    ret = ioctl(fd, IOMMU_DESTROY, &des);
> +    trace_iommufd_backend_free_id(fd, id, ret);
> +    if (ret) {
> +        error_report("Failed to free id: %u %m", id);
> +    }
> +}
> +
> +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova,
> +                            ram_addr_t size, void *vaddr, bool readonly)
> +{
> +    int ret, fd = be->fd;
> +    struct iommu_ioas_map map = {
> +        .size = sizeof(map),
> +        .flags = IOMMU_IOAS_MAP_READABLE |
> +                 IOMMU_IOAS_MAP_FIXED_IOVA,
> +        .ioas_id = ioas_id,
> +        .__reserved = 0,
> +        .user_va = (uintptr_t)vaddr,
> +        .iova = iova,
> +        .length = size,
> +    };
> +
> +    if (!readonly) {
> +        map.flags |= IOMMU_IOAS_MAP_WRITEABLE;
> +    }
> +
> +    ret = ioctl(fd, IOMMU_IOAS_MAP, &map);
> +    trace_iommufd_backend_map_dma(fd, ioas_id, iova, size,
> +                                  vaddr, readonly, ret);
> +    if (ret) {
> +        ret = -errno;
> +        error_report("IOMMU_IOAS_MAP failed: %m");
> +    }
> +    return ret;
> +}
> +
> +int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
> +                              hwaddr iova, ram_addr_t size)
> +{
> +    int ret, fd = be->fd;
> +    struct iommu_ioas_unmap unmap = {
> +        .size = sizeof(unmap),
> +        .ioas_id = ioas_id,
> +        .iova = iova,
> +        .length = size,
> +    };
> +
> +    ret = ioctl(fd, IOMMU_IOAS_UNMAP, &unmap);
> +    /*
> +     * IOMMUFD takes mapping as some kind of object, unmapping
> +     * nonexistent mapping is treated as deleting a nonexistent
> +     * object and return ENOENT. This is different from legacy
> +     * backend which allows it. vIOMMU may trigger a lot of
> +     * redundant unmapping, to avoid flush the log, treat them
> +     * as succeess for IOMMUFD just like legacy backend.
> +     */
> +    if (ret && errno == ENOENT) {
> +        trace_iommufd_backend_unmap_dma_non_exist(fd, ioas_id, iova, size, ret);
> +        ret = 0;
> +    } else {
> +        trace_iommufd_backend_unmap_dma(fd, ioas_id, iova, size, ret);
> +    }
> +
> +    if (ret) {
> +        ret = -errno;
> +        error_report("IOMMU_IOAS_UNMAP failed: %m");
> +    }
> +    return ret;
> +}
> +
> +static const TypeInfo iommufd_backend_info = {
> +    .name = TYPE_IOMMUFD_BACKEND,
> +    .parent = TYPE_OBJECT,
> +    .instance_size = sizeof(IOMMUFDBackend),
> +    .instance_init = iommufd_backend_init,
> +    .instance_finalize = iommufd_backend_finalize,
> +    .class_size = sizeof(IOMMUFDBackendClass),
> +    .class_init = iommufd_backend_class_init,
> +    .interfaces = (InterfaceInfo[]) {
> +        { TYPE_USER_CREATABLE },
> +        { }
> +    }
> +};
> +
> +static void register_types(void)
> +{
> +    type_register_static(&iommufd_backend_info);
> +}
> +
> +type_init(register_types);
> diff --git a/backends/Kconfig b/backends/Kconfig
> index f35abc1609..2cb23f62fa 100644
> --- a/backends/Kconfig
> +++ b/backends/Kconfig
> @@ -1 +1,5 @@
>  source tpm/Kconfig
> +
> +config IOMMUFD
> +    bool
> +    depends on VFIO
I don't know the state of vDPA/iommufd integration but this extra might
be added in short term.
> diff --git a/backends/meson.build b/backends/meson.build
> index 914c7c4afb..9a5cea480d 100644
> --- a/backends/meson.build
> +++ b/backends/meson.build
> @@ -20,6 +20,7 @@ if have_vhost_user
>    system_ss.add(when: 'CONFIG_VIRTIO', if_true: files('vhost-user.c'))
>  endif
>  system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('cryptodev-vhost.c'))
> +system_ss.add(when: 'CONFIG_IOMMUFD', if_true: files('iommufd.c'))
>  if have_vhost_user_crypto
>    system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('cryptodev-vhost-user.c'))
>  endif
> diff --git a/backends/trace-events b/backends/trace-events
> index 652eb76a57..d45c6e31a6 100644
> --- a/backends/trace-events
> +++ b/backends/trace-events
> @@ -5,3 +5,13 @@ dbus_vmstate_pre_save(void)
>  dbus_vmstate_post_load(int version_id) "version_id: %d"
>  dbus_vmstate_loading(const char *id) "id: %s"
>  dbus_vmstate_saving(const char *id) "id: %s"
> +
> +# iommufd.c
> +iommufd_backend_connect(int fd, bool owned, uint32_t users, int ret) "fd=%d owned=%d users=%d (%d)"
> +iommufd_backend_disconnect(int fd, uint32_t users) "fd=%d users=%d"
> +iommu_backend_set_fd(int fd) "pre-opened /dev/iommu fd=%d"
> +iommufd_backend_map_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, void *vaddr, bool readonly, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" addr=%p readonly=%d (%d)"
> +iommufd_backend_unmap_dma_non_exist(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " Unmap nonexistent mapping: iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)"
> +iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)"
> +iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas, int ret) " iommufd=%d ioas=%d (%d)"
> +iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)"
> diff --git a/qemu-options.hx b/qemu-options.hx
> index 42fd09e4de..70507c0ee6 100644
> --- a/qemu-options.hx
> +++ b/qemu-options.hx
> @@ -5224,6 +5224,18 @@ SRST
>  
>          The ``share`` boolean option is on by default with memfd.
>  
> +    ``-object iommufd,id=id[,fd=fd]``
> +        Creates an iommufd backend which allows control of DMA mapping
> +        through the /dev/iommu device.
> +
> +        The ``id`` parameter is a unique ID which frontends (such as
> +        vfio-pci of vdpa) will use to connect with the iommufd backend.
> +
> +        The ``fd`` parameter is an optional pre-opened file descriptor
> +        resulting from /dev/iommu opening. Usually the iommufd is shared
> +        across all subsystems, bringing the benefit of centralized
> +        reference counting.
> +
>      ``-object rng-builtin,id=id``
>          Creates a random number generator backend which obtains entropy
>          from QEMU builtin functions. The ``id`` parameter is a unique ID
RE: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object
Posted by Duan, Zhenzhong 1 year ago
Hi Eric,

>-----Original Message-----
>From: Eric Auger <eric.auger@redhat.com>
>Sent: Wednesday, November 15, 2023 8:53 PM
>Subject: Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object
>
>Hi Zhenzhong,
>
>On 11/14/23 11:09, Zhenzhong Duan wrote:
>> From: Eric Auger <eric.auger@redhat.com>
>>
>> Introduce an iommufd object which allows the interaction
>> with the host /dev/iommu device.
>>
>> The /dev/iommu can have been already pre-opened outside of qemu,
>> in which case the fd can be passed directly along with the
>> iommufd object:
>>
>> This allows the iommufd object to be shared accross several
>> subsystems (VFIO, VDPA, ...). For example, libvirt would open
>> the /dev/iommu once.
>>
>> If no fd is passed along with the iommufd object, the /dev/iommu
>> is opened by the qemu code.
>>
>> Suggested-by: Alex Williamson <alex.williamson@redhat.com>
>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
>> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
>> ---
>> v6: remove redundant call, alloc_hwpt, get/put_ioas
>>
>>  MAINTAINERS              |   7 ++
>>  qapi/qom.json            |  19 ++++
>>  include/sysemu/iommufd.h |  44 ++++++++
>>  backends/iommufd.c       | 228 +++++++++++++++++++++++++++++++++++++++
>>  backends/Kconfig         |   4 +
>>  backends/meson.build     |   1 +
>>  backends/trace-events    |  10 ++
>>  qemu-options.hx          |  12 +++
>>  8 files changed, 325 insertions(+)
>>  create mode 100644 include/sysemu/iommufd.h
>>  create mode 100644 backends/iommufd.c
>>
>> diff --git a/MAINTAINERS b/MAINTAINERS
>> index ff1238bb98..a4891f7bda 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -2166,6 +2166,13 @@ F: hw/vfio/ap.c
>>  F: docs/system/s390x/vfio-ap.rst
>>  L: qemu-s390x@nongnu.org
>>
>> +iommufd
>> +M: Yi Liu <yi.l.liu@intel.com>
>> +M: Eric Auger <eric.auger@redhat.com>
>Zhenzhong, don't you want to be added here?
>> +S: Supported
>> +F: backends/iommufd.c
>> +F: include/sysemu/iommufd.h
>> +
>>  vhost
>>  M: Michael S. Tsirkin <mst@redhat.com>
>>  S: Supported
>> diff --git a/qapi/qom.json b/qapi/qom.json
>> index c53ef978ff..1fd8555a75 100644
>> --- a/qapi/qom.json
>> +++ b/qapi/qom.json
>> @@ -794,6 +794,23 @@
>>  { 'struct': 'VfioUserServerProperties',
>>    'data': { 'socket': 'SocketAddress', 'device': 'str' } }
>>
>> +##
>> +# @IOMMUFDProperties:
>> +#
>> +# Properties for iommufd objects.
>> +#
>> +# @fd: file descriptor name previously passed via 'getfd' command,
>
>"previously passed via 'getfd' command", I wonder if this applies here or whether
>it is copy/paste of
>RemoteObjectProperties.fd doc?

Maybe copied😊 I thinks this applies here because I use monitor_fd_param to get fd.
Or I miss anything?

>
>> +#     which represents a pre-opened /dev/iommu.  This allows the
>> +#     iommufd object to be shared accross several subsystems
>> +#     (VFIO, VDPA, ...), and the file descriptor to be shared
>> +#     with other process, e.g. DPDK.  (default: QEMU opens
>> +#     /dev/iommu by itself)
>> +#
>> +# Since: 8.2
>> +##
>> +{ 'struct': 'IOMMUFDProperties',
>> +  'data': { '*fd': 'str' } }
>> +
>>  ##
>>  # @RngProperties:
>>  #
>> @@ -934,6 +951,7 @@
>>      'input-barrier',
>>      { 'name': 'input-linux',
>>        'if': 'CONFIG_LINUX' },
>> +    'iommufd',
>>      'iothread',
>>      'main-loop',
>>      { 'name': 'memory-backend-epc',
>> @@ -1003,6 +1021,7 @@
>>        'input-barrier':              'InputBarrierProperties',
>>        'input-linux':                { 'type': 'InputLinuxProperties',
>>                                        'if': 'CONFIG_LINUX' },
>> +      'iommufd':                    'IOMMUFDProperties',
>>        'iothread':                   'IothreadProperties',
>>        'main-loop':                  'MainLoopProperties',
>>        'memory-backend-epc':         { 'type': 'MemoryBackendEpcProperties',
>> diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
>> new file mode 100644
>> index 0000000000..9b3a86f57d
>> --- /dev/null
>> +++ b/include/sysemu/iommufd.h
>> @@ -0,0 +1,44 @@
>> +#ifndef SYSEMU_IOMMUFD_H
>> +#define SYSEMU_IOMMUFD_H
>> +
>> +#include "qom/object.h"
>> +#include "qemu/thread.h"
>> +#include "exec/hwaddr.h"
>> +#include "exec/cpu-common.h"
>> +
>> +#define TYPE_IOMMUFD_BACKEND "iommufd"
>> +OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass,
>> +                    IOMMUFD_BACKEND)
>> +#define IOMMUFD_BACKEND(obj) \
>> +    OBJECT_CHECK(IOMMUFDBackend, (obj), TYPE_IOMMUFD_BACKEND)
>> +#define IOMMUFD_BACKEND_GET_CLASS(obj) \
>> +    OBJECT_GET_CLASS(IOMMUFDBackendClass, (obj),
>TYPE_IOMMUFD_BACKEND)
>> +#define IOMMUFD_BACKEND_CLASS(klass) \
>> +    OBJECT_CLASS_CHECK(IOMMUFDBackendClass, (klass),
>TYPE_IOMMUFD_BACKEND)
>> +struct IOMMUFDBackendClass {
>> +    ObjectClass parent_class;
>> +};
>> +
>> +struct IOMMUFDBackend {
>> +    Object parent;
>> +
>> +    /*< protected >*/
>> +    int fd;            /* /dev/iommu file descriptor */
>> +    bool owned;        /* is the /dev/iommu opened internally */
>> +    QemuMutex lock;
>> +    uint32_t users;
>> +
>> +    /*< public >*/
>> +};
>> +
>> +int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp);
>> +void iommufd_backend_disconnect(IOMMUFDBackend *be);
>> +
>> +int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id,
>> +                               Error **errp);
>> +void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id);
>> +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id,
>hwaddr iova,
>> +                            ram_addr_t size, void *vaddr, bool readonly);
>> +int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
>> +                              hwaddr iova, ram_addr_t size);
>> +#endif
>> diff --git a/backends/iommufd.c b/backends/iommufd.c
>> new file mode 100644
>> index 0000000000..ea3e2a8f85
>> --- /dev/null
>> +++ b/backends/iommufd.c
>> @@ -0,0 +1,228 @@
>> +/*
>> + * iommufd container backend
>> + *
>> + * Copyright (C) 2023 Intel Corporation.
>> + * Copyright Red Hat, Inc. 2023
>> + *
>> + * Authors: Yi Liu <yi.l.liu@intel.com>
>> + *          Eric Auger <eric.auger@redhat.com>
>> + *
>> + * SPDX-License-Identifier: GPL-2.0-or-later
>> + */
>> +
>> +#include "qemu/osdep.h"
>> +#include "sysemu/iommufd.h"
>> +#include "qapi/error.h"
>> +#include "qapi/qmp/qerror.h"
>> +#include "qemu/module.h"
>> +#include "qom/object_interfaces.h"
>> +#include "qemu/error-report.h"
>> +#include "monitor/monitor.h"
>> +#include "trace.h"
>> +#include <sys/ioctl.h>
>> +#include <linux/iommufd.h>
>> +
>> +static void iommufd_backend_init(Object *obj)
>> +{
>> +    IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
>> +
>> +    be->fd = -1;
>> +    be->users = 0;
>> +    be->owned = true;
>> +    qemu_mutex_init(&be->lock);
>> +}
>> +
>> +static void iommufd_backend_finalize(Object *obj)
>> +{
>> +    IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
>> +
>> +    if (be->owned) {
>> +        close(be->fd);
>> +        be->fd = -1;
>> +    }
>> +}
>> +
>> +static void iommufd_backend_set_fd(Object *obj, const char *str, Error **errp)
>> +{
>> +    IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
>> +    int fd = -1;
>> +
>> +    fd = monitor_fd_param(monitor_cur(), str, errp);
>> +    if (fd == -1) {
>> +        error_prepend(errp, "Could not parse remote object fd %s:", str);
>> +        return;
>> +    }
>> +    qemu_mutex_lock(&be->lock);
>> +    be->fd = fd;
>> +    be->owned = false;
>> +    qemu_mutex_unlock(&be->lock);
>> +    trace_iommu_backend_set_fd(be->fd);
>> +}
>> +
>> +static void iommufd_backend_class_init(ObjectClass *oc, void *data)
>> +{
>> +    object_class_property_add_str(oc, "fd", NULL, iommufd_backend_set_fd);
>> +}
>> +
>> +int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp)
>> +{
>> +    int fd, ret = 0;
>> +
>> +    qemu_mutex_lock(&be->lock);
>> +    if (be->users == UINT32_MAX) {
>> +        error_setg(errp, "too many connections");
>> +        ret = -E2BIG;
>> +        goto out;
>> +    }
>> +    if (be->owned && !be->users) {
>> +        fd = qemu_open_old("/dev/iommu", O_RDWR);
>> +        if (fd < 0) {
>> +            error_setg_errno(errp, errno, "/dev/iommu opening failed");
>> +            ret = fd;
>> +            goto out;
>> +        }
>> +        be->fd = fd;
>> +    }
>> +    be->users++;
>> +out:
>> +    trace_iommufd_backend_connect(be->fd, be->owned,
>> +                                  be->users, ret);
>> +    qemu_mutex_unlock(&be->lock);
>> +    return ret;
>> +}
>> +
>> +void iommufd_backend_disconnect(IOMMUFDBackend *be)
>> +{
>> +    qemu_mutex_lock(&be->lock);
>> +    if (!be->users) {
>> +        goto out;
>> +    }
>> +    be->users--;
>> +    if (!be->users && be->owned) {
>> +        close(be->fd);
>> +        be->fd = -1;
>> +    }
>> +out:
>> +    trace_iommufd_backend_disconnect(be->fd, be->users);
>> +    qemu_mutex_unlock(&be->lock);
>> +}
>> +
>> +int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id,
>> +                               Error **errp)
>> +{
>> +    int ret, fd = be->fd;
>> +    struct iommu_ioas_alloc alloc_data  = {
>> +        .size = sizeof(alloc_data),
>> +        .flags = 0,
>> +    };
>> +
>> +    ret = ioctl(fd, IOMMU_IOAS_ALLOC, &alloc_data);
>> +    if (ret) {
>> +        error_setg_errno(errp, errno, "Failed to allocate ioas");
>> +        return ret;
>> +    }
>> +
>> +    *ioas_id = alloc_data.out_ioas_id;
>> +    trace_iommufd_backend_alloc_ioas(fd, *ioas_id, ret);
>> +
>> +    return ret;
>> +}
>> +
>> +void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id)
>> +{
>> +    int ret, fd = be->fd;
>> +    struct iommu_destroy des = {
>> +        .size = sizeof(des),
>> +        .id = id,
>> +    };
>> +
>> +    ret = ioctl(fd, IOMMU_DESTROY, &des);
>> +    trace_iommufd_backend_free_id(fd, id, ret);
>> +    if (ret) {
>> +        error_report("Failed to free id: %u %m", id);
>> +    }
>> +}
>> +
>> +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id,
>hwaddr iova,
>> +                            ram_addr_t size, void *vaddr, bool readonly)
>> +{
>> +    int ret, fd = be->fd;
>> +    struct iommu_ioas_map map = {
>> +        .size = sizeof(map),
>> +        .flags = IOMMU_IOAS_MAP_READABLE |
>> +                 IOMMU_IOAS_MAP_FIXED_IOVA,
>> +        .ioas_id = ioas_id,
>> +        .__reserved = 0,
>> +        .user_va = (uintptr_t)vaddr,
>> +        .iova = iova,
>> +        .length = size,
>> +    };
>> +
>> +    if (!readonly) {
>> +        map.flags |= IOMMU_IOAS_MAP_WRITEABLE;
>> +    }
>> +
>> +    ret = ioctl(fd, IOMMU_IOAS_MAP, &map);
>> +    trace_iommufd_backend_map_dma(fd, ioas_id, iova, size,
>> +                                  vaddr, readonly, ret);
>> +    if (ret) {
>> +        ret = -errno;
>> +        error_report("IOMMU_IOAS_MAP failed: %m");
>> +    }
>> +    return ret;
>> +}
>> +
>> +int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
>> +                              hwaddr iova, ram_addr_t size)
>> +{
>> +    int ret, fd = be->fd;
>> +    struct iommu_ioas_unmap unmap = {
>> +        .size = sizeof(unmap),
>> +        .ioas_id = ioas_id,
>> +        .iova = iova,
>> +        .length = size,
>> +    };
>> +
>> +    ret = ioctl(fd, IOMMU_IOAS_UNMAP, &unmap);
>> +    /*
>> +     * IOMMUFD takes mapping as some kind of object, unmapping
>> +     * nonexistent mapping is treated as deleting a nonexistent
>> +     * object and return ENOENT. This is different from legacy
>> +     * backend which allows it. vIOMMU may trigger a lot of
>> +     * redundant unmapping, to avoid flush the log, treat them
>> +     * as succeess for IOMMUFD just like legacy backend.
>> +     */
>> +    if (ret && errno == ENOENT) {
>> +        trace_iommufd_backend_unmap_dma_non_exist(fd, ioas_id, iova, size,
>ret);
>> +        ret = 0;
>> +    } else {
>> +        trace_iommufd_backend_unmap_dma(fd, ioas_id, iova, size, ret);
>> +    }
>> +
>> +    if (ret) {
>> +        ret = -errno;
>> +        error_report("IOMMU_IOAS_UNMAP failed: %m");
>> +    }
>> +    return ret;
>> +}
>> +
>> +static const TypeInfo iommufd_backend_info = {
>> +    .name = TYPE_IOMMUFD_BACKEND,
>> +    .parent = TYPE_OBJECT,
>> +    .instance_size = sizeof(IOMMUFDBackend),
>> +    .instance_init = iommufd_backend_init,
>> +    .instance_finalize = iommufd_backend_finalize,
>> +    .class_size = sizeof(IOMMUFDBackendClass),
>> +    .class_init = iommufd_backend_class_init,
>> +    .interfaces = (InterfaceInfo[]) {
>> +        { TYPE_USER_CREATABLE },
>> +        { }
>> +    }
>> +};
>> +
>> +static void register_types(void)
>> +{
>> +    type_register_static(&iommufd_backend_info);
>> +}
>> +
>> +type_init(register_types);
>> diff --git a/backends/Kconfig b/backends/Kconfig
>> index f35abc1609..2cb23f62fa 100644
>> --- a/backends/Kconfig
>> +++ b/backends/Kconfig
>> @@ -1 +1,5 @@
>>  source tpm/Kconfig
>> +
>> +config IOMMUFD
>> +    bool
>> +    depends on VFIO
>I don't know the state of vDPA/iommufd integration but this extra might
>be added in short term.

Thanks for reminder. But I think it make more sense that series relax the check
itself?
E.g. depends on VFIO || VHOST_VDPA

Thanks
Zhenzhong
Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object
Posted by Eric Auger 1 year ago
Hi Zhenzhong,

On 11/16/23 05:04, Duan, Zhenzhong wrote:
> Hi Eric,
>
>> -----Original Message-----
>> From: Eric Auger <eric.auger@redhat.com>
>> Sent: Wednesday, November 15, 2023 8:53 PM
>> Subject: Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object
>>
>> Hi Zhenzhong,
>>
>> On 11/14/23 11:09, Zhenzhong Duan wrote:
>>> From: Eric Auger <eric.auger@redhat.com>
>>>
>>> Introduce an iommufd object which allows the interaction
>>> with the host /dev/iommu device.
>>>
>>> The /dev/iommu can have been already pre-opened outside of qemu,
>>> in which case the fd can be passed directly along with the
>>> iommufd object:
>>>
>>> This allows the iommufd object to be shared accross several
>>> subsystems (VFIO, VDPA, ...). For example, libvirt would open
>>> the /dev/iommu once.
>>>
>>> If no fd is passed along with the iommufd object, the /dev/iommu
>>> is opened by the qemu code.
>>>
>>> Suggested-by: Alex Williamson <alex.williamson@redhat.com>
>>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>>> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
>>> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
>>> ---
>>> v6: remove redundant call, alloc_hwpt, get/put_ioas
>>>
>>>  MAINTAINERS              |   7 ++
>>>  qapi/qom.json            |  19 ++++
>>>  include/sysemu/iommufd.h |  44 ++++++++
>>>  backends/iommufd.c       | 228 +++++++++++++++++++++++++++++++++++++++
>>>  backends/Kconfig         |   4 +
>>>  backends/meson.build     |   1 +
>>>  backends/trace-events    |  10 ++
>>>  qemu-options.hx          |  12 +++
>>>  8 files changed, 325 insertions(+)
>>>  create mode 100644 include/sysemu/iommufd.h
>>>  create mode 100644 backends/iommufd.c
>>>
>>> diff --git a/MAINTAINERS b/MAINTAINERS
>>> index ff1238bb98..a4891f7bda 100644
>>> --- a/MAINTAINERS
>>> +++ b/MAINTAINERS
>>> @@ -2166,6 +2166,13 @@ F: hw/vfio/ap.c
>>>  F: docs/system/s390x/vfio-ap.rst
>>>  L: qemu-s390x@nongnu.org
>>>
>>> +iommufd
>>> +M: Yi Liu <yi.l.liu@intel.com>
>>> +M: Eric Auger <eric.auger@redhat.com>
>> Zhenzhong, don't you want to be added here?
>>> +S: Supported
>>> +F: backends/iommufd.c
>>> +F: include/sysemu/iommufd.h
>>> +
>>>  vhost
>>>  M: Michael S. Tsirkin <mst@redhat.com>
>>>  S: Supported
>>> diff --git a/qapi/qom.json b/qapi/qom.json
>>> index c53ef978ff..1fd8555a75 100644
>>> --- a/qapi/qom.json
>>> +++ b/qapi/qom.json
>>> @@ -794,6 +794,23 @@
>>>  { 'struct': 'VfioUserServerProperties',
>>>    'data': { 'socket': 'SocketAddress', 'device': 'str' } }
>>>
>>> +##
>>> +# @IOMMUFDProperties:
>>> +#
>>> +# Properties for iommufd objects.
>>> +#
>>> +# @fd: file descriptor name previously passed via 'getfd' command,
>> "previously passed via 'getfd' command", I wonder if this applies here or whether
>> it is copy/paste of
>> RemoteObjectProperties.fd doc?
> Maybe copied😊 I thinks this applies here because I use monitor_fd_param to get fd.
> Or I miss anything?
This is a bit cryptic to me and I don't really understand what it means.
It does not mean it is not correct but I am curious about explanations
if anybody has some ;-)
>
>>> +#     which represents a pre-opened /dev/iommu.  This allows the
>>> +#     iommufd object to be shared accross several subsystems
>>> +#     (VFIO, VDPA, ...), and the file descriptor to be shared
>>> +#     with other process, e.g. DPDK.  (default: QEMU opens
>>> +#     /dev/iommu by itself)
>>> +#
>>> +# Since: 8.2
>>> +##
>>> +{ 'struct': 'IOMMUFDProperties',
>>> +  'data': { '*fd': 'str' } }
>>> +
>>>  ##
>>>  # @RngProperties:
>>>  #
>>> @@ -934,6 +951,7 @@
>>>      'input-barrier',
>>>      { 'name': 'input-linux',
>>>        'if': 'CONFIG_LINUX' },
>>> +    'iommufd',
>>>      'iothread',
>>>      'main-loop',
>>>      { 'name': 'memory-backend-epc',
>>> @@ -1003,6 +1021,7 @@
>>>        'input-barrier':              'InputBarrierProperties',
>>>        'input-linux':                { 'type': 'InputLinuxProperties',
>>>                                        'if': 'CONFIG_LINUX' },
>>> +      'iommufd':                    'IOMMUFDProperties',
>>>        'iothread':                   'IothreadProperties',
>>>        'main-loop':                  'MainLoopProperties',
>>>        'memory-backend-epc':         { 'type': 'MemoryBackendEpcProperties',
>>> diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
>>> new file mode 100644
>>> index 0000000000..9b3a86f57d
>>> --- /dev/null
>>> +++ b/include/sysemu/iommufd.h
>>> @@ -0,0 +1,44 @@
>>> +#ifndef SYSEMU_IOMMUFD_H
>>> +#define SYSEMU_IOMMUFD_H
>>> +
>>> +#include "qom/object.h"
>>> +#include "qemu/thread.h"
>>> +#include "exec/hwaddr.h"
>>> +#include "exec/cpu-common.h"
>>> +
>>> +#define TYPE_IOMMUFD_BACKEND "iommufd"
>>> +OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass,
>>> +                    IOMMUFD_BACKEND)
>>> +#define IOMMUFD_BACKEND(obj) \
>>> +    OBJECT_CHECK(IOMMUFDBackend, (obj), TYPE_IOMMUFD_BACKEND)
>>> +#define IOMMUFD_BACKEND_GET_CLASS(obj) \
>>> +    OBJECT_GET_CLASS(IOMMUFDBackendClass, (obj),
>> TYPE_IOMMUFD_BACKEND)
>>> +#define IOMMUFD_BACKEND_CLASS(klass) \
>>> +    OBJECT_CLASS_CHECK(IOMMUFDBackendClass, (klass),
>> TYPE_IOMMUFD_BACKEND)
>>> +struct IOMMUFDBackendClass {
>>> +    ObjectClass parent_class;
>>> +};
>>> +
>>> +struct IOMMUFDBackend {
>>> +    Object parent;
>>> +
>>> +    /*< protected >*/
>>> +    int fd;            /* /dev/iommu file descriptor */
>>> +    bool owned;        /* is the /dev/iommu opened internally */
>>> +    QemuMutex lock;
>>> +    uint32_t users;
>>> +
>>> +    /*< public >*/
>>> +};
>>> +
>>> +int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp);
>>> +void iommufd_backend_disconnect(IOMMUFDBackend *be);
>>> +
>>> +int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id,
>>> +                               Error **errp);
>>> +void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id);
>>> +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id,
>> hwaddr iova,
>>> +                            ram_addr_t size, void *vaddr, bool readonly);
>>> +int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
>>> +                              hwaddr iova, ram_addr_t size);
>>> +#endif
>>> diff --git a/backends/iommufd.c b/backends/iommufd.c
>>> new file mode 100644
>>> index 0000000000..ea3e2a8f85
>>> --- /dev/null
>>> +++ b/backends/iommufd.c
>>> @@ -0,0 +1,228 @@
>>> +/*
>>> + * iommufd container backend
>>> + *
>>> + * Copyright (C) 2023 Intel Corporation.
>>> + * Copyright Red Hat, Inc. 2023
>>> + *
>>> + * Authors: Yi Liu <yi.l.liu@intel.com>
>>> + *          Eric Auger <eric.auger@redhat.com>
>>> + *
>>> + * SPDX-License-Identifier: GPL-2.0-or-later
>>> + */
>>> +
>>> +#include "qemu/osdep.h"
>>> +#include "sysemu/iommufd.h"
>>> +#include "qapi/error.h"
>>> +#include "qapi/qmp/qerror.h"
>>> +#include "qemu/module.h"
>>> +#include "qom/object_interfaces.h"
>>> +#include "qemu/error-report.h"
>>> +#include "monitor/monitor.h"
>>> +#include "trace.h"
>>> +#include <sys/ioctl.h>
>>> +#include <linux/iommufd.h>
>>> +
>>> +static void iommufd_backend_init(Object *obj)
>>> +{
>>> +    IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
>>> +
>>> +    be->fd = -1;
>>> +    be->users = 0;
>>> +    be->owned = true;
>>> +    qemu_mutex_init(&be->lock);
>>> +}
>>> +
>>> +static void iommufd_backend_finalize(Object *obj)
>>> +{
>>> +    IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
>>> +
>>> +    if (be->owned) {
>>> +        close(be->fd);
>>> +        be->fd = -1;
>>> +    }
>>> +}
>>> +
>>> +static void iommufd_backend_set_fd(Object *obj, const char *str, Error **errp)
>>> +{
>>> +    IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
>>> +    int fd = -1;
>>> +
>>> +    fd = monitor_fd_param(monitor_cur(), str, errp);
>>> +    if (fd == -1) {
>>> +        error_prepend(errp, "Could not parse remote object fd %s:", str);
>>> +        return;
>>> +    }
>>> +    qemu_mutex_lock(&be->lock);
>>> +    be->fd = fd;
>>> +    be->owned = false;
>>> +    qemu_mutex_unlock(&be->lock);
>>> +    trace_iommu_backend_set_fd(be->fd);
>>> +}
>>> +
>>> +static void iommufd_backend_class_init(ObjectClass *oc, void *data)
>>> +{
>>> +    object_class_property_add_str(oc, "fd", NULL, iommufd_backend_set_fd);
>>> +}
>>> +
>>> +int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp)
>>> +{
>>> +    int fd, ret = 0;
>>> +
>>> +    qemu_mutex_lock(&be->lock);
>>> +    if (be->users == UINT32_MAX) {
>>> +        error_setg(errp, "too many connections");
>>> +        ret = -E2BIG;
>>> +        goto out;
>>> +    }
>>> +    if (be->owned && !be->users) {
>>> +        fd = qemu_open_old("/dev/iommu", O_RDWR);
>>> +        if (fd < 0) {
>>> +            error_setg_errno(errp, errno, "/dev/iommu opening failed");
>>> +            ret = fd;
>>> +            goto out;
>>> +        }
>>> +        be->fd = fd;
>>> +    }
>>> +    be->users++;
>>> +out:
>>> +    trace_iommufd_backend_connect(be->fd, be->owned,
>>> +                                  be->users, ret);
>>> +    qemu_mutex_unlock(&be->lock);
>>> +    return ret;
>>> +}
>>> +
>>> +void iommufd_backend_disconnect(IOMMUFDBackend *be)
>>> +{
>>> +    qemu_mutex_lock(&be->lock);
>>> +    if (!be->users) {
>>> +        goto out;
>>> +    }
>>> +    be->users--;
>>> +    if (!be->users && be->owned) {
>>> +        close(be->fd);
>>> +        be->fd = -1;
>>> +    }
>>> +out:
>>> +    trace_iommufd_backend_disconnect(be->fd, be->users);
>>> +    qemu_mutex_unlock(&be->lock);
>>> +}
>>> +
>>> +int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id,
>>> +                               Error **errp)
>>> +{
>>> +    int ret, fd = be->fd;
>>> +    struct iommu_ioas_alloc alloc_data  = {
>>> +        .size = sizeof(alloc_data),
>>> +        .flags = 0,
>>> +    };
>>> +
>>> +    ret = ioctl(fd, IOMMU_IOAS_ALLOC, &alloc_data);
>>> +    if (ret) {
>>> +        error_setg_errno(errp, errno, "Failed to allocate ioas");
>>> +        return ret;
>>> +    }
>>> +
>>> +    *ioas_id = alloc_data.out_ioas_id;
>>> +    trace_iommufd_backend_alloc_ioas(fd, *ioas_id, ret);
>>> +
>>> +    return ret;
>>> +}
>>> +
>>> +void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id)
>>> +{
>>> +    int ret, fd = be->fd;
>>> +    struct iommu_destroy des = {
>>> +        .size = sizeof(des),
>>> +        .id = id,
>>> +    };
>>> +
>>> +    ret = ioctl(fd, IOMMU_DESTROY, &des);
>>> +    trace_iommufd_backend_free_id(fd, id, ret);
>>> +    if (ret) {
>>> +        error_report("Failed to free id: %u %m", id);
>>> +    }
>>> +}
>>> +
>>> +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id,
>> hwaddr iova,
>>> +                            ram_addr_t size, void *vaddr, bool readonly)
>>> +{
>>> +    int ret, fd = be->fd;
>>> +    struct iommu_ioas_map map = {
>>> +        .size = sizeof(map),
>>> +        .flags = IOMMU_IOAS_MAP_READABLE |
>>> +                 IOMMU_IOAS_MAP_FIXED_IOVA,
>>> +        .ioas_id = ioas_id,
>>> +        .__reserved = 0,
>>> +        .user_va = (uintptr_t)vaddr,
>>> +        .iova = iova,
>>> +        .length = size,
>>> +    };
>>> +
>>> +    if (!readonly) {
>>> +        map.flags |= IOMMU_IOAS_MAP_WRITEABLE;
>>> +    }
>>> +
>>> +    ret = ioctl(fd, IOMMU_IOAS_MAP, &map);
>>> +    trace_iommufd_backend_map_dma(fd, ioas_id, iova, size,
>>> +                                  vaddr, readonly, ret);
>>> +    if (ret) {
>>> +        ret = -errno;
>>> +        error_report("IOMMU_IOAS_MAP failed: %m");
>>> +    }
>>> +    return ret;
>>> +}
>>> +
>>> +int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
>>> +                              hwaddr iova, ram_addr_t size)
>>> +{
>>> +    int ret, fd = be->fd;
>>> +    struct iommu_ioas_unmap unmap = {
>>> +        .size = sizeof(unmap),
>>> +        .ioas_id = ioas_id,
>>> +        .iova = iova,
>>> +        .length = size,
>>> +    };
>>> +
>>> +    ret = ioctl(fd, IOMMU_IOAS_UNMAP, &unmap);
>>> +    /*
>>> +     * IOMMUFD takes mapping as some kind of object, unmapping
>>> +     * nonexistent mapping is treated as deleting a nonexistent
>>> +     * object and return ENOENT. This is different from legacy
>>> +     * backend which allows it. vIOMMU may trigger a lot of
>>> +     * redundant unmapping, to avoid flush the log, treat them
>>> +     * as succeess for IOMMUFD just like legacy backend.
>>> +     */
>>> +    if (ret && errno == ENOENT) {
>>> +        trace_iommufd_backend_unmap_dma_non_exist(fd, ioas_id, iova, size,
>> ret);
>>> +        ret = 0;
>>> +    } else {
>>> +        trace_iommufd_backend_unmap_dma(fd, ioas_id, iova, size, ret);
>>> +    }
>>> +
>>> +    if (ret) {
>>> +        ret = -errno;
>>> +        error_report("IOMMU_IOAS_UNMAP failed: %m");
>>> +    }
>>> +    return ret;
>>> +}
>>> +
>>> +static const TypeInfo iommufd_backend_info = {
>>> +    .name = TYPE_IOMMUFD_BACKEND,
>>> +    .parent = TYPE_OBJECT,
>>> +    .instance_size = sizeof(IOMMUFDBackend),
>>> +    .instance_init = iommufd_backend_init,
>>> +    .instance_finalize = iommufd_backend_finalize,
>>> +    .class_size = sizeof(IOMMUFDBackendClass),
>>> +    .class_init = iommufd_backend_class_init,
>>> +    .interfaces = (InterfaceInfo[]) {
>>> +        { TYPE_USER_CREATABLE },
>>> +        { }
>>> +    }
>>> +};
>>> +
>>> +static void register_types(void)
>>> +{
>>> +    type_register_static(&iommufd_backend_info);
>>> +}
>>> +
>>> +type_init(register_types);
>>> diff --git a/backends/Kconfig b/backends/Kconfig
>>> index f35abc1609..2cb23f62fa 100644
>>> --- a/backends/Kconfig
>>> +++ b/backends/Kconfig
>>> @@ -1 +1,5 @@
>>>  source tpm/Kconfig
>>> +
>>> +config IOMMUFD
>>> +    bool
>>> +    depends on VFIO
>> I don't know the state of vDPA/iommufd integration but this extra might
>> be added in short term.
> Thanks for reminder. But I think it make more sense that series relax the check
> itself?
> E.g. depends on VFIO || VHOST_VDPA
yeah we can leave it as it is for now

Eric
>
> Thanks
> Zhenzhong


RE: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object
Posted by Duan, Zhenzhong 1 year ago

>-----Original Message-----
>From: Eric Auger <eric.auger@redhat.com>
>Sent: Thursday, November 16, 2023 4:33 PM
>Subject: Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object
>
>Hi Zhenzhong,
>
>On 11/16/23 05:04, Duan, Zhenzhong wrote:
>> Hi Eric,
>>
>>> -----Original Message-----
>>> From: Eric Auger <eric.auger@redhat.com>
>>> Sent: Wednesday, November 15, 2023 8:53 PM
>>> Subject: Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd
>object
>>>
>>> Hi Zhenzhong,
>>>
>>> On 11/14/23 11:09, Zhenzhong Duan wrote:
>>>> From: Eric Auger <eric.auger@redhat.com>
>>>>
>>>> Introduce an iommufd object which allows the interaction
>>>> with the host /dev/iommu device.
>>>>
>>>> The /dev/iommu can have been already pre-opened outside of qemu,
>>>> in which case the fd can be passed directly along with the
>>>> iommufd object:
>>>>
>>>> This allows the iommufd object to be shared accross several
>>>> subsystems (VFIO, VDPA, ...). For example, libvirt would open
>>>> the /dev/iommu once.
>>>>
>>>> If no fd is passed along with the iommufd object, the /dev/iommu
>>>> is opened by the qemu code.
>>>>
>>>> Suggested-by: Alex Williamson <alex.williamson@redhat.com>
>>>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>>>> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
>>>> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
>>>> ---
>>>> v6: remove redundant call, alloc_hwpt, get/put_ioas
>>>>
>>>>  MAINTAINERS              |   7 ++
>>>>  qapi/qom.json            |  19 ++++
>>>>  include/sysemu/iommufd.h |  44 ++++++++
>>>>  backends/iommufd.c       | 228
>+++++++++++++++++++++++++++++++++++++++
>>>>  backends/Kconfig         |   4 +
>>>>  backends/meson.build     |   1 +
>>>>  backends/trace-events    |  10 ++
>>>>  qemu-options.hx          |  12 +++
>>>>  8 files changed, 325 insertions(+)
>>>>  create mode 100644 include/sysemu/iommufd.h
>>>>  create mode 100644 backends/iommufd.c
>>>>
>>>> diff --git a/MAINTAINERS b/MAINTAINERS
>>>> index ff1238bb98..a4891f7bda 100644
>>>> --- a/MAINTAINERS
>>>> +++ b/MAINTAINERS
>>>> @@ -2166,6 +2166,13 @@ F: hw/vfio/ap.c
>>>>  F: docs/system/s390x/vfio-ap.rst
>>>>  L: qemu-s390x@nongnu.org
>>>>
>>>> +iommufd
>>>> +M: Yi Liu <yi.l.liu@intel.com>
>>>> +M: Eric Auger <eric.auger@redhat.com>
>>> Zhenzhong, don't you want to be added here?

Sorry, missed this comment.
My pleasure, I'll add myself in v7.

>>>> +S: Supported
>>>> +F: backends/iommufd.c
>>>> +F: include/sysemu/iommufd.h
>>>> +
>>>>  vhost
>>>>  M: Michael S. Tsirkin <mst@redhat.com>
>>>>  S: Supported
>>>> diff --git a/qapi/qom.json b/qapi/qom.json
>>>> index c53ef978ff..1fd8555a75 100644
>>>> --- a/qapi/qom.json
>>>> +++ b/qapi/qom.json
>>>> @@ -794,6 +794,23 @@
>>>>  { 'struct': 'VfioUserServerProperties',
>>>>    'data': { 'socket': 'SocketAddress', 'device': 'str' } }
>>>>
>>>> +##
>>>> +# @IOMMUFDProperties:
>>>> +#
>>>> +# Properties for iommufd objects.
>>>> +#
>>>> +# @fd: file descriptor name previously passed via 'getfd' command,
>>> "previously passed via 'getfd' command", I wonder if this applies here or
>whether
>>> it is copy/paste of
>>> RemoteObjectProperties.fd doc?
>> Maybe copied😊 I thinks this applies here because I use monitor_fd_param to
>get fd.
>> Or I miss anything?
>This is a bit cryptic to me and I don't really understand what it means.
>It does not mean it is not correct but I am curious about explanations
>if anybody has some ;-)

I have a weak understanding on this, may have errors😊
QMP support a command named 'getfd' to send a pre-opened fd with a name,
This fd is then stored in a fd list. Then we can use that name to reference the fd
In ths list.

Thanks
Zhenzhong
Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object
Posted by Cédric Le Goater 1 year ago
On 11/14/23 11:09, Zhenzhong Duan wrote:
> From: Eric Auger <eric.auger@redhat.com>
> 
> Introduce an iommufd object which allows the interaction
> with the host /dev/iommu device.
> 
> The /dev/iommu can have been already pre-opened outside of qemu,
> in which case the fd can be passed directly along with the
> iommufd object:
> 
> This allows the iommufd object to be shared accross several
> subsystems (VFIO, VDPA, ...). For example, libvirt would open
> the /dev/iommu once.
> 
> If no fd is passed along with the iommufd object, the /dev/iommu
> is opened by the qemu code.
> 
> Suggested-by: Alex Williamson <alex.williamson@redhat.com>
> Signed-off-by: Eric Auger <eric.auger@redhat.com>
> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>

I simplified the object declaration in include/sysemu/iommufd.h and
formatted /dev/iommu in qemu-options.hx. No need to resend.

Reviewed-by: Cédric Le Goater <clg@redhat.com>

Thanks,

C.


> ---
> v6: remove redundant call, alloc_hwpt, get/put_ioas
> 
>   MAINTAINERS              |   7 ++
>   qapi/qom.json            |  19 ++++
>   include/sysemu/iommufd.h |  44 ++++++++
>   backends/iommufd.c       | 228 +++++++++++++++++++++++++++++++++++++++
>   backends/Kconfig         |   4 +
>   backends/meson.build     |   1 +
>   backends/trace-events    |  10 ++
>   qemu-options.hx          |  12 +++
>   8 files changed, 325 insertions(+)
>   create mode 100644 include/sysemu/iommufd.h
>   create mode 100644 backends/iommufd.c
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index ff1238bb98..a4891f7bda 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -2166,6 +2166,13 @@ F: hw/vfio/ap.c
>   F: docs/system/s390x/vfio-ap.rst
>   L: qemu-s390x@nongnu.org
>   
> +iommufd
> +M: Yi Liu <yi.l.liu@intel.com>
> +M: Eric Auger <eric.auger@redhat.com>
> +S: Supported
> +F: backends/iommufd.c
> +F: include/sysemu/iommufd.h
> +
>   vhost
>   M: Michael S. Tsirkin <mst@redhat.com>
>   S: Supported
> diff --git a/qapi/qom.json b/qapi/qom.json
> index c53ef978ff..1fd8555a75 100644
> --- a/qapi/qom.json
> +++ b/qapi/qom.json
> @@ -794,6 +794,23 @@
>   { 'struct': 'VfioUserServerProperties',
>     'data': { 'socket': 'SocketAddress', 'device': 'str' } }
>   
> +##
> +# @IOMMUFDProperties:
> +#
> +# Properties for iommufd objects.
> +#
> +# @fd: file descriptor name previously passed via 'getfd' command,
> +#     which represents a pre-opened /dev/iommu.  This allows the
> +#     iommufd object to be shared accross several subsystems
> +#     (VFIO, VDPA, ...), and the file descriptor to be shared
> +#     with other process, e.g. DPDK.  (default: QEMU opens
> +#     /dev/iommu by itself)
> +#
> +# Since: 8.2
> +##
> +{ 'struct': 'IOMMUFDProperties',
> +  'data': { '*fd': 'str' } }
> +
>   ##
>   # @RngProperties:
>   #
> @@ -934,6 +951,7 @@
>       'input-barrier',
>       { 'name': 'input-linux',
>         'if': 'CONFIG_LINUX' },
> +    'iommufd',
>       'iothread',
>       'main-loop',
>       { 'name': 'memory-backend-epc',
> @@ -1003,6 +1021,7 @@
>         'input-barrier':              'InputBarrierProperties',
>         'input-linux':                { 'type': 'InputLinuxProperties',
>                                         'if': 'CONFIG_LINUX' },
> +      'iommufd':                    'IOMMUFDProperties',
>         'iothread':                   'IothreadProperties',
>         'main-loop':                  'MainLoopProperties',
>         'memory-backend-epc':         { 'type': 'MemoryBackendEpcProperties',
> diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
> new file mode 100644
> index 0000000000..9b3a86f57d
> --- /dev/null
> +++ b/include/sysemu/iommufd.h
> @@ -0,0 +1,44 @@
> +#ifndef SYSEMU_IOMMUFD_H
> +#define SYSEMU_IOMMUFD_H
> +
> +#include "qom/object.h"
> +#include "qemu/thread.h"
> +#include "exec/hwaddr.h"
> +#include "exec/cpu-common.h"
> +
> +#define TYPE_IOMMUFD_BACKEND "iommufd"
> +OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass,
> +                    IOMMUFD_BACKEND)
> +#define IOMMUFD_BACKEND(obj) \
> +    OBJECT_CHECK(IOMMUFDBackend, (obj), TYPE_IOMMUFD_BACKEND)
> +#define IOMMUFD_BACKEND_GET_CLASS(obj) \
> +    OBJECT_GET_CLASS(IOMMUFDBackendClass, (obj), TYPE_IOMMUFD_BACKEND)
> +#define IOMMUFD_BACKEND_CLASS(klass) \
> +    OBJECT_CLASS_CHECK(IOMMUFDBackendClass, (klass), TYPE_IOMMUFD_BACKEND)
> +struct IOMMUFDBackendClass {
> +    ObjectClass parent_class;
> +};
> +
> +struct IOMMUFDBackend {
> +    Object parent;
> +
> +    /*< protected >*/
> +    int fd;            /* /dev/iommu file descriptor */
> +    bool owned;        /* is the /dev/iommu opened internally */
> +    QemuMutex lock;
> +    uint32_t users;
> +
> +    /*< public >*/
> +};
> +
> +int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp);
> +void iommufd_backend_disconnect(IOMMUFDBackend *be);
> +
> +int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id,
> +                               Error **errp);
> +void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id);
> +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova,
> +                            ram_addr_t size, void *vaddr, bool readonly);
> +int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
> +                              hwaddr iova, ram_addr_t size);
> +#endif
> diff --git a/backends/iommufd.c b/backends/iommufd.c
> new file mode 100644
> index 0000000000..ea3e2a8f85
> --- /dev/null
> +++ b/backends/iommufd.c
> @@ -0,0 +1,228 @@
> +/*
> + * iommufd container backend
> + *
> + * Copyright (C) 2023 Intel Corporation.
> + * Copyright Red Hat, Inc. 2023
> + *
> + * Authors: Yi Liu <yi.l.liu@intel.com>
> + *          Eric Auger <eric.auger@redhat.com>
> + *
> + * SPDX-License-Identifier: GPL-2.0-or-later
> + */
> +
> +#include "qemu/osdep.h"
> +#include "sysemu/iommufd.h"
> +#include "qapi/error.h"
> +#include "qapi/qmp/qerror.h"
> +#include "qemu/module.h"
> +#include "qom/object_interfaces.h"
> +#include "qemu/error-report.h"
> +#include "monitor/monitor.h"
> +#include "trace.h"
> +#include <sys/ioctl.h>
> +#include <linux/iommufd.h>
> +
> +static void iommufd_backend_init(Object *obj)
> +{
> +    IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
> +
> +    be->fd = -1;
> +    be->users = 0;
> +    be->owned = true;
> +    qemu_mutex_init(&be->lock);
> +}
> +
> +static void iommufd_backend_finalize(Object *obj)
> +{
> +    IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
> +
> +    if (be->owned) {
> +        close(be->fd);
> +        be->fd = -1;
> +    }
> +}
> +
> +static void iommufd_backend_set_fd(Object *obj, const char *str, Error **errp)
> +{
> +    IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
> +    int fd = -1;
> +
> +    fd = monitor_fd_param(monitor_cur(), str, errp);
> +    if (fd == -1) {
> +        error_prepend(errp, "Could not parse remote object fd %s:", str);
> +        return;
> +    }
> +    qemu_mutex_lock(&be->lock);
> +    be->fd = fd;
> +    be->owned = false;
> +    qemu_mutex_unlock(&be->lock);
> +    trace_iommu_backend_set_fd(be->fd);
> +}
> +
> +static void iommufd_backend_class_init(ObjectClass *oc, void *data)
> +{
> +    object_class_property_add_str(oc, "fd", NULL, iommufd_backend_set_fd);
> +}
> +
> +int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp)
> +{
> +    int fd, ret = 0;
> +
> +    qemu_mutex_lock(&be->lock);
> +    if (be->users == UINT32_MAX) {
> +        error_setg(errp, "too many connections");
> +        ret = -E2BIG;
> +        goto out;
> +    }
> +    if (be->owned && !be->users) {
> +        fd = qemu_open_old("/dev/iommu", O_RDWR);
> +        if (fd < 0) {
> +            error_setg_errno(errp, errno, "/dev/iommu opening failed");
> +            ret = fd;
> +            goto out;
> +        }
> +        be->fd = fd;
> +    }
> +    be->users++;
> +out:
> +    trace_iommufd_backend_connect(be->fd, be->owned,
> +                                  be->users, ret);
> +    qemu_mutex_unlock(&be->lock);
> +    return ret;
> +}
> +
> +void iommufd_backend_disconnect(IOMMUFDBackend *be)
> +{
> +    qemu_mutex_lock(&be->lock);
> +    if (!be->users) {
> +        goto out;
> +    }
> +    be->users--;
> +    if (!be->users && be->owned) {
> +        close(be->fd);
> +        be->fd = -1;
> +    }
> +out:
> +    trace_iommufd_backend_disconnect(be->fd, be->users);
> +    qemu_mutex_unlock(&be->lock);
> +}
> +
> +int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id,
> +                               Error **errp)
> +{
> +    int ret, fd = be->fd;
> +    struct iommu_ioas_alloc alloc_data  = {
> +        .size = sizeof(alloc_data),
> +        .flags = 0,
> +    };
> +
> +    ret = ioctl(fd, IOMMU_IOAS_ALLOC, &alloc_data);
> +    if (ret) {
> +        error_setg_errno(errp, errno, "Failed to allocate ioas");
> +        return ret;
> +    }
> +
> +    *ioas_id = alloc_data.out_ioas_id;
> +    trace_iommufd_backend_alloc_ioas(fd, *ioas_id, ret);
> +
> +    return ret;
> +}
> +
> +void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id)
> +{
> +    int ret, fd = be->fd;
> +    struct iommu_destroy des = {
> +        .size = sizeof(des),
> +        .id = id,
> +    };
> +
> +    ret = ioctl(fd, IOMMU_DESTROY, &des);
> +    trace_iommufd_backend_free_id(fd, id, ret);
> +    if (ret) {
> +        error_report("Failed to free id: %u %m", id);
> +    }
> +}
> +
> +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova,
> +                            ram_addr_t size, void *vaddr, bool readonly)
> +{
> +    int ret, fd = be->fd;
> +    struct iommu_ioas_map map = {
> +        .size = sizeof(map),
> +        .flags = IOMMU_IOAS_MAP_READABLE |
> +                 IOMMU_IOAS_MAP_FIXED_IOVA,
> +        .ioas_id = ioas_id,
> +        .__reserved = 0,
> +        .user_va = (uintptr_t)vaddr,
> +        .iova = iova,
> +        .length = size,
> +    };
> +
> +    if (!readonly) {
> +        map.flags |= IOMMU_IOAS_MAP_WRITEABLE;
> +    }
> +
> +    ret = ioctl(fd, IOMMU_IOAS_MAP, &map);
> +    trace_iommufd_backend_map_dma(fd, ioas_id, iova, size,
> +                                  vaddr, readonly, ret);
> +    if (ret) {
> +        ret = -errno;
> +        error_report("IOMMU_IOAS_MAP failed: %m");
> +    }
> +    return ret;
> +}
> +
> +int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
> +                              hwaddr iova, ram_addr_t size)
> +{
> +    int ret, fd = be->fd;
> +    struct iommu_ioas_unmap unmap = {
> +        .size = sizeof(unmap),
> +        .ioas_id = ioas_id,
> +        .iova = iova,
> +        .length = size,
> +    };
> +
> +    ret = ioctl(fd, IOMMU_IOAS_UNMAP, &unmap);
> +    /*
> +     * IOMMUFD takes mapping as some kind of object, unmapping
> +     * nonexistent mapping is treated as deleting a nonexistent
> +     * object and return ENOENT. This is different from legacy
> +     * backend which allows it. vIOMMU may trigger a lot of
> +     * redundant unmapping, to avoid flush the log, treat them
> +     * as succeess for IOMMUFD just like legacy backend.
> +     */
> +    if (ret && errno == ENOENT) {
> +        trace_iommufd_backend_unmap_dma_non_exist(fd, ioas_id, iova, size, ret);
> +        ret = 0;
> +    } else {
> +        trace_iommufd_backend_unmap_dma(fd, ioas_id, iova, size, ret);
> +    }
> +
> +    if (ret) {
> +        ret = -errno;
> +        error_report("IOMMU_IOAS_UNMAP failed: %m");
> +    }
> +    return ret;
> +}
> +
> +static const TypeInfo iommufd_backend_info = {
> +    .name = TYPE_IOMMUFD_BACKEND,
> +    .parent = TYPE_OBJECT,
> +    .instance_size = sizeof(IOMMUFDBackend),
> +    .instance_init = iommufd_backend_init,
> +    .instance_finalize = iommufd_backend_finalize,
> +    .class_size = sizeof(IOMMUFDBackendClass),
> +    .class_init = iommufd_backend_class_init,
> +    .interfaces = (InterfaceInfo[]) {
> +        { TYPE_USER_CREATABLE },
> +        { }
> +    }
> +};
> +
> +static void register_types(void)
> +{
> +    type_register_static(&iommufd_backend_info);
> +}
> +
> +type_init(register_types);
> diff --git a/backends/Kconfig b/backends/Kconfig
> index f35abc1609..2cb23f62fa 100644
> --- a/backends/Kconfig
> +++ b/backends/Kconfig
> @@ -1 +1,5 @@
>   source tpm/Kconfig
> +
> +config IOMMUFD
> +    bool
> +    depends on VFIO
> diff --git a/backends/meson.build b/backends/meson.build
> index 914c7c4afb..9a5cea480d 100644
> --- a/backends/meson.build
> +++ b/backends/meson.build
> @@ -20,6 +20,7 @@ if have_vhost_user
>     system_ss.add(when: 'CONFIG_VIRTIO', if_true: files('vhost-user.c'))
>   endif
>   system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('cryptodev-vhost.c'))
> +system_ss.add(when: 'CONFIG_IOMMUFD', if_true: files('iommufd.c'))
>   if have_vhost_user_crypto
>     system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('cryptodev-vhost-user.c'))
>   endif
> diff --git a/backends/trace-events b/backends/trace-events
> index 652eb76a57..d45c6e31a6 100644
> --- a/backends/trace-events
> +++ b/backends/trace-events
> @@ -5,3 +5,13 @@ dbus_vmstate_pre_save(void)
>   dbus_vmstate_post_load(int version_id) "version_id: %d"
>   dbus_vmstate_loading(const char *id) "id: %s"
>   dbus_vmstate_saving(const char *id) "id: %s"
> +
> +# iommufd.c
> +iommufd_backend_connect(int fd, bool owned, uint32_t users, int ret) "fd=%d owned=%d users=%d (%d)"
> +iommufd_backend_disconnect(int fd, uint32_t users) "fd=%d users=%d"
> +iommu_backend_set_fd(int fd) "pre-opened /dev/iommu fd=%d"
> +iommufd_backend_map_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, void *vaddr, bool readonly, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" addr=%p readonly=%d (%d)"
> +iommufd_backend_unmap_dma_non_exist(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " Unmap nonexistent mapping: iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)"
> +iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)"
> +iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas, int ret) " iommufd=%d ioas=%d (%d)"
> +iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)"
> diff --git a/qemu-options.hx b/qemu-options.hx
> index 42fd09e4de..70507c0ee6 100644
> --- a/qemu-options.hx
> +++ b/qemu-options.hx
> @@ -5224,6 +5224,18 @@ SRST
>   
>           The ``share`` boolean option is on by default with memfd.
>   
> +    ``-object iommufd,id=id[,fd=fd]``
> +        Creates an iommufd backend which allows control of DMA mapping
> +        through the /dev/iommu device.
> +
> +        The ``id`` parameter is a unique ID which frontends (such as
> +        vfio-pci of vdpa) will use to connect with the iommufd backend.
> +
> +        The ``fd`` parameter is an optional pre-opened file descriptor
> +        resulting from /dev/iommu opening. Usually the iommufd is shared
> +        across all subsystems, bringing the benefit of centralized
> +        reference counting.
> +
>       ``-object rng-builtin,id=id``
>           Creates a random number generator backend which obtains entropy
>           from QEMU builtin functions. The ``id`` parameter is a unique ID


RE: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object
Posted by Duan, Zhenzhong 1 year ago

>-----Original Message-----
>From: Cédric Le Goater <clg@redhat.com>
>Sent: Tuesday, November 14, 2023 9:29 PM
>Subject: Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object
>
>On 11/14/23 11:09, Zhenzhong Duan wrote:
>> From: Eric Auger <eric.auger@redhat.com>
>>
>> Introduce an iommufd object which allows the interaction
>> with the host /dev/iommu device.
>>
>> The /dev/iommu can have been already pre-opened outside of qemu,
>> in which case the fd can be passed directly along with the
>> iommufd object:
>>
>> This allows the iommufd object to be shared accross several
>> subsystems (VFIO, VDPA, ...). For example, libvirt would open
>> the /dev/iommu once.
>>
>> If no fd is passed along with the iommufd object, the /dev/iommu
>> is opened by the qemu code.
>>
>> Suggested-by: Alex Williamson <alex.williamson@redhat.com>
>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
>> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
>
>I simplified the object declaration in include/sysemu/iommufd.h and
>formatted /dev/iommu in qemu-options.hx. No need to resend.

Good catch, thanks! Maybe further simplified with below? This is minor.

OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass, IOMMUFD_BACKEND)

Thanks
Zhenzhong

>
>Reviewed-by: Cédric Le Goater <clg@redhat.com>
>
>Thanks,
>
>C.
>
>
>> ---
>> v6: remove redundant call, alloc_hwpt, get/put_ioas
>>
>>   MAINTAINERS              |   7 ++
>>   qapi/qom.json            |  19 ++++
>>   include/sysemu/iommufd.h |  44 ++++++++
>>   backends/iommufd.c       | 228
>+++++++++++++++++++++++++++++++++++++++
>>   backends/Kconfig         |   4 +
>>   backends/meson.build     |   1 +
>>   backends/trace-events    |  10 ++
>>   qemu-options.hx          |  12 +++
>>   8 files changed, 325 insertions(+)
>>   create mode 100644 include/sysemu/iommufd.h
>>   create mode 100644 backends/iommufd.c
>>
>> diff --git a/MAINTAINERS b/MAINTAINERS
>> index ff1238bb98..a4891f7bda 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -2166,6 +2166,13 @@ F: hw/vfio/ap.c
>>   F: docs/system/s390x/vfio-ap.rst
>>   L: qemu-s390x@nongnu.org
>>
>> +iommufd
>> +M: Yi Liu <yi.l.liu@intel.com>
>> +M: Eric Auger <eric.auger@redhat.com>
>> +S: Supported
>> +F: backends/iommufd.c
>> +F: include/sysemu/iommufd.h
>> +
>>   vhost
>>   M: Michael S. Tsirkin <mst@redhat.com>
>>   S: Supported
>> diff --git a/qapi/qom.json b/qapi/qom.json
>> index c53ef978ff..1fd8555a75 100644
>> --- a/qapi/qom.json
>> +++ b/qapi/qom.json
>> @@ -794,6 +794,23 @@
>>   { 'struct': 'VfioUserServerProperties',
>>     'data': { 'socket': 'SocketAddress', 'device': 'str' } }
>>
>> +##
>> +# @IOMMUFDProperties:
>> +#
>> +# Properties for iommufd objects.
>> +#
>> +# @fd: file descriptor name previously passed via 'getfd' command,
>> +#     which represents a pre-opened /dev/iommu.  This allows the
>> +#     iommufd object to be shared accross several subsystems
>> +#     (VFIO, VDPA, ...), and the file descriptor to be shared
>> +#     with other process, e.g. DPDK.  (default: QEMU opens
>> +#     /dev/iommu by itself)
>> +#
>> +# Since: 8.2
>> +##
>> +{ 'struct': 'IOMMUFDProperties',
>> +  'data': { '*fd': 'str' } }
>> +
>>   ##
>>   # @RngProperties:
>>   #
>> @@ -934,6 +951,7 @@
>>       'input-barrier',
>>       { 'name': 'input-linux',
>>         'if': 'CONFIG_LINUX' },
>> +    'iommufd',
>>       'iothread',
>>       'main-loop',
>>       { 'name': 'memory-backend-epc',
>> @@ -1003,6 +1021,7 @@
>>         'input-barrier':              'InputBarrierProperties',
>>         'input-linux':                { 'type': 'InputLinuxProperties',
>>                                         'if': 'CONFIG_LINUX' },
>> +      'iommufd':                    'IOMMUFDProperties',
>>         'iothread':                   'IothreadProperties',
>>         'main-loop':                  'MainLoopProperties',
>>         'memory-backend-epc':         { 'type': 'MemoryBackendEpcProperties',
>> diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
>> new file mode 100644
>> index 0000000000..9b3a86f57d
>> --- /dev/null
>> +++ b/include/sysemu/iommufd.h
>> @@ -0,0 +1,44 @@
>> +#ifndef SYSEMU_IOMMUFD_H
>> +#define SYSEMU_IOMMUFD_H
>> +
>> +#include "qom/object.h"
>> +#include "qemu/thread.h"
>> +#include "exec/hwaddr.h"
>> +#include "exec/cpu-common.h"
>> +
>> +#define TYPE_IOMMUFD_BACKEND "iommufd"
>> +OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass,
>> +                    IOMMUFD_BACKEND)
>> +#define IOMMUFD_BACKEND(obj) \
>> +    OBJECT_CHECK(IOMMUFDBackend, (obj), TYPE_IOMMUFD_BACKEND)
>> +#define IOMMUFD_BACKEND_GET_CLASS(obj) \
>> +    OBJECT_GET_CLASS(IOMMUFDBackendClass, (obj),
>TYPE_IOMMUFD_BACKEND)
>> +#define IOMMUFD_BACKEND_CLASS(klass) \
>> +    OBJECT_CLASS_CHECK(IOMMUFDBackendClass, (klass),
>TYPE_IOMMUFD_BACKEND)
>> +struct IOMMUFDBackendClass {
>> +    ObjectClass parent_class;
>> +};
>> +
>> +struct IOMMUFDBackend {
>> +    Object parent;
>> +
>> +    /*< protected >*/
>> +    int fd;            /* /dev/iommu file descriptor */
>> +    bool owned;        /* is the /dev/iommu opened internally */
>> +    QemuMutex lock;
>> +    uint32_t users;
>> +
>> +    /*< public >*/
>> +};
>> +
>> +int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp);
>> +void iommufd_backend_disconnect(IOMMUFDBackend *be);
>> +
>> +int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id,
>> +                               Error **errp);
>> +void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id);
>> +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id,
>hwaddr iova,
>> +                            ram_addr_t size, void *vaddr, bool readonly);
>> +int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
>> +                              hwaddr iova, ram_addr_t size);
>> +#endif
>> diff --git a/backends/iommufd.c b/backends/iommufd.c
>> new file mode 100644
>> index 0000000000..ea3e2a8f85
>> --- /dev/null
>> +++ b/backends/iommufd.c
>> @@ -0,0 +1,228 @@
>> +/*
>> + * iommufd container backend
>> + *
>> + * Copyright (C) 2023 Intel Corporation.
>> + * Copyright Red Hat, Inc. 2023
>> + *
>> + * Authors: Yi Liu <yi.l.liu@intel.com>
>> + *          Eric Auger <eric.auger@redhat.com>
>> + *
>> + * SPDX-License-Identifier: GPL-2.0-or-later
>> + */
>> +
>> +#include "qemu/osdep.h"
>> +#include "sysemu/iommufd.h"
>> +#include "qapi/error.h"
>> +#include "qapi/qmp/qerror.h"
>> +#include "qemu/module.h"
>> +#include "qom/object_interfaces.h"
>> +#include "qemu/error-report.h"
>> +#include "monitor/monitor.h"
>> +#include "trace.h"
>> +#include <sys/ioctl.h>
>> +#include <linux/iommufd.h>
>> +
>> +static void iommufd_backend_init(Object *obj)
>> +{
>> +    IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
>> +
>> +    be->fd = -1;
>> +    be->users = 0;
>> +    be->owned = true;
>> +    qemu_mutex_init(&be->lock);
>> +}
>> +
>> +static void iommufd_backend_finalize(Object *obj)
>> +{
>> +    IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
>> +
>> +    if (be->owned) {
>> +        close(be->fd);
>> +        be->fd = -1;
>> +    }
>> +}
>> +
>> +static void iommufd_backend_set_fd(Object *obj, const char *str, Error **errp)
>> +{
>> +    IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
>> +    int fd = -1;
>> +
>> +    fd = monitor_fd_param(monitor_cur(), str, errp);
>> +    if (fd == -1) {
>> +        error_prepend(errp, "Could not parse remote object fd %s:", str);
>> +        return;
>> +    }
>> +    qemu_mutex_lock(&be->lock);
>> +    be->fd = fd;
>> +    be->owned = false;
>> +    qemu_mutex_unlock(&be->lock);
>> +    trace_iommu_backend_set_fd(be->fd);
>> +}
>> +
>> +static void iommufd_backend_class_init(ObjectClass *oc, void *data)
>> +{
>> +    object_class_property_add_str(oc, "fd", NULL, iommufd_backend_set_fd);
>> +}
>> +
>> +int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp)
>> +{
>> +    int fd, ret = 0;
>> +
>> +    qemu_mutex_lock(&be->lock);
>> +    if (be->users == UINT32_MAX) {
>> +        error_setg(errp, "too many connections");
>> +        ret = -E2BIG;
>> +        goto out;
>> +    }
>> +    if (be->owned && !be->users) {
>> +        fd = qemu_open_old("/dev/iommu", O_RDWR);
>> +        if (fd < 0) {
>> +            error_setg_errno(errp, errno, "/dev/iommu opening failed");
>> +            ret = fd;
>> +            goto out;
>> +        }
>> +        be->fd = fd;
>> +    }
>> +    be->users++;
>> +out:
>> +    trace_iommufd_backend_connect(be->fd, be->owned,
>> +                                  be->users, ret);
>> +    qemu_mutex_unlock(&be->lock);
>> +    return ret;
>> +}
>> +
>> +void iommufd_backend_disconnect(IOMMUFDBackend *be)
>> +{
>> +    qemu_mutex_lock(&be->lock);
>> +    if (!be->users) {
>> +        goto out;
>> +    }
>> +    be->users--;
>> +    if (!be->users && be->owned) {
>> +        close(be->fd);
>> +        be->fd = -1;
>> +    }
>> +out:
>> +    trace_iommufd_backend_disconnect(be->fd, be->users);
>> +    qemu_mutex_unlock(&be->lock);
>> +}
>> +
>> +int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id,
>> +                               Error **errp)
>> +{
>> +    int ret, fd = be->fd;
>> +    struct iommu_ioas_alloc alloc_data  = {
>> +        .size = sizeof(alloc_data),
>> +        .flags = 0,
>> +    };
>> +
>> +    ret = ioctl(fd, IOMMU_IOAS_ALLOC, &alloc_data);
>> +    if (ret) {
>> +        error_setg_errno(errp, errno, "Failed to allocate ioas");
>> +        return ret;
>> +    }
>> +
>> +    *ioas_id = alloc_data.out_ioas_id;
>> +    trace_iommufd_backend_alloc_ioas(fd, *ioas_id, ret);
>> +
>> +    return ret;
>> +}
>> +
>> +void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id)
>> +{
>> +    int ret, fd = be->fd;
>> +    struct iommu_destroy des = {
>> +        .size = sizeof(des),
>> +        .id = id,
>> +    };
>> +
>> +    ret = ioctl(fd, IOMMU_DESTROY, &des);
>> +    trace_iommufd_backend_free_id(fd, id, ret);
>> +    if (ret) {
>> +        error_report("Failed to free id: %u %m", id);
>> +    }
>> +}
>> +
>> +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id,
>hwaddr iova,
>> +                            ram_addr_t size, void *vaddr, bool readonly)
>> +{
>> +    int ret, fd = be->fd;
>> +    struct iommu_ioas_map map = {
>> +        .size = sizeof(map),
>> +        .flags = IOMMU_IOAS_MAP_READABLE |
>> +                 IOMMU_IOAS_MAP_FIXED_IOVA,
>> +        .ioas_id = ioas_id,
>> +        .__reserved = 0,
>> +        .user_va = (uintptr_t)vaddr,
>> +        .iova = iova,
>> +        .length = size,
>> +    };
>> +
>> +    if (!readonly) {
>> +        map.flags |= IOMMU_IOAS_MAP_WRITEABLE;
>> +    }
>> +
>> +    ret = ioctl(fd, IOMMU_IOAS_MAP, &map);
>> +    trace_iommufd_backend_map_dma(fd, ioas_id, iova, size,
>> +                                  vaddr, readonly, ret);
>> +    if (ret) {
>> +        ret = -errno;
>> +        error_report("IOMMU_IOAS_MAP failed: %m");
>> +    }
>> +    return ret;
>> +}
>> +
>> +int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
>> +                              hwaddr iova, ram_addr_t size)
>> +{
>> +    int ret, fd = be->fd;
>> +    struct iommu_ioas_unmap unmap = {
>> +        .size = sizeof(unmap),
>> +        .ioas_id = ioas_id,
>> +        .iova = iova,
>> +        .length = size,
>> +    };
>> +
>> +    ret = ioctl(fd, IOMMU_IOAS_UNMAP, &unmap);
>> +    /*
>> +     * IOMMUFD takes mapping as some kind of object, unmapping
>> +     * nonexistent mapping is treated as deleting a nonexistent
>> +     * object and return ENOENT. This is different from legacy
>> +     * backend which allows it. vIOMMU may trigger a lot of
>> +     * redundant unmapping, to avoid flush the log, treat them
>> +     * as succeess for IOMMUFD just like legacy backend.
>> +     */
>> +    if (ret && errno == ENOENT) {
>> +        trace_iommufd_backend_unmap_dma_non_exist(fd, ioas_id, iova, size,
>ret);
>> +        ret = 0;
>> +    } else {
>> +        trace_iommufd_backend_unmap_dma(fd, ioas_id, iova, size, ret);
>> +    }
>> +
>> +    if (ret) {
>> +        ret = -errno;
>> +        error_report("IOMMU_IOAS_UNMAP failed: %m");
>> +    }
>> +    return ret;
>> +}
>> +
>> +static const TypeInfo iommufd_backend_info = {
>> +    .name = TYPE_IOMMUFD_BACKEND,
>> +    .parent = TYPE_OBJECT,
>> +    .instance_size = sizeof(IOMMUFDBackend),
>> +    .instance_init = iommufd_backend_init,
>> +    .instance_finalize = iommufd_backend_finalize,
>> +    .class_size = sizeof(IOMMUFDBackendClass),
>> +    .class_init = iommufd_backend_class_init,
>> +    .interfaces = (InterfaceInfo[]) {
>> +        { TYPE_USER_CREATABLE },
>> +        { }
>> +    }
>> +};
>> +
>> +static void register_types(void)
>> +{
>> +    type_register_static(&iommufd_backend_info);
>> +}
>> +
>> +type_init(register_types);
>> diff --git a/backends/Kconfig b/backends/Kconfig
>> index f35abc1609..2cb23f62fa 100644
>> --- a/backends/Kconfig
>> +++ b/backends/Kconfig
>> @@ -1 +1,5 @@
>>   source tpm/Kconfig
>> +
>> +config IOMMUFD
>> +    bool
>> +    depends on VFIO
>> diff --git a/backends/meson.build b/backends/meson.build
>> index 914c7c4afb..9a5cea480d 100644
>> --- a/backends/meson.build
>> +++ b/backends/meson.build
>> @@ -20,6 +20,7 @@ if have_vhost_user
>>     system_ss.add(when: 'CONFIG_VIRTIO', if_true: files('vhost-user.c'))
>>   endif
>>   system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('cryptodev-
>vhost.c'))
>> +system_ss.add(when: 'CONFIG_IOMMUFD', if_true: files('iommufd.c'))
>>   if have_vhost_user_crypto
>>     system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('cryptodev-
>vhost-user.c'))
>>   endif
>> diff --git a/backends/trace-events b/backends/trace-events
>> index 652eb76a57..d45c6e31a6 100644
>> --- a/backends/trace-events
>> +++ b/backends/trace-events
>> @@ -5,3 +5,13 @@ dbus_vmstate_pre_save(void)
>>   dbus_vmstate_post_load(int version_id) "version_id: %d"
>>   dbus_vmstate_loading(const char *id) "id: %s"
>>   dbus_vmstate_saving(const char *id) "id: %s"
>> +
>> +# iommufd.c
>> +iommufd_backend_connect(int fd, bool owned, uint32_t users, int ret) "fd=%d
>owned=%d users=%d (%d)"
>> +iommufd_backend_disconnect(int fd, uint32_t users) "fd=%d users=%d"
>> +iommu_backend_set_fd(int fd) "pre-opened /dev/iommu fd=%d"
>> +iommufd_backend_map_dma(int iommufd, uint32_t ioas, uint64_t iova,
>uint64_t size, void *vaddr, bool readonly, int ret) " iommufd=%d ioas=%d
>iova=0x%"PRIx64" size=0x%"PRIx64" addr=%p readonly=%d (%d)"
>> +iommufd_backend_unmap_dma_non_exist(int iommufd, uint32_t ioas,
>uint64_t iova, uint64_t size, int ret) " Unmap nonexistent mapping: iommufd=%d
>ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)"
>> +iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova,
>uint64_t size, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64"
>size=0x%"PRIx64" (%d)"
>> +iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas, int ret) "
>iommufd=%d ioas=%d (%d)"
>> +iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d
>id=%d (%d)"
>> diff --git a/qemu-options.hx b/qemu-options.hx
>> index 42fd09e4de..70507c0ee6 100644
>> --- a/qemu-options.hx
>> +++ b/qemu-options.hx
>> @@ -5224,6 +5224,18 @@ SRST
>>
>>           The ``share`` boolean option is on by default with memfd.
>>
>> +    ``-object iommufd,id=id[,fd=fd]``
>> +        Creates an iommufd backend which allows control of DMA mapping
>> +        through the /dev/iommu device.
>> +
>> +        The ``id`` parameter is a unique ID which frontends (such as
>> +        vfio-pci of vdpa) will use to connect with the iommufd backend.
>> +
>> +        The ``fd`` parameter is an optional pre-opened file descriptor
>> +        resulting from /dev/iommu opening. Usually the iommufd is shared
>> +        across all subsystems, bringing the benefit of centralized
>> +        reference counting.
>> +
>>       ``-object rng-builtin,id=id``
>>           Creates a random number generator backend which obtains entropy
>>           from QEMU builtin functions. The ``id`` parameter is a unique ID

Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object
Posted by Cédric Le Goater 1 year ago
On 11/15/23 05:06, Duan, Zhenzhong wrote:
> 
> 
>> -----Original Message-----
>> From: Cédric Le Goater <clg@redhat.com>
>> Sent: Tuesday, November 14, 2023 9:29 PM
>> Subject: Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object
>>
>> On 11/14/23 11:09, Zhenzhong Duan wrote:
>>> From: Eric Auger <eric.auger@redhat.com>
>>>
>>> Introduce an iommufd object which allows the interaction
>>> with the host /dev/iommu device.
>>>
>>> The /dev/iommu can have been already pre-opened outside of qemu,
>>> in which case the fd can be passed directly along with the
>>> iommufd object:
>>>
>>> This allows the iommufd object to be shared accross several
>>> subsystems (VFIO, VDPA, ...). For example, libvirt would open
>>> the /dev/iommu once.
>>>
>>> If no fd is passed along with the iommufd object, the /dev/iommu
>>> is opened by the qemu code.
>>>
>>> Suggested-by: Alex Williamson <alex.williamson@redhat.com>
>>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>>> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
>>> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
>>
>> I simplified the object declaration in include/sysemu/iommufd.h and
>> formatted /dev/iommu in qemu-options.hx. No need to resend.
> 
> Good catch, thanks! Maybe further simplified with below? This is minor.
> 
> OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass, IOMMUFD_BACKEND)

Indeed. Done.



Thanks,

C.