From: Eric Auger <eric.auger@redhat.com>
Introduce an iommufd object which allows the interaction
with the host /dev/iommu device.
The /dev/iommu can have been already pre-opened outside of qemu,
in which case the fd can be passed directly along with the
iommufd object:
This allows the iommufd object to be shared accross several
subsystems (VFIO, VDPA, ...). For example, libvirt would open
the /dev/iommu once.
If no fd is passed along with the iommufd object, the /dev/iommu
is opened by the qemu code.
Suggested-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
---
v6: remove redundant call, alloc_hwpt, get/put_ioas
MAINTAINERS | 7 ++
qapi/qom.json | 19 ++++
include/sysemu/iommufd.h | 44 ++++++++
backends/iommufd.c | 228 +++++++++++++++++++++++++++++++++++++++
backends/Kconfig | 4 +
backends/meson.build | 1 +
backends/trace-events | 10 ++
qemu-options.hx | 12 +++
8 files changed, 325 insertions(+)
create mode 100644 include/sysemu/iommufd.h
create mode 100644 backends/iommufd.c
diff --git a/MAINTAINERS b/MAINTAINERS
index ff1238bb98..a4891f7bda 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2166,6 +2166,13 @@ F: hw/vfio/ap.c
F: docs/system/s390x/vfio-ap.rst
L: qemu-s390x@nongnu.org
+iommufd
+M: Yi Liu <yi.l.liu@intel.com>
+M: Eric Auger <eric.auger@redhat.com>
+S: Supported
+F: backends/iommufd.c
+F: include/sysemu/iommufd.h
+
vhost
M: Michael S. Tsirkin <mst@redhat.com>
S: Supported
diff --git a/qapi/qom.json b/qapi/qom.json
index c53ef978ff..1fd8555a75 100644
--- a/qapi/qom.json
+++ b/qapi/qom.json
@@ -794,6 +794,23 @@
{ 'struct': 'VfioUserServerProperties',
'data': { 'socket': 'SocketAddress', 'device': 'str' } }
+##
+# @IOMMUFDProperties:
+#
+# Properties for iommufd objects.
+#
+# @fd: file descriptor name previously passed via 'getfd' command,
+# which represents a pre-opened /dev/iommu. This allows the
+# iommufd object to be shared accross several subsystems
+# (VFIO, VDPA, ...), and the file descriptor to be shared
+# with other process, e.g. DPDK. (default: QEMU opens
+# /dev/iommu by itself)
+#
+# Since: 8.2
+##
+{ 'struct': 'IOMMUFDProperties',
+ 'data': { '*fd': 'str' } }
+
##
# @RngProperties:
#
@@ -934,6 +951,7 @@
'input-barrier',
{ 'name': 'input-linux',
'if': 'CONFIG_LINUX' },
+ 'iommufd',
'iothread',
'main-loop',
{ 'name': 'memory-backend-epc',
@@ -1003,6 +1021,7 @@
'input-barrier': 'InputBarrierProperties',
'input-linux': { 'type': 'InputLinuxProperties',
'if': 'CONFIG_LINUX' },
+ 'iommufd': 'IOMMUFDProperties',
'iothread': 'IothreadProperties',
'main-loop': 'MainLoopProperties',
'memory-backend-epc': { 'type': 'MemoryBackendEpcProperties',
diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
new file mode 100644
index 0000000000..9b3a86f57d
--- /dev/null
+++ b/include/sysemu/iommufd.h
@@ -0,0 +1,44 @@
+#ifndef SYSEMU_IOMMUFD_H
+#define SYSEMU_IOMMUFD_H
+
+#include "qom/object.h"
+#include "qemu/thread.h"
+#include "exec/hwaddr.h"
+#include "exec/cpu-common.h"
+
+#define TYPE_IOMMUFD_BACKEND "iommufd"
+OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass,
+ IOMMUFD_BACKEND)
+#define IOMMUFD_BACKEND(obj) \
+ OBJECT_CHECK(IOMMUFDBackend, (obj), TYPE_IOMMUFD_BACKEND)
+#define IOMMUFD_BACKEND_GET_CLASS(obj) \
+ OBJECT_GET_CLASS(IOMMUFDBackendClass, (obj), TYPE_IOMMUFD_BACKEND)
+#define IOMMUFD_BACKEND_CLASS(klass) \
+ OBJECT_CLASS_CHECK(IOMMUFDBackendClass, (klass), TYPE_IOMMUFD_BACKEND)
+struct IOMMUFDBackendClass {
+ ObjectClass parent_class;
+};
+
+struct IOMMUFDBackend {
+ Object parent;
+
+ /*< protected >*/
+ int fd; /* /dev/iommu file descriptor */
+ bool owned; /* is the /dev/iommu opened internally */
+ QemuMutex lock;
+ uint32_t users;
+
+ /*< public >*/
+};
+
+int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp);
+void iommufd_backend_disconnect(IOMMUFDBackend *be);
+
+int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id,
+ Error **errp);
+void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id);
+int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova,
+ ram_addr_t size, void *vaddr, bool readonly);
+int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
+ hwaddr iova, ram_addr_t size);
+#endif
diff --git a/backends/iommufd.c b/backends/iommufd.c
new file mode 100644
index 0000000000..ea3e2a8f85
--- /dev/null
+++ b/backends/iommufd.c
@@ -0,0 +1,228 @@
+/*
+ * iommufd container backend
+ *
+ * Copyright (C) 2023 Intel Corporation.
+ * Copyright Red Hat, Inc. 2023
+ *
+ * Authors: Yi Liu <yi.l.liu@intel.com>
+ * Eric Auger <eric.auger@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "sysemu/iommufd.h"
+#include "qapi/error.h"
+#include "qapi/qmp/qerror.h"
+#include "qemu/module.h"
+#include "qom/object_interfaces.h"
+#include "qemu/error-report.h"
+#include "monitor/monitor.h"
+#include "trace.h"
+#include <sys/ioctl.h>
+#include <linux/iommufd.h>
+
+static void iommufd_backend_init(Object *obj)
+{
+ IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
+
+ be->fd = -1;
+ be->users = 0;
+ be->owned = true;
+ qemu_mutex_init(&be->lock);
+}
+
+static void iommufd_backend_finalize(Object *obj)
+{
+ IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
+
+ if (be->owned) {
+ close(be->fd);
+ be->fd = -1;
+ }
+}
+
+static void iommufd_backend_set_fd(Object *obj, const char *str, Error **errp)
+{
+ IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
+ int fd = -1;
+
+ fd = monitor_fd_param(monitor_cur(), str, errp);
+ if (fd == -1) {
+ error_prepend(errp, "Could not parse remote object fd %s:", str);
+ return;
+ }
+ qemu_mutex_lock(&be->lock);
+ be->fd = fd;
+ be->owned = false;
+ qemu_mutex_unlock(&be->lock);
+ trace_iommu_backend_set_fd(be->fd);
+}
+
+static void iommufd_backend_class_init(ObjectClass *oc, void *data)
+{
+ object_class_property_add_str(oc, "fd", NULL, iommufd_backend_set_fd);
+}
+
+int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp)
+{
+ int fd, ret = 0;
+
+ qemu_mutex_lock(&be->lock);
+ if (be->users == UINT32_MAX) {
+ error_setg(errp, "too many connections");
+ ret = -E2BIG;
+ goto out;
+ }
+ if (be->owned && !be->users) {
+ fd = qemu_open_old("/dev/iommu", O_RDWR);
+ if (fd < 0) {
+ error_setg_errno(errp, errno, "/dev/iommu opening failed");
+ ret = fd;
+ goto out;
+ }
+ be->fd = fd;
+ }
+ be->users++;
+out:
+ trace_iommufd_backend_connect(be->fd, be->owned,
+ be->users, ret);
+ qemu_mutex_unlock(&be->lock);
+ return ret;
+}
+
+void iommufd_backend_disconnect(IOMMUFDBackend *be)
+{
+ qemu_mutex_lock(&be->lock);
+ if (!be->users) {
+ goto out;
+ }
+ be->users--;
+ if (!be->users && be->owned) {
+ close(be->fd);
+ be->fd = -1;
+ }
+out:
+ trace_iommufd_backend_disconnect(be->fd, be->users);
+ qemu_mutex_unlock(&be->lock);
+}
+
+int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id,
+ Error **errp)
+{
+ int ret, fd = be->fd;
+ struct iommu_ioas_alloc alloc_data = {
+ .size = sizeof(alloc_data),
+ .flags = 0,
+ };
+
+ ret = ioctl(fd, IOMMU_IOAS_ALLOC, &alloc_data);
+ if (ret) {
+ error_setg_errno(errp, errno, "Failed to allocate ioas");
+ return ret;
+ }
+
+ *ioas_id = alloc_data.out_ioas_id;
+ trace_iommufd_backend_alloc_ioas(fd, *ioas_id, ret);
+
+ return ret;
+}
+
+void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id)
+{
+ int ret, fd = be->fd;
+ struct iommu_destroy des = {
+ .size = sizeof(des),
+ .id = id,
+ };
+
+ ret = ioctl(fd, IOMMU_DESTROY, &des);
+ trace_iommufd_backend_free_id(fd, id, ret);
+ if (ret) {
+ error_report("Failed to free id: %u %m", id);
+ }
+}
+
+int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova,
+ ram_addr_t size, void *vaddr, bool readonly)
+{
+ int ret, fd = be->fd;
+ struct iommu_ioas_map map = {
+ .size = sizeof(map),
+ .flags = IOMMU_IOAS_MAP_READABLE |
+ IOMMU_IOAS_MAP_FIXED_IOVA,
+ .ioas_id = ioas_id,
+ .__reserved = 0,
+ .user_va = (uintptr_t)vaddr,
+ .iova = iova,
+ .length = size,
+ };
+
+ if (!readonly) {
+ map.flags |= IOMMU_IOAS_MAP_WRITEABLE;
+ }
+
+ ret = ioctl(fd, IOMMU_IOAS_MAP, &map);
+ trace_iommufd_backend_map_dma(fd, ioas_id, iova, size,
+ vaddr, readonly, ret);
+ if (ret) {
+ ret = -errno;
+ error_report("IOMMU_IOAS_MAP failed: %m");
+ }
+ return ret;
+}
+
+int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
+ hwaddr iova, ram_addr_t size)
+{
+ int ret, fd = be->fd;
+ struct iommu_ioas_unmap unmap = {
+ .size = sizeof(unmap),
+ .ioas_id = ioas_id,
+ .iova = iova,
+ .length = size,
+ };
+
+ ret = ioctl(fd, IOMMU_IOAS_UNMAP, &unmap);
+ /*
+ * IOMMUFD takes mapping as some kind of object, unmapping
+ * nonexistent mapping is treated as deleting a nonexistent
+ * object and return ENOENT. This is different from legacy
+ * backend which allows it. vIOMMU may trigger a lot of
+ * redundant unmapping, to avoid flush the log, treat them
+ * as succeess for IOMMUFD just like legacy backend.
+ */
+ if (ret && errno == ENOENT) {
+ trace_iommufd_backend_unmap_dma_non_exist(fd, ioas_id, iova, size, ret);
+ ret = 0;
+ } else {
+ trace_iommufd_backend_unmap_dma(fd, ioas_id, iova, size, ret);
+ }
+
+ if (ret) {
+ ret = -errno;
+ error_report("IOMMU_IOAS_UNMAP failed: %m");
+ }
+ return ret;
+}
+
+static const TypeInfo iommufd_backend_info = {
+ .name = TYPE_IOMMUFD_BACKEND,
+ .parent = TYPE_OBJECT,
+ .instance_size = sizeof(IOMMUFDBackend),
+ .instance_init = iommufd_backend_init,
+ .instance_finalize = iommufd_backend_finalize,
+ .class_size = sizeof(IOMMUFDBackendClass),
+ .class_init = iommufd_backend_class_init,
+ .interfaces = (InterfaceInfo[]) {
+ { TYPE_USER_CREATABLE },
+ { }
+ }
+};
+
+static void register_types(void)
+{
+ type_register_static(&iommufd_backend_info);
+}
+
+type_init(register_types);
diff --git a/backends/Kconfig b/backends/Kconfig
index f35abc1609..2cb23f62fa 100644
--- a/backends/Kconfig
+++ b/backends/Kconfig
@@ -1 +1,5 @@
source tpm/Kconfig
+
+config IOMMUFD
+ bool
+ depends on VFIO
diff --git a/backends/meson.build b/backends/meson.build
index 914c7c4afb..9a5cea480d 100644
--- a/backends/meson.build
+++ b/backends/meson.build
@@ -20,6 +20,7 @@ if have_vhost_user
system_ss.add(when: 'CONFIG_VIRTIO', if_true: files('vhost-user.c'))
endif
system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('cryptodev-vhost.c'))
+system_ss.add(when: 'CONFIG_IOMMUFD', if_true: files('iommufd.c'))
if have_vhost_user_crypto
system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('cryptodev-vhost-user.c'))
endif
diff --git a/backends/trace-events b/backends/trace-events
index 652eb76a57..d45c6e31a6 100644
--- a/backends/trace-events
+++ b/backends/trace-events
@@ -5,3 +5,13 @@ dbus_vmstate_pre_save(void)
dbus_vmstate_post_load(int version_id) "version_id: %d"
dbus_vmstate_loading(const char *id) "id: %s"
dbus_vmstate_saving(const char *id) "id: %s"
+
+# iommufd.c
+iommufd_backend_connect(int fd, bool owned, uint32_t users, int ret) "fd=%d owned=%d users=%d (%d)"
+iommufd_backend_disconnect(int fd, uint32_t users) "fd=%d users=%d"
+iommu_backend_set_fd(int fd) "pre-opened /dev/iommu fd=%d"
+iommufd_backend_map_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, void *vaddr, bool readonly, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" addr=%p readonly=%d (%d)"
+iommufd_backend_unmap_dma_non_exist(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " Unmap nonexistent mapping: iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)"
+iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)"
+iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas, int ret) " iommufd=%d ioas=%d (%d)"
+iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)"
diff --git a/qemu-options.hx b/qemu-options.hx
index 42fd09e4de..70507c0ee6 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -5224,6 +5224,18 @@ SRST
The ``share`` boolean option is on by default with memfd.
+ ``-object iommufd,id=id[,fd=fd]``
+ Creates an iommufd backend which allows control of DMA mapping
+ through the /dev/iommu device.
+
+ The ``id`` parameter is a unique ID which frontends (such as
+ vfio-pci of vdpa) will use to connect with the iommufd backend.
+
+ The ``fd`` parameter is an optional pre-opened file descriptor
+ resulting from /dev/iommu opening. Usually the iommufd is shared
+ across all subsystems, bringing the benefit of centralized
+ reference counting.
+
``-object rng-builtin,id=id``
Creates a random number generator backend which obtains entropy
from QEMU builtin functions. The ``id`` parameter is a unique ID
--
2.34.1
Hello, > +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova, > + ram_addr_t size, void *vaddr, bool readonly) > +{ > + int ret, fd = be->fd; > + struct iommu_ioas_map map = { > + .size = sizeof(map), > + .flags = IOMMU_IOAS_MAP_READABLE | > + IOMMU_IOAS_MAP_FIXED_IOVA, > + .ioas_id = ioas_id, > + .__reserved = 0, > + .user_va = (uintptr_t)vaddr, > + .iova = iova, > + .length = size, > + }; > + > + if (!readonly) { > + map.flags |= IOMMU_IOAS_MAP_WRITEABLE; > + } > + > + ret = ioctl(fd, IOMMU_IOAS_MAP, &map); > + trace_iommufd_backend_map_dma(fd, ioas_id, iova, size, > + vaddr, readonly, ret); > + if (ret) { > + ret = -errno; > + error_report("IOMMU_IOAS_MAP failed: %m"); > + } > + return ret; > +} When using a UEFI guest, QEMU reports errors when mapping regions in the top PCI space : iommufd_backend_map_dma iommufd=10 ioas=2 iova=0x380000001000 size=0x3000 addr=0x7fce2c28b000 readonly=0 (-1) qemu-system-x86_64: IOMMU_IOAS_MAP failed: Invalid argument qemu-system-x86_64: vfio_container_dma_map(0x55a21b03a150, 0x380000001000, 0x3000, 0x7fce2c28b000) = -22 (Invalid argument) iommufd_backend_map_dma iommufd=10 ioas=2 iova=0x380000004000 size=0x4000 addr=0x7fce2c980000 readonly=0 (-1) qemu-system-x86_64: IOMMU_IOAS_MAP failed: Invalid argument qemu-system-x86_64: vfio_container_dma_map(0x55a21b03a150, 0x380000004000, 0x4000, 0x7fce2c980000) = -22 (Invalid argument) This is because IOMMUFD reserved IOVAs areas are : [ fee00000 - feefffff ] [ 8000000000 - ffffffffffffffff ] (39 bits address space) which were allocated when the device was initially attached. The topology is basic. Something is wrong. Thanks, C.
Hi Cédric, >-----Original Message----- >From: Cédric Le Goater <clg@redhat.com> >Sent: Friday, November 17, 2023 7:10 PM >Subject: Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object > >Hello, > >> +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, >hwaddr iova, >> + ram_addr_t size, void *vaddr, bool readonly) >> +{ >> + int ret, fd = be->fd; >> + struct iommu_ioas_map map = { >> + .size = sizeof(map), >> + .flags = IOMMU_IOAS_MAP_READABLE | >> + IOMMU_IOAS_MAP_FIXED_IOVA, >> + .ioas_id = ioas_id, >> + .__reserved = 0, >> + .user_va = (uintptr_t)vaddr, >> + .iova = iova, >> + .length = size, >> + }; >> + >> + if (!readonly) { >> + map.flags |= IOMMU_IOAS_MAP_WRITEABLE; >> + } >> + >> + ret = ioctl(fd, IOMMU_IOAS_MAP, &map); >> + trace_iommufd_backend_map_dma(fd, ioas_id, iova, size, >> + vaddr, readonly, ret); >> + if (ret) { >> + ret = -errno; >> + error_report("IOMMU_IOAS_MAP failed: %m"); >> + } >> + return ret; >> +} > >When using a UEFI guest, QEMU reports errors when mapping regions >in the top PCI space : > > iommufd_backend_map_dma iommufd=10 ioas=2 iova=0x380000001000 >size=0x3000 addr=0x7fce2c28b000 readonly=0 (-1) > qemu-system-x86_64: IOMMU_IOAS_MAP failed: Invalid argument > qemu-system-x86_64: vfio_container_dma_map(0x55a21b03a150, >0x380000001000, 0x3000, 0x7fce2c28b000) = -22 (Invalid argument) > > iommufd_backend_map_dma iommufd=10 ioas=2 iova=0x380000004000 >size=0x4000 addr=0x7fce2c980000 readonly=0 (-1) > qemu-system-x86_64: IOMMU_IOAS_MAP failed: Invalid argument > qemu-system-x86_64: vfio_container_dma_map(0x55a21b03a150, >0x380000004000, 0x4000, 0x7fce2c980000) = -22 (Invalid argument) > >This is because IOMMUFD reserved IOVAs areas are : > > [ fee00000 - feefffff ] > [ 8000000000 - ffffffffffffffff ] (39 bits address space) > >which were allocated when the device was initially attached. >The topology is basic. Something is wrong. Thanks for your report. This looks a hardware limit of host IOMMU address width(39) < guest physical address width. A similar issue with a fix submitted below, ccing related people. https://lists.gnu.org/archive/html/qemu-devel/2023-11/msg02937.html It looks the fix will not work for hotplug. Or below qemu cmdline may help: "-cpu host,host-phys-bits-limit=39" Thanks Zhenzhong
Hi Cédric, On 11/17/23 12:39, Duan, Zhenzhong wrote: > Hi Cédric, > >> -----Original Message----- >> From: Cédric Le Goater <clg@redhat.com> >> Sent: Friday, November 17, 2023 7:10 PM >> Subject: Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object >> >> Hello, >> >>> +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, >> hwaddr iova, >>> + ram_addr_t size, void *vaddr, bool readonly) >>> +{ >>> + int ret, fd = be->fd; >>> + struct iommu_ioas_map map = { >>> + .size = sizeof(map), >>> + .flags = IOMMU_IOAS_MAP_READABLE | >>> + IOMMU_IOAS_MAP_FIXED_IOVA, >>> + .ioas_id = ioas_id, >>> + .__reserved = 0, >>> + .user_va = (uintptr_t)vaddr, >>> + .iova = iova, >>> + .length = size, >>> + }; >>> + >>> + if (!readonly) { >>> + map.flags |= IOMMU_IOAS_MAP_WRITEABLE; >>> + } >>> + >>> + ret = ioctl(fd, IOMMU_IOAS_MAP, &map); >>> + trace_iommufd_backend_map_dma(fd, ioas_id, iova, size, >>> + vaddr, readonly, ret); >>> + if (ret) { >>> + ret = -errno; >>> + error_report("IOMMU_IOAS_MAP failed: %m"); >>> + } >>> + return ret; >>> +} >> When using a UEFI guest, QEMU reports errors when mapping regions >> in the top PCI space : >> >> iommufd_backend_map_dma iommufd=10 ioas=2 iova=0x380000001000 >> size=0x3000 addr=0x7fce2c28b000 readonly=0 (-1) >> qemu-system-x86_64: IOMMU_IOAS_MAP failed: Invalid argument >> qemu-system-x86_64: vfio_container_dma_map(0x55a21b03a150, >> 0x380000001000, 0x3000, 0x7fce2c28b000) = -22 (Invalid argument) >> >> iommufd_backend_map_dma iommufd=10 ioas=2 iova=0x380000004000 >> size=0x4000 addr=0x7fce2c980000 readonly=0 (-1) >> qemu-system-x86_64: IOMMU_IOAS_MAP failed: Invalid argument >> qemu-system-x86_64: vfio_container_dma_map(0x55a21b03a150, >> 0x380000004000, 0x4000, 0x7fce2c980000) = -22 (Invalid argument) >> >> This is because IOMMUFD reserved IOVAs areas are : >> >> [ fee00000 - feefffff ] >> [ 8000000000 - ffffffffffffffff ] (39 bits address space) >> >> which were allocated when the device was initially attached. >> The topology is basic. Something is wrong. > > Thanks for your report. This looks a hardware limit of > host IOMMU address width(39) < guest physical address width. > > A similar issue with a fix submitted below, ccing related people. > https://lists.gnu.org/archive/html/qemu-devel/2023-11/msg02937.html > It looks the fix will not work for hotplug. > > Or below qemu cmdline may help: > "-cpu host,host-phys-bits-limit=39" don't you have the same issue with legacy VFIO code, you should? Eric > > Thanks > Zhenzhong >
On 11/17/23 14:29, Eric Auger wrote: > Hi Cédric, > > On 11/17/23 12:39, Duan, Zhenzhong wrote: >> Hi Cédric, >> >>> -----Original Message----- >>> From: Cédric Le Goater <clg@redhat.com> >>> Sent: Friday, November 17, 2023 7:10 PM >>> Subject: Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object >>> >>> Hello, >>> >>>> +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, >>> hwaddr iova, >>>> + ram_addr_t size, void *vaddr, bool readonly) >>>> +{ >>>> + int ret, fd = be->fd; >>>> + struct iommu_ioas_map map = { >>>> + .size = sizeof(map), >>>> + .flags = IOMMU_IOAS_MAP_READABLE | >>>> + IOMMU_IOAS_MAP_FIXED_IOVA, >>>> + .ioas_id = ioas_id, >>>> + .__reserved = 0, >>>> + .user_va = (uintptr_t)vaddr, >>>> + .iova = iova, >>>> + .length = size, >>>> + }; >>>> + >>>> + if (!readonly) { >>>> + map.flags |= IOMMU_IOAS_MAP_WRITEABLE; >>>> + } >>>> + >>>> + ret = ioctl(fd, IOMMU_IOAS_MAP, &map); >>>> + trace_iommufd_backend_map_dma(fd, ioas_id, iova, size, >>>> + vaddr, readonly, ret); >>>> + if (ret) { >>>> + ret = -errno; >>>> + error_report("IOMMU_IOAS_MAP failed: %m"); >>>> + } >>>> + return ret; >>>> +} >>> When using a UEFI guest, QEMU reports errors when mapping regions >>> in the top PCI space : >>> >>> iommufd_backend_map_dma iommufd=10 ioas=2 iova=0x380000001000 >>> size=0x3000 addr=0x7fce2c28b000 readonly=0 (-1) >>> qemu-system-x86_64: IOMMU_IOAS_MAP failed: Invalid argument >>> qemu-system-x86_64: vfio_container_dma_map(0x55a21b03a150, >>> 0x380000001000, 0x3000, 0x7fce2c28b000) = -22 (Invalid argument) >>> >>> iommufd_backend_map_dma iommufd=10 ioas=2 iova=0x380000004000 >>> size=0x4000 addr=0x7fce2c980000 readonly=0 (-1) >>> qemu-system-x86_64: IOMMU_IOAS_MAP failed: Invalid argument >>> qemu-system-x86_64: vfio_container_dma_map(0x55a21b03a150, >>> 0x380000004000, 0x4000, 0x7fce2c980000) = -22 (Invalid argument) >>> >>> This is because IOMMUFD reserved IOVAs areas are : >>> >>> [ fee00000 - feefffff ] >>> [ 8000000000 - ffffffffffffffff ] (39 bits address space) >>> >>> which were allocated when the device was initially attached. >>> The topology is basic. Something is wrong. >> >> Thanks for your report. This looks a hardware limit of >> host IOMMU address width(39) < guest physical address width. >> >> A similar issue with a fix submitted below, ccing related people. >> https://lists.gnu.org/archive/html/qemu-devel/2023-11/msg02937.html >> It looks the fix will not work for hotplug. >> >> Or below qemu cmdline may help: >> "-cpu host,host-phys-bits-limit=39" > > don't you have the same issue with legacy VFIO code, you should? I tend to be lazy and use seabios for guests on the command line. I do see the error with legacy VFIO and uefi. However, with the address space size work-around and iommufd, the error is different, an EFAULT now. Some page pinning issue it seems. Thanks, C.
>-----Original Message----- >From: Cédric Le Goater <clg@redhat.com> >Sent: Friday, November 17, 2023 9:56 PM >Subject: Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object > >On 11/17/23 14:29, Eric Auger wrote: >> Hi Cédric, >> >> On 11/17/23 12:39, Duan, Zhenzhong wrote: >>> Hi Cédric, >>> >>>> -----Original Message----- >>>> From: Cédric Le Goater <clg@redhat.com> >>>> Sent: Friday, November 17, 2023 7:10 PM >>>> Subject: Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd >object >>>> >>>> Hello, >>>> >>>>> +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, >>>> hwaddr iova, >>>>> + ram_addr_t size, void *vaddr, bool readonly) >>>>> +{ >>>>> + int ret, fd = be->fd; >>>>> + struct iommu_ioas_map map = { >>>>> + .size = sizeof(map), >>>>> + .flags = IOMMU_IOAS_MAP_READABLE | >>>>> + IOMMU_IOAS_MAP_FIXED_IOVA, >>>>> + .ioas_id = ioas_id, >>>>> + .__reserved = 0, >>>>> + .user_va = (uintptr_t)vaddr, >>>>> + .iova = iova, >>>>> + .length = size, >>>>> + }; >>>>> + >>>>> + if (!readonly) { >>>>> + map.flags |= IOMMU_IOAS_MAP_WRITEABLE; >>>>> + } >>>>> + >>>>> + ret = ioctl(fd, IOMMU_IOAS_MAP, &map); >>>>> + trace_iommufd_backend_map_dma(fd, ioas_id, iova, size, >>>>> + vaddr, readonly, ret); >>>>> + if (ret) { >>>>> + ret = -errno; >>>>> + error_report("IOMMU_IOAS_MAP failed: %m"); >>>>> + } >>>>> + return ret; >>>>> +} >>>> When using a UEFI guest, QEMU reports errors when mapping regions >>>> in the top PCI space : >>>> >>>> iommufd_backend_map_dma iommufd=10 ioas=2 iova=0x380000001000 >>>> size=0x3000 addr=0x7fce2c28b000 readonly=0 (-1) >>>> qemu-system-x86_64: IOMMU_IOAS_MAP failed: Invalid argument >>>> qemu-system-x86_64: vfio_container_dma_map(0x55a21b03a150, >>>> 0x380000001000, 0x3000, 0x7fce2c28b000) = -22 (Invalid argument) >>>> >>>> iommufd_backend_map_dma iommufd=10 ioas=2 iova=0x380000004000 >>>> size=0x4000 addr=0x7fce2c980000 readonly=0 (-1) >>>> qemu-system-x86_64: IOMMU_IOAS_MAP failed: Invalid argument >>>> qemu-system-x86_64: vfio_container_dma_map(0x55a21b03a150, >>>> 0x380000004000, 0x4000, 0x7fce2c980000) = -22 (Invalid argument) >>>> >>>> This is because IOMMUFD reserved IOVAs areas are : >>>> >>>> [ fee00000 - feefffff ] >>>> [ 8000000000 - ffffffffffffffff ] (39 bits address space) >>>> >>>> which were allocated when the device was initially attached. >>>> The topology is basic. Something is wrong. >>> >>> Thanks for your report. This looks a hardware limit of >>> host IOMMU address width(39) < guest physical address width. >>> >>> A similar issue with a fix submitted below, ccing related people. >>> https://lists.gnu.org/archive/html/qemu-devel/2023-11/msg02937.html >>> It looks the fix will not work for hotplug. >>> >>> Or below qemu cmdline may help: >>> "-cpu host,host-phys-bits-limit=39" >> >> don't you have the same issue with legacy VFIO code, you should? > >I tend to be lazy and use seabios for guests on the command line. >I do see the error with legacy VFIO and uefi. > >However, with the address space size work-around and iommufd, the >error is different, an EFAULT now. Some page pinning issue it seems. Yes, this reminds me of iommufd not supporting p2p mapping yet. So EFAULT is expected. Maybe I should add a comment in docs/devel/vfio-iommufd.rst Thanks Zhenzhong
>>>> A similar issue with a fix submitted below, ccing related people. >>>> https://lists.gnu.org/archive/html/qemu-devel/2023-11/msg02937.html >>>> It looks the fix will not work for hotplug. >>>> >>>> Or below qemu cmdline may help: >>>> "-cpu host,host-phys-bits-limit=39" >>> >>> don't you have the same issue with legacy VFIO code, you should? >> >> I tend to be lazy and use seabios for guests on the command line. >> I do see the error with legacy VFIO and uefi. >> >> However, with the address space size work-around and iommufd, the >> error is different, an EFAULT now. Some page pinning issue it seems. > > Yes, this reminds me of iommufd not supporting p2p mapping yet. OK. Should we transform this error in a warning ? The code needs at least a comment. > So EFAULT is expected. Maybe I should add a comment in docs/devel/vfio-iommufd.rst Yes. It would be good to have a list of gaps and effects in the documentation. See Jason's presentation at LPC. https://lpc.events/event/17/contributions/1418/attachments/1297/2607/LPC2023_iommufd.pdf Thanks, C.
Hi Cédric,
>-----Original Message-----
>From: Cédric Le Goater <clg@redhat.com>
>Sent: Monday, November 20, 2023 4:25 PM
>Subject: Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object
>
>>>>> A similar issue with a fix submitted below, ccing related people.
>>>>> https://lists.gnu.org/archive/html/qemu-devel/2023-11/msg02937.html
>>>>> It looks the fix will not work for hotplug.
>>>>>
>>>>> Or below qemu cmdline may help:
>>>>> "-cpu host,host-phys-bits-limit=39"
>>>>
>>>> don't you have the same issue with legacy VFIO code, you should?
>>>
>>> I tend to be lazy and use seabios for guests on the command line.
>>> I do see the error with legacy VFIO and uefi.
>>>
>>> However, with the address space size work-around and iommufd, the
>>> error is different, an EFAULT now. Some page pinning issue it seems.
>>
>> Yes, this reminds me of iommufd not supporting p2p mapping yet.
>
>OK. Should we transform this error in a warning ? The code needs
>at least a comment.
Make sense, though I'm not clear if there is other corner case return EFAULT.
I plan below change in v7:
diff --git a/backends/iommufd.c b/backends/iommufd.c
index 53fdac4cc0..ba58a0eb0d 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -178,7 +178,13 @@ int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova,
vaddr, readonly, ret);
if (ret) {
ret = -errno;
- error_report("IOMMU_IOAS_MAP failed: %m");
+
+ /* TODO: Not support mapping hardware PCI BAR region for now. */
+ if (errno == EFAULT) {
+ warn_report("IOMMU_IOAS_MAP failed: %m, PCI BAR?");
+ } else {
+ error_report("IOMMU_IOAS_MAP failed: %m");
+ }
}
return ret;
}
I failed to change vfio_container_dma_map print as warning because for legacy container, it's real errro.
So print after fix:
qemu-system-x86_64: warning: IOMMU_IOAS_MAP failed: Bad address, PCI BAR?
qemu-system-x86_64: vfio_container_dma_map(0x560cb6cb1620, 0xe000000021000, 0x3000, 0x7f32ed55c000) = -14 (Bad address)
>
>> So EFAULT is expected. Maybe I should add a comment in docs/devel/vfio-
>iommufd.rst
>
>Yes. It would be good to have a list of gaps and effects in the
>documentation. See Jason's presentation at LPC.
>
>
>https://lpc.events/event/17/contributions/1418/attachments/1297/2607/LPC202
>3_iommufd.pdf
I see, PCI Peer to Peer and POWER/SPAPR are related to qemu iommufd implementation.
For POWER/SPAPR, we have "Supported platform" section.
Below are other gaps I can think of for now:
Gaps:
1. dirty page sync, WIP (Joao)
2. p2p dma not supported yet.
3. fd passing with mdev not support ram discard(vfio-pci) as no way to know it's a mdev from a fd.
Thanks
Zhenzhong
Hello Zhenzhong On 11/20/23 11:07, Duan, Zhenzhong wrote: > Hi Cédric, > >> -----Original Message----- >> From: Cédric Le Goater <clg@redhat.com> >> Sent: Monday, November 20, 2023 4:25 PM >> Subject: Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object >> >>>>>> A similar issue with a fix submitted below, ccing related people. >>>>>> https://lists.gnu.org/archive/html/qemu-devel/2023-11/msg02937.html >>>>>> It looks the fix will not work for hotplug. >>>>>> >>>>>> Or below qemu cmdline may help: >>>>>> "-cpu host,host-phys-bits-limit=39" >>>>> >>>>> don't you have the same issue with legacy VFIO code, you should? >>>> >>>> I tend to be lazy and use seabios for guests on the command line. >>>> I do see the error with legacy VFIO and uefi. >>>> >>>> However, with the address space size work-around and iommufd, the >>>> error is different, an EFAULT now. Some page pinning issue it seems. >>> >>> Yes, this reminds me of iommufd not supporting p2p mapping yet. >> >> OK. Should we transform this error in a warning ? The code needs >> at least a comment. > > Make sense, though I'm not clear if there is other corner case return EFAULT. yep. That's the problem. > I plan below change in v7: > > diff --git a/backends/iommufd.c b/backends/iommufd.c > index 53fdac4cc0..ba58a0eb0d 100644 > --- a/backends/iommufd.c > +++ b/backends/iommufd.c > @@ -178,7 +178,13 @@ int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova, > vaddr, readonly, ret); > if (ret) { > ret = -errno; > - error_report("IOMMU_IOAS_MAP failed: %m"); > + > + /* TODO: Not support mapping hardware PCI BAR region for now. */ > + if (errno == EFAULT) { > + warn_report("IOMMU_IOAS_MAP failed: %m, PCI BAR?"); > + } else { > + error_report("IOMMU_IOAS_MAP failed: %m"); > + } > } > return ret; > } > > I failed to change vfio_container_dma_map print as warning because for legacy container, it's real errro. > So print after fix: > > qemu-system-x86_64: warning: IOMMU_IOAS_MAP failed: Bad address, PCI BAR? > qemu-system-x86_64: vfio_container_dma_map(0x560cb6cb1620, 0xe000000021000, 0x3000, 0x7f32ed55c000) = -14 (Bad address) I am OK with that. Let's see what the others have to say. >> >>> So EFAULT is expected. Maybe I should add a comment in docs/devel/vfio- >> iommufd.rst >> >> Yes. It would be good to have a list of gaps and effects in the >> documentation. See Jason's presentation at LPC. >> >> >> https://lpc.events/event/17/contributions/1418/attachments/1297/2607/LPC202 >> 3_iommufd.pdf > > I see, PCI Peer to Peer and POWER/SPAPR are related to qemu iommufd implementation. > For POWER/SPAPR, we have "Supported platform" section. yes. > Below are other gaps I can think of for now: > > Gaps: > 1. dirty page sync, WIP (Joao) > 2. p2p dma not supported yet. > 3. fd passing with mdev not support ram discard(vfio-pci) as no way to know it's a mdev from a fd. Call the section Caveats maybe? Thanks, C.
>-----Original Message----- >From: Cédric Le Goater <clg@redhat.com> >Sent: Tuesday, November 21, 2023 1:09 AM >Subject: Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object > >Hello Zhenzhong > >On 11/20/23 11:07, Duan, Zhenzhong wrote: >> Hi Cédric, >> >>> -----Original Message----- >>> From: Cédric Le Goater <clg@redhat.com> >>> Sent: Monday, November 20, 2023 4:25 PM >>> Subject: Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd >object >>> >>>>>>> A similar issue with a fix submitted below, ccing related people. >>>>>>> https://lists.gnu.org/archive/html/qemu-devel/2023-11/msg02937.html >>>>>>> It looks the fix will not work for hotplug. >>>>>>> >>>>>>> Or below qemu cmdline may help: >>>>>>> "-cpu host,host-phys-bits-limit=39" >>>>>> >>>>>> don't you have the same issue with legacy VFIO code, you should? >>>>> >>>>> I tend to be lazy and use seabios for guests on the command line. >>>>> I do see the error with legacy VFIO and uefi. >>>>> >>>>> However, with the address space size work-around and iommufd, the >>>>> error is different, an EFAULT now. Some page pinning issue it seems. >>>> >>>> Yes, this reminds me of iommufd not supporting p2p mapping yet. >>> >>> OK. Should we transform this error in a warning ? The code needs >>> at least a comment. >> >> Make sense, though I'm not clear if there is other corner case return EFAULT. > >yep. That's the problem. > >> I plan below change in v7: >> >> diff --git a/backends/iommufd.c b/backends/iommufd.c >> index 53fdac4cc0..ba58a0eb0d 100644 >> --- a/backends/iommufd.c >> +++ b/backends/iommufd.c >> @@ -178,7 +178,13 @@ int iommufd_backend_map_dma(IOMMUFDBackend >*be, uint32_t ioas_id, hwaddr iova, >> vaddr, readonly, ret); >> if (ret) { >> ret = -errno; >> - error_report("IOMMU_IOAS_MAP failed: %m"); >> + >> + /* TODO: Not support mapping hardware PCI BAR region for now. */ >> + if (errno == EFAULT) { >> + warn_report("IOMMU_IOAS_MAP failed: %m, PCI BAR?"); >> + } else { >> + error_report("IOMMU_IOAS_MAP failed: %m"); >> + } >> } >> return ret; >> } >> >> I failed to change vfio_container_dma_map print as warning because for legacy >container, it's real errro. >> So print after fix: >> >> qemu-system-x86_64: warning: IOMMU_IOAS_MAP failed: Bad address, PCI >BAR? >> qemu-system-x86_64: vfio_container_dma_map(0x560cb6cb1620, >0xe000000021000, 0x3000, 0x7f32ed55c000) = -14 (Bad address) > >I am OK with that. Let's see what the others have to say. > >>> >>>> So EFAULT is expected. Maybe I should add a comment in docs/devel/vfio- >>> iommufd.rst >>> >>> Yes. It would be good to have a list of gaps and effects in the >>> documentation. See Jason's presentation at LPC. >>> >>> >>> >https://lpc.events/event/17/contributions/1418/attachments/1297/2607/LPC202 >>> 3_iommufd.pdf >> >> I see, PCI Peer to Peer and POWER/SPAPR are related to qemu iommufd >implementation. >> For POWER/SPAPR, we have "Supported platform" section. > >yes. > >> Below are other gaps I can think of for now: >> >> Gaps: >> 1. dirty page sync, WIP (Joao) >> 2. p2p dma not supported yet. >> 3. fd passing with mdev not support ram discard(vfio-pci) as no way to know it's >a mdev from a fd. > >Call the section Caveats maybe? Got it. Thanks Zhenzhong
Hello Zhenzhong, >>> Below are other gaps I can think of for now: >>> >>> Gaps: >>> 1. dirty page sync, WIP (Joao) >>> 2. p2p dma not supported yet. >>> 3. fd passing with mdev not support ram discard(vfio-pci) as no way to know it's >> a mdev from a fd. >> >> Call the section Caveats maybe? > > Got it. It looks like v7 should be ready by rc2 (next week). I would then merge in vfio-next and wait a week before sending a QEMU-9.0 PR. Thanks, C.
>-----Original Message----- >From: Cédric Le Goater <clg@redhat.com> >Sent: Tuesday, November 21, 2023 4:06 PM >Subject: Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object > >Hello Zhenzhong, > >>>> Below are other gaps I can think of for now: >>>> >>>> Gaps: >>>> 1. dirty page sync, WIP (Joao) >>>> 2. p2p dma not supported yet. >>>> 3. fd passing with mdev not support ram discard(vfio-pci) as no way to know >it's >>> a mdev from a fd. >>> >>> Call the section Caveats maybe? >> >> Got it. > >It looks like v7 should be ready by rc2 (next week). I would then merge >in vfio-next and wait a week before sending a QEMU-9.0 PR. Got it, I'll send out soon. Thanks Zhenzhong
On 11/17/23 12:39, Duan, Zhenzhong wrote: > Hi Cédric, > >> -----Original Message----- >> From: Cédric Le Goater <clg@redhat.com> >> Sent: Friday, November 17, 2023 7:10 PM >> Subject: Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object >> >> Hello, >> >>> +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, >> hwaddr iova, >>> + ram_addr_t size, void *vaddr, bool readonly) >>> +{ >>> + int ret, fd = be->fd; >>> + struct iommu_ioas_map map = { >>> + .size = sizeof(map), >>> + .flags = IOMMU_IOAS_MAP_READABLE | >>> + IOMMU_IOAS_MAP_FIXED_IOVA, >>> + .ioas_id = ioas_id, >>> + .__reserved = 0, >>> + .user_va = (uintptr_t)vaddr, >>> + .iova = iova, >>> + .length = size, >>> + }; >>> + >>> + if (!readonly) { >>> + map.flags |= IOMMU_IOAS_MAP_WRITEABLE; >>> + } >>> + >>> + ret = ioctl(fd, IOMMU_IOAS_MAP, &map); >>> + trace_iommufd_backend_map_dma(fd, ioas_id, iova, size, >>> + vaddr, readonly, ret); >>> + if (ret) { >>> + ret = -errno; >>> + error_report("IOMMU_IOAS_MAP failed: %m"); >>> + } >>> + return ret; >>> +} >> >> When using a UEFI guest, QEMU reports errors when mapping regions >> in the top PCI space : >> >> iommufd_backend_map_dma iommufd=10 ioas=2 iova=0x380000001000 >> size=0x3000 addr=0x7fce2c28b000 readonly=0 (-1) >> qemu-system-x86_64: IOMMU_IOAS_MAP failed: Invalid argument >> qemu-system-x86_64: vfio_container_dma_map(0x55a21b03a150, >> 0x380000001000, 0x3000, 0x7fce2c28b000) = -22 (Invalid argument) >> >> iommufd_backend_map_dma iommufd=10 ioas=2 iova=0x380000004000 >> size=0x4000 addr=0x7fce2c980000 readonly=0 (-1) >> qemu-system-x86_64: IOMMU_IOAS_MAP failed: Invalid argument >> qemu-system-x86_64: vfio_container_dma_map(0x55a21b03a150, >> 0x380000004000, 0x4000, 0x7fce2c980000) = -22 (Invalid argument) >> >> This is because IOMMUFD reserved IOVAs areas are : >> >> [ fee00000 - feefffff ] >> [ 8000000000 - ffffffffffffffff ] (39 bits address space) >> >> which were allocated when the device was initially attached. >> The topology is basic. Something is wrong. > > Thanks for your report. This looks a hardware limit of > host IOMMU address width(39) < guest physical address width. > > A similar issue with a fix submitted below, ccing related people. > https://lists.gnu.org/archive/html/qemu-devel/2023-11/msg02937.html > It looks the fix will not work for hotplug. > > Or below qemu cmdline may help: > "-cpu host,host-phys-bits-limit=39" Not that much. The IOMMU_IOAS_MAP failure becomes a "Bad address". Thanks, C.
Hi Zhenzhong, On 11/14/23 11:09, Zhenzhong Duan wrote: > From: Eric Auger <eric.auger@redhat.com> > > Introduce an iommufd object which allows the interaction > with the host /dev/iommu device. > > The /dev/iommu can have been already pre-opened outside of qemu, > in which case the fd can be passed directly along with the > iommufd object: > > This allows the iommufd object to be shared accross several > subsystems (VFIO, VDPA, ...). For example, libvirt would open > the /dev/iommu once. > > If no fd is passed along with the iommufd object, the /dev/iommu > is opened by the qemu code. > > Suggested-by: Alex Williamson <alex.williamson@redhat.com> > Signed-off-by: Eric Auger <eric.auger@redhat.com> > Signed-off-by: Yi Liu <yi.l.liu@intel.com> > Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com> > --- > v6: remove redundant call, alloc_hwpt, get/put_ioas > > MAINTAINERS | 7 ++ > qapi/qom.json | 19 ++++ > include/sysemu/iommufd.h | 44 ++++++++ > backends/iommufd.c | 228 +++++++++++++++++++++++++++++++++++++++ > backends/Kconfig | 4 + > backends/meson.build | 1 + > backends/trace-events | 10 ++ > qemu-options.hx | 12 +++ > 8 files changed, 325 insertions(+) > create mode 100644 include/sysemu/iommufd.h > create mode 100644 backends/iommufd.c > > diff --git a/MAINTAINERS b/MAINTAINERS > index ff1238bb98..a4891f7bda 100644 > --- a/MAINTAINERS > +++ b/MAINTAINERS > @@ -2166,6 +2166,13 @@ F: hw/vfio/ap.c > F: docs/system/s390x/vfio-ap.rst > L: qemu-s390x@nongnu.org > > +iommufd > +M: Yi Liu <yi.l.liu@intel.com> > +M: Eric Auger <eric.auger@redhat.com> Zhenzhong, don't you want to be added here? > +S: Supported > +F: backends/iommufd.c > +F: include/sysemu/iommufd.h > + > vhost > M: Michael S. Tsirkin <mst@redhat.com> > S: Supported > diff --git a/qapi/qom.json b/qapi/qom.json > index c53ef978ff..1fd8555a75 100644 > --- a/qapi/qom.json > +++ b/qapi/qom.json > @@ -794,6 +794,23 @@ > { 'struct': 'VfioUserServerProperties', > 'data': { 'socket': 'SocketAddress', 'device': 'str' } } > > +## > +# @IOMMUFDProperties: > +# > +# Properties for iommufd objects. > +# > +# @fd: file descriptor name previously passed via 'getfd' command, "previously passed via 'getfd' command", I wonder if this applies here or whether it is copy/paste of RemoteObjectProperties.fd doc? > +# which represents a pre-opened /dev/iommu. This allows the > +# iommufd object to be shared accross several subsystems > +# (VFIO, VDPA, ...), and the file descriptor to be shared > +# with other process, e.g. DPDK. (default: QEMU opens > +# /dev/iommu by itself) > +# > +# Since: 8.2 > +## > +{ 'struct': 'IOMMUFDProperties', > + 'data': { '*fd': 'str' } } > + > ## > # @RngProperties: > # > @@ -934,6 +951,7 @@ > 'input-barrier', > { 'name': 'input-linux', > 'if': 'CONFIG_LINUX' }, > + 'iommufd', > 'iothread', > 'main-loop', > { 'name': 'memory-backend-epc', > @@ -1003,6 +1021,7 @@ > 'input-barrier': 'InputBarrierProperties', > 'input-linux': { 'type': 'InputLinuxProperties', > 'if': 'CONFIG_LINUX' }, > + 'iommufd': 'IOMMUFDProperties', > 'iothread': 'IothreadProperties', > 'main-loop': 'MainLoopProperties', > 'memory-backend-epc': { 'type': 'MemoryBackendEpcProperties', > diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h > new file mode 100644 > index 0000000000..9b3a86f57d > --- /dev/null > +++ b/include/sysemu/iommufd.h > @@ -0,0 +1,44 @@ > +#ifndef SYSEMU_IOMMUFD_H > +#define SYSEMU_IOMMUFD_H > + > +#include "qom/object.h" > +#include "qemu/thread.h" > +#include "exec/hwaddr.h" > +#include "exec/cpu-common.h" > + > +#define TYPE_IOMMUFD_BACKEND "iommufd" > +OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass, > + IOMMUFD_BACKEND) > +#define IOMMUFD_BACKEND(obj) \ > + OBJECT_CHECK(IOMMUFDBackend, (obj), TYPE_IOMMUFD_BACKEND) > +#define IOMMUFD_BACKEND_GET_CLASS(obj) \ > + OBJECT_GET_CLASS(IOMMUFDBackendClass, (obj), TYPE_IOMMUFD_BACKEND) > +#define IOMMUFD_BACKEND_CLASS(klass) \ > + OBJECT_CLASS_CHECK(IOMMUFDBackendClass, (klass), TYPE_IOMMUFD_BACKEND) > +struct IOMMUFDBackendClass { > + ObjectClass parent_class; > +}; > + > +struct IOMMUFDBackend { > + Object parent; > + > + /*< protected >*/ > + int fd; /* /dev/iommu file descriptor */ > + bool owned; /* is the /dev/iommu opened internally */ > + QemuMutex lock; > + uint32_t users; > + > + /*< public >*/ > +}; > + > +int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp); > +void iommufd_backend_disconnect(IOMMUFDBackend *be); > + > +int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id, > + Error **errp); > +void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id); > +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova, > + ram_addr_t size, void *vaddr, bool readonly); > +int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, > + hwaddr iova, ram_addr_t size); > +#endif > diff --git a/backends/iommufd.c b/backends/iommufd.c > new file mode 100644 > index 0000000000..ea3e2a8f85 > --- /dev/null > +++ b/backends/iommufd.c > @@ -0,0 +1,228 @@ > +/* > + * iommufd container backend > + * > + * Copyright (C) 2023 Intel Corporation. > + * Copyright Red Hat, Inc. 2023 > + * > + * Authors: Yi Liu <yi.l.liu@intel.com> > + * Eric Auger <eric.auger@redhat.com> > + * > + * SPDX-License-Identifier: GPL-2.0-or-later > + */ > + > +#include "qemu/osdep.h" > +#include "sysemu/iommufd.h" > +#include "qapi/error.h" > +#include "qapi/qmp/qerror.h" > +#include "qemu/module.h" > +#include "qom/object_interfaces.h" > +#include "qemu/error-report.h" > +#include "monitor/monitor.h" > +#include "trace.h" > +#include <sys/ioctl.h> > +#include <linux/iommufd.h> > + > +static void iommufd_backend_init(Object *obj) > +{ > + IOMMUFDBackend *be = IOMMUFD_BACKEND(obj); > + > + be->fd = -1; > + be->users = 0; > + be->owned = true; > + qemu_mutex_init(&be->lock); > +} > + > +static void iommufd_backend_finalize(Object *obj) > +{ > + IOMMUFDBackend *be = IOMMUFD_BACKEND(obj); > + > + if (be->owned) { > + close(be->fd); > + be->fd = -1; > + } > +} > + > +static void iommufd_backend_set_fd(Object *obj, const char *str, Error **errp) > +{ > + IOMMUFDBackend *be = IOMMUFD_BACKEND(obj); > + int fd = -1; > + > + fd = monitor_fd_param(monitor_cur(), str, errp); > + if (fd == -1) { > + error_prepend(errp, "Could not parse remote object fd %s:", str); > + return; > + } > + qemu_mutex_lock(&be->lock); > + be->fd = fd; > + be->owned = false; > + qemu_mutex_unlock(&be->lock); > + trace_iommu_backend_set_fd(be->fd); > +} > + > +static void iommufd_backend_class_init(ObjectClass *oc, void *data) > +{ > + object_class_property_add_str(oc, "fd", NULL, iommufd_backend_set_fd); > +} > + > +int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp) > +{ > + int fd, ret = 0; > + > + qemu_mutex_lock(&be->lock); > + if (be->users == UINT32_MAX) { > + error_setg(errp, "too many connections"); > + ret = -E2BIG; > + goto out; > + } > + if (be->owned && !be->users) { > + fd = qemu_open_old("/dev/iommu", O_RDWR); > + if (fd < 0) { > + error_setg_errno(errp, errno, "/dev/iommu opening failed"); > + ret = fd; > + goto out; > + } > + be->fd = fd; > + } > + be->users++; > +out: > + trace_iommufd_backend_connect(be->fd, be->owned, > + be->users, ret); > + qemu_mutex_unlock(&be->lock); > + return ret; > +} > + > +void iommufd_backend_disconnect(IOMMUFDBackend *be) > +{ > + qemu_mutex_lock(&be->lock); > + if (!be->users) { > + goto out; > + } > + be->users--; > + if (!be->users && be->owned) { > + close(be->fd); > + be->fd = -1; > + } > +out: > + trace_iommufd_backend_disconnect(be->fd, be->users); > + qemu_mutex_unlock(&be->lock); > +} > + > +int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id, > + Error **errp) > +{ > + int ret, fd = be->fd; > + struct iommu_ioas_alloc alloc_data = { > + .size = sizeof(alloc_data), > + .flags = 0, > + }; > + > + ret = ioctl(fd, IOMMU_IOAS_ALLOC, &alloc_data); > + if (ret) { > + error_setg_errno(errp, errno, "Failed to allocate ioas"); > + return ret; > + } > + > + *ioas_id = alloc_data.out_ioas_id; > + trace_iommufd_backend_alloc_ioas(fd, *ioas_id, ret); > + > + return ret; > +} > + > +void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id) > +{ > + int ret, fd = be->fd; > + struct iommu_destroy des = { > + .size = sizeof(des), > + .id = id, > + }; > + > + ret = ioctl(fd, IOMMU_DESTROY, &des); > + trace_iommufd_backend_free_id(fd, id, ret); > + if (ret) { > + error_report("Failed to free id: %u %m", id); > + } > +} > + > +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova, > + ram_addr_t size, void *vaddr, bool readonly) > +{ > + int ret, fd = be->fd; > + struct iommu_ioas_map map = { > + .size = sizeof(map), > + .flags = IOMMU_IOAS_MAP_READABLE | > + IOMMU_IOAS_MAP_FIXED_IOVA, > + .ioas_id = ioas_id, > + .__reserved = 0, > + .user_va = (uintptr_t)vaddr, > + .iova = iova, > + .length = size, > + }; > + > + if (!readonly) { > + map.flags |= IOMMU_IOAS_MAP_WRITEABLE; > + } > + > + ret = ioctl(fd, IOMMU_IOAS_MAP, &map); > + trace_iommufd_backend_map_dma(fd, ioas_id, iova, size, > + vaddr, readonly, ret); > + if (ret) { > + ret = -errno; > + error_report("IOMMU_IOAS_MAP failed: %m"); > + } > + return ret; > +} > + > +int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, > + hwaddr iova, ram_addr_t size) > +{ > + int ret, fd = be->fd; > + struct iommu_ioas_unmap unmap = { > + .size = sizeof(unmap), > + .ioas_id = ioas_id, > + .iova = iova, > + .length = size, > + }; > + > + ret = ioctl(fd, IOMMU_IOAS_UNMAP, &unmap); > + /* > + * IOMMUFD takes mapping as some kind of object, unmapping > + * nonexistent mapping is treated as deleting a nonexistent > + * object and return ENOENT. This is different from legacy > + * backend which allows it. vIOMMU may trigger a lot of > + * redundant unmapping, to avoid flush the log, treat them > + * as succeess for IOMMUFD just like legacy backend. > + */ > + if (ret && errno == ENOENT) { > + trace_iommufd_backend_unmap_dma_non_exist(fd, ioas_id, iova, size, ret); > + ret = 0; > + } else { > + trace_iommufd_backend_unmap_dma(fd, ioas_id, iova, size, ret); > + } > + > + if (ret) { > + ret = -errno; > + error_report("IOMMU_IOAS_UNMAP failed: %m"); > + } > + return ret; > +} > + > +static const TypeInfo iommufd_backend_info = { > + .name = TYPE_IOMMUFD_BACKEND, > + .parent = TYPE_OBJECT, > + .instance_size = sizeof(IOMMUFDBackend), > + .instance_init = iommufd_backend_init, > + .instance_finalize = iommufd_backend_finalize, > + .class_size = sizeof(IOMMUFDBackendClass), > + .class_init = iommufd_backend_class_init, > + .interfaces = (InterfaceInfo[]) { > + { TYPE_USER_CREATABLE }, > + { } > + } > +}; > + > +static void register_types(void) > +{ > + type_register_static(&iommufd_backend_info); > +} > + > +type_init(register_types); > diff --git a/backends/Kconfig b/backends/Kconfig > index f35abc1609..2cb23f62fa 100644 > --- a/backends/Kconfig > +++ b/backends/Kconfig > @@ -1 +1,5 @@ > source tpm/Kconfig > + > +config IOMMUFD > + bool > + depends on VFIO I don't know the state of vDPA/iommufd integration but this extra might be added in short term. > diff --git a/backends/meson.build b/backends/meson.build > index 914c7c4afb..9a5cea480d 100644 > --- a/backends/meson.build > +++ b/backends/meson.build > @@ -20,6 +20,7 @@ if have_vhost_user > system_ss.add(when: 'CONFIG_VIRTIO', if_true: files('vhost-user.c')) > endif > system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('cryptodev-vhost.c')) > +system_ss.add(when: 'CONFIG_IOMMUFD', if_true: files('iommufd.c')) > if have_vhost_user_crypto > system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('cryptodev-vhost-user.c')) > endif > diff --git a/backends/trace-events b/backends/trace-events > index 652eb76a57..d45c6e31a6 100644 > --- a/backends/trace-events > +++ b/backends/trace-events > @@ -5,3 +5,13 @@ dbus_vmstate_pre_save(void) > dbus_vmstate_post_load(int version_id) "version_id: %d" > dbus_vmstate_loading(const char *id) "id: %s" > dbus_vmstate_saving(const char *id) "id: %s" > + > +# iommufd.c > +iommufd_backend_connect(int fd, bool owned, uint32_t users, int ret) "fd=%d owned=%d users=%d (%d)" > +iommufd_backend_disconnect(int fd, uint32_t users) "fd=%d users=%d" > +iommu_backend_set_fd(int fd) "pre-opened /dev/iommu fd=%d" > +iommufd_backend_map_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, void *vaddr, bool readonly, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" addr=%p readonly=%d (%d)" > +iommufd_backend_unmap_dma_non_exist(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " Unmap nonexistent mapping: iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)" > +iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)" > +iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas, int ret) " iommufd=%d ioas=%d (%d)" > +iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)" > diff --git a/qemu-options.hx b/qemu-options.hx > index 42fd09e4de..70507c0ee6 100644 > --- a/qemu-options.hx > +++ b/qemu-options.hx > @@ -5224,6 +5224,18 @@ SRST > > The ``share`` boolean option is on by default with memfd. > > + ``-object iommufd,id=id[,fd=fd]`` > + Creates an iommufd backend which allows control of DMA mapping > + through the /dev/iommu device. > + > + The ``id`` parameter is a unique ID which frontends (such as > + vfio-pci of vdpa) will use to connect with the iommufd backend. > + > + The ``fd`` parameter is an optional pre-opened file descriptor > + resulting from /dev/iommu opening. Usually the iommufd is shared > + across all subsystems, bringing the benefit of centralized > + reference counting. > + > ``-object rng-builtin,id=id`` > Creates a random number generator backend which obtains entropy > from QEMU builtin functions. The ``id`` parameter is a unique ID
Hi Eric, >-----Original Message----- >From: Eric Auger <eric.auger@redhat.com> >Sent: Wednesday, November 15, 2023 8:53 PM >Subject: Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object > >Hi Zhenzhong, > >On 11/14/23 11:09, Zhenzhong Duan wrote: >> From: Eric Auger <eric.auger@redhat.com> >> >> Introduce an iommufd object which allows the interaction >> with the host /dev/iommu device. >> >> The /dev/iommu can have been already pre-opened outside of qemu, >> in which case the fd can be passed directly along with the >> iommufd object: >> >> This allows the iommufd object to be shared accross several >> subsystems (VFIO, VDPA, ...). For example, libvirt would open >> the /dev/iommu once. >> >> If no fd is passed along with the iommufd object, the /dev/iommu >> is opened by the qemu code. >> >> Suggested-by: Alex Williamson <alex.williamson@redhat.com> >> Signed-off-by: Eric Auger <eric.auger@redhat.com> >> Signed-off-by: Yi Liu <yi.l.liu@intel.com> >> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com> >> --- >> v6: remove redundant call, alloc_hwpt, get/put_ioas >> >> MAINTAINERS | 7 ++ >> qapi/qom.json | 19 ++++ >> include/sysemu/iommufd.h | 44 ++++++++ >> backends/iommufd.c | 228 +++++++++++++++++++++++++++++++++++++++ >> backends/Kconfig | 4 + >> backends/meson.build | 1 + >> backends/trace-events | 10 ++ >> qemu-options.hx | 12 +++ >> 8 files changed, 325 insertions(+) >> create mode 100644 include/sysemu/iommufd.h >> create mode 100644 backends/iommufd.c >> >> diff --git a/MAINTAINERS b/MAINTAINERS >> index ff1238bb98..a4891f7bda 100644 >> --- a/MAINTAINERS >> +++ b/MAINTAINERS >> @@ -2166,6 +2166,13 @@ F: hw/vfio/ap.c >> F: docs/system/s390x/vfio-ap.rst >> L: qemu-s390x@nongnu.org >> >> +iommufd >> +M: Yi Liu <yi.l.liu@intel.com> >> +M: Eric Auger <eric.auger@redhat.com> >Zhenzhong, don't you want to be added here? >> +S: Supported >> +F: backends/iommufd.c >> +F: include/sysemu/iommufd.h >> + >> vhost >> M: Michael S. Tsirkin <mst@redhat.com> >> S: Supported >> diff --git a/qapi/qom.json b/qapi/qom.json >> index c53ef978ff..1fd8555a75 100644 >> --- a/qapi/qom.json >> +++ b/qapi/qom.json >> @@ -794,6 +794,23 @@ >> { 'struct': 'VfioUserServerProperties', >> 'data': { 'socket': 'SocketAddress', 'device': 'str' } } >> >> +## >> +# @IOMMUFDProperties: >> +# >> +# Properties for iommufd objects. >> +# >> +# @fd: file descriptor name previously passed via 'getfd' command, > >"previously passed via 'getfd' command", I wonder if this applies here or whether >it is copy/paste of >RemoteObjectProperties.fd doc? Maybe copied😊 I thinks this applies here because I use monitor_fd_param to get fd. Or I miss anything? > >> +# which represents a pre-opened /dev/iommu. This allows the >> +# iommufd object to be shared accross several subsystems >> +# (VFIO, VDPA, ...), and the file descriptor to be shared >> +# with other process, e.g. DPDK. (default: QEMU opens >> +# /dev/iommu by itself) >> +# >> +# Since: 8.2 >> +## >> +{ 'struct': 'IOMMUFDProperties', >> + 'data': { '*fd': 'str' } } >> + >> ## >> # @RngProperties: >> # >> @@ -934,6 +951,7 @@ >> 'input-barrier', >> { 'name': 'input-linux', >> 'if': 'CONFIG_LINUX' }, >> + 'iommufd', >> 'iothread', >> 'main-loop', >> { 'name': 'memory-backend-epc', >> @@ -1003,6 +1021,7 @@ >> 'input-barrier': 'InputBarrierProperties', >> 'input-linux': { 'type': 'InputLinuxProperties', >> 'if': 'CONFIG_LINUX' }, >> + 'iommufd': 'IOMMUFDProperties', >> 'iothread': 'IothreadProperties', >> 'main-loop': 'MainLoopProperties', >> 'memory-backend-epc': { 'type': 'MemoryBackendEpcProperties', >> diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h >> new file mode 100644 >> index 0000000000..9b3a86f57d >> --- /dev/null >> +++ b/include/sysemu/iommufd.h >> @@ -0,0 +1,44 @@ >> +#ifndef SYSEMU_IOMMUFD_H >> +#define SYSEMU_IOMMUFD_H >> + >> +#include "qom/object.h" >> +#include "qemu/thread.h" >> +#include "exec/hwaddr.h" >> +#include "exec/cpu-common.h" >> + >> +#define TYPE_IOMMUFD_BACKEND "iommufd" >> +OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass, >> + IOMMUFD_BACKEND) >> +#define IOMMUFD_BACKEND(obj) \ >> + OBJECT_CHECK(IOMMUFDBackend, (obj), TYPE_IOMMUFD_BACKEND) >> +#define IOMMUFD_BACKEND_GET_CLASS(obj) \ >> + OBJECT_GET_CLASS(IOMMUFDBackendClass, (obj), >TYPE_IOMMUFD_BACKEND) >> +#define IOMMUFD_BACKEND_CLASS(klass) \ >> + OBJECT_CLASS_CHECK(IOMMUFDBackendClass, (klass), >TYPE_IOMMUFD_BACKEND) >> +struct IOMMUFDBackendClass { >> + ObjectClass parent_class; >> +}; >> + >> +struct IOMMUFDBackend { >> + Object parent; >> + >> + /*< protected >*/ >> + int fd; /* /dev/iommu file descriptor */ >> + bool owned; /* is the /dev/iommu opened internally */ >> + QemuMutex lock; >> + uint32_t users; >> + >> + /*< public >*/ >> +}; >> + >> +int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp); >> +void iommufd_backend_disconnect(IOMMUFDBackend *be); >> + >> +int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id, >> + Error **errp); >> +void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id); >> +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, >hwaddr iova, >> + ram_addr_t size, void *vaddr, bool readonly); >> +int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, >> + hwaddr iova, ram_addr_t size); >> +#endif >> diff --git a/backends/iommufd.c b/backends/iommufd.c >> new file mode 100644 >> index 0000000000..ea3e2a8f85 >> --- /dev/null >> +++ b/backends/iommufd.c >> @@ -0,0 +1,228 @@ >> +/* >> + * iommufd container backend >> + * >> + * Copyright (C) 2023 Intel Corporation. >> + * Copyright Red Hat, Inc. 2023 >> + * >> + * Authors: Yi Liu <yi.l.liu@intel.com> >> + * Eric Auger <eric.auger@redhat.com> >> + * >> + * SPDX-License-Identifier: GPL-2.0-or-later >> + */ >> + >> +#include "qemu/osdep.h" >> +#include "sysemu/iommufd.h" >> +#include "qapi/error.h" >> +#include "qapi/qmp/qerror.h" >> +#include "qemu/module.h" >> +#include "qom/object_interfaces.h" >> +#include "qemu/error-report.h" >> +#include "monitor/monitor.h" >> +#include "trace.h" >> +#include <sys/ioctl.h> >> +#include <linux/iommufd.h> >> + >> +static void iommufd_backend_init(Object *obj) >> +{ >> + IOMMUFDBackend *be = IOMMUFD_BACKEND(obj); >> + >> + be->fd = -1; >> + be->users = 0; >> + be->owned = true; >> + qemu_mutex_init(&be->lock); >> +} >> + >> +static void iommufd_backend_finalize(Object *obj) >> +{ >> + IOMMUFDBackend *be = IOMMUFD_BACKEND(obj); >> + >> + if (be->owned) { >> + close(be->fd); >> + be->fd = -1; >> + } >> +} >> + >> +static void iommufd_backend_set_fd(Object *obj, const char *str, Error **errp) >> +{ >> + IOMMUFDBackend *be = IOMMUFD_BACKEND(obj); >> + int fd = -1; >> + >> + fd = monitor_fd_param(monitor_cur(), str, errp); >> + if (fd == -1) { >> + error_prepend(errp, "Could not parse remote object fd %s:", str); >> + return; >> + } >> + qemu_mutex_lock(&be->lock); >> + be->fd = fd; >> + be->owned = false; >> + qemu_mutex_unlock(&be->lock); >> + trace_iommu_backend_set_fd(be->fd); >> +} >> + >> +static void iommufd_backend_class_init(ObjectClass *oc, void *data) >> +{ >> + object_class_property_add_str(oc, "fd", NULL, iommufd_backend_set_fd); >> +} >> + >> +int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp) >> +{ >> + int fd, ret = 0; >> + >> + qemu_mutex_lock(&be->lock); >> + if (be->users == UINT32_MAX) { >> + error_setg(errp, "too many connections"); >> + ret = -E2BIG; >> + goto out; >> + } >> + if (be->owned && !be->users) { >> + fd = qemu_open_old("/dev/iommu", O_RDWR); >> + if (fd < 0) { >> + error_setg_errno(errp, errno, "/dev/iommu opening failed"); >> + ret = fd; >> + goto out; >> + } >> + be->fd = fd; >> + } >> + be->users++; >> +out: >> + trace_iommufd_backend_connect(be->fd, be->owned, >> + be->users, ret); >> + qemu_mutex_unlock(&be->lock); >> + return ret; >> +} >> + >> +void iommufd_backend_disconnect(IOMMUFDBackend *be) >> +{ >> + qemu_mutex_lock(&be->lock); >> + if (!be->users) { >> + goto out; >> + } >> + be->users--; >> + if (!be->users && be->owned) { >> + close(be->fd); >> + be->fd = -1; >> + } >> +out: >> + trace_iommufd_backend_disconnect(be->fd, be->users); >> + qemu_mutex_unlock(&be->lock); >> +} >> + >> +int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id, >> + Error **errp) >> +{ >> + int ret, fd = be->fd; >> + struct iommu_ioas_alloc alloc_data = { >> + .size = sizeof(alloc_data), >> + .flags = 0, >> + }; >> + >> + ret = ioctl(fd, IOMMU_IOAS_ALLOC, &alloc_data); >> + if (ret) { >> + error_setg_errno(errp, errno, "Failed to allocate ioas"); >> + return ret; >> + } >> + >> + *ioas_id = alloc_data.out_ioas_id; >> + trace_iommufd_backend_alloc_ioas(fd, *ioas_id, ret); >> + >> + return ret; >> +} >> + >> +void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id) >> +{ >> + int ret, fd = be->fd; >> + struct iommu_destroy des = { >> + .size = sizeof(des), >> + .id = id, >> + }; >> + >> + ret = ioctl(fd, IOMMU_DESTROY, &des); >> + trace_iommufd_backend_free_id(fd, id, ret); >> + if (ret) { >> + error_report("Failed to free id: %u %m", id); >> + } >> +} >> + >> +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, >hwaddr iova, >> + ram_addr_t size, void *vaddr, bool readonly) >> +{ >> + int ret, fd = be->fd; >> + struct iommu_ioas_map map = { >> + .size = sizeof(map), >> + .flags = IOMMU_IOAS_MAP_READABLE | >> + IOMMU_IOAS_MAP_FIXED_IOVA, >> + .ioas_id = ioas_id, >> + .__reserved = 0, >> + .user_va = (uintptr_t)vaddr, >> + .iova = iova, >> + .length = size, >> + }; >> + >> + if (!readonly) { >> + map.flags |= IOMMU_IOAS_MAP_WRITEABLE; >> + } >> + >> + ret = ioctl(fd, IOMMU_IOAS_MAP, &map); >> + trace_iommufd_backend_map_dma(fd, ioas_id, iova, size, >> + vaddr, readonly, ret); >> + if (ret) { >> + ret = -errno; >> + error_report("IOMMU_IOAS_MAP failed: %m"); >> + } >> + return ret; >> +} >> + >> +int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, >> + hwaddr iova, ram_addr_t size) >> +{ >> + int ret, fd = be->fd; >> + struct iommu_ioas_unmap unmap = { >> + .size = sizeof(unmap), >> + .ioas_id = ioas_id, >> + .iova = iova, >> + .length = size, >> + }; >> + >> + ret = ioctl(fd, IOMMU_IOAS_UNMAP, &unmap); >> + /* >> + * IOMMUFD takes mapping as some kind of object, unmapping >> + * nonexistent mapping is treated as deleting a nonexistent >> + * object and return ENOENT. This is different from legacy >> + * backend which allows it. vIOMMU may trigger a lot of >> + * redundant unmapping, to avoid flush the log, treat them >> + * as succeess for IOMMUFD just like legacy backend. >> + */ >> + if (ret && errno == ENOENT) { >> + trace_iommufd_backend_unmap_dma_non_exist(fd, ioas_id, iova, size, >ret); >> + ret = 0; >> + } else { >> + trace_iommufd_backend_unmap_dma(fd, ioas_id, iova, size, ret); >> + } >> + >> + if (ret) { >> + ret = -errno; >> + error_report("IOMMU_IOAS_UNMAP failed: %m"); >> + } >> + return ret; >> +} >> + >> +static const TypeInfo iommufd_backend_info = { >> + .name = TYPE_IOMMUFD_BACKEND, >> + .parent = TYPE_OBJECT, >> + .instance_size = sizeof(IOMMUFDBackend), >> + .instance_init = iommufd_backend_init, >> + .instance_finalize = iommufd_backend_finalize, >> + .class_size = sizeof(IOMMUFDBackendClass), >> + .class_init = iommufd_backend_class_init, >> + .interfaces = (InterfaceInfo[]) { >> + { TYPE_USER_CREATABLE }, >> + { } >> + } >> +}; >> + >> +static void register_types(void) >> +{ >> + type_register_static(&iommufd_backend_info); >> +} >> + >> +type_init(register_types); >> diff --git a/backends/Kconfig b/backends/Kconfig >> index f35abc1609..2cb23f62fa 100644 >> --- a/backends/Kconfig >> +++ b/backends/Kconfig >> @@ -1 +1,5 @@ >> source tpm/Kconfig >> + >> +config IOMMUFD >> + bool >> + depends on VFIO >I don't know the state of vDPA/iommufd integration but this extra might >be added in short term. Thanks for reminder. But I think it make more sense that series relax the check itself? E.g. depends on VFIO || VHOST_VDPA Thanks Zhenzhong
Hi Zhenzhong, On 11/16/23 05:04, Duan, Zhenzhong wrote: > Hi Eric, > >> -----Original Message----- >> From: Eric Auger <eric.auger@redhat.com> >> Sent: Wednesday, November 15, 2023 8:53 PM >> Subject: Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object >> >> Hi Zhenzhong, >> >> On 11/14/23 11:09, Zhenzhong Duan wrote: >>> From: Eric Auger <eric.auger@redhat.com> >>> >>> Introduce an iommufd object which allows the interaction >>> with the host /dev/iommu device. >>> >>> The /dev/iommu can have been already pre-opened outside of qemu, >>> in which case the fd can be passed directly along with the >>> iommufd object: >>> >>> This allows the iommufd object to be shared accross several >>> subsystems (VFIO, VDPA, ...). For example, libvirt would open >>> the /dev/iommu once. >>> >>> If no fd is passed along with the iommufd object, the /dev/iommu >>> is opened by the qemu code. >>> >>> Suggested-by: Alex Williamson <alex.williamson@redhat.com> >>> Signed-off-by: Eric Auger <eric.auger@redhat.com> >>> Signed-off-by: Yi Liu <yi.l.liu@intel.com> >>> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com> >>> --- >>> v6: remove redundant call, alloc_hwpt, get/put_ioas >>> >>> MAINTAINERS | 7 ++ >>> qapi/qom.json | 19 ++++ >>> include/sysemu/iommufd.h | 44 ++++++++ >>> backends/iommufd.c | 228 +++++++++++++++++++++++++++++++++++++++ >>> backends/Kconfig | 4 + >>> backends/meson.build | 1 + >>> backends/trace-events | 10 ++ >>> qemu-options.hx | 12 +++ >>> 8 files changed, 325 insertions(+) >>> create mode 100644 include/sysemu/iommufd.h >>> create mode 100644 backends/iommufd.c >>> >>> diff --git a/MAINTAINERS b/MAINTAINERS >>> index ff1238bb98..a4891f7bda 100644 >>> --- a/MAINTAINERS >>> +++ b/MAINTAINERS >>> @@ -2166,6 +2166,13 @@ F: hw/vfio/ap.c >>> F: docs/system/s390x/vfio-ap.rst >>> L: qemu-s390x@nongnu.org >>> >>> +iommufd >>> +M: Yi Liu <yi.l.liu@intel.com> >>> +M: Eric Auger <eric.auger@redhat.com> >> Zhenzhong, don't you want to be added here? >>> +S: Supported >>> +F: backends/iommufd.c >>> +F: include/sysemu/iommufd.h >>> + >>> vhost >>> M: Michael S. Tsirkin <mst@redhat.com> >>> S: Supported >>> diff --git a/qapi/qom.json b/qapi/qom.json >>> index c53ef978ff..1fd8555a75 100644 >>> --- a/qapi/qom.json >>> +++ b/qapi/qom.json >>> @@ -794,6 +794,23 @@ >>> { 'struct': 'VfioUserServerProperties', >>> 'data': { 'socket': 'SocketAddress', 'device': 'str' } } >>> >>> +## >>> +# @IOMMUFDProperties: >>> +# >>> +# Properties for iommufd objects. >>> +# >>> +# @fd: file descriptor name previously passed via 'getfd' command, >> "previously passed via 'getfd' command", I wonder if this applies here or whether >> it is copy/paste of >> RemoteObjectProperties.fd doc? > Maybe copied😊 I thinks this applies here because I use monitor_fd_param to get fd. > Or I miss anything? This is a bit cryptic to me and I don't really understand what it means. It does not mean it is not correct but I am curious about explanations if anybody has some ;-) > >>> +# which represents a pre-opened /dev/iommu. This allows the >>> +# iommufd object to be shared accross several subsystems >>> +# (VFIO, VDPA, ...), and the file descriptor to be shared >>> +# with other process, e.g. DPDK. (default: QEMU opens >>> +# /dev/iommu by itself) >>> +# >>> +# Since: 8.2 >>> +## >>> +{ 'struct': 'IOMMUFDProperties', >>> + 'data': { '*fd': 'str' } } >>> + >>> ## >>> # @RngProperties: >>> # >>> @@ -934,6 +951,7 @@ >>> 'input-barrier', >>> { 'name': 'input-linux', >>> 'if': 'CONFIG_LINUX' }, >>> + 'iommufd', >>> 'iothread', >>> 'main-loop', >>> { 'name': 'memory-backend-epc', >>> @@ -1003,6 +1021,7 @@ >>> 'input-barrier': 'InputBarrierProperties', >>> 'input-linux': { 'type': 'InputLinuxProperties', >>> 'if': 'CONFIG_LINUX' }, >>> + 'iommufd': 'IOMMUFDProperties', >>> 'iothread': 'IothreadProperties', >>> 'main-loop': 'MainLoopProperties', >>> 'memory-backend-epc': { 'type': 'MemoryBackendEpcProperties', >>> diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h >>> new file mode 100644 >>> index 0000000000..9b3a86f57d >>> --- /dev/null >>> +++ b/include/sysemu/iommufd.h >>> @@ -0,0 +1,44 @@ >>> +#ifndef SYSEMU_IOMMUFD_H >>> +#define SYSEMU_IOMMUFD_H >>> + >>> +#include "qom/object.h" >>> +#include "qemu/thread.h" >>> +#include "exec/hwaddr.h" >>> +#include "exec/cpu-common.h" >>> + >>> +#define TYPE_IOMMUFD_BACKEND "iommufd" >>> +OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass, >>> + IOMMUFD_BACKEND) >>> +#define IOMMUFD_BACKEND(obj) \ >>> + OBJECT_CHECK(IOMMUFDBackend, (obj), TYPE_IOMMUFD_BACKEND) >>> +#define IOMMUFD_BACKEND_GET_CLASS(obj) \ >>> + OBJECT_GET_CLASS(IOMMUFDBackendClass, (obj), >> TYPE_IOMMUFD_BACKEND) >>> +#define IOMMUFD_BACKEND_CLASS(klass) \ >>> + OBJECT_CLASS_CHECK(IOMMUFDBackendClass, (klass), >> TYPE_IOMMUFD_BACKEND) >>> +struct IOMMUFDBackendClass { >>> + ObjectClass parent_class; >>> +}; >>> + >>> +struct IOMMUFDBackend { >>> + Object parent; >>> + >>> + /*< protected >*/ >>> + int fd; /* /dev/iommu file descriptor */ >>> + bool owned; /* is the /dev/iommu opened internally */ >>> + QemuMutex lock; >>> + uint32_t users; >>> + >>> + /*< public >*/ >>> +}; >>> + >>> +int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp); >>> +void iommufd_backend_disconnect(IOMMUFDBackend *be); >>> + >>> +int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id, >>> + Error **errp); >>> +void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id); >>> +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, >> hwaddr iova, >>> + ram_addr_t size, void *vaddr, bool readonly); >>> +int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, >>> + hwaddr iova, ram_addr_t size); >>> +#endif >>> diff --git a/backends/iommufd.c b/backends/iommufd.c >>> new file mode 100644 >>> index 0000000000..ea3e2a8f85 >>> --- /dev/null >>> +++ b/backends/iommufd.c >>> @@ -0,0 +1,228 @@ >>> +/* >>> + * iommufd container backend >>> + * >>> + * Copyright (C) 2023 Intel Corporation. >>> + * Copyright Red Hat, Inc. 2023 >>> + * >>> + * Authors: Yi Liu <yi.l.liu@intel.com> >>> + * Eric Auger <eric.auger@redhat.com> >>> + * >>> + * SPDX-License-Identifier: GPL-2.0-or-later >>> + */ >>> + >>> +#include "qemu/osdep.h" >>> +#include "sysemu/iommufd.h" >>> +#include "qapi/error.h" >>> +#include "qapi/qmp/qerror.h" >>> +#include "qemu/module.h" >>> +#include "qom/object_interfaces.h" >>> +#include "qemu/error-report.h" >>> +#include "monitor/monitor.h" >>> +#include "trace.h" >>> +#include <sys/ioctl.h> >>> +#include <linux/iommufd.h> >>> + >>> +static void iommufd_backend_init(Object *obj) >>> +{ >>> + IOMMUFDBackend *be = IOMMUFD_BACKEND(obj); >>> + >>> + be->fd = -1; >>> + be->users = 0; >>> + be->owned = true; >>> + qemu_mutex_init(&be->lock); >>> +} >>> + >>> +static void iommufd_backend_finalize(Object *obj) >>> +{ >>> + IOMMUFDBackend *be = IOMMUFD_BACKEND(obj); >>> + >>> + if (be->owned) { >>> + close(be->fd); >>> + be->fd = -1; >>> + } >>> +} >>> + >>> +static void iommufd_backend_set_fd(Object *obj, const char *str, Error **errp) >>> +{ >>> + IOMMUFDBackend *be = IOMMUFD_BACKEND(obj); >>> + int fd = -1; >>> + >>> + fd = monitor_fd_param(monitor_cur(), str, errp); >>> + if (fd == -1) { >>> + error_prepend(errp, "Could not parse remote object fd %s:", str); >>> + return; >>> + } >>> + qemu_mutex_lock(&be->lock); >>> + be->fd = fd; >>> + be->owned = false; >>> + qemu_mutex_unlock(&be->lock); >>> + trace_iommu_backend_set_fd(be->fd); >>> +} >>> + >>> +static void iommufd_backend_class_init(ObjectClass *oc, void *data) >>> +{ >>> + object_class_property_add_str(oc, "fd", NULL, iommufd_backend_set_fd); >>> +} >>> + >>> +int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp) >>> +{ >>> + int fd, ret = 0; >>> + >>> + qemu_mutex_lock(&be->lock); >>> + if (be->users == UINT32_MAX) { >>> + error_setg(errp, "too many connections"); >>> + ret = -E2BIG; >>> + goto out; >>> + } >>> + if (be->owned && !be->users) { >>> + fd = qemu_open_old("/dev/iommu", O_RDWR); >>> + if (fd < 0) { >>> + error_setg_errno(errp, errno, "/dev/iommu opening failed"); >>> + ret = fd; >>> + goto out; >>> + } >>> + be->fd = fd; >>> + } >>> + be->users++; >>> +out: >>> + trace_iommufd_backend_connect(be->fd, be->owned, >>> + be->users, ret); >>> + qemu_mutex_unlock(&be->lock); >>> + return ret; >>> +} >>> + >>> +void iommufd_backend_disconnect(IOMMUFDBackend *be) >>> +{ >>> + qemu_mutex_lock(&be->lock); >>> + if (!be->users) { >>> + goto out; >>> + } >>> + be->users--; >>> + if (!be->users && be->owned) { >>> + close(be->fd); >>> + be->fd = -1; >>> + } >>> +out: >>> + trace_iommufd_backend_disconnect(be->fd, be->users); >>> + qemu_mutex_unlock(&be->lock); >>> +} >>> + >>> +int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id, >>> + Error **errp) >>> +{ >>> + int ret, fd = be->fd; >>> + struct iommu_ioas_alloc alloc_data = { >>> + .size = sizeof(alloc_data), >>> + .flags = 0, >>> + }; >>> + >>> + ret = ioctl(fd, IOMMU_IOAS_ALLOC, &alloc_data); >>> + if (ret) { >>> + error_setg_errno(errp, errno, "Failed to allocate ioas"); >>> + return ret; >>> + } >>> + >>> + *ioas_id = alloc_data.out_ioas_id; >>> + trace_iommufd_backend_alloc_ioas(fd, *ioas_id, ret); >>> + >>> + return ret; >>> +} >>> + >>> +void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id) >>> +{ >>> + int ret, fd = be->fd; >>> + struct iommu_destroy des = { >>> + .size = sizeof(des), >>> + .id = id, >>> + }; >>> + >>> + ret = ioctl(fd, IOMMU_DESTROY, &des); >>> + trace_iommufd_backend_free_id(fd, id, ret); >>> + if (ret) { >>> + error_report("Failed to free id: %u %m", id); >>> + } >>> +} >>> + >>> +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, >> hwaddr iova, >>> + ram_addr_t size, void *vaddr, bool readonly) >>> +{ >>> + int ret, fd = be->fd; >>> + struct iommu_ioas_map map = { >>> + .size = sizeof(map), >>> + .flags = IOMMU_IOAS_MAP_READABLE | >>> + IOMMU_IOAS_MAP_FIXED_IOVA, >>> + .ioas_id = ioas_id, >>> + .__reserved = 0, >>> + .user_va = (uintptr_t)vaddr, >>> + .iova = iova, >>> + .length = size, >>> + }; >>> + >>> + if (!readonly) { >>> + map.flags |= IOMMU_IOAS_MAP_WRITEABLE; >>> + } >>> + >>> + ret = ioctl(fd, IOMMU_IOAS_MAP, &map); >>> + trace_iommufd_backend_map_dma(fd, ioas_id, iova, size, >>> + vaddr, readonly, ret); >>> + if (ret) { >>> + ret = -errno; >>> + error_report("IOMMU_IOAS_MAP failed: %m"); >>> + } >>> + return ret; >>> +} >>> + >>> +int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, >>> + hwaddr iova, ram_addr_t size) >>> +{ >>> + int ret, fd = be->fd; >>> + struct iommu_ioas_unmap unmap = { >>> + .size = sizeof(unmap), >>> + .ioas_id = ioas_id, >>> + .iova = iova, >>> + .length = size, >>> + }; >>> + >>> + ret = ioctl(fd, IOMMU_IOAS_UNMAP, &unmap); >>> + /* >>> + * IOMMUFD takes mapping as some kind of object, unmapping >>> + * nonexistent mapping is treated as deleting a nonexistent >>> + * object and return ENOENT. This is different from legacy >>> + * backend which allows it. vIOMMU may trigger a lot of >>> + * redundant unmapping, to avoid flush the log, treat them >>> + * as succeess for IOMMUFD just like legacy backend. >>> + */ >>> + if (ret && errno == ENOENT) { >>> + trace_iommufd_backend_unmap_dma_non_exist(fd, ioas_id, iova, size, >> ret); >>> + ret = 0; >>> + } else { >>> + trace_iommufd_backend_unmap_dma(fd, ioas_id, iova, size, ret); >>> + } >>> + >>> + if (ret) { >>> + ret = -errno; >>> + error_report("IOMMU_IOAS_UNMAP failed: %m"); >>> + } >>> + return ret; >>> +} >>> + >>> +static const TypeInfo iommufd_backend_info = { >>> + .name = TYPE_IOMMUFD_BACKEND, >>> + .parent = TYPE_OBJECT, >>> + .instance_size = sizeof(IOMMUFDBackend), >>> + .instance_init = iommufd_backend_init, >>> + .instance_finalize = iommufd_backend_finalize, >>> + .class_size = sizeof(IOMMUFDBackendClass), >>> + .class_init = iommufd_backend_class_init, >>> + .interfaces = (InterfaceInfo[]) { >>> + { TYPE_USER_CREATABLE }, >>> + { } >>> + } >>> +}; >>> + >>> +static void register_types(void) >>> +{ >>> + type_register_static(&iommufd_backend_info); >>> +} >>> + >>> +type_init(register_types); >>> diff --git a/backends/Kconfig b/backends/Kconfig >>> index f35abc1609..2cb23f62fa 100644 >>> --- a/backends/Kconfig >>> +++ b/backends/Kconfig >>> @@ -1 +1,5 @@ >>> source tpm/Kconfig >>> + >>> +config IOMMUFD >>> + bool >>> + depends on VFIO >> I don't know the state of vDPA/iommufd integration but this extra might >> be added in short term. > Thanks for reminder. But I think it make more sense that series relax the check > itself? > E.g. depends on VFIO || VHOST_VDPA yeah we can leave it as it is for now Eric > > Thanks > Zhenzhong
>-----Original Message----- >From: Eric Auger <eric.auger@redhat.com> >Sent: Thursday, November 16, 2023 4:33 PM >Subject: Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object > >Hi Zhenzhong, > >On 11/16/23 05:04, Duan, Zhenzhong wrote: >> Hi Eric, >> >>> -----Original Message----- >>> From: Eric Auger <eric.auger@redhat.com> >>> Sent: Wednesday, November 15, 2023 8:53 PM >>> Subject: Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd >object >>> >>> Hi Zhenzhong, >>> >>> On 11/14/23 11:09, Zhenzhong Duan wrote: >>>> From: Eric Auger <eric.auger@redhat.com> >>>> >>>> Introduce an iommufd object which allows the interaction >>>> with the host /dev/iommu device. >>>> >>>> The /dev/iommu can have been already pre-opened outside of qemu, >>>> in which case the fd can be passed directly along with the >>>> iommufd object: >>>> >>>> This allows the iommufd object to be shared accross several >>>> subsystems (VFIO, VDPA, ...). For example, libvirt would open >>>> the /dev/iommu once. >>>> >>>> If no fd is passed along with the iommufd object, the /dev/iommu >>>> is opened by the qemu code. >>>> >>>> Suggested-by: Alex Williamson <alex.williamson@redhat.com> >>>> Signed-off-by: Eric Auger <eric.auger@redhat.com> >>>> Signed-off-by: Yi Liu <yi.l.liu@intel.com> >>>> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com> >>>> --- >>>> v6: remove redundant call, alloc_hwpt, get/put_ioas >>>> >>>> MAINTAINERS | 7 ++ >>>> qapi/qom.json | 19 ++++ >>>> include/sysemu/iommufd.h | 44 ++++++++ >>>> backends/iommufd.c | 228 >+++++++++++++++++++++++++++++++++++++++ >>>> backends/Kconfig | 4 + >>>> backends/meson.build | 1 + >>>> backends/trace-events | 10 ++ >>>> qemu-options.hx | 12 +++ >>>> 8 files changed, 325 insertions(+) >>>> create mode 100644 include/sysemu/iommufd.h >>>> create mode 100644 backends/iommufd.c >>>> >>>> diff --git a/MAINTAINERS b/MAINTAINERS >>>> index ff1238bb98..a4891f7bda 100644 >>>> --- a/MAINTAINERS >>>> +++ b/MAINTAINERS >>>> @@ -2166,6 +2166,13 @@ F: hw/vfio/ap.c >>>> F: docs/system/s390x/vfio-ap.rst >>>> L: qemu-s390x@nongnu.org >>>> >>>> +iommufd >>>> +M: Yi Liu <yi.l.liu@intel.com> >>>> +M: Eric Auger <eric.auger@redhat.com> >>> Zhenzhong, don't you want to be added here? Sorry, missed this comment. My pleasure, I'll add myself in v7. >>>> +S: Supported >>>> +F: backends/iommufd.c >>>> +F: include/sysemu/iommufd.h >>>> + >>>> vhost >>>> M: Michael S. Tsirkin <mst@redhat.com> >>>> S: Supported >>>> diff --git a/qapi/qom.json b/qapi/qom.json >>>> index c53ef978ff..1fd8555a75 100644 >>>> --- a/qapi/qom.json >>>> +++ b/qapi/qom.json >>>> @@ -794,6 +794,23 @@ >>>> { 'struct': 'VfioUserServerProperties', >>>> 'data': { 'socket': 'SocketAddress', 'device': 'str' } } >>>> >>>> +## >>>> +# @IOMMUFDProperties: >>>> +# >>>> +# Properties for iommufd objects. >>>> +# >>>> +# @fd: file descriptor name previously passed via 'getfd' command, >>> "previously passed via 'getfd' command", I wonder if this applies here or >whether >>> it is copy/paste of >>> RemoteObjectProperties.fd doc? >> Maybe copied😊 I thinks this applies here because I use monitor_fd_param to >get fd. >> Or I miss anything? >This is a bit cryptic to me and I don't really understand what it means. >It does not mean it is not correct but I am curious about explanations >if anybody has some ;-) I have a weak understanding on this, may have errors😊 QMP support a command named 'getfd' to send a pre-opened fd with a name, This fd is then stored in a fd list. Then we can use that name to reference the fd In ths list. Thanks Zhenzhong
On 11/14/23 11:09, Zhenzhong Duan wrote: > From: Eric Auger <eric.auger@redhat.com> > > Introduce an iommufd object which allows the interaction > with the host /dev/iommu device. > > The /dev/iommu can have been already pre-opened outside of qemu, > in which case the fd can be passed directly along with the > iommufd object: > > This allows the iommufd object to be shared accross several > subsystems (VFIO, VDPA, ...). For example, libvirt would open > the /dev/iommu once. > > If no fd is passed along with the iommufd object, the /dev/iommu > is opened by the qemu code. > > Suggested-by: Alex Williamson <alex.williamson@redhat.com> > Signed-off-by: Eric Auger <eric.auger@redhat.com> > Signed-off-by: Yi Liu <yi.l.liu@intel.com> > Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com> I simplified the object declaration in include/sysemu/iommufd.h and formatted /dev/iommu in qemu-options.hx. No need to resend. Reviewed-by: Cédric Le Goater <clg@redhat.com> Thanks, C. > --- > v6: remove redundant call, alloc_hwpt, get/put_ioas > > MAINTAINERS | 7 ++ > qapi/qom.json | 19 ++++ > include/sysemu/iommufd.h | 44 ++++++++ > backends/iommufd.c | 228 +++++++++++++++++++++++++++++++++++++++ > backends/Kconfig | 4 + > backends/meson.build | 1 + > backends/trace-events | 10 ++ > qemu-options.hx | 12 +++ > 8 files changed, 325 insertions(+) > create mode 100644 include/sysemu/iommufd.h > create mode 100644 backends/iommufd.c > > diff --git a/MAINTAINERS b/MAINTAINERS > index ff1238bb98..a4891f7bda 100644 > --- a/MAINTAINERS > +++ b/MAINTAINERS > @@ -2166,6 +2166,13 @@ F: hw/vfio/ap.c > F: docs/system/s390x/vfio-ap.rst > L: qemu-s390x@nongnu.org > > +iommufd > +M: Yi Liu <yi.l.liu@intel.com> > +M: Eric Auger <eric.auger@redhat.com> > +S: Supported > +F: backends/iommufd.c > +F: include/sysemu/iommufd.h > + > vhost > M: Michael S. Tsirkin <mst@redhat.com> > S: Supported > diff --git a/qapi/qom.json b/qapi/qom.json > index c53ef978ff..1fd8555a75 100644 > --- a/qapi/qom.json > +++ b/qapi/qom.json > @@ -794,6 +794,23 @@ > { 'struct': 'VfioUserServerProperties', > 'data': { 'socket': 'SocketAddress', 'device': 'str' } } > > +## > +# @IOMMUFDProperties: > +# > +# Properties for iommufd objects. > +# > +# @fd: file descriptor name previously passed via 'getfd' command, > +# which represents a pre-opened /dev/iommu. This allows the > +# iommufd object to be shared accross several subsystems > +# (VFIO, VDPA, ...), and the file descriptor to be shared > +# with other process, e.g. DPDK. (default: QEMU opens > +# /dev/iommu by itself) > +# > +# Since: 8.2 > +## > +{ 'struct': 'IOMMUFDProperties', > + 'data': { '*fd': 'str' } } > + > ## > # @RngProperties: > # > @@ -934,6 +951,7 @@ > 'input-barrier', > { 'name': 'input-linux', > 'if': 'CONFIG_LINUX' }, > + 'iommufd', > 'iothread', > 'main-loop', > { 'name': 'memory-backend-epc', > @@ -1003,6 +1021,7 @@ > 'input-barrier': 'InputBarrierProperties', > 'input-linux': { 'type': 'InputLinuxProperties', > 'if': 'CONFIG_LINUX' }, > + 'iommufd': 'IOMMUFDProperties', > 'iothread': 'IothreadProperties', > 'main-loop': 'MainLoopProperties', > 'memory-backend-epc': { 'type': 'MemoryBackendEpcProperties', > diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h > new file mode 100644 > index 0000000000..9b3a86f57d > --- /dev/null > +++ b/include/sysemu/iommufd.h > @@ -0,0 +1,44 @@ > +#ifndef SYSEMU_IOMMUFD_H > +#define SYSEMU_IOMMUFD_H > + > +#include "qom/object.h" > +#include "qemu/thread.h" > +#include "exec/hwaddr.h" > +#include "exec/cpu-common.h" > + > +#define TYPE_IOMMUFD_BACKEND "iommufd" > +OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass, > + IOMMUFD_BACKEND) > +#define IOMMUFD_BACKEND(obj) \ > + OBJECT_CHECK(IOMMUFDBackend, (obj), TYPE_IOMMUFD_BACKEND) > +#define IOMMUFD_BACKEND_GET_CLASS(obj) \ > + OBJECT_GET_CLASS(IOMMUFDBackendClass, (obj), TYPE_IOMMUFD_BACKEND) > +#define IOMMUFD_BACKEND_CLASS(klass) \ > + OBJECT_CLASS_CHECK(IOMMUFDBackendClass, (klass), TYPE_IOMMUFD_BACKEND) > +struct IOMMUFDBackendClass { > + ObjectClass parent_class; > +}; > + > +struct IOMMUFDBackend { > + Object parent; > + > + /*< protected >*/ > + int fd; /* /dev/iommu file descriptor */ > + bool owned; /* is the /dev/iommu opened internally */ > + QemuMutex lock; > + uint32_t users; > + > + /*< public >*/ > +}; > + > +int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp); > +void iommufd_backend_disconnect(IOMMUFDBackend *be); > + > +int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id, > + Error **errp); > +void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id); > +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova, > + ram_addr_t size, void *vaddr, bool readonly); > +int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, > + hwaddr iova, ram_addr_t size); > +#endif > diff --git a/backends/iommufd.c b/backends/iommufd.c > new file mode 100644 > index 0000000000..ea3e2a8f85 > --- /dev/null > +++ b/backends/iommufd.c > @@ -0,0 +1,228 @@ > +/* > + * iommufd container backend > + * > + * Copyright (C) 2023 Intel Corporation. > + * Copyright Red Hat, Inc. 2023 > + * > + * Authors: Yi Liu <yi.l.liu@intel.com> > + * Eric Auger <eric.auger@redhat.com> > + * > + * SPDX-License-Identifier: GPL-2.0-or-later > + */ > + > +#include "qemu/osdep.h" > +#include "sysemu/iommufd.h" > +#include "qapi/error.h" > +#include "qapi/qmp/qerror.h" > +#include "qemu/module.h" > +#include "qom/object_interfaces.h" > +#include "qemu/error-report.h" > +#include "monitor/monitor.h" > +#include "trace.h" > +#include <sys/ioctl.h> > +#include <linux/iommufd.h> > + > +static void iommufd_backend_init(Object *obj) > +{ > + IOMMUFDBackend *be = IOMMUFD_BACKEND(obj); > + > + be->fd = -1; > + be->users = 0; > + be->owned = true; > + qemu_mutex_init(&be->lock); > +} > + > +static void iommufd_backend_finalize(Object *obj) > +{ > + IOMMUFDBackend *be = IOMMUFD_BACKEND(obj); > + > + if (be->owned) { > + close(be->fd); > + be->fd = -1; > + } > +} > + > +static void iommufd_backend_set_fd(Object *obj, const char *str, Error **errp) > +{ > + IOMMUFDBackend *be = IOMMUFD_BACKEND(obj); > + int fd = -1; > + > + fd = monitor_fd_param(monitor_cur(), str, errp); > + if (fd == -1) { > + error_prepend(errp, "Could not parse remote object fd %s:", str); > + return; > + } > + qemu_mutex_lock(&be->lock); > + be->fd = fd; > + be->owned = false; > + qemu_mutex_unlock(&be->lock); > + trace_iommu_backend_set_fd(be->fd); > +} > + > +static void iommufd_backend_class_init(ObjectClass *oc, void *data) > +{ > + object_class_property_add_str(oc, "fd", NULL, iommufd_backend_set_fd); > +} > + > +int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp) > +{ > + int fd, ret = 0; > + > + qemu_mutex_lock(&be->lock); > + if (be->users == UINT32_MAX) { > + error_setg(errp, "too many connections"); > + ret = -E2BIG; > + goto out; > + } > + if (be->owned && !be->users) { > + fd = qemu_open_old("/dev/iommu", O_RDWR); > + if (fd < 0) { > + error_setg_errno(errp, errno, "/dev/iommu opening failed"); > + ret = fd; > + goto out; > + } > + be->fd = fd; > + } > + be->users++; > +out: > + trace_iommufd_backend_connect(be->fd, be->owned, > + be->users, ret); > + qemu_mutex_unlock(&be->lock); > + return ret; > +} > + > +void iommufd_backend_disconnect(IOMMUFDBackend *be) > +{ > + qemu_mutex_lock(&be->lock); > + if (!be->users) { > + goto out; > + } > + be->users--; > + if (!be->users && be->owned) { > + close(be->fd); > + be->fd = -1; > + } > +out: > + trace_iommufd_backend_disconnect(be->fd, be->users); > + qemu_mutex_unlock(&be->lock); > +} > + > +int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id, > + Error **errp) > +{ > + int ret, fd = be->fd; > + struct iommu_ioas_alloc alloc_data = { > + .size = sizeof(alloc_data), > + .flags = 0, > + }; > + > + ret = ioctl(fd, IOMMU_IOAS_ALLOC, &alloc_data); > + if (ret) { > + error_setg_errno(errp, errno, "Failed to allocate ioas"); > + return ret; > + } > + > + *ioas_id = alloc_data.out_ioas_id; > + trace_iommufd_backend_alloc_ioas(fd, *ioas_id, ret); > + > + return ret; > +} > + > +void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id) > +{ > + int ret, fd = be->fd; > + struct iommu_destroy des = { > + .size = sizeof(des), > + .id = id, > + }; > + > + ret = ioctl(fd, IOMMU_DESTROY, &des); > + trace_iommufd_backend_free_id(fd, id, ret); > + if (ret) { > + error_report("Failed to free id: %u %m", id); > + } > +} > + > +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova, > + ram_addr_t size, void *vaddr, bool readonly) > +{ > + int ret, fd = be->fd; > + struct iommu_ioas_map map = { > + .size = sizeof(map), > + .flags = IOMMU_IOAS_MAP_READABLE | > + IOMMU_IOAS_MAP_FIXED_IOVA, > + .ioas_id = ioas_id, > + .__reserved = 0, > + .user_va = (uintptr_t)vaddr, > + .iova = iova, > + .length = size, > + }; > + > + if (!readonly) { > + map.flags |= IOMMU_IOAS_MAP_WRITEABLE; > + } > + > + ret = ioctl(fd, IOMMU_IOAS_MAP, &map); > + trace_iommufd_backend_map_dma(fd, ioas_id, iova, size, > + vaddr, readonly, ret); > + if (ret) { > + ret = -errno; > + error_report("IOMMU_IOAS_MAP failed: %m"); > + } > + return ret; > +} > + > +int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, > + hwaddr iova, ram_addr_t size) > +{ > + int ret, fd = be->fd; > + struct iommu_ioas_unmap unmap = { > + .size = sizeof(unmap), > + .ioas_id = ioas_id, > + .iova = iova, > + .length = size, > + }; > + > + ret = ioctl(fd, IOMMU_IOAS_UNMAP, &unmap); > + /* > + * IOMMUFD takes mapping as some kind of object, unmapping > + * nonexistent mapping is treated as deleting a nonexistent > + * object and return ENOENT. This is different from legacy > + * backend which allows it. vIOMMU may trigger a lot of > + * redundant unmapping, to avoid flush the log, treat them > + * as succeess for IOMMUFD just like legacy backend. > + */ > + if (ret && errno == ENOENT) { > + trace_iommufd_backend_unmap_dma_non_exist(fd, ioas_id, iova, size, ret); > + ret = 0; > + } else { > + trace_iommufd_backend_unmap_dma(fd, ioas_id, iova, size, ret); > + } > + > + if (ret) { > + ret = -errno; > + error_report("IOMMU_IOAS_UNMAP failed: %m"); > + } > + return ret; > +} > + > +static const TypeInfo iommufd_backend_info = { > + .name = TYPE_IOMMUFD_BACKEND, > + .parent = TYPE_OBJECT, > + .instance_size = sizeof(IOMMUFDBackend), > + .instance_init = iommufd_backend_init, > + .instance_finalize = iommufd_backend_finalize, > + .class_size = sizeof(IOMMUFDBackendClass), > + .class_init = iommufd_backend_class_init, > + .interfaces = (InterfaceInfo[]) { > + { TYPE_USER_CREATABLE }, > + { } > + } > +}; > + > +static void register_types(void) > +{ > + type_register_static(&iommufd_backend_info); > +} > + > +type_init(register_types); > diff --git a/backends/Kconfig b/backends/Kconfig > index f35abc1609..2cb23f62fa 100644 > --- a/backends/Kconfig > +++ b/backends/Kconfig > @@ -1 +1,5 @@ > source tpm/Kconfig > + > +config IOMMUFD > + bool > + depends on VFIO > diff --git a/backends/meson.build b/backends/meson.build > index 914c7c4afb..9a5cea480d 100644 > --- a/backends/meson.build > +++ b/backends/meson.build > @@ -20,6 +20,7 @@ if have_vhost_user > system_ss.add(when: 'CONFIG_VIRTIO', if_true: files('vhost-user.c')) > endif > system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('cryptodev-vhost.c')) > +system_ss.add(when: 'CONFIG_IOMMUFD', if_true: files('iommufd.c')) > if have_vhost_user_crypto > system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('cryptodev-vhost-user.c')) > endif > diff --git a/backends/trace-events b/backends/trace-events > index 652eb76a57..d45c6e31a6 100644 > --- a/backends/trace-events > +++ b/backends/trace-events > @@ -5,3 +5,13 @@ dbus_vmstate_pre_save(void) > dbus_vmstate_post_load(int version_id) "version_id: %d" > dbus_vmstate_loading(const char *id) "id: %s" > dbus_vmstate_saving(const char *id) "id: %s" > + > +# iommufd.c > +iommufd_backend_connect(int fd, bool owned, uint32_t users, int ret) "fd=%d owned=%d users=%d (%d)" > +iommufd_backend_disconnect(int fd, uint32_t users) "fd=%d users=%d" > +iommu_backend_set_fd(int fd) "pre-opened /dev/iommu fd=%d" > +iommufd_backend_map_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, void *vaddr, bool readonly, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" addr=%p readonly=%d (%d)" > +iommufd_backend_unmap_dma_non_exist(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " Unmap nonexistent mapping: iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)" > +iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)" > +iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas, int ret) " iommufd=%d ioas=%d (%d)" > +iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)" > diff --git a/qemu-options.hx b/qemu-options.hx > index 42fd09e4de..70507c0ee6 100644 > --- a/qemu-options.hx > +++ b/qemu-options.hx > @@ -5224,6 +5224,18 @@ SRST > > The ``share`` boolean option is on by default with memfd. > > + ``-object iommufd,id=id[,fd=fd]`` > + Creates an iommufd backend which allows control of DMA mapping > + through the /dev/iommu device. > + > + The ``id`` parameter is a unique ID which frontends (such as > + vfio-pci of vdpa) will use to connect with the iommufd backend. > + > + The ``fd`` parameter is an optional pre-opened file descriptor > + resulting from /dev/iommu opening. Usually the iommufd is shared > + across all subsystems, bringing the benefit of centralized > + reference counting. > + > ``-object rng-builtin,id=id`` > Creates a random number generator backend which obtains entropy > from QEMU builtin functions. The ``id`` parameter is a unique ID
>-----Original Message----- >From: Cédric Le Goater <clg@redhat.com> >Sent: Tuesday, November 14, 2023 9:29 PM >Subject: Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object > >On 11/14/23 11:09, Zhenzhong Duan wrote: >> From: Eric Auger <eric.auger@redhat.com> >> >> Introduce an iommufd object which allows the interaction >> with the host /dev/iommu device. >> >> The /dev/iommu can have been already pre-opened outside of qemu, >> in which case the fd can be passed directly along with the >> iommufd object: >> >> This allows the iommufd object to be shared accross several >> subsystems (VFIO, VDPA, ...). For example, libvirt would open >> the /dev/iommu once. >> >> If no fd is passed along with the iommufd object, the /dev/iommu >> is opened by the qemu code. >> >> Suggested-by: Alex Williamson <alex.williamson@redhat.com> >> Signed-off-by: Eric Auger <eric.auger@redhat.com> >> Signed-off-by: Yi Liu <yi.l.liu@intel.com> >> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com> > >I simplified the object declaration in include/sysemu/iommufd.h and >formatted /dev/iommu in qemu-options.hx. No need to resend. Good catch, thanks! Maybe further simplified with below? This is minor. OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass, IOMMUFD_BACKEND) Thanks Zhenzhong > >Reviewed-by: Cédric Le Goater <clg@redhat.com> > >Thanks, > >C. > > >> --- >> v6: remove redundant call, alloc_hwpt, get/put_ioas >> >> MAINTAINERS | 7 ++ >> qapi/qom.json | 19 ++++ >> include/sysemu/iommufd.h | 44 ++++++++ >> backends/iommufd.c | 228 >+++++++++++++++++++++++++++++++++++++++ >> backends/Kconfig | 4 + >> backends/meson.build | 1 + >> backends/trace-events | 10 ++ >> qemu-options.hx | 12 +++ >> 8 files changed, 325 insertions(+) >> create mode 100644 include/sysemu/iommufd.h >> create mode 100644 backends/iommufd.c >> >> diff --git a/MAINTAINERS b/MAINTAINERS >> index ff1238bb98..a4891f7bda 100644 >> --- a/MAINTAINERS >> +++ b/MAINTAINERS >> @@ -2166,6 +2166,13 @@ F: hw/vfio/ap.c >> F: docs/system/s390x/vfio-ap.rst >> L: qemu-s390x@nongnu.org >> >> +iommufd >> +M: Yi Liu <yi.l.liu@intel.com> >> +M: Eric Auger <eric.auger@redhat.com> >> +S: Supported >> +F: backends/iommufd.c >> +F: include/sysemu/iommufd.h >> + >> vhost >> M: Michael S. Tsirkin <mst@redhat.com> >> S: Supported >> diff --git a/qapi/qom.json b/qapi/qom.json >> index c53ef978ff..1fd8555a75 100644 >> --- a/qapi/qom.json >> +++ b/qapi/qom.json >> @@ -794,6 +794,23 @@ >> { 'struct': 'VfioUserServerProperties', >> 'data': { 'socket': 'SocketAddress', 'device': 'str' } } >> >> +## >> +# @IOMMUFDProperties: >> +# >> +# Properties for iommufd objects. >> +# >> +# @fd: file descriptor name previously passed via 'getfd' command, >> +# which represents a pre-opened /dev/iommu. This allows the >> +# iommufd object to be shared accross several subsystems >> +# (VFIO, VDPA, ...), and the file descriptor to be shared >> +# with other process, e.g. DPDK. (default: QEMU opens >> +# /dev/iommu by itself) >> +# >> +# Since: 8.2 >> +## >> +{ 'struct': 'IOMMUFDProperties', >> + 'data': { '*fd': 'str' } } >> + >> ## >> # @RngProperties: >> # >> @@ -934,6 +951,7 @@ >> 'input-barrier', >> { 'name': 'input-linux', >> 'if': 'CONFIG_LINUX' }, >> + 'iommufd', >> 'iothread', >> 'main-loop', >> { 'name': 'memory-backend-epc', >> @@ -1003,6 +1021,7 @@ >> 'input-barrier': 'InputBarrierProperties', >> 'input-linux': { 'type': 'InputLinuxProperties', >> 'if': 'CONFIG_LINUX' }, >> + 'iommufd': 'IOMMUFDProperties', >> 'iothread': 'IothreadProperties', >> 'main-loop': 'MainLoopProperties', >> 'memory-backend-epc': { 'type': 'MemoryBackendEpcProperties', >> diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h >> new file mode 100644 >> index 0000000000..9b3a86f57d >> --- /dev/null >> +++ b/include/sysemu/iommufd.h >> @@ -0,0 +1,44 @@ >> +#ifndef SYSEMU_IOMMUFD_H >> +#define SYSEMU_IOMMUFD_H >> + >> +#include "qom/object.h" >> +#include "qemu/thread.h" >> +#include "exec/hwaddr.h" >> +#include "exec/cpu-common.h" >> + >> +#define TYPE_IOMMUFD_BACKEND "iommufd" >> +OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass, >> + IOMMUFD_BACKEND) >> +#define IOMMUFD_BACKEND(obj) \ >> + OBJECT_CHECK(IOMMUFDBackend, (obj), TYPE_IOMMUFD_BACKEND) >> +#define IOMMUFD_BACKEND_GET_CLASS(obj) \ >> + OBJECT_GET_CLASS(IOMMUFDBackendClass, (obj), >TYPE_IOMMUFD_BACKEND) >> +#define IOMMUFD_BACKEND_CLASS(klass) \ >> + OBJECT_CLASS_CHECK(IOMMUFDBackendClass, (klass), >TYPE_IOMMUFD_BACKEND) >> +struct IOMMUFDBackendClass { >> + ObjectClass parent_class; >> +}; >> + >> +struct IOMMUFDBackend { >> + Object parent; >> + >> + /*< protected >*/ >> + int fd; /* /dev/iommu file descriptor */ >> + bool owned; /* is the /dev/iommu opened internally */ >> + QemuMutex lock; >> + uint32_t users; >> + >> + /*< public >*/ >> +}; >> + >> +int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp); >> +void iommufd_backend_disconnect(IOMMUFDBackend *be); >> + >> +int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id, >> + Error **errp); >> +void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id); >> +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, >hwaddr iova, >> + ram_addr_t size, void *vaddr, bool readonly); >> +int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, >> + hwaddr iova, ram_addr_t size); >> +#endif >> diff --git a/backends/iommufd.c b/backends/iommufd.c >> new file mode 100644 >> index 0000000000..ea3e2a8f85 >> --- /dev/null >> +++ b/backends/iommufd.c >> @@ -0,0 +1,228 @@ >> +/* >> + * iommufd container backend >> + * >> + * Copyright (C) 2023 Intel Corporation. >> + * Copyright Red Hat, Inc. 2023 >> + * >> + * Authors: Yi Liu <yi.l.liu@intel.com> >> + * Eric Auger <eric.auger@redhat.com> >> + * >> + * SPDX-License-Identifier: GPL-2.0-or-later >> + */ >> + >> +#include "qemu/osdep.h" >> +#include "sysemu/iommufd.h" >> +#include "qapi/error.h" >> +#include "qapi/qmp/qerror.h" >> +#include "qemu/module.h" >> +#include "qom/object_interfaces.h" >> +#include "qemu/error-report.h" >> +#include "monitor/monitor.h" >> +#include "trace.h" >> +#include <sys/ioctl.h> >> +#include <linux/iommufd.h> >> + >> +static void iommufd_backend_init(Object *obj) >> +{ >> + IOMMUFDBackend *be = IOMMUFD_BACKEND(obj); >> + >> + be->fd = -1; >> + be->users = 0; >> + be->owned = true; >> + qemu_mutex_init(&be->lock); >> +} >> + >> +static void iommufd_backend_finalize(Object *obj) >> +{ >> + IOMMUFDBackend *be = IOMMUFD_BACKEND(obj); >> + >> + if (be->owned) { >> + close(be->fd); >> + be->fd = -1; >> + } >> +} >> + >> +static void iommufd_backend_set_fd(Object *obj, const char *str, Error **errp) >> +{ >> + IOMMUFDBackend *be = IOMMUFD_BACKEND(obj); >> + int fd = -1; >> + >> + fd = monitor_fd_param(monitor_cur(), str, errp); >> + if (fd == -1) { >> + error_prepend(errp, "Could not parse remote object fd %s:", str); >> + return; >> + } >> + qemu_mutex_lock(&be->lock); >> + be->fd = fd; >> + be->owned = false; >> + qemu_mutex_unlock(&be->lock); >> + trace_iommu_backend_set_fd(be->fd); >> +} >> + >> +static void iommufd_backend_class_init(ObjectClass *oc, void *data) >> +{ >> + object_class_property_add_str(oc, "fd", NULL, iommufd_backend_set_fd); >> +} >> + >> +int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp) >> +{ >> + int fd, ret = 0; >> + >> + qemu_mutex_lock(&be->lock); >> + if (be->users == UINT32_MAX) { >> + error_setg(errp, "too many connections"); >> + ret = -E2BIG; >> + goto out; >> + } >> + if (be->owned && !be->users) { >> + fd = qemu_open_old("/dev/iommu", O_RDWR); >> + if (fd < 0) { >> + error_setg_errno(errp, errno, "/dev/iommu opening failed"); >> + ret = fd; >> + goto out; >> + } >> + be->fd = fd; >> + } >> + be->users++; >> +out: >> + trace_iommufd_backend_connect(be->fd, be->owned, >> + be->users, ret); >> + qemu_mutex_unlock(&be->lock); >> + return ret; >> +} >> + >> +void iommufd_backend_disconnect(IOMMUFDBackend *be) >> +{ >> + qemu_mutex_lock(&be->lock); >> + if (!be->users) { >> + goto out; >> + } >> + be->users--; >> + if (!be->users && be->owned) { >> + close(be->fd); >> + be->fd = -1; >> + } >> +out: >> + trace_iommufd_backend_disconnect(be->fd, be->users); >> + qemu_mutex_unlock(&be->lock); >> +} >> + >> +int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id, >> + Error **errp) >> +{ >> + int ret, fd = be->fd; >> + struct iommu_ioas_alloc alloc_data = { >> + .size = sizeof(alloc_data), >> + .flags = 0, >> + }; >> + >> + ret = ioctl(fd, IOMMU_IOAS_ALLOC, &alloc_data); >> + if (ret) { >> + error_setg_errno(errp, errno, "Failed to allocate ioas"); >> + return ret; >> + } >> + >> + *ioas_id = alloc_data.out_ioas_id; >> + trace_iommufd_backend_alloc_ioas(fd, *ioas_id, ret); >> + >> + return ret; >> +} >> + >> +void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id) >> +{ >> + int ret, fd = be->fd; >> + struct iommu_destroy des = { >> + .size = sizeof(des), >> + .id = id, >> + }; >> + >> + ret = ioctl(fd, IOMMU_DESTROY, &des); >> + trace_iommufd_backend_free_id(fd, id, ret); >> + if (ret) { >> + error_report("Failed to free id: %u %m", id); >> + } >> +} >> + >> +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, >hwaddr iova, >> + ram_addr_t size, void *vaddr, bool readonly) >> +{ >> + int ret, fd = be->fd; >> + struct iommu_ioas_map map = { >> + .size = sizeof(map), >> + .flags = IOMMU_IOAS_MAP_READABLE | >> + IOMMU_IOAS_MAP_FIXED_IOVA, >> + .ioas_id = ioas_id, >> + .__reserved = 0, >> + .user_va = (uintptr_t)vaddr, >> + .iova = iova, >> + .length = size, >> + }; >> + >> + if (!readonly) { >> + map.flags |= IOMMU_IOAS_MAP_WRITEABLE; >> + } >> + >> + ret = ioctl(fd, IOMMU_IOAS_MAP, &map); >> + trace_iommufd_backend_map_dma(fd, ioas_id, iova, size, >> + vaddr, readonly, ret); >> + if (ret) { >> + ret = -errno; >> + error_report("IOMMU_IOAS_MAP failed: %m"); >> + } >> + return ret; >> +} >> + >> +int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, >> + hwaddr iova, ram_addr_t size) >> +{ >> + int ret, fd = be->fd; >> + struct iommu_ioas_unmap unmap = { >> + .size = sizeof(unmap), >> + .ioas_id = ioas_id, >> + .iova = iova, >> + .length = size, >> + }; >> + >> + ret = ioctl(fd, IOMMU_IOAS_UNMAP, &unmap); >> + /* >> + * IOMMUFD takes mapping as some kind of object, unmapping >> + * nonexistent mapping is treated as deleting a nonexistent >> + * object and return ENOENT. This is different from legacy >> + * backend which allows it. vIOMMU may trigger a lot of >> + * redundant unmapping, to avoid flush the log, treat them >> + * as succeess for IOMMUFD just like legacy backend. >> + */ >> + if (ret && errno == ENOENT) { >> + trace_iommufd_backend_unmap_dma_non_exist(fd, ioas_id, iova, size, >ret); >> + ret = 0; >> + } else { >> + trace_iommufd_backend_unmap_dma(fd, ioas_id, iova, size, ret); >> + } >> + >> + if (ret) { >> + ret = -errno; >> + error_report("IOMMU_IOAS_UNMAP failed: %m"); >> + } >> + return ret; >> +} >> + >> +static const TypeInfo iommufd_backend_info = { >> + .name = TYPE_IOMMUFD_BACKEND, >> + .parent = TYPE_OBJECT, >> + .instance_size = sizeof(IOMMUFDBackend), >> + .instance_init = iommufd_backend_init, >> + .instance_finalize = iommufd_backend_finalize, >> + .class_size = sizeof(IOMMUFDBackendClass), >> + .class_init = iommufd_backend_class_init, >> + .interfaces = (InterfaceInfo[]) { >> + { TYPE_USER_CREATABLE }, >> + { } >> + } >> +}; >> + >> +static void register_types(void) >> +{ >> + type_register_static(&iommufd_backend_info); >> +} >> + >> +type_init(register_types); >> diff --git a/backends/Kconfig b/backends/Kconfig >> index f35abc1609..2cb23f62fa 100644 >> --- a/backends/Kconfig >> +++ b/backends/Kconfig >> @@ -1 +1,5 @@ >> source tpm/Kconfig >> + >> +config IOMMUFD >> + bool >> + depends on VFIO >> diff --git a/backends/meson.build b/backends/meson.build >> index 914c7c4afb..9a5cea480d 100644 >> --- a/backends/meson.build >> +++ b/backends/meson.build >> @@ -20,6 +20,7 @@ if have_vhost_user >> system_ss.add(when: 'CONFIG_VIRTIO', if_true: files('vhost-user.c')) >> endif >> system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('cryptodev- >vhost.c')) >> +system_ss.add(when: 'CONFIG_IOMMUFD', if_true: files('iommufd.c')) >> if have_vhost_user_crypto >> system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('cryptodev- >vhost-user.c')) >> endif >> diff --git a/backends/trace-events b/backends/trace-events >> index 652eb76a57..d45c6e31a6 100644 >> --- a/backends/trace-events >> +++ b/backends/trace-events >> @@ -5,3 +5,13 @@ dbus_vmstate_pre_save(void) >> dbus_vmstate_post_load(int version_id) "version_id: %d" >> dbus_vmstate_loading(const char *id) "id: %s" >> dbus_vmstate_saving(const char *id) "id: %s" >> + >> +# iommufd.c >> +iommufd_backend_connect(int fd, bool owned, uint32_t users, int ret) "fd=%d >owned=%d users=%d (%d)" >> +iommufd_backend_disconnect(int fd, uint32_t users) "fd=%d users=%d" >> +iommu_backend_set_fd(int fd) "pre-opened /dev/iommu fd=%d" >> +iommufd_backend_map_dma(int iommufd, uint32_t ioas, uint64_t iova, >uint64_t size, void *vaddr, bool readonly, int ret) " iommufd=%d ioas=%d >iova=0x%"PRIx64" size=0x%"PRIx64" addr=%p readonly=%d (%d)" >> +iommufd_backend_unmap_dma_non_exist(int iommufd, uint32_t ioas, >uint64_t iova, uint64_t size, int ret) " Unmap nonexistent mapping: iommufd=%d >ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)" >> +iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova, >uint64_t size, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" >size=0x%"PRIx64" (%d)" >> +iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas, int ret) " >iommufd=%d ioas=%d (%d)" >> +iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d >id=%d (%d)" >> diff --git a/qemu-options.hx b/qemu-options.hx >> index 42fd09e4de..70507c0ee6 100644 >> --- a/qemu-options.hx >> +++ b/qemu-options.hx >> @@ -5224,6 +5224,18 @@ SRST >> >> The ``share`` boolean option is on by default with memfd. >> >> + ``-object iommufd,id=id[,fd=fd]`` >> + Creates an iommufd backend which allows control of DMA mapping >> + through the /dev/iommu device. >> + >> + The ``id`` parameter is a unique ID which frontends (such as >> + vfio-pci of vdpa) will use to connect with the iommufd backend. >> + >> + The ``fd`` parameter is an optional pre-opened file descriptor >> + resulting from /dev/iommu opening. Usually the iommufd is shared >> + across all subsystems, bringing the benefit of centralized >> + reference counting. >> + >> ``-object rng-builtin,id=id`` >> Creates a random number generator backend which obtains entropy >> from QEMU builtin functions. The ``id`` parameter is a unique ID
On 11/15/23 05:06, Duan, Zhenzhong wrote: > > >> -----Original Message----- >> From: Cédric Le Goater <clg@redhat.com> >> Sent: Tuesday, November 14, 2023 9:29 PM >> Subject: Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object >> >> On 11/14/23 11:09, Zhenzhong Duan wrote: >>> From: Eric Auger <eric.auger@redhat.com> >>> >>> Introduce an iommufd object which allows the interaction >>> with the host /dev/iommu device. >>> >>> The /dev/iommu can have been already pre-opened outside of qemu, >>> in which case the fd can be passed directly along with the >>> iommufd object: >>> >>> This allows the iommufd object to be shared accross several >>> subsystems (VFIO, VDPA, ...). For example, libvirt would open >>> the /dev/iommu once. >>> >>> If no fd is passed along with the iommufd object, the /dev/iommu >>> is opened by the qemu code. >>> >>> Suggested-by: Alex Williamson <alex.williamson@redhat.com> >>> Signed-off-by: Eric Auger <eric.auger@redhat.com> >>> Signed-off-by: Yi Liu <yi.l.liu@intel.com> >>> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com> >> >> I simplified the object declaration in include/sysemu/iommufd.h and >> formatted /dev/iommu in qemu-options.hx. No need to resend. > > Good catch, thanks! Maybe further simplified with below? This is minor. > > OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass, IOMMUFD_BACKEND) Indeed. Done. Thanks, C.
© 2016 - 2024 Red Hat, Inc.