[PATCH 2/4] qemu: open iommufd FDs from libvirt backend

Nathan Chen via Devel posted 4 patches 2 weeks, 3 days ago
[PATCH 2/4] qemu: open iommufd FDs from libvirt backend
Posted by Nathan Chen via Devel 2 weeks, 3 days ago
Open iommufd FDs from libvirt backend without
exposing these FDs to XML users, i.e. one per
domain for /dev/iommu and one per iommufd
hostdev for /dev/vfio/devices/vfioX, and pass
the FD to qemu command line.

Signed-off-by: Nathan Chen <nathanc@nvidia.com>
---
 src/qemu/qemu_command.c |  43 +++++++-
 src/qemu/qemu_command.h |   3 +-
 src/qemu/qemu_domain.c  |   8 ++
 src/qemu/qemu_domain.h  |   7 ++
 src/qemu/qemu_hotplug.c |   2 +-
 src/qemu/qemu_process.c | 232 ++++++++++++++++++++++++++++++++++++++++
 6 files changed, 289 insertions(+), 6 deletions(-)

diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c
index 8fd7527645..740a6970f2 100644
--- a/src/qemu/qemu_command.c
+++ b/src/qemu/qemu_command.c
@@ -4730,7 +4730,8 @@ qemuBuildVideoCommandLine(virCommand *cmd,
 
 virJSONValue *
 qemuBuildPCIHostdevDevProps(const virDomainDef *def,
-                            virDomainHostdevDef *dev)
+                            virDomainHostdevDef *dev,
+                            virDomainObj *vm)
 {
     g_autoptr(virJSONValue) props = NULL;
     virDomainHostdevSubsysPCI *pcisrc = &dev->source.subsys.u.pci;
@@ -4741,6 +4742,13 @@ qemuBuildPCIHostdevDevProps(const virDomainDef *def,
     const char *iommufdId = NULL;
     /* 'ramfb' property must be omitted unless it's to be enabled */
     bool ramfb = pcisrc->ramfb == VIR_TRISTATE_SWITCH_ON;
+    bool useIommufd = false;
+    qemuDomainObjPrivate *priv = vm ? vm->privateData : NULL;
+
+    if (pcisrc->driver.name == VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO &&
+        pcisrc->driver.iommufd) {
+        useIommufd = true;
+    }
 
     /* caller has to assign proper passthrough driver name */
     switch (pcisrc->driver.name) {
@@ -4787,6 +4795,18 @@ qemuBuildPCIHostdevDevProps(const virDomainDef *def,
                               NULL) < 0)
         return NULL;
 
+    if (useIommufd && priv) {
+        g_autofree char *vfioFdName = g_strdup_printf("vfio-%04x:%02x:%02x.%d",
+                                                      pcisrc->addr.domain, pcisrc->addr.bus,
+                                                      pcisrc->addr.slot, pcisrc->addr.function);
+
+        int vfiofd = GPOINTER_TO_INT(g_hash_table_lookup(priv->vfioDeviceFds, vfioFdName));
+        if (virJSONValueObjectAdd(&props,
+                                  "S:fd", g_strdup_printf("%d", vfiofd),
+                                  NULL) < 0)
+            return NULL;
+    }
+
     if (qemuBuildDeviceAddressProps(props, def, dev->info) < 0)
         return NULL;
 
@@ -5197,12 +5217,14 @@ qemuBuildAcpiNodesetProps(virCommand *cmd,
 static int
 qemuBuildHostdevCommandLine(virCommand *cmd,
                             const virDomainDef *def,
-                            virQEMUCaps *qemuCaps)
+                            virQEMUCaps *qemuCaps,
+                            virDomainObj *vm)
 {
     size_t i;
     g_autoptr(virJSONValue) props = NULL;
     int iommufd = 0;
     const char * iommufdId = "iommufd0";
+    qemuDomainObjPrivate *priv = vm->privateData;
 
     for (i = 0; i < def->nhostdevs; i++) {
         virDomainHostdevDef *hostdev = def->hostdevs[i];
@@ -5233,8 +5255,10 @@ qemuBuildHostdevCommandLine(virCommand *cmd,
 
             if (subsys->u.pci.driver.iommufd && iommufd == 0) {
                 iommufd = 1;
+                virCommandPassFD(cmd, priv->iommufd, VIR_COMMAND_PASS_FD_CLOSE_PARENT);
                 if (qemuMonitorCreateObjectProps(&props, "iommufd",
                                                  iommufdId,
+                                                 "S:fd", g_strdup_printf("%d", priv->iommufd),
                                                  NULL) < 0)
                     return -1;
 
@@ -5245,7 +5269,18 @@ qemuBuildHostdevCommandLine(virCommand *cmd,
             if (qemuCommandAddExtDevice(cmd, hostdev->info, def, qemuCaps) < 0)
                 return -1;
 
-            if (!(devprops = qemuBuildPCIHostdevDevProps(def, hostdev)))
+            if (subsys->u.pci.driver.iommufd) {
+                virDomainHostdevSubsysPCI *pcisrc = &hostdev->source.subsys.u.pci;
+                g_autofree char *vfioFdName = g_strdup_printf("vfio-%04x:%02x:%02x.%d",
+                                                              pcisrc->addr.domain, pcisrc->addr.bus,
+                                                              pcisrc->addr.slot, pcisrc->addr.function);
+
+                int vfiofd = GPOINTER_TO_INT(g_hash_table_lookup(priv->vfioDeviceFds, vfioFdName));
+
+                virCommandPassFD(cmd, vfiofd, VIR_COMMAND_PASS_FD_CLOSE_PARENT);
+            }
+
+            if (!(devprops = qemuBuildPCIHostdevDevProps(def, hostdev, vm)))
                 return -1;
 
             if (qemuBuildDeviceCommandlineFromJSON(cmd, devprops, def, qemuCaps) < 0)
@@ -10893,7 +10928,7 @@ qemuBuildCommandLine(virDomainObj *vm,
     if (qemuBuildRedirdevCommandLine(cmd, def, qemuCaps) < 0)
         return NULL;
 
-    if (qemuBuildHostdevCommandLine(cmd, def, qemuCaps) < 0)
+    if (qemuBuildHostdevCommandLine(cmd, def, qemuCaps, vm) < 0)
         return NULL;
 
     if (migrateURI)
diff --git a/src/qemu/qemu_command.h b/src/qemu/qemu_command.h
index ad068f1f16..380aac261f 100644
--- a/src/qemu/qemu_command.h
+++ b/src/qemu/qemu_command.h
@@ -180,7 +180,8 @@ qemuBuildThreadContextProps(virJSONValue **tcProps,
 /* Current, best practice */
 virJSONValue *
 qemuBuildPCIHostdevDevProps(const virDomainDef *def,
-                            virDomainHostdevDef *dev);
+                            virDomainHostdevDef *dev,
+                            virDomainObj *vm);
 
 virJSONValue *
 qemuBuildRNGDevProps(const virDomainDef *def,
diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
index a42721efad..86640aa3e3 100644
--- a/src/qemu/qemu_domain.c
+++ b/src/qemu/qemu_domain.c
@@ -1953,6 +1953,11 @@ qemuDomainObjPrivateFree(void *data)
 
     virChrdevFree(priv->devs);
 
+    if (priv->iommufd >= 0) {
+        virEventRemoveHandle(priv->iommufd);
+        priv->iommufd = -1;
+    }
+
     if (priv->pidMonitored >= 0) {
         virEventRemoveHandle(priv->pidMonitored);
         priv->pidMonitored = -1;
@@ -1974,6 +1979,7 @@ qemuDomainObjPrivateFree(void *data)
 
     g_clear_pointer(&priv->blockjobs, g_hash_table_unref);
     g_clear_pointer(&priv->fds, g_hash_table_unref);
+    g_clear_pointer(&priv->vfioDeviceFds, g_hash_table_unref);
 
     /* This should never be non-NULL if we get here, but just in case... */
     if (priv->eventThread) {
@@ -2002,7 +2008,9 @@ qemuDomainObjPrivateAlloc(void *opaque)
 
     priv->blockjobs = virHashNew(virObjectUnref);
     priv->fds = virHashNew(g_object_unref);
+    priv->vfioDeviceFds = g_hash_table_new(g_str_hash, g_str_equal);
 
+    priv->iommufd = -1;
     priv->pidMonitored = -1;
 
     /* agent commands block by default, user can choose different behavior */
diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h
index 3396f929fd..d6214df783 100644
--- a/src/qemu/qemu_domain.h
+++ b/src/qemu/qemu_domain.h
@@ -264,6 +264,10 @@ struct _qemuDomainObjPrivate {
     /* named file descriptor groups associated with the VM */
     GHashTable *fds;
 
+    int iommufd;
+
+    GHashTable *vfioDeviceFds;
+
     char *memoryBackingDir;
 };
 
@@ -1174,3 +1178,6 @@ qemuDomainCheckCPU(virArch arch,
 bool
 qemuDomainMachineSupportsFloppy(const char *machine,
                                 virQEMUCaps *qemuCaps);
+
+int qemuProcessOpenVfioFds(virDomainObj *vm);
+void qemuProcessCloseVfioFds(virDomainObj *vm);
diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c
index fb426deb1a..661e9008f7 100644
--- a/src/qemu/qemu_hotplug.c
+++ b/src/qemu/qemu_hotplug.c
@@ -1630,7 +1630,7 @@ qemuDomainAttachHostPCIDevice(virQEMUDriver *driver,
         goto error;
     }
 
-    if (!(devprops = qemuBuildPCIHostdevDevProps(vm->def, hostdev)))
+    if (!(devprops = qemuBuildPCIHostdevDevProps(vm->def, hostdev, vm)))
         goto error;
 
     qemuDomainObjEnterMonitor(vm);
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
index 45fc32a663..cecfed94a7 100644
--- a/src/qemu/qemu_process.c
+++ b/src/qemu/qemu_process.c
@@ -25,6 +25,7 @@
 #include <unistd.h>
 #include <signal.h>
 #include <sys/stat.h>
+#include <dirent.h>
 #if WITH_SYS_SYSCALL_H
 # include <sys/syscall.h>
 #endif
@@ -8091,6 +8092,9 @@ qemuProcessLaunch(virConnectPtr conn,
     if (qemuExtDevicesStart(driver, vm, incomingMigrationExtDevices) < 0)
         goto cleanup;
 
+    if (qemuProcessOpenVfioFds(vm) < 0)
+        goto cleanup;
+
     if (!(cmd = qemuBuildCommandLine(vm,
                                      incoming ? "defer" : NULL,
                                      vmop,
@@ -10267,3 +10271,231 @@ qemuProcessHandleNbdkitExit(qemuNbdkitProcess *nbdkit,
     qemuProcessEventSubmit(vm, QEMU_PROCESS_EVENT_NBDKIT_EXITED, 0, 0, nbdkit);
     virObjectUnlock(vm);
 }
+
+/**
+ * qemuProcessOpenIommuFd:
+ * @vm: domain object
+ * @iommuFd: returned file descriptor
+ *
+ * Opens /dev/iommu file descriptor for the VM.
+ *
+ * Returns: 0 on success, -1 on failure
+ */
+static int
+qemuProcessOpenIommuFd(virDomainObj *vm, int *iommuFd)
+{
+    int fd = -1;
+
+    VIR_DEBUG("Opening IOMMU FD for domain %s", vm->def->name);
+
+    if ((fd = open("/dev/iommu", O_RDWR | O_CLOEXEC)) < 0) {
+        if (errno == ENOENT) {
+            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+                           _("IOMMU FD support requires /dev/iommu device"));
+        } else {
+            virReportSystemError(errno, "%s",
+                                 _("cannot open /dev/iommu"));
+        }
+        return -1;
+    }
+
+    *iommuFd = fd;
+    VIR_DEBUG("Opened IOMMU FD %d for domain %s", fd, vm->def->name);
+    return 0;
+}
+
+/**
+ * qemuProcessGetVfioDevicePath:
+ * @hostdev: host device definition
+ * @vfioPath: returned VFIO device path
+ *
+ * Constructs the VFIO device path for a PCI hostdev.
+ *
+ * Returns: 0 on success, -1 on failure
+ */
+static int
+qemuProcessGetVfioDevicePath(virDomainHostdevDef *hostdev,
+                             char **vfioPath)
+{
+    virPCIDeviceAddress *addr;
+    g_autofree char *sysfsPath = NULL;
+    DIR *dir = NULL;
+    struct dirent *entry = NULL;
+    int ret = -1;
+
+    if (hostdev->mode != VIR_DOMAIN_HOSTDEV_MODE_SUBSYS ||
+        hostdev->source.subsys.type != VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI) {
+        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
+                       _("VFIO FD only supported for PCI hostdevs"));
+        return -1;
+    }
+
+    addr = &hostdev->source.subsys.u.pci.addr;
+
+    /* Build sysfs path: /sys/bus/pci/devices/DDDD:BB:DD.F/vfio-dev/ */
+    sysfsPath = g_strdup_printf("/sys/bus/pci/devices/"
+                                "%04x:%02x:%02x.%d/vfio-dev/",
+                                addr->domain, addr->bus,
+                                addr->slot, addr->function);
+
+    if (virDirOpen(&dir, sysfsPath) < 0) {
+        virReportSystemError(errno,
+                             _("cannot open VFIO sysfs directory %1$s"),
+                             sysfsPath);
+        return -1;
+    }
+
+    /* Find the vfio device name in the directory */
+    while (virDirRead(dir, &entry, sysfsPath) > 0) {
+        if (STRPREFIX(entry->d_name, "vfio")) {
+            *vfioPath = g_strdup_printf("/dev/vfio/devices/%s", entry->d_name);
+            ret = 0;
+            break;
+        }
+    }
+
+    if (ret < 0) {
+        virReportError(VIR_ERR_INTERNAL_ERROR,
+                       _("cannot find VFIO device for PCI device %1$04x:%2$02x:%3$02x.%4$d"),
+                       addr->domain, addr->bus, addr->slot, addr->function);
+    }
+
+    virDirClose(dir);
+    return ret;
+}
+
+/**
+ * qemuProcessOpenVfioDeviceFd:
+ * @hostdev: host device definition
+ * @vfioFd: returned file descriptor
+ *
+ * Opens the VFIO device file descriptor for a hostdev.
+ *
+ * Returns: 0 on success, -1 on failure
+ */
+static int
+qemuProcessOpenVfioDeviceFd(virDomainHostdevDef *hostdev,
+                            int *vfioFd)
+{
+    g_autofree char *vfioPath = NULL;
+    int fd = -1;
+
+    if (qemuProcessGetVfioDevicePath(hostdev, &vfioPath) < 0)
+        return -1;
+
+    VIR_DEBUG("Opening VFIO device %s", vfioPath);
+
+    if ((fd = open(vfioPath, O_RDWR | O_CLOEXEC)) < 0) {
+        if (errno == ENOENT) {
+            virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
+                           _("VFIO device %1$s not found - ensure device is bound to vfio-pci driver"),
+                           vfioPath);
+        } else {
+            virReportSystemError(errno,
+                                 _("cannot open VFIO device %1$s"), vfioPath);
+        }
+        return -1;
+    }
+
+    *vfioFd = fd;
+    VIR_DEBUG("Opened VFIO device FD %d for %s", *vfioFd, vfioPath);
+    return 0;
+}
+
+/**
+ * qemuProcessOpenVfioFds:
+ * @vm: domain object
+ *
+ * Opens all necessary VFIO file descriptors for the domain.
+ *
+ * Returns: 0 on success, -1 on failure
+ */
+int
+qemuProcessOpenVfioFds(virDomainObj *vm)
+{
+    qemuDomainObjPrivate *priv = vm->privateData;
+    bool needsIommuFd = false;
+    size_t i;
+
+    /* Check if we have any hostdevs that need VFIO FDs */
+    for (i = 0; i < vm->def->nhostdevs; i++) {
+        virDomainHostdevDef *hostdev = vm->def->hostdevs[i];
+        int vfioFd = -1;
+        g_autofree char *fdname = NULL;
+
+        if (hostdev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS &&
+            hostdev->source.subsys.type == VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI) {
+
+            /* Check if this hostdev uses VFIO with IOMMU FD */
+            if (hostdev->source.subsys.u.pci.driver.name == VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO &&
+                hostdev->source.subsys.u.pci.driver.iommufd) {
+
+                needsIommuFd = true;
+
+                /* Open VFIO device FD */
+                if (qemuProcessOpenVfioDeviceFd(hostdev, &vfioFd) < 0)
+                    goto error;
+
+                /* Store the FD */
+                fdname = g_strdup_printf("vfio-%04x:%02x:%02x.%d",
+                                         hostdev->source.subsys.u.pci.addr.domain,
+                                         hostdev->source.subsys.u.pci.addr.bus,
+                                         hostdev->source.subsys.u.pci.addr.slot,
+                                         hostdev->source.subsys.u.pci.addr.function);
+
+                g_hash_table_insert(priv->vfioDeviceFds, g_steal_pointer(&fdname), GINT_TO_POINTER(vfioFd));
+
+                VIR_DEBUG("Stored VFIO FD for device %s", fdname);
+            }
+        }
+    }
+
+    /* Open IOMMU FD if needed */
+    if (needsIommuFd) {
+        int iommuFd = -1;
+
+        if (qemuProcessOpenIommuFd(vm, &iommuFd) < 0)
+            goto error;
+
+        priv->iommufd = iommuFd;
+
+        VIR_DEBUG("Stored IOMMU FD");
+    }
+
+    return 0;
+
+ error:
+    qemuProcessCloseVfioFds(vm);
+    return -1;
+}
+
+/**
+ * qemuProcessCloseVfioFds:
+ * @vm: domain object
+ *
+ * Closes all VFIO file descriptors for the domain.
+ */
+void
+qemuProcessCloseVfioFds(virDomainObj *vm)
+{
+    qemuDomainObjPrivate *priv = vm->privateData;
+    GHashTableIter iter;
+    gpointer key, value;
+
+    /* Close all VFIO device FDs */
+    if (priv->vfioDeviceFds) {
+        g_hash_table_iter_init(&iter, priv->vfioDeviceFds);
+        while (g_hash_table_iter_next(&iter, &key, &value)) {
+            int fd = GPOINTER_TO_INT(value);
+            VIR_DEBUG("Closing VFIO device FD %d for %s", fd, (char*)key);
+            VIR_FORCE_CLOSE(fd);
+        }
+        g_hash_table_remove_all(priv->vfioDeviceFds);
+    }
+
+    /* Close IOMMU FD */
+    if (priv->iommufd >= 0) {
+        VIR_DEBUG("Closing IOMMU FD %d", priv->iommufd);
+        VIR_FORCE_CLOSE(priv->iommufd);
+    }
+}
-- 
2.43.0
Re: [PATCH 2/4] qemu: open iommufd FDs from libvirt backend
Posted by Ján Tomko via Devel 2 weeks ago
On a Monday in 2025, Nathan Chen via Devel wrote:
>Open iommufd FDs from libvirt backend without
>exposing these FDs to XML users, i.e. one per
>domain for /dev/iommu and one per iommufd
>hostdev for /dev/vfio/devices/vfioX, and pass
>the FD to qemu command line.
>

The part formatting the object and the part formatting the device
should be split.

>Signed-off-by: Nathan Chen <nathanc@nvidia.com>
>---
> src/qemu/qemu_command.c |  43 +++++++-
> src/qemu/qemu_command.h |   3 +-
> src/qemu/qemu_domain.c  |   8 ++
> src/qemu/qemu_domain.h  |   7 ++
> src/qemu/qemu_hotplug.c |   2 +-
> src/qemu/qemu_process.c | 232 ++++++++++++++++++++++++++++++++++++++++
> 6 files changed, 289 insertions(+), 6 deletions(-)
>
>diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c
>index 8fd7527645..740a6970f2 100644
>--- a/src/qemu/qemu_command.c
>+++ b/src/qemu/qemu_command.c
>@@ -4730,7 +4730,8 @@ qemuBuildVideoCommandLine(virCommand *cmd,
>
> virJSONValue *
> qemuBuildPCIHostdevDevProps(const virDomainDef *def,
>-                            virDomainHostdevDef *dev)
>+                            virDomainHostdevDef *dev,
>+                            virDomainObj *vm)

Hmm, perhaps exposing the iommufd object in the XML would save us from
having to pass this.

> {
>     g_autoptr(virJSONValue) props = NULL;
>     virDomainHostdevSubsysPCI *pcisrc = &dev->source.subsys.u.pci;
>@@ -4741,6 +4742,13 @@ qemuBuildPCIHostdevDevProps(const virDomainDef *def,
>     const char *iommufdId = NULL;
>     /* 'ramfb' property must be omitted unless it's to be enabled */
>     bool ramfb = pcisrc->ramfb == VIR_TRISTATE_SWITCH_ON;
>+    bool useIommufd = false;
>+    qemuDomainObjPrivate *priv = vm ? vm->privateData : NULL;
>+
>+    if (pcisrc->driver.name == VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO &&
>+        pcisrc->driver.iommufd) {
>+        useIommufd = true;
>+    }
>
>     /* caller has to assign proper passthrough driver name */
>     switch (pcisrc->driver.name) {
>@@ -4787,6 +4795,18 @@ qemuBuildPCIHostdevDevProps(const virDomainDef *def,
>                               NULL) < 0)
>         return NULL;
>
>+    if (useIommufd && priv) {
>+        g_autofree char *vfioFdName = g_strdup_printf("vfio-%04x:%02x:%02x.%d",
>+                                                      pcisrc->addr.domain, pcisrc->addr.bus,
>+                                                      pcisrc->addr.slot, pcisrc->addr.function);
>+

There's no need to duplicate the list of hostdevs which use iommufd in a
per-domain hash table.

For storing per-device file descriptors, we have per-device private
data.

>+        int vfiofd = GPOINTER_TO_INT(g_hash_table_lookup(priv->vfioDeviceFds, vfioFdName));
>+        if (virJSONValueObjectAdd(&props,
>+                                  "S:fd", g_strdup_printf("%d", vfiofd),
>+                                  NULL) < 0)
>+            return NULL;
>+    }
>+
>     if (qemuBuildDeviceAddressProps(props, def, dev->info) < 0)
>         return NULL;
>
>@@ -5197,12 +5217,14 @@ qemuBuildAcpiNodesetProps(virCommand *cmd,
> static int
> qemuBuildHostdevCommandLine(virCommand *cmd,
>                             const virDomainDef *def,
>-                            virQEMUCaps *qemuCaps)
>+                            virQEMUCaps *qemuCaps,
>+                            virDomainObj *vm)
> {
>     size_t i;
>     g_autoptr(virJSONValue) props = NULL;
>     int iommufd = 0;
>     const char * iommufdId = "iommufd0";
>+    qemuDomainObjPrivate *priv = vm->privateData;
>
>     for (i = 0; i < def->nhostdevs; i++) {
>         virDomainHostdevDef *hostdev = def->hostdevs[i];
>@@ -5233,8 +5255,10 @@ qemuBuildHostdevCommandLine(virCommand *cmd,
>
>             if (subsys->u.pci.driver.iommufd && iommufd == 0) {
>                 iommufd = 1;
>+                virCommandPassFD(cmd, priv->iommufd, VIR_COMMAND_PASS_FD_CLOSE_PARENT);
>                 if (qemuMonitorCreateObjectProps(&props, "iommufd",
>                                                  iommufdId,
>+                                                 "S:fd", g_strdup_printf("%d", priv->iommufd),
>                                                  NULL) < 0)
>                     return -1;
>
>@@ -5245,7 +5269,18 @@ qemuBuildHostdevCommandLine(virCommand *cmd,
>             if (qemuCommandAddExtDevice(cmd, hostdev->info, def, qemuCaps) < 0)
>                 return -1;
>
>-            if (!(devprops = qemuBuildPCIHostdevDevProps(def, hostdev)))
>+            if (subsys->u.pci.driver.iommufd) {
>+                virDomainHostdevSubsysPCI *pcisrc = &hostdev->source.subsys.u.pci;
>+                g_autofree char *vfioFdName = g_strdup_printf("vfio-%04x:%02x:%02x.%d",
>+                                                              pcisrc->addr.domain, pcisrc->addr.bus,
>+                                                              pcisrc->addr.slot, pcisrc->addr.function);
>+
>+                int vfiofd = GPOINTER_TO_INT(g_hash_table_lookup(priv->vfioDeviceFds, vfioFdName));
>+
>+                virCommandPassFD(cmd, vfiofd, VIR_COMMAND_PASS_FD_CLOSE_PARENT);

This would become just:

qemuDomainHostdevPrivate *priv = (qemuDomainHostdevPrivate *)vsock->privateData;

virCommandPassFD(cmd, priv->vfiofd, VIR_COMMAND_PASS_FD_CLOSE_PARENT);

>+            }
>+
>+            if (!(devprops = qemuBuildPCIHostdevDevProps(def, hostdev, vm)))
>                 return -1;
>
>             if (qemuBuildDeviceCommandlineFromJSON(cmd, devprops, def, qemuCaps) < 0)
>@@ -10893,7 +10928,7 @@ qemuBuildCommandLine(virDomainObj *vm,
>     if (qemuBuildRedirdevCommandLine(cmd, def, qemuCaps) < 0)
>         return NULL;
>
>-    if (qemuBuildHostdevCommandLine(cmd, def, qemuCaps) < 0)
>+    if (qemuBuildHostdevCommandLine(cmd, def, qemuCaps, vm) < 0)
>         return NULL;
>
>     if (migrateURI)
>diff --git a/src/qemu/qemu_command.h b/src/qemu/qemu_command.h
>index ad068f1f16..380aac261f 100644
>--- a/src/qemu/qemu_command.h
>+++ b/src/qemu/qemu_command.h
>@@ -180,7 +180,8 @@ qemuBuildThreadContextProps(virJSONValue **tcProps,
> /* Current, best practice */
> virJSONValue *
> qemuBuildPCIHostdevDevProps(const virDomainDef *def,
>-                            virDomainHostdevDef *dev);
>+                            virDomainHostdevDef *dev,
>+                            virDomainObj *vm);
>
> virJSONValue *
> qemuBuildRNGDevProps(const virDomainDef *def,
>diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
>index a42721efad..86640aa3e3 100644
>--- a/src/qemu/qemu_domain.c
>+++ b/src/qemu/qemu_domain.c
>@@ -1953,6 +1953,11 @@ qemuDomainObjPrivateFree(void *data)
>
>     virChrdevFree(priv->devs);
>
>+    if (priv->iommufd >= 0) {
>+        virEventRemoveHandle(priv->iommufd);

There is no handle to remove (and none is needed). So no need for the
condition either.

>+        priv->iommufd = -1;
>+    }
>+
>     if (priv->pidMonitored >= 0) {
>         virEventRemoveHandle(priv->pidMonitored);
>         priv->pidMonitored = -1;
>diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
>index 45fc32a663..cecfed94a7 100644
>--- a/src/qemu/qemu_process.c
>+++ b/src/qemu/qemu_process.c
>@@ -25,6 +25,7 @@
> #include <unistd.h>
> #include <signal.h>
> #include <sys/stat.h>
>+#include <dirent.h>

We should not need this in qemu_process.

> #if WITH_SYS_SYSCALL_H
> # include <sys/syscall.h>
> #endif
>@@ -8091,6 +8092,9 @@ qemuProcessLaunch(virConnectPtr conn,
>     if (qemuExtDevicesStart(driver, vm, incomingMigrationExtDevices) < 0)
>         goto cleanup;
>
>+    if (qemuProcessOpenVfioFds(vm) < 0)
>+        goto cleanup;
>+
>     if (!(cmd = qemuBuildCommandLine(vm,
>                                      incoming ? "defer" : NULL,
>                                      vmop,
>@@ -10267,3 +10271,231 @@ qemuProcessHandleNbdkitExit(qemuNbdkitProcess *nbdkit,
>     qemuProcessEventSubmit(vm, QEMU_PROCESS_EVENT_NBDKIT_EXITED, 0, 0, nbdkit);
>     virObjectUnlock(vm);
> }
>+
>+/**
>+ * qemuProcessOpenIommuFd:
>+ * @vm: domain object
>+ * @iommuFd: returned file descriptor
>+ *
>+ * Opens /dev/iommu file descriptor for the VM.
>+ *
>+ * Returns: 0 on success, -1 on failure
>+ */
>+static int
>+qemuProcessOpenIommuFd(virDomainObj *vm, int *iommuFd)
>+{
>+    int fd = -1;
>+
>+    VIR_DEBUG("Opening IOMMU FD for domain %s", vm->def->name);
>+
>+    if ((fd = open("/dev/iommu", O_RDWR | O_CLOEXEC)) < 0) {
>+        if (errno == ENOENT) {
>+            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
>+                           _("IOMMU FD support requires /dev/iommu device"));
>+        } else {
>+            virReportSystemError(errno, "%s",
>+                                 _("cannot open /dev/iommu"));
>+        }
>+        return -1;
>+    }
>+
>+    *iommuFd = fd;
>+    VIR_DEBUG("Opened IOMMU FD %d for domain %s", fd, vm->def->name);
>+    return 0;
>+}
>+
>+/**
>+ * qemuProcessGetVfioDevicePath:
>+ * @hostdev: host device definition
>+ * @vfioPath: returned VFIO device path
>+ *
>+ * Constructs the VFIO device path for a PCI hostdev.
>+ *
>+ * Returns: 0 on success, -1 on failure
>+ */
>+static int
>+qemuProcessGetVfioDevicePath(virDomainHostdevDef *hostdev,

No need to pass the whole hostdev here. Then this function can live in
virpci.c

>+                             char **vfioPath)
>+{
>+    virPCIDeviceAddress *addr;
>+    g_autofree char *sysfsPath = NULL;
>+    DIR *dir = NULL;
>+    struct dirent *entry = NULL;
>+    int ret = -1;
>+

Jano
Re: [PATCH 2/4] qemu: open iommufd FDs from libvirt backend
Posted by Nathan Chen via Devel 2 weeks ago

On 11/6/2025 11:19 AM, Ján Tomko wrote:
>> Open iommufd FDs from libvirt backend without
>> exposing these FDs to XML users, i.e. one per
>> domain for /dev/iommu and one per iommufd
>> hostdev for /dev/vfio/devices/vfioX, and pass
>> the FD to qemu command line.
>>
> 
> The part formatting the object and the part formatting the device
> should be split.
> 

Sounds good, I will split it into two commits. >> Signed-off-by: Nathan 
Chen <nathanc@nvidia.com>
>> ---
>> src/qemu/qemu_command.c |  43 +++++++-
>> src/qemu/qemu_command.h |   3 +-
>> src/qemu/qemu_domain.c  |   8 ++
>> src/qemu/qemu_domain.h  |   7 ++
>> src/qemu/qemu_hotplug.c |   2 +-
>> src/qemu/qemu_process.c | 232 ++++++++++++++++++++++++++++++++++++++++
>> 6 files changed, 289 insertions(+), 6 deletions(-)
>>
>> diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c
>> index 8fd7527645..740a6970f2 100644
>> --- a/src/qemu/qemu_command.c
>> +++ b/src/qemu/qemu_command.c
>> @@ -4730,7 +4730,8 @@ qemuBuildVideoCommandLine(virCommand *cmd,
>>
>> virJSONValue *
>> qemuBuildPCIHostdevDevProps(const virDomainDef *def,
>> -                            virDomainHostdevDef *dev)
>> +                            virDomainHostdevDef *dev,
>> +                            virDomainObj *vm)
> 
> Hmm, perhaps exposing the iommufd object in the XML would save us from
> having to pass this.
> 

We are passing virDomainObj to this function in order to retrieve the FD 
number, would you mind clarifying how we could avoid passing this by 
exposing the iommufd object in the XML? It is my understanding that 
exposing the iommufd object ID would still mean we need pass the 
virDomainObj.

>> {
>>     g_autoptr(virJSONValue) props = NULL;
>>     virDomainHostdevSubsysPCI *pcisrc = &dev->source.subsys.u.pci;
>> @@ -4741,6 +4742,13 @@ qemuBuildPCIHostdevDevProps(const virDomainDef 
>> *def,
>>     const char *iommufdId = NULL;
>>     /* 'ramfb' property must be omitted unless it's to be enabled */
>>     bool ramfb = pcisrc->ramfb == VIR_TRISTATE_SWITCH_ON;
>> +    bool useIommufd = false;
>> +    qemuDomainObjPrivate *priv = vm ? vm->privateData : NULL;
>> +
>> +    if (pcisrc->driver.name == 
>> VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO &&
>> +        pcisrc->driver.iommufd) {
>> +        useIommufd = true;
>> +    }
>>
>>     /* caller has to assign proper passthrough driver name */
>>     switch (pcisrc->driver.name) {
>> @@ -4787,6 +4795,18 @@ qemuBuildPCIHostdevDevProps(const virDomainDef 
>> *def,
>>                               NULL) < 0)
>>         return NULL;
>>
>> +    if (useIommufd && priv) {
>> +        g_autofree char *vfioFdName = g_strdup_printf("vfio-%04x: 
>> %02x:%02x.%d",
>> +                                                      pcisrc- 
>> >addr.domain, pcisrc->addr.bus,
>> +                                                      pcisrc- 
>> >addr.slot, pcisrc->addr.function);
>> +
> 
> There's no need to duplicate the list of hostdevs which use iommufd in a
> per-domain hash table.
> 
> For storing per-device file descriptors, we have per-device private
> data.
> 
>> +        int vfiofd = GPOINTER_TO_INT(g_hash_table_lookup(priv- 
>> >vfioDeviceFds, vfioFdName));
>> +        if (virJSONValueObjectAdd(&props,
>> +                                  "S:fd", g_strdup_printf("%d", vfiofd),
>> +                                  NULL) < 0)
>> +            return NULL;
>> +    }
>> +
>>     if (qemuBuildDeviceAddressProps(props, def, dev->info) < 0)
>>         return NULL;
>>
>> @@ -5197,12 +5217,14 @@ qemuBuildAcpiNodesetProps(virCommand *cmd,
>> static int
>> qemuBuildHostdevCommandLine(virCommand *cmd,
>>                             const virDomainDef *def,
>> -                            virQEMUCaps *qemuCaps)
>> +                            virQEMUCaps *qemuCaps,
>> +                            virDomainObj *vm)
>> {
>>     size_t i;
>>     g_autoptr(virJSONValue) props = NULL;
>>     int iommufd = 0;
>>     const char * iommufdId = "iommufd0";
>> +    qemuDomainObjPrivate *priv = vm->privateData;
>>
>>     for (i = 0; i < def->nhostdevs; i++) {
>>         virDomainHostdevDef *hostdev = def->hostdevs[i];
>> @@ -5233,8 +5255,10 @@ qemuBuildHostdevCommandLine(virCommand *cmd,
>>
>>             if (subsys->u.pci.driver.iommufd && iommufd == 0) {
>>                 iommufd = 1;
>> +                virCommandPassFD(cmd, priv->iommufd, 
>> VIR_COMMAND_PASS_FD_CLOSE_PARENT);
>>                 if (qemuMonitorCreateObjectProps(&props, "iommufd",
>>                                                  iommufdId,
>> +                                                 "S:fd", 
>> g_strdup_printf("%d", priv->iommufd),
>>                                                  NULL) < 0)
>>                     return -1;
>>
>> @@ -5245,7 +5269,18 @@ qemuBuildHostdevCommandLine(virCommand *cmd,
>>             if (qemuCommandAddExtDevice(cmd, hostdev->info, def, 
>> qemuCaps) < 0)
>>                 return -1;
>>
>> -            if (!(devprops = qemuBuildPCIHostdevDevProps(def, hostdev)))
>> +            if (subsys->u.pci.driver.iommufd) {
>> +                virDomainHostdevSubsysPCI *pcisrc = &hostdev- 
>> >source.subsys.u.pci;
>> +                g_autofree char *vfioFdName = g_strdup_printf("vfio- 
>> %04x:%02x:%02x.%d",
>> +                                                              pcisrc- 
>> >addr.domain, pcisrc->addr.bus,
>> +                                                              pcisrc- 
>> >addr.slot, pcisrc->addr.function);
>> +
>> +                int vfiofd = 
>> GPOINTER_TO_INT(g_hash_table_lookup(priv->vfioDeviceFds, vfioFdName));
>> +
>> +                virCommandPassFD(cmd, vfiofd, 
>> VIR_COMMAND_PASS_FD_CLOSE_PARENT);
> 
> This would become just:
> 
> qemuDomainHostdevPrivate *priv = (qemuDomainHostdevPrivate *)vsock- 
>  >privateData;
> 
> virCommandPassFD(cmd, priv->vfiofd, VIR_COMMAND_PASS_FD_CLOSE_PARENT);
> 

I will proceed with implementing a qemuDomainHostdevPrivate struct and 
look into the existing implementation for qemuDomainDiskPrivate for 
reference. I was not able to find a private data attribute in the 
_virDomainHostdevDef struct definition, but I do see "virObject 
*privateData;" under the _virDomainDiskDef struct definition - are you 
aware of any reason behind this, or has it just never been needed for 
the hostdev struct?

>> +            }
>> +
>> +            if (!(devprops = qemuBuildPCIHostdevDevProps(def, 
>> hostdev, vm)))
>>                 return -1;
>>
>>             if (qemuBuildDeviceCommandlineFromJSON(cmd, devprops, def, 
>> qemuCaps) < 0)
>> @@ -10893,7 +10928,7 @@ qemuBuildCommandLine(virDomainObj *vm,
>>     if (qemuBuildRedirdevCommandLine(cmd, def, qemuCaps) < 0)
>>         return NULL;
>>
>> -    if (qemuBuildHostdevCommandLine(cmd, def, qemuCaps) < 0)
>> +    if (qemuBuildHostdevCommandLine(cmd, def, qemuCaps, vm) < 0)
>>         return NULL;
>>
>>     if (migrateURI)
>> diff --git a/src/qemu/qemu_command.h b/src/qemu/qemu_command.h
>> index ad068f1f16..380aac261f 100644
>> --- a/src/qemu/qemu_command.h
>> +++ b/src/qemu/qemu_command.h
>> @@ -180,7 +180,8 @@ qemuBuildThreadContextProps(virJSONValue **tcProps,
>> /* Current, best practice */
>> virJSONValue *
>> qemuBuildPCIHostdevDevProps(const virDomainDef *def,
>> -                            virDomainHostdevDef *dev);
>> +                            virDomainHostdevDef *dev,
>> +                            virDomainObj *vm);
>>
>> virJSONValue *
>> qemuBuildRNGDevProps(const virDomainDef *def,
>> diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
>> index a42721efad..86640aa3e3 100644
>> --- a/src/qemu/qemu_domain.c
>> +++ b/src/qemu/qemu_domain.c
>> @@ -1953,6 +1953,11 @@ qemuDomainObjPrivateFree(void *data)
>>
>>     virChrdevFree(priv->devs);
>>
>> +    if (priv->iommufd >= 0) {
>> +        virEventRemoveHandle(priv->iommufd);
> 
> There is no handle to remove (and none is needed). So no need for the
> condition either.
> 

Thanks for catching this, agreed that the other logic to close iommufd 
file descriptor is sufficient and this is not needed.

>> +        priv->iommufd = -1;
>> +    }
>> +
>>     if (priv->pidMonitored >= 0) {
>>         virEventRemoveHandle(priv->pidMonitored);
>>         priv->pidMonitored = -1;
>> diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
>> index 45fc32a663..cecfed94a7 100644
>> --- a/src/qemu/qemu_process.c
>> +++ b/src/qemu/qemu_process.c
>> @@ -25,6 +25,7 @@
>> #include <unistd.h>
>> #include <signal.h>
>> #include <sys/stat.h>
>> +#include <dirent.h>
> 
> We should not need this in qemu_process.
> 

This must have been left over from a previous implementation attempt. I 
will remove this in the next revision. Thanks

>> #if WITH_SYS_SYSCALL_H
>> # include <sys/syscall.h>
>> #endif
>> @@ -8091,6 +8092,9 @@ qemuProcessLaunch(virConnectPtr conn,
>>     if (qemuExtDevicesStart(driver, vm, incomingMigrationExtDevices) < 0)
>>         goto cleanup;
>>
>> +    if (qemuProcessOpenVfioFds(vm) < 0)
>> +        goto cleanup;
>> +
>>     if (!(cmd = qemuBuildCommandLine(vm,
>>                                      incoming ? "defer" : NULL,
>>                                      vmop,
>> @@ -10267,3 +10271,231 @@ 
>> qemuProcessHandleNbdkitExit(qemuNbdkitProcess *nbdkit,
>>     qemuProcessEventSubmit(vm, QEMU_PROCESS_EVENT_NBDKIT_EXITED, 0, 0, 
>> nbdkit);
>>     virObjectUnlock(vm);
>> }
>> +
>> +/**
>> + * qemuProcessOpenIommuFd:
>> + * @vm: domain object
>> + * @iommuFd: returned file descriptor
>> + *
>> + * Opens /dev/iommu file descriptor for the VM.
>> + *
>> + * Returns: 0 on success, -1 on failure
>> + */
>> +static int
>> +qemuProcessOpenIommuFd(virDomainObj *vm, int *iommuFd)
>> +{
>> +    int fd = -1;
>> +
>> +    VIR_DEBUG("Opening IOMMU FD for domain %s", vm->def->name);
>> +
>> +    if ((fd = open("/dev/iommu", O_RDWR | O_CLOEXEC)) < 0) {
>> +        if (errno == ENOENT) {
>> +            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
>> +                           _("IOMMU FD support requires /dev/iommu 
>> device"));
>> +        } else {
>> +            virReportSystemError(errno, "%s",
>> +                                 _("cannot open /dev/iommu"));
>> +        }
>> +        return -1;
>> +    }
>> +
>> +    *iommuFd = fd;
>> +    VIR_DEBUG("Opened IOMMU FD %d for domain %s", fd, vm->def->name);
>> +    return 0;
>> +}
>> +
>> +/**
>> + * qemuProcessGetVfioDevicePath:
>> + * @hostdev: host device definition
>> + * @vfioPath: returned VFIO device path
>> + *
>> + * Constructs the VFIO device path for a PCI hostdev.
>> + *
>> + * Returns: 0 on success, -1 on failure
>> + */
>> +static int
>> +qemuProcessGetVfioDevicePath(virDomainHostdevDef *hostdev,
> 
> No need to pass the whole hostdev here. Then this function can live in
> virpci.c

Sounds good, I will move this to virpci.c and just pass in the device 
address.

-Nathan
Re: [PATCH 2/4] qemu: open iommufd FDs from libvirt backend
Posted by Ján Tomko via Devel 1 week, 6 days ago
On a Thursday in 2025, Nathan Chen wrote:
>
>
>On 11/6/2025 11:19 AM, Ján Tomko wrote:
>>>Open iommufd FDs from libvirt backend without
>>>exposing these FDs to XML users, i.e. one per
>>>domain for /dev/iommu and one per iommufd
>>>hostdev for /dev/vfio/devices/vfioX, and pass
>>>the FD to qemu command line.
>>>
>>
>>The part formatting the object and the part formatting the device
>>should be split.
>>
>
>Sounds good, I will split it into two commits. >> Signed-off-by: 
>Nathan Chen <nathanc@nvidia.com>
>>>---
>>>src/qemu/qemu_command.c |  43 +++++++-
>>>src/qemu/qemu_command.h |   3 +-
>>>src/qemu/qemu_domain.c  |   8 ++
>>>src/qemu/qemu_domain.h  |   7 ++
>>>src/qemu/qemu_hotplug.c |   2 +-
>>>src/qemu/qemu_process.c | 232 ++++++++++++++++++++++++++++++++++++++++
>>>6 files changed, 289 insertions(+), 6 deletions(-)
>>>
>>>diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c
>>>index 8fd7527645..740a6970f2 100644
>>>--- a/src/qemu/qemu_command.c
>>>+++ b/src/qemu/qemu_command.c
>>>@@ -4730,7 +4730,8 @@ qemuBuildVideoCommandLine(virCommand *cmd,
>>>
>>>virJSONValue *
>>>qemuBuildPCIHostdevDevProps(const virDomainDef *def,
>>>-                            virDomainHostdevDef *dev)
>>>+                            virDomainHostdevDef *dev,
>>>+                            virDomainObj *vm)
>>
>>Hmm, perhaps exposing the iommufd object in the XML would save us from
>>having to pass this.
>>
>
>We are passing virDomainObj to this function in order to retrieve the 
>FD number, would you mind clarifying how we could avoid passing this 
>by exposing the iommufd object in the XML? It is my understanding that 
>exposing the iommufd object ID would still mean we need pass the 
>virDomainObj.
>

If it was a separate device, it would have its own data type and own
formatting function unrelated to hostdevs.

>>>{
>>>    g_autoptr(virJSONValue) props = NULL;
>>>    virDomainHostdevSubsysPCI *pcisrc = &dev->source.subsys.u.pci;
>>>@@ -4741,6 +4742,13 @@ qemuBuildPCIHostdevDevProps(const 
>>>virDomainDef *def,
>>>    const char *iommufdId = NULL;
>>>    /* 'ramfb' property must be omitted unless it's to be enabled */
>>>    bool ramfb = pcisrc->ramfb == VIR_TRISTATE_SWITCH_ON;
>>>+    bool useIommufd = false;
>>>+    qemuDomainObjPrivate *priv = vm ? vm->privateData : NULL;
>>>+
>>>+    if (pcisrc->driver.name == 
>>>VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO &&
>>>+        pcisrc->driver.iommufd) {
>>>+        useIommufd = true;
>>>+    }
>>>
>>>    /* caller has to assign proper passthrough driver name */
>>>    switch (pcisrc->driver.name) {
>>>@@ -4787,6 +4795,18 @@ qemuBuildPCIHostdevDevProps(const 
>>>virDomainDef *def,
>>>                              NULL) < 0)
>>>        return NULL;
>>>
>>>+    if (useIommufd && priv) {
>>>+        g_autofree char *vfioFdName = g_strdup_printf("vfio-%04x: 
>>>%02x:%02x.%d",
>>>+                                                      pcisrc- 
>>>>addr.domain, pcisrc->addr.bus,
>>>+                                                      pcisrc- 
>>>>addr.slot, pcisrc->addr.function);
>>>+
>>
>>There's no need to duplicate the list of hostdevs which use iommufd in a
>>per-domain hash table.
>>
>>For storing per-device file descriptors, we have per-device private
>>data.
>>
>>>+        int vfiofd = GPOINTER_TO_INT(g_hash_table_lookup(priv- 
>>>>vfioDeviceFds, vfioFdName));
>>>+        if (virJSONValueObjectAdd(&props,
>>>+                                  "S:fd", g_strdup_printf("%d", vfiofd),
>>>+                                  NULL) < 0)
>>>+            return NULL;
>>>+    }
>>>+
>>>    if (qemuBuildDeviceAddressProps(props, def, dev->info) < 0)
>>>        return NULL;
>>>
>>>@@ -5197,12 +5217,14 @@ qemuBuildAcpiNodesetProps(virCommand *cmd,
>>>static int
>>>qemuBuildHostdevCommandLine(virCommand *cmd,
>>>                            const virDomainDef *def,
>>>-                            virQEMUCaps *qemuCaps)
>>>+                            virQEMUCaps *qemuCaps,
>>>+                            virDomainObj *vm)
>>>{
>>>    size_t i;
>>>    g_autoptr(virJSONValue) props = NULL;
>>>    int iommufd = 0;
>>>    const char * iommufdId = "iommufd0";
>>>+    qemuDomainObjPrivate *priv = vm->privateData;
>>>
>>>    for (i = 0; i < def->nhostdevs; i++) {
>>>        virDomainHostdevDef *hostdev = def->hostdevs[i];
>>>@@ -5233,8 +5255,10 @@ qemuBuildHostdevCommandLine(virCommand *cmd,
>>>
>>>            if (subsys->u.pci.driver.iommufd && iommufd == 0) {
>>>                iommufd = 1;
>>>+                virCommandPassFD(cmd, priv->iommufd, 
>>>VIR_COMMAND_PASS_FD_CLOSE_PARENT);
>>>                if (qemuMonitorCreateObjectProps(&props, "iommufd",
>>>                                                 iommufdId,
>>>+                                                 "S:fd", 
>>>g_strdup_printf("%d", priv->iommufd),
>>>                                                 NULL) < 0)
>>>                    return -1;
>>>
>>>@@ -5245,7 +5269,18 @@ qemuBuildHostdevCommandLine(virCommand *cmd,
>>>            if (qemuCommandAddExtDevice(cmd, hostdev->info, def, 
>>>qemuCaps) < 0)
>>>                return -1;
>>>
>>>-            if (!(devprops = qemuBuildPCIHostdevDevProps(def, hostdev)))
>>>+            if (subsys->u.pci.driver.iommufd) {
>>>+                virDomainHostdevSubsysPCI *pcisrc = &hostdev- 
>>>>source.subsys.u.pci;
>>>+                g_autofree char *vfioFdName = 
>>>g_strdup_printf("vfio- %04x:%02x:%02x.%d",
>>>+                                                              
>>>pcisrc- >addr.domain, pcisrc->addr.bus,
>>>+                                                              
>>>pcisrc- >addr.slot, pcisrc->addr.function);
>>>+
>>>+                int vfiofd = 
>>>GPOINTER_TO_INT(g_hash_table_lookup(priv->vfioDeviceFds, 
>>>vfioFdName));
>>>+
>>>+                virCommandPassFD(cmd, vfiofd, 
>>>VIR_COMMAND_PASS_FD_CLOSE_PARENT);
>>
>>This would become just:
>>
>>qemuDomainHostdevPrivate *priv = (qemuDomainHostdevPrivate *)vsock-  
>>>privateData;
>>
>>virCommandPassFD(cmd, priv->vfiofd, VIR_COMMAND_PASS_FD_CLOSE_PARENT);
>>
>
>I will proceed with implementing a qemuDomainHostdevPrivate struct and 
>look into the existing implementation for qemuDomainDiskPrivate for 
>reference. I was not able to find a private data attribute in the 
>_virDomainHostdevDef struct definition, but I do see "virObject 
>*privateData;" under the _virDomainDiskDef struct definition - are you 
>aware of any reason behind this, or has it just never been needed for 
>the hostdev struct?
>

It was added for disks at the time when it was needed.

Jano

>>>+            }
>>>+
>>>+            if (!(devprops = qemuBuildPCIHostdevDevProps(def, 
>>>hostdev, vm)))
>>>                return -1;
>>>
Re: [PATCH 2/4] qemu: open iommufd FDs from libvirt backend
Posted by Nathan Chen via Devel 1 week, 6 days ago

On 11/7/2025 4:40 AM, Ján Tomko wrote:
> On a Thursday in 2025, Nathan Chen wrote:
>>
>>
>> On 11/6/2025 11:19 AM, Ján Tomko wrote:
>>>> Open iommufd FDs from libvirt backend without
>>>> exposing these FDs to XML users, i.e. one per
>>>> domain for /dev/iommu and one per iommufd
>>>> hostdev for /dev/vfio/devices/vfioX, and pass
>>>> the FD to qemu command line.
>>>>
>>>
>>> The part formatting the object and the part formatting the device
>>> should be split.
>>>
>>
>> Sounds good, I will split it into two commits. >> Signed-off-by: 
>> Nathan Chen <nathanc@nvidia.com>
>>>> ---
>>>> src/qemu/qemu_command.c |  43 +++++++-
>>>> src/qemu/qemu_command.h |   3 +-
>>>> src/qemu/qemu_domain.c  |   8 ++
>>>> src/qemu/qemu_domain.h  |   7 ++
>>>> src/qemu/qemu_hotplug.c |   2 +-
>>>> src/qemu/qemu_process.c | 232 ++++++++++++++++++++++++++++++++++++++++
>>>> 6 files changed, 289 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c
>>>> index 8fd7527645..740a6970f2 100644
>>>> --- a/src/qemu/qemu_command.c
>>>> +++ b/src/qemu/qemu_command.c
>>>> @@ -4730,7 +4730,8 @@ qemuBuildVideoCommandLine(virCommand *cmd,
>>>>
>>>> virJSONValue *
>>>> qemuBuildPCIHostdevDevProps(const virDomainDef *def,
>>>> -                            virDomainHostdevDef *dev)
>>>> +                            virDomainHostdevDef *dev,
>>>> +                            virDomainObj *vm)
>>>
>>> Hmm, perhaps exposing the iommufd object in the XML would save us from
>>> having to pass this.
>>>
>>
>> We are passing virDomainObj to this function in order to retrieve the 
>> FD number, would you mind clarifying how we could avoid passing this 
>> by exposing the iommufd object in the XML? It is my understanding that 
>> exposing the iommufd object ID would still mean we need pass the 
>> virDomainObj.
>>
> 
> If it was a separate device, it would have its own data type and own
> formatting function unrelated to hostdevs.

That makes sense, like passing a new virDomainIommufdDef struct pointer 
to a new qemuBuildIommufdDevProps() function. Previously we implemented 
a virDomainIommufdDef struct as a member under virDomainIOMMUDef [0] to 
store the iommufd ID and FD number. But we changed the implementation 
adn XML representation to only be a bool member associated with 
hostdevs. What are your thoughts on either of the following paths forward?

1. Re-implementing the virDomainIommufdDef struct as a 
virDomainHostdevDef member, storing the FD numbers as part of a private 
data struct member in virDomainIommufdDef
2. Storing the FD numbers as part of a private data struct member in 
virDomainHostdevDef, and avoiding implementing a separate 
virDomainIommufdDef struct.

Option 2 seems the most straightforward t ome, still allowing us to 
avoid passing the virDomainObj to qemuBuildPCIHostdevDevPRops(), but 
please let me know what you think.

[0] 
https://lists.libvirt.org/archives/list/devel@lists.libvirt.org/thread/EASBQHPCLPK5G3PF3DEU57G6CI4GSC74/#JTIDPC3CIVB3OGEVDMR4H63OPBLKAO3N

Thanks,
Nathan
Re: [PATCH 2/4] qemu: open iommufd FDs from libvirt backend
Posted by Nathan Chen via Devel 2 weeks ago

On 11/6/2025 11:19 AM, Ján Tomko wrote:
>> Open iommufd FDs from libvirt backend without
>> exposing these FDs to XML users, i.e. one per
>> domain for /dev/iommu and one per iommufd
>> hostdev for /dev/vfio/devices/vfioX, and pass
>> the FD to qemu command line.
>>
> 
> The part formatting the object and the part formatting the device
> should be split.
> 

Sounds good, I will split it into two commits.

>> Signed-off-by: Nathan Chen <nathanc@nvidia.com>
>> ---
>> src/qemu/qemu_command.c |  43 +++++++-
>> src/qemu/qemu_command.h |   3 +-
>> src/qemu/qemu_domain.c  |   8 ++
>> src/qemu/qemu_domain.h  |   7 ++
>> src/qemu/qemu_hotplug.c |   2 +-
>> src/qemu/qemu_process.c | 232 ++++++++++++++++++++++++++++++++++++++++
>> 6 files changed, 289 insertions(+), 6 deletions(-)
>>
>> diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c
>> index 8fd7527645..740a6970f2 100644
>> --- a/src/qemu/qemu_command.c
>> +++ b/src/qemu/qemu_command.c
>> @@ -4730,7 +4730,8 @@ qemuBuildVideoCommandLine(virCommand *cmd,
>>
>> virJSONValue *
>> qemuBuildPCIHostdevDevProps(const virDomainDef *def,
>> -                            virDomainHostdevDef *dev)
>> +                            virDomainHostdevDef *dev,
>> +                            virDomainObj *vm)
> 
> Hmm, perhaps exposing the iommufd object in the XML would save us from
> having to pass this.

We are passing virDomainObj to this function in order to retrieve the 
you referring to exposing the

> 
>> {
>>     g_autoptr(virJSONValue) props = NULL;
>>     virDomainHostdevSubsysPCI *pcisrc = &dev->source.subsys.u.pci;
>> @@ -4741,6 +4742,13 @@ qemuBuildPCIHostdevDevProps(const virDomainDef 
>> *def,
>>     const char *iommufdId = NULL;
>>     /* 'ramfb' property must be omitted unless it's to be enabled */
>>     bool ramfb = pcisrc->ramfb == VIR_TRISTATE_SWITCH_ON;
>> +    bool useIommufd = false;
>> +    qemuDomainObjPrivate *priv = vm ? vm->privateData : NULL;
>> +
>> +    if (pcisrc->driver.name == 
>> VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO &&
>> +        pcisrc->driver.iommufd) {
>> +        useIommufd = true;
>> +    }
>>
>>     /* caller has to assign proper passthrough driver name */
>>     switch (pcisrc->driver.name) {
>> @@ -4787,6 +4795,18 @@ qemuBuildPCIHostdevDevProps(const virDomainDef 
>> *def,
>>                               NULL) < 0)
>>         return NULL;
>>
>> +    if (useIommufd && priv) {
>> +        g_autofree char *vfioFdName = g_strdup_printf("vfio-%04x: 
>> %02x:%02x.%d",
>> +                                                      pcisrc- 
>> >addr.domain, pcisrc->addr.bus,
>> +                                                      pcisrc- 
>> >addr.slot, pcisrc->addr.function);
>> +
> 
> There's no need to duplicate the list of hostdevs which use iommufd in a
> per-domain hash table.
> 
> For storing per-device file descriptors, we have per-device private
> data.
> 
>> +        int vfiofd = GPOINTER_TO_INT(g_hash_table_lookup(priv- 
>> >vfioDeviceFds, vfioFdName));
>> +        if (virJSONValueObjectAdd(&props,
>> +                                  "S:fd", g_strdup_printf("%d", vfiofd),
>> +                                  NULL) < 0)
>> +            return NULL;
>> +    }
>> +
>>     if (qemuBuildDeviceAddressProps(props, def, dev->info) < 0)
>>         return NULL;
>>
>> @@ -5197,12 +5217,14 @@ qemuBuildAcpiNodesetProps(virCommand *cmd,
>> static int
>> qemuBuildHostdevCommandLine(virCommand *cmd,
>>                             const virDomainDef *def,
>> -                            virQEMUCaps *qemuCaps)
>> +                            virQEMUCaps *qemuCaps,
>> +                            virDomainObj *vm)
>> {
>>     size_t i;
>>     g_autoptr(virJSONValue) props = NULL;
>>     int iommufd = 0;
>>     const char * iommufdId = "iommufd0";
>> +    qemuDomainObjPrivate *priv = vm->privateData;
>>
>>     for (i = 0; i < def->nhostdevs; i++) {
>>         virDomainHostdevDef *hostdev = def->hostdevs[i];
>> @@ -5233,8 +5255,10 @@ qemuBuildHostdevCommandLine(virCommand *cmd,
>>
>>             if (subsys->u.pci.driver.iommufd && iommufd == 0) {
>>                 iommufd = 1;
>> +                virCommandPassFD(cmd, priv->iommufd, 
>> VIR_COMMAND_PASS_FD_CLOSE_PARENT);
>>                 if (qemuMonitorCreateObjectProps(&props, "iommufd",
>>                                                  iommufdId,
>> +                                                 "S:fd", 
>> g_strdup_printf("%d", priv->iommufd),
>>                                                  NULL) < 0)
>>                     return -1;
>>
>> @@ -5245,7 +5269,18 @@ qemuBuildHostdevCommandLine(virCommand *cmd,
>>             if (qemuCommandAddExtDevice(cmd, hostdev->info, def, 
>> qemuCaps) < 0)
>>                 return -1;
>>
>> -            if (!(devprops = qemuBuildPCIHostdevDevProps(def, hostdev)))
>> +            if (subsys->u.pci.driver.iommufd) {
>> +                virDomainHostdevSubsysPCI *pcisrc = &hostdev- 
>> >source.subsys.u.pci;
>> +                g_autofree char *vfioFdName = g_strdup_printf("vfio- 
>> %04x:%02x:%02x.%d",
>> +                                                              pcisrc- 
>> >addr.domain, pcisrc->addr.bus,
>> +                                                              pcisrc- 
>> >addr.slot, pcisrc->addr.function);
>> +
>> +                int vfiofd = 
>> GPOINTER_TO_INT(g_hash_table_lookup(priv->vfioDeviceFds, vfioFdName));
>> +
>> +                virCommandPassFD(cmd, vfiofd, 
>> VIR_COMMAND_PASS_FD_CLOSE_PARENT);
> 
> This would become just:
> 
> qemuDomainHostdevPrivate *priv = (qemuDomainHostdevPrivate *)vsock- 
>  >privateData;
> 
> virCommandPassFD(cmd, priv->vfiofd, VIR_COMMAND_PASS_FD_CLOSE_PARENT);
> 
>> +            }
>> +
>> +            if (!(devprops = qemuBuildPCIHostdevDevProps(def, 
>> hostdev, vm)))
>>                 return -1;
>>
>>             if (qemuBuildDeviceCommandlineFromJSON(cmd, devprops, def, 
>> qemuCaps) < 0)
>> @@ -10893,7 +10928,7 @@ qemuBuildCommandLine(virDomainObj *vm,
>>     if (qemuBuildRedirdevCommandLine(cmd, def, qemuCaps) < 0)
>>         return NULL;
>>
>> -    if (qemuBuildHostdevCommandLine(cmd, def, qemuCaps) < 0)
>> +    if (qemuBuildHostdevCommandLine(cmd, def, qemuCaps, vm) < 0)
>>         return NULL;
>>
>>     if (migrateURI)
>> diff --git a/src/qemu/qemu_command.h b/src/qemu/qemu_command.h
>> index ad068f1f16..380aac261f 100644
>> --- a/src/qemu/qemu_command.h
>> +++ b/src/qemu/qemu_command.h
>> @@ -180,7 +180,8 @@ qemuBuildThreadContextProps(virJSONValue **tcProps,
>> /* Current, best practice */
>> virJSONValue *
>> qemuBuildPCIHostdevDevProps(const virDomainDef *def,
>> -                            virDomainHostdevDef *dev);
>> +                            virDomainHostdevDef *dev,
>> +                            virDomainObj *vm);
>>
>> virJSONValue *
>> qemuBuildRNGDevProps(const virDomainDef *def,
>> diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
>> index a42721efad..86640aa3e3 100644
>> --- a/src/qemu/qemu_domain.c
>> +++ b/src/qemu/qemu_domain.c
>> @@ -1953,6 +1953,11 @@ qemuDomainObjPrivateFree(void *data)
>>
>>     virChrdevFree(priv->devs);
>>
>> +    if (priv->iommufd >= 0) {
>> +        virEventRemoveHandle(priv->iommufd);
> 
> There is no handle to remove (and none is needed). So no need for the
> condition either.
> 
>> +        priv->iommufd = -1;
>> +    }
>> +
>>     if (priv->pidMonitored >= 0) {
>>         virEventRemoveHandle(priv->pidMonitored);
>>         priv->pidMonitored = -1;
>> diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
>> index 45fc32a663..cecfed94a7 100644
>> --- a/src/qemu/qemu_process.c
>> +++ b/src/qemu/qemu_process.c
>> @@ -25,6 +25,7 @@
>> #include <unistd.h>
>> #include <signal.h>
>> #include <sys/stat.h>
>> +#include <dirent.h>
> 
> We should not need this in qemu_process.
> 
>> #if WITH_SYS_SYSCALL_H
>> # include <sys/syscall.h>
>> #endif
>> @@ -8091,6 +8092,9 @@ qemuProcessLaunch(virConnectPtr conn,
>>     if (qemuExtDevicesStart(driver, vm, incomingMigrationExtDevices) < 0)
>>         goto cleanup;
>>
>> +    if (qemuProcessOpenVfioFds(vm) < 0)
>> +        goto cleanup;
>> +
>>     if (!(cmd = qemuBuildCommandLine(vm,
>>                                      incoming ? "defer" : NULL,
>>                                      vmop,
>> @@ -10267,3 +10271,231 @@ 
>> qemuProcessHandleNbdkitExit(qemuNbdkitProcess *nbdkit,
>>     qemuProcessEventSubmit(vm, QEMU_PROCESS_EVENT_NBDKIT_EXITED, 0, 0, 
>> nbdkit);
>>     virObjectUnlock(vm);
>> }
>> +
>> +/**
>> + * qemuProcessOpenIommuFd:
>> + * @vm: domain object
>> + * @iommuFd: returned file descriptor
>> + *
>> + * Opens /dev/iommu file descriptor for the VM.
>> + *
>> + * Returns: 0 on success, -1 on failure
>> + */
>> +static int
>> +qemuProcessOpenIommuFd(virDomainObj *vm, int *iommuFd)
>> +{
>> +    int fd = -1;
>> +
>> +    VIR_DEBUG("Opening IOMMU FD for domain %s", vm->def->name);
>> +
>> +    if ((fd = open("/dev/iommu", O_RDWR | O_CLOEXEC)) < 0) {
>> +        if (errno == ENOENT) {
>> +            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
>> +                           _("IOMMU FD support requires /dev/iommu 
>> device"));
>> +        } else {
>> +            virReportSystemError(errno, "%s",
>> +                                 _("cannot open /dev/iommu"));
>> +        }
>> +        return -1;
>> +    }
>> +
>> +    *iommuFd = fd;
>> +    VIR_DEBUG("Opened IOMMU FD %d for domain %s", fd, vm->def->name);
>> +    return 0;
>> +}
>> +
>> +/**
>> + * qemuProcessGetVfioDevicePath:
>> + * @hostdev: host device definition
>> + * @vfioPath: returned VFIO device path
>> + *
>> + * Constructs the VFIO device path for a PCI hostdev.
>> + *
>> + * Returns: 0 on success, -1 on failure
>> + */
>> +static int
>> +qemuProcessGetVfioDevicePath(virDomainHostdevDef *hostdev,
> 
> No need to pass the whole hostdev here. Then this function can live in
> virpci.c
> 
>> +                             char **vfioPath)
>> +{
>> +    virPCIDeviceAddress *addr;
>> +    g_autofree char *sysfsPath = NULL;
>> +    DIR *dir = NULL;
>> +    struct dirent *entry = NULL;
>> +    int ret = -1;
>> +