Open iommufd FDs from libvirt backend without
exposing these FDs to XML users, i.e. one per
domain for /dev/iommu and one per iommufd
hostdev for /dev/vfio/devices/vfioX, and pass
the FD to qemu command line.
Signed-off-by: Nathan Chen <nathanc@nvidia.com>
---
src/qemu/qemu_command.c | 43 +++++++-
src/qemu/qemu_command.h | 3 +-
src/qemu/qemu_domain.c | 8 ++
src/qemu/qemu_domain.h | 7 ++
src/qemu/qemu_hotplug.c | 2 +-
src/qemu/qemu_process.c | 232 ++++++++++++++++++++++++++++++++++++++++
6 files changed, 289 insertions(+), 6 deletions(-)
diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c
index 8fd7527645..740a6970f2 100644
--- a/src/qemu/qemu_command.c
+++ b/src/qemu/qemu_command.c
@@ -4730,7 +4730,8 @@ qemuBuildVideoCommandLine(virCommand *cmd,
virJSONValue *
qemuBuildPCIHostdevDevProps(const virDomainDef *def,
- virDomainHostdevDef *dev)
+ virDomainHostdevDef *dev,
+ virDomainObj *vm)
{
g_autoptr(virJSONValue) props = NULL;
virDomainHostdevSubsysPCI *pcisrc = &dev->source.subsys.u.pci;
@@ -4741,6 +4742,13 @@ qemuBuildPCIHostdevDevProps(const virDomainDef *def,
const char *iommufdId = NULL;
/* 'ramfb' property must be omitted unless it's to be enabled */
bool ramfb = pcisrc->ramfb == VIR_TRISTATE_SWITCH_ON;
+ bool useIommufd = false;
+ qemuDomainObjPrivate *priv = vm ? vm->privateData : NULL;
+
+ if (pcisrc->driver.name == VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO &&
+ pcisrc->driver.iommufd) {
+ useIommufd = true;
+ }
/* caller has to assign proper passthrough driver name */
switch (pcisrc->driver.name) {
@@ -4787,6 +4795,18 @@ qemuBuildPCIHostdevDevProps(const virDomainDef *def,
NULL) < 0)
return NULL;
+ if (useIommufd && priv) {
+ g_autofree char *vfioFdName = g_strdup_printf("vfio-%04x:%02x:%02x.%d",
+ pcisrc->addr.domain, pcisrc->addr.bus,
+ pcisrc->addr.slot, pcisrc->addr.function);
+
+ int vfiofd = GPOINTER_TO_INT(g_hash_table_lookup(priv->vfioDeviceFds, vfioFdName));
+ if (virJSONValueObjectAdd(&props,
+ "S:fd", g_strdup_printf("%d", vfiofd),
+ NULL) < 0)
+ return NULL;
+ }
+
if (qemuBuildDeviceAddressProps(props, def, dev->info) < 0)
return NULL;
@@ -5197,12 +5217,14 @@ qemuBuildAcpiNodesetProps(virCommand *cmd,
static int
qemuBuildHostdevCommandLine(virCommand *cmd,
const virDomainDef *def,
- virQEMUCaps *qemuCaps)
+ virQEMUCaps *qemuCaps,
+ virDomainObj *vm)
{
size_t i;
g_autoptr(virJSONValue) props = NULL;
int iommufd = 0;
const char * iommufdId = "iommufd0";
+ qemuDomainObjPrivate *priv = vm->privateData;
for (i = 0; i < def->nhostdevs; i++) {
virDomainHostdevDef *hostdev = def->hostdevs[i];
@@ -5233,8 +5255,10 @@ qemuBuildHostdevCommandLine(virCommand *cmd,
if (subsys->u.pci.driver.iommufd && iommufd == 0) {
iommufd = 1;
+ virCommandPassFD(cmd, priv->iommufd, VIR_COMMAND_PASS_FD_CLOSE_PARENT);
if (qemuMonitorCreateObjectProps(&props, "iommufd",
iommufdId,
+ "S:fd", g_strdup_printf("%d", priv->iommufd),
NULL) < 0)
return -1;
@@ -5245,7 +5269,18 @@ qemuBuildHostdevCommandLine(virCommand *cmd,
if (qemuCommandAddExtDevice(cmd, hostdev->info, def, qemuCaps) < 0)
return -1;
- if (!(devprops = qemuBuildPCIHostdevDevProps(def, hostdev)))
+ if (subsys->u.pci.driver.iommufd) {
+ virDomainHostdevSubsysPCI *pcisrc = &hostdev->source.subsys.u.pci;
+ g_autofree char *vfioFdName = g_strdup_printf("vfio-%04x:%02x:%02x.%d",
+ pcisrc->addr.domain, pcisrc->addr.bus,
+ pcisrc->addr.slot, pcisrc->addr.function);
+
+ int vfiofd = GPOINTER_TO_INT(g_hash_table_lookup(priv->vfioDeviceFds, vfioFdName));
+
+ virCommandPassFD(cmd, vfiofd, VIR_COMMAND_PASS_FD_CLOSE_PARENT);
+ }
+
+ if (!(devprops = qemuBuildPCIHostdevDevProps(def, hostdev, vm)))
return -1;
if (qemuBuildDeviceCommandlineFromJSON(cmd, devprops, def, qemuCaps) < 0)
@@ -10893,7 +10928,7 @@ qemuBuildCommandLine(virDomainObj *vm,
if (qemuBuildRedirdevCommandLine(cmd, def, qemuCaps) < 0)
return NULL;
- if (qemuBuildHostdevCommandLine(cmd, def, qemuCaps) < 0)
+ if (qemuBuildHostdevCommandLine(cmd, def, qemuCaps, vm) < 0)
return NULL;
if (migrateURI)
diff --git a/src/qemu/qemu_command.h b/src/qemu/qemu_command.h
index ad068f1f16..380aac261f 100644
--- a/src/qemu/qemu_command.h
+++ b/src/qemu/qemu_command.h
@@ -180,7 +180,8 @@ qemuBuildThreadContextProps(virJSONValue **tcProps,
/* Current, best practice */
virJSONValue *
qemuBuildPCIHostdevDevProps(const virDomainDef *def,
- virDomainHostdevDef *dev);
+ virDomainHostdevDef *dev,
+ virDomainObj *vm);
virJSONValue *
qemuBuildRNGDevProps(const virDomainDef *def,
diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
index a42721efad..86640aa3e3 100644
--- a/src/qemu/qemu_domain.c
+++ b/src/qemu/qemu_domain.c
@@ -1953,6 +1953,11 @@ qemuDomainObjPrivateFree(void *data)
virChrdevFree(priv->devs);
+ if (priv->iommufd >= 0) {
+ virEventRemoveHandle(priv->iommufd);
+ priv->iommufd = -1;
+ }
+
if (priv->pidMonitored >= 0) {
virEventRemoveHandle(priv->pidMonitored);
priv->pidMonitored = -1;
@@ -1974,6 +1979,7 @@ qemuDomainObjPrivateFree(void *data)
g_clear_pointer(&priv->blockjobs, g_hash_table_unref);
g_clear_pointer(&priv->fds, g_hash_table_unref);
+ g_clear_pointer(&priv->vfioDeviceFds, g_hash_table_unref);
/* This should never be non-NULL if we get here, but just in case... */
if (priv->eventThread) {
@@ -2002,7 +2008,9 @@ qemuDomainObjPrivateAlloc(void *opaque)
priv->blockjobs = virHashNew(virObjectUnref);
priv->fds = virHashNew(g_object_unref);
+ priv->vfioDeviceFds = g_hash_table_new(g_str_hash, g_str_equal);
+ priv->iommufd = -1;
priv->pidMonitored = -1;
/* agent commands block by default, user can choose different behavior */
diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h
index 3396f929fd..d6214df783 100644
--- a/src/qemu/qemu_domain.h
+++ b/src/qemu/qemu_domain.h
@@ -264,6 +264,10 @@ struct _qemuDomainObjPrivate {
/* named file descriptor groups associated with the VM */
GHashTable *fds;
+ int iommufd;
+
+ GHashTable *vfioDeviceFds;
+
char *memoryBackingDir;
};
@@ -1174,3 +1178,6 @@ qemuDomainCheckCPU(virArch arch,
bool
qemuDomainMachineSupportsFloppy(const char *machine,
virQEMUCaps *qemuCaps);
+
+int qemuProcessOpenVfioFds(virDomainObj *vm);
+void qemuProcessCloseVfioFds(virDomainObj *vm);
diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c
index fb426deb1a..661e9008f7 100644
--- a/src/qemu/qemu_hotplug.c
+++ b/src/qemu/qemu_hotplug.c
@@ -1630,7 +1630,7 @@ qemuDomainAttachHostPCIDevice(virQEMUDriver *driver,
goto error;
}
- if (!(devprops = qemuBuildPCIHostdevDevProps(vm->def, hostdev)))
+ if (!(devprops = qemuBuildPCIHostdevDevProps(vm->def, hostdev, vm)))
goto error;
qemuDomainObjEnterMonitor(vm);
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
index 45fc32a663..cecfed94a7 100644
--- a/src/qemu/qemu_process.c
+++ b/src/qemu/qemu_process.c
@@ -25,6 +25,7 @@
#include <unistd.h>
#include <signal.h>
#include <sys/stat.h>
+#include <dirent.h>
#if WITH_SYS_SYSCALL_H
# include <sys/syscall.h>
#endif
@@ -8091,6 +8092,9 @@ qemuProcessLaunch(virConnectPtr conn,
if (qemuExtDevicesStart(driver, vm, incomingMigrationExtDevices) < 0)
goto cleanup;
+ if (qemuProcessOpenVfioFds(vm) < 0)
+ goto cleanup;
+
if (!(cmd = qemuBuildCommandLine(vm,
incoming ? "defer" : NULL,
vmop,
@@ -10267,3 +10271,231 @@ qemuProcessHandleNbdkitExit(qemuNbdkitProcess *nbdkit,
qemuProcessEventSubmit(vm, QEMU_PROCESS_EVENT_NBDKIT_EXITED, 0, 0, nbdkit);
virObjectUnlock(vm);
}
+
+/**
+ * qemuProcessOpenIommuFd:
+ * @vm: domain object
+ * @iommuFd: returned file descriptor
+ *
+ * Opens /dev/iommu file descriptor for the VM.
+ *
+ * Returns: 0 on success, -1 on failure
+ */
+static int
+qemuProcessOpenIommuFd(virDomainObj *vm, int *iommuFd)
+{
+ int fd = -1;
+
+ VIR_DEBUG("Opening IOMMU FD for domain %s", vm->def->name);
+
+ if ((fd = open("/dev/iommu", O_RDWR | O_CLOEXEC)) < 0) {
+ if (errno == ENOENT) {
+ virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+ _("IOMMU FD support requires /dev/iommu device"));
+ } else {
+ virReportSystemError(errno, "%s",
+ _("cannot open /dev/iommu"));
+ }
+ return -1;
+ }
+
+ *iommuFd = fd;
+ VIR_DEBUG("Opened IOMMU FD %d for domain %s", fd, vm->def->name);
+ return 0;
+}
+
+/**
+ * qemuProcessGetVfioDevicePath:
+ * @hostdev: host device definition
+ * @vfioPath: returned VFIO device path
+ *
+ * Constructs the VFIO device path for a PCI hostdev.
+ *
+ * Returns: 0 on success, -1 on failure
+ */
+static int
+qemuProcessGetVfioDevicePath(virDomainHostdevDef *hostdev,
+ char **vfioPath)
+{
+ virPCIDeviceAddress *addr;
+ g_autofree char *sysfsPath = NULL;
+ DIR *dir = NULL;
+ struct dirent *entry = NULL;
+ int ret = -1;
+
+ if (hostdev->mode != VIR_DOMAIN_HOSTDEV_MODE_SUBSYS ||
+ hostdev->source.subsys.type != VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI) {
+ virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
+ _("VFIO FD only supported for PCI hostdevs"));
+ return -1;
+ }
+
+ addr = &hostdev->source.subsys.u.pci.addr;
+
+ /* Build sysfs path: /sys/bus/pci/devices/DDDD:BB:DD.F/vfio-dev/ */
+ sysfsPath = g_strdup_printf("/sys/bus/pci/devices/"
+ "%04x:%02x:%02x.%d/vfio-dev/",
+ addr->domain, addr->bus,
+ addr->slot, addr->function);
+
+ if (virDirOpen(&dir, sysfsPath) < 0) {
+ virReportSystemError(errno,
+ _("cannot open VFIO sysfs directory %1$s"),
+ sysfsPath);
+ return -1;
+ }
+
+ /* Find the vfio device name in the directory */
+ while (virDirRead(dir, &entry, sysfsPath) > 0) {
+ if (STRPREFIX(entry->d_name, "vfio")) {
+ *vfioPath = g_strdup_printf("/dev/vfio/devices/%s", entry->d_name);
+ ret = 0;
+ break;
+ }
+ }
+
+ if (ret < 0) {
+ virReportError(VIR_ERR_INTERNAL_ERROR,
+ _("cannot find VFIO device for PCI device %1$04x:%2$02x:%3$02x.%4$d"),
+ addr->domain, addr->bus, addr->slot, addr->function);
+ }
+
+ virDirClose(dir);
+ return ret;
+}
+
+/**
+ * qemuProcessOpenVfioDeviceFd:
+ * @hostdev: host device definition
+ * @vfioFd: returned file descriptor
+ *
+ * Opens the VFIO device file descriptor for a hostdev.
+ *
+ * Returns: 0 on success, -1 on failure
+ */
+static int
+qemuProcessOpenVfioDeviceFd(virDomainHostdevDef *hostdev,
+ int *vfioFd)
+{
+ g_autofree char *vfioPath = NULL;
+ int fd = -1;
+
+ if (qemuProcessGetVfioDevicePath(hostdev, &vfioPath) < 0)
+ return -1;
+
+ VIR_DEBUG("Opening VFIO device %s", vfioPath);
+
+ if ((fd = open(vfioPath, O_RDWR | O_CLOEXEC)) < 0) {
+ if (errno == ENOENT) {
+ virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
+ _("VFIO device %1$s not found - ensure device is bound to vfio-pci driver"),
+ vfioPath);
+ } else {
+ virReportSystemError(errno,
+ _("cannot open VFIO device %1$s"), vfioPath);
+ }
+ return -1;
+ }
+
+ *vfioFd = fd;
+ VIR_DEBUG("Opened VFIO device FD %d for %s", *vfioFd, vfioPath);
+ return 0;
+}
+
+/**
+ * qemuProcessOpenVfioFds:
+ * @vm: domain object
+ *
+ * Opens all necessary VFIO file descriptors for the domain.
+ *
+ * Returns: 0 on success, -1 on failure
+ */
+int
+qemuProcessOpenVfioFds(virDomainObj *vm)
+{
+ qemuDomainObjPrivate *priv = vm->privateData;
+ bool needsIommuFd = false;
+ size_t i;
+
+ /* Check if we have any hostdevs that need VFIO FDs */
+ for (i = 0; i < vm->def->nhostdevs; i++) {
+ virDomainHostdevDef *hostdev = vm->def->hostdevs[i];
+ int vfioFd = -1;
+ g_autofree char *fdname = NULL;
+
+ if (hostdev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS &&
+ hostdev->source.subsys.type == VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI) {
+
+ /* Check if this hostdev uses VFIO with IOMMU FD */
+ if (hostdev->source.subsys.u.pci.driver.name == VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO &&
+ hostdev->source.subsys.u.pci.driver.iommufd) {
+
+ needsIommuFd = true;
+
+ /* Open VFIO device FD */
+ if (qemuProcessOpenVfioDeviceFd(hostdev, &vfioFd) < 0)
+ goto error;
+
+ /* Store the FD */
+ fdname = g_strdup_printf("vfio-%04x:%02x:%02x.%d",
+ hostdev->source.subsys.u.pci.addr.domain,
+ hostdev->source.subsys.u.pci.addr.bus,
+ hostdev->source.subsys.u.pci.addr.slot,
+ hostdev->source.subsys.u.pci.addr.function);
+
+ g_hash_table_insert(priv->vfioDeviceFds, g_steal_pointer(&fdname), GINT_TO_POINTER(vfioFd));
+
+ VIR_DEBUG("Stored VFIO FD for device %s", fdname);
+ }
+ }
+ }
+
+ /* Open IOMMU FD if needed */
+ if (needsIommuFd) {
+ int iommuFd = -1;
+
+ if (qemuProcessOpenIommuFd(vm, &iommuFd) < 0)
+ goto error;
+
+ priv->iommufd = iommuFd;
+
+ VIR_DEBUG("Stored IOMMU FD");
+ }
+
+ return 0;
+
+ error:
+ qemuProcessCloseVfioFds(vm);
+ return -1;
+}
+
+/**
+ * qemuProcessCloseVfioFds:
+ * @vm: domain object
+ *
+ * Closes all VFIO file descriptors for the domain.
+ */
+void
+qemuProcessCloseVfioFds(virDomainObj *vm)
+{
+ qemuDomainObjPrivate *priv = vm->privateData;
+ GHashTableIter iter;
+ gpointer key, value;
+
+ /* Close all VFIO device FDs */
+ if (priv->vfioDeviceFds) {
+ g_hash_table_iter_init(&iter, priv->vfioDeviceFds);
+ while (g_hash_table_iter_next(&iter, &key, &value)) {
+ int fd = GPOINTER_TO_INT(value);
+ VIR_DEBUG("Closing VFIO device FD %d for %s", fd, (char*)key);
+ VIR_FORCE_CLOSE(fd);
+ }
+ g_hash_table_remove_all(priv->vfioDeviceFds);
+ }
+
+ /* Close IOMMU FD */
+ if (priv->iommufd >= 0) {
+ VIR_DEBUG("Closing IOMMU FD %d", priv->iommufd);
+ VIR_FORCE_CLOSE(priv->iommufd);
+ }
+}
--
2.43.0
On a Monday in 2025, Nathan Chen via Devel wrote:
>Open iommufd FDs from libvirt backend without
>exposing these FDs to XML users, i.e. one per
>domain for /dev/iommu and one per iommufd
>hostdev for /dev/vfio/devices/vfioX, and pass
>the FD to qemu command line.
>
The part formatting the object and the part formatting the device
should be split.
>Signed-off-by: Nathan Chen <nathanc@nvidia.com>
>---
> src/qemu/qemu_command.c | 43 +++++++-
> src/qemu/qemu_command.h | 3 +-
> src/qemu/qemu_domain.c | 8 ++
> src/qemu/qemu_domain.h | 7 ++
> src/qemu/qemu_hotplug.c | 2 +-
> src/qemu/qemu_process.c | 232 ++++++++++++++++++++++++++++++++++++++++
> 6 files changed, 289 insertions(+), 6 deletions(-)
>
>diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c
>index 8fd7527645..740a6970f2 100644
>--- a/src/qemu/qemu_command.c
>+++ b/src/qemu/qemu_command.c
>@@ -4730,7 +4730,8 @@ qemuBuildVideoCommandLine(virCommand *cmd,
>
> virJSONValue *
> qemuBuildPCIHostdevDevProps(const virDomainDef *def,
>- virDomainHostdevDef *dev)
>+ virDomainHostdevDef *dev,
>+ virDomainObj *vm)
Hmm, perhaps exposing the iommufd object in the XML would save us from
having to pass this.
> {
> g_autoptr(virJSONValue) props = NULL;
> virDomainHostdevSubsysPCI *pcisrc = &dev->source.subsys.u.pci;
>@@ -4741,6 +4742,13 @@ qemuBuildPCIHostdevDevProps(const virDomainDef *def,
> const char *iommufdId = NULL;
> /* 'ramfb' property must be omitted unless it's to be enabled */
> bool ramfb = pcisrc->ramfb == VIR_TRISTATE_SWITCH_ON;
>+ bool useIommufd = false;
>+ qemuDomainObjPrivate *priv = vm ? vm->privateData : NULL;
>+
>+ if (pcisrc->driver.name == VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO &&
>+ pcisrc->driver.iommufd) {
>+ useIommufd = true;
>+ }
>
> /* caller has to assign proper passthrough driver name */
> switch (pcisrc->driver.name) {
>@@ -4787,6 +4795,18 @@ qemuBuildPCIHostdevDevProps(const virDomainDef *def,
> NULL) < 0)
> return NULL;
>
>+ if (useIommufd && priv) {
>+ g_autofree char *vfioFdName = g_strdup_printf("vfio-%04x:%02x:%02x.%d",
>+ pcisrc->addr.domain, pcisrc->addr.bus,
>+ pcisrc->addr.slot, pcisrc->addr.function);
>+
There's no need to duplicate the list of hostdevs which use iommufd in a
per-domain hash table.
For storing per-device file descriptors, we have per-device private
data.
>+ int vfiofd = GPOINTER_TO_INT(g_hash_table_lookup(priv->vfioDeviceFds, vfioFdName));
>+ if (virJSONValueObjectAdd(&props,
>+ "S:fd", g_strdup_printf("%d", vfiofd),
>+ NULL) < 0)
>+ return NULL;
>+ }
>+
> if (qemuBuildDeviceAddressProps(props, def, dev->info) < 0)
> return NULL;
>
>@@ -5197,12 +5217,14 @@ qemuBuildAcpiNodesetProps(virCommand *cmd,
> static int
> qemuBuildHostdevCommandLine(virCommand *cmd,
> const virDomainDef *def,
>- virQEMUCaps *qemuCaps)
>+ virQEMUCaps *qemuCaps,
>+ virDomainObj *vm)
> {
> size_t i;
> g_autoptr(virJSONValue) props = NULL;
> int iommufd = 0;
> const char * iommufdId = "iommufd0";
>+ qemuDomainObjPrivate *priv = vm->privateData;
>
> for (i = 0; i < def->nhostdevs; i++) {
> virDomainHostdevDef *hostdev = def->hostdevs[i];
>@@ -5233,8 +5255,10 @@ qemuBuildHostdevCommandLine(virCommand *cmd,
>
> if (subsys->u.pci.driver.iommufd && iommufd == 0) {
> iommufd = 1;
>+ virCommandPassFD(cmd, priv->iommufd, VIR_COMMAND_PASS_FD_CLOSE_PARENT);
> if (qemuMonitorCreateObjectProps(&props, "iommufd",
> iommufdId,
>+ "S:fd", g_strdup_printf("%d", priv->iommufd),
> NULL) < 0)
> return -1;
>
>@@ -5245,7 +5269,18 @@ qemuBuildHostdevCommandLine(virCommand *cmd,
> if (qemuCommandAddExtDevice(cmd, hostdev->info, def, qemuCaps) < 0)
> return -1;
>
>- if (!(devprops = qemuBuildPCIHostdevDevProps(def, hostdev)))
>+ if (subsys->u.pci.driver.iommufd) {
>+ virDomainHostdevSubsysPCI *pcisrc = &hostdev->source.subsys.u.pci;
>+ g_autofree char *vfioFdName = g_strdup_printf("vfio-%04x:%02x:%02x.%d",
>+ pcisrc->addr.domain, pcisrc->addr.bus,
>+ pcisrc->addr.slot, pcisrc->addr.function);
>+
>+ int vfiofd = GPOINTER_TO_INT(g_hash_table_lookup(priv->vfioDeviceFds, vfioFdName));
>+
>+ virCommandPassFD(cmd, vfiofd, VIR_COMMAND_PASS_FD_CLOSE_PARENT);
This would become just:
qemuDomainHostdevPrivate *priv = (qemuDomainHostdevPrivate *)vsock->privateData;
virCommandPassFD(cmd, priv->vfiofd, VIR_COMMAND_PASS_FD_CLOSE_PARENT);
>+ }
>+
>+ if (!(devprops = qemuBuildPCIHostdevDevProps(def, hostdev, vm)))
> return -1;
>
> if (qemuBuildDeviceCommandlineFromJSON(cmd, devprops, def, qemuCaps) < 0)
>@@ -10893,7 +10928,7 @@ qemuBuildCommandLine(virDomainObj *vm,
> if (qemuBuildRedirdevCommandLine(cmd, def, qemuCaps) < 0)
> return NULL;
>
>- if (qemuBuildHostdevCommandLine(cmd, def, qemuCaps) < 0)
>+ if (qemuBuildHostdevCommandLine(cmd, def, qemuCaps, vm) < 0)
> return NULL;
>
> if (migrateURI)
>diff --git a/src/qemu/qemu_command.h b/src/qemu/qemu_command.h
>index ad068f1f16..380aac261f 100644
>--- a/src/qemu/qemu_command.h
>+++ b/src/qemu/qemu_command.h
>@@ -180,7 +180,8 @@ qemuBuildThreadContextProps(virJSONValue **tcProps,
> /* Current, best practice */
> virJSONValue *
> qemuBuildPCIHostdevDevProps(const virDomainDef *def,
>- virDomainHostdevDef *dev);
>+ virDomainHostdevDef *dev,
>+ virDomainObj *vm);
>
> virJSONValue *
> qemuBuildRNGDevProps(const virDomainDef *def,
>diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
>index a42721efad..86640aa3e3 100644
>--- a/src/qemu/qemu_domain.c
>+++ b/src/qemu/qemu_domain.c
>@@ -1953,6 +1953,11 @@ qemuDomainObjPrivateFree(void *data)
>
> virChrdevFree(priv->devs);
>
>+ if (priv->iommufd >= 0) {
>+ virEventRemoveHandle(priv->iommufd);
There is no handle to remove (and none is needed). So no need for the
condition either.
>+ priv->iommufd = -1;
>+ }
>+
> if (priv->pidMonitored >= 0) {
> virEventRemoveHandle(priv->pidMonitored);
> priv->pidMonitored = -1;
>diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
>index 45fc32a663..cecfed94a7 100644
>--- a/src/qemu/qemu_process.c
>+++ b/src/qemu/qemu_process.c
>@@ -25,6 +25,7 @@
> #include <unistd.h>
> #include <signal.h>
> #include <sys/stat.h>
>+#include <dirent.h>
We should not need this in qemu_process.
> #if WITH_SYS_SYSCALL_H
> # include <sys/syscall.h>
> #endif
>@@ -8091,6 +8092,9 @@ qemuProcessLaunch(virConnectPtr conn,
> if (qemuExtDevicesStart(driver, vm, incomingMigrationExtDevices) < 0)
> goto cleanup;
>
>+ if (qemuProcessOpenVfioFds(vm) < 0)
>+ goto cleanup;
>+
> if (!(cmd = qemuBuildCommandLine(vm,
> incoming ? "defer" : NULL,
> vmop,
>@@ -10267,3 +10271,231 @@ qemuProcessHandleNbdkitExit(qemuNbdkitProcess *nbdkit,
> qemuProcessEventSubmit(vm, QEMU_PROCESS_EVENT_NBDKIT_EXITED, 0, 0, nbdkit);
> virObjectUnlock(vm);
> }
>+
>+/**
>+ * qemuProcessOpenIommuFd:
>+ * @vm: domain object
>+ * @iommuFd: returned file descriptor
>+ *
>+ * Opens /dev/iommu file descriptor for the VM.
>+ *
>+ * Returns: 0 on success, -1 on failure
>+ */
>+static int
>+qemuProcessOpenIommuFd(virDomainObj *vm, int *iommuFd)
>+{
>+ int fd = -1;
>+
>+ VIR_DEBUG("Opening IOMMU FD for domain %s", vm->def->name);
>+
>+ if ((fd = open("/dev/iommu", O_RDWR | O_CLOEXEC)) < 0) {
>+ if (errno == ENOENT) {
>+ virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
>+ _("IOMMU FD support requires /dev/iommu device"));
>+ } else {
>+ virReportSystemError(errno, "%s",
>+ _("cannot open /dev/iommu"));
>+ }
>+ return -1;
>+ }
>+
>+ *iommuFd = fd;
>+ VIR_DEBUG("Opened IOMMU FD %d for domain %s", fd, vm->def->name);
>+ return 0;
>+}
>+
>+/**
>+ * qemuProcessGetVfioDevicePath:
>+ * @hostdev: host device definition
>+ * @vfioPath: returned VFIO device path
>+ *
>+ * Constructs the VFIO device path for a PCI hostdev.
>+ *
>+ * Returns: 0 on success, -1 on failure
>+ */
>+static int
>+qemuProcessGetVfioDevicePath(virDomainHostdevDef *hostdev,
No need to pass the whole hostdev here. Then this function can live in
virpci.c
>+ char **vfioPath)
>+{
>+ virPCIDeviceAddress *addr;
>+ g_autofree char *sysfsPath = NULL;
>+ DIR *dir = NULL;
>+ struct dirent *entry = NULL;
>+ int ret = -1;
>+
Jano
On 11/6/2025 11:19 AM, Ján Tomko wrote:
>> Open iommufd FDs from libvirt backend without
>> exposing these FDs to XML users, i.e. one per
>> domain for /dev/iommu and one per iommufd
>> hostdev for /dev/vfio/devices/vfioX, and pass
>> the FD to qemu command line.
>>
>
> The part formatting the object and the part formatting the device
> should be split.
>
Sounds good, I will split it into two commits. >> Signed-off-by: Nathan
Chen <nathanc@nvidia.com>
>> ---
>> src/qemu/qemu_command.c | 43 +++++++-
>> src/qemu/qemu_command.h | 3 +-
>> src/qemu/qemu_domain.c | 8 ++
>> src/qemu/qemu_domain.h | 7 ++
>> src/qemu/qemu_hotplug.c | 2 +-
>> src/qemu/qemu_process.c | 232 ++++++++++++++++++++++++++++++++++++++++
>> 6 files changed, 289 insertions(+), 6 deletions(-)
>>
>> diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c
>> index 8fd7527645..740a6970f2 100644
>> --- a/src/qemu/qemu_command.c
>> +++ b/src/qemu/qemu_command.c
>> @@ -4730,7 +4730,8 @@ qemuBuildVideoCommandLine(virCommand *cmd,
>>
>> virJSONValue *
>> qemuBuildPCIHostdevDevProps(const virDomainDef *def,
>> - virDomainHostdevDef *dev)
>> + virDomainHostdevDef *dev,
>> + virDomainObj *vm)
>
> Hmm, perhaps exposing the iommufd object in the XML would save us from
> having to pass this.
>
We are passing virDomainObj to this function in order to retrieve the FD
number, would you mind clarifying how we could avoid passing this by
exposing the iommufd object in the XML? It is my understanding that
exposing the iommufd object ID would still mean we need pass the
virDomainObj.
>> {
>> g_autoptr(virJSONValue) props = NULL;
>> virDomainHostdevSubsysPCI *pcisrc = &dev->source.subsys.u.pci;
>> @@ -4741,6 +4742,13 @@ qemuBuildPCIHostdevDevProps(const virDomainDef
>> *def,
>> const char *iommufdId = NULL;
>> /* 'ramfb' property must be omitted unless it's to be enabled */
>> bool ramfb = pcisrc->ramfb == VIR_TRISTATE_SWITCH_ON;
>> + bool useIommufd = false;
>> + qemuDomainObjPrivate *priv = vm ? vm->privateData : NULL;
>> +
>> + if (pcisrc->driver.name ==
>> VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO &&
>> + pcisrc->driver.iommufd) {
>> + useIommufd = true;
>> + }
>>
>> /* caller has to assign proper passthrough driver name */
>> switch (pcisrc->driver.name) {
>> @@ -4787,6 +4795,18 @@ qemuBuildPCIHostdevDevProps(const virDomainDef
>> *def,
>> NULL) < 0)
>> return NULL;
>>
>> + if (useIommufd && priv) {
>> + g_autofree char *vfioFdName = g_strdup_printf("vfio-%04x:
>> %02x:%02x.%d",
>> + pcisrc-
>> >addr.domain, pcisrc->addr.bus,
>> + pcisrc-
>> >addr.slot, pcisrc->addr.function);
>> +
>
> There's no need to duplicate the list of hostdevs which use iommufd in a
> per-domain hash table.
>
> For storing per-device file descriptors, we have per-device private
> data.
>
>> + int vfiofd = GPOINTER_TO_INT(g_hash_table_lookup(priv-
>> >vfioDeviceFds, vfioFdName));
>> + if (virJSONValueObjectAdd(&props,
>> + "S:fd", g_strdup_printf("%d", vfiofd),
>> + NULL) < 0)
>> + return NULL;
>> + }
>> +
>> if (qemuBuildDeviceAddressProps(props, def, dev->info) < 0)
>> return NULL;
>>
>> @@ -5197,12 +5217,14 @@ qemuBuildAcpiNodesetProps(virCommand *cmd,
>> static int
>> qemuBuildHostdevCommandLine(virCommand *cmd,
>> const virDomainDef *def,
>> - virQEMUCaps *qemuCaps)
>> + virQEMUCaps *qemuCaps,
>> + virDomainObj *vm)
>> {
>> size_t i;
>> g_autoptr(virJSONValue) props = NULL;
>> int iommufd = 0;
>> const char * iommufdId = "iommufd0";
>> + qemuDomainObjPrivate *priv = vm->privateData;
>>
>> for (i = 0; i < def->nhostdevs; i++) {
>> virDomainHostdevDef *hostdev = def->hostdevs[i];
>> @@ -5233,8 +5255,10 @@ qemuBuildHostdevCommandLine(virCommand *cmd,
>>
>> if (subsys->u.pci.driver.iommufd && iommufd == 0) {
>> iommufd = 1;
>> + virCommandPassFD(cmd, priv->iommufd,
>> VIR_COMMAND_PASS_FD_CLOSE_PARENT);
>> if (qemuMonitorCreateObjectProps(&props, "iommufd",
>> iommufdId,
>> + "S:fd",
>> g_strdup_printf("%d", priv->iommufd),
>> NULL) < 0)
>> return -1;
>>
>> @@ -5245,7 +5269,18 @@ qemuBuildHostdevCommandLine(virCommand *cmd,
>> if (qemuCommandAddExtDevice(cmd, hostdev->info, def,
>> qemuCaps) < 0)
>> return -1;
>>
>> - if (!(devprops = qemuBuildPCIHostdevDevProps(def, hostdev)))
>> + if (subsys->u.pci.driver.iommufd) {
>> + virDomainHostdevSubsysPCI *pcisrc = &hostdev-
>> >source.subsys.u.pci;
>> + g_autofree char *vfioFdName = g_strdup_printf("vfio-
>> %04x:%02x:%02x.%d",
>> + pcisrc-
>> >addr.domain, pcisrc->addr.bus,
>> + pcisrc-
>> >addr.slot, pcisrc->addr.function);
>> +
>> + int vfiofd =
>> GPOINTER_TO_INT(g_hash_table_lookup(priv->vfioDeviceFds, vfioFdName));
>> +
>> + virCommandPassFD(cmd, vfiofd,
>> VIR_COMMAND_PASS_FD_CLOSE_PARENT);
>
> This would become just:
>
> qemuDomainHostdevPrivate *priv = (qemuDomainHostdevPrivate *)vsock-
> >privateData;
>
> virCommandPassFD(cmd, priv->vfiofd, VIR_COMMAND_PASS_FD_CLOSE_PARENT);
>
I will proceed with implementing a qemuDomainHostdevPrivate struct and
look into the existing implementation for qemuDomainDiskPrivate for
reference. I was not able to find a private data attribute in the
_virDomainHostdevDef struct definition, but I do see "virObject
*privateData;" under the _virDomainDiskDef struct definition - are you
aware of any reason behind this, or has it just never been needed for
the hostdev struct?
>> + }
>> +
>> + if (!(devprops = qemuBuildPCIHostdevDevProps(def,
>> hostdev, vm)))
>> return -1;
>>
>> if (qemuBuildDeviceCommandlineFromJSON(cmd, devprops, def,
>> qemuCaps) < 0)
>> @@ -10893,7 +10928,7 @@ qemuBuildCommandLine(virDomainObj *vm,
>> if (qemuBuildRedirdevCommandLine(cmd, def, qemuCaps) < 0)
>> return NULL;
>>
>> - if (qemuBuildHostdevCommandLine(cmd, def, qemuCaps) < 0)
>> + if (qemuBuildHostdevCommandLine(cmd, def, qemuCaps, vm) < 0)
>> return NULL;
>>
>> if (migrateURI)
>> diff --git a/src/qemu/qemu_command.h b/src/qemu/qemu_command.h
>> index ad068f1f16..380aac261f 100644
>> --- a/src/qemu/qemu_command.h
>> +++ b/src/qemu/qemu_command.h
>> @@ -180,7 +180,8 @@ qemuBuildThreadContextProps(virJSONValue **tcProps,
>> /* Current, best practice */
>> virJSONValue *
>> qemuBuildPCIHostdevDevProps(const virDomainDef *def,
>> - virDomainHostdevDef *dev);
>> + virDomainHostdevDef *dev,
>> + virDomainObj *vm);
>>
>> virJSONValue *
>> qemuBuildRNGDevProps(const virDomainDef *def,
>> diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
>> index a42721efad..86640aa3e3 100644
>> --- a/src/qemu/qemu_domain.c
>> +++ b/src/qemu/qemu_domain.c
>> @@ -1953,6 +1953,11 @@ qemuDomainObjPrivateFree(void *data)
>>
>> virChrdevFree(priv->devs);
>>
>> + if (priv->iommufd >= 0) {
>> + virEventRemoveHandle(priv->iommufd);
>
> There is no handle to remove (and none is needed). So no need for the
> condition either.
>
Thanks for catching this, agreed that the other logic to close iommufd
file descriptor is sufficient and this is not needed.
>> + priv->iommufd = -1;
>> + }
>> +
>> if (priv->pidMonitored >= 0) {
>> virEventRemoveHandle(priv->pidMonitored);
>> priv->pidMonitored = -1;
>> diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
>> index 45fc32a663..cecfed94a7 100644
>> --- a/src/qemu/qemu_process.c
>> +++ b/src/qemu/qemu_process.c
>> @@ -25,6 +25,7 @@
>> #include <unistd.h>
>> #include <signal.h>
>> #include <sys/stat.h>
>> +#include <dirent.h>
>
> We should not need this in qemu_process.
>
This must have been left over from a previous implementation attempt. I
will remove this in the next revision. Thanks
>> #if WITH_SYS_SYSCALL_H
>> # include <sys/syscall.h>
>> #endif
>> @@ -8091,6 +8092,9 @@ qemuProcessLaunch(virConnectPtr conn,
>> if (qemuExtDevicesStart(driver, vm, incomingMigrationExtDevices) < 0)
>> goto cleanup;
>>
>> + if (qemuProcessOpenVfioFds(vm) < 0)
>> + goto cleanup;
>> +
>> if (!(cmd = qemuBuildCommandLine(vm,
>> incoming ? "defer" : NULL,
>> vmop,
>> @@ -10267,3 +10271,231 @@
>> qemuProcessHandleNbdkitExit(qemuNbdkitProcess *nbdkit,
>> qemuProcessEventSubmit(vm, QEMU_PROCESS_EVENT_NBDKIT_EXITED, 0, 0,
>> nbdkit);
>> virObjectUnlock(vm);
>> }
>> +
>> +/**
>> + * qemuProcessOpenIommuFd:
>> + * @vm: domain object
>> + * @iommuFd: returned file descriptor
>> + *
>> + * Opens /dev/iommu file descriptor for the VM.
>> + *
>> + * Returns: 0 on success, -1 on failure
>> + */
>> +static int
>> +qemuProcessOpenIommuFd(virDomainObj *vm, int *iommuFd)
>> +{
>> + int fd = -1;
>> +
>> + VIR_DEBUG("Opening IOMMU FD for domain %s", vm->def->name);
>> +
>> + if ((fd = open("/dev/iommu", O_RDWR | O_CLOEXEC)) < 0) {
>> + if (errno == ENOENT) {
>> + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
>> + _("IOMMU FD support requires /dev/iommu
>> device"));
>> + } else {
>> + virReportSystemError(errno, "%s",
>> + _("cannot open /dev/iommu"));
>> + }
>> + return -1;
>> + }
>> +
>> + *iommuFd = fd;
>> + VIR_DEBUG("Opened IOMMU FD %d for domain %s", fd, vm->def->name);
>> + return 0;
>> +}
>> +
>> +/**
>> + * qemuProcessGetVfioDevicePath:
>> + * @hostdev: host device definition
>> + * @vfioPath: returned VFIO device path
>> + *
>> + * Constructs the VFIO device path for a PCI hostdev.
>> + *
>> + * Returns: 0 on success, -1 on failure
>> + */
>> +static int
>> +qemuProcessGetVfioDevicePath(virDomainHostdevDef *hostdev,
>
> No need to pass the whole hostdev here. Then this function can live in
> virpci.c
Sounds good, I will move this to virpci.c and just pass in the device
address.
-Nathan
On a Thursday in 2025, Nathan Chen wrote:
>
>
>On 11/6/2025 11:19 AM, Ján Tomko wrote:
>>>Open iommufd FDs from libvirt backend without
>>>exposing these FDs to XML users, i.e. one per
>>>domain for /dev/iommu and one per iommufd
>>>hostdev for /dev/vfio/devices/vfioX, and pass
>>>the FD to qemu command line.
>>>
>>
>>The part formatting the object and the part formatting the device
>>should be split.
>>
>
>Sounds good, I will split it into two commits. >> Signed-off-by:
>Nathan Chen <nathanc@nvidia.com>
>>>---
>>>src/qemu/qemu_command.c | 43 +++++++-
>>>src/qemu/qemu_command.h | 3 +-
>>>src/qemu/qemu_domain.c | 8 ++
>>>src/qemu/qemu_domain.h | 7 ++
>>>src/qemu/qemu_hotplug.c | 2 +-
>>>src/qemu/qemu_process.c | 232 ++++++++++++++++++++++++++++++++++++++++
>>>6 files changed, 289 insertions(+), 6 deletions(-)
>>>
>>>diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c
>>>index 8fd7527645..740a6970f2 100644
>>>--- a/src/qemu/qemu_command.c
>>>+++ b/src/qemu/qemu_command.c
>>>@@ -4730,7 +4730,8 @@ qemuBuildVideoCommandLine(virCommand *cmd,
>>>
>>>virJSONValue *
>>>qemuBuildPCIHostdevDevProps(const virDomainDef *def,
>>>- virDomainHostdevDef *dev)
>>>+ virDomainHostdevDef *dev,
>>>+ virDomainObj *vm)
>>
>>Hmm, perhaps exposing the iommufd object in the XML would save us from
>>having to pass this.
>>
>
>We are passing virDomainObj to this function in order to retrieve the
>FD number, would you mind clarifying how we could avoid passing this
>by exposing the iommufd object in the XML? It is my understanding that
>exposing the iommufd object ID would still mean we need pass the
>virDomainObj.
>
If it was a separate device, it would have its own data type and own
formatting function unrelated to hostdevs.
>>>{
>>> g_autoptr(virJSONValue) props = NULL;
>>> virDomainHostdevSubsysPCI *pcisrc = &dev->source.subsys.u.pci;
>>>@@ -4741,6 +4742,13 @@ qemuBuildPCIHostdevDevProps(const
>>>virDomainDef *def,
>>> const char *iommufdId = NULL;
>>> /* 'ramfb' property must be omitted unless it's to be enabled */
>>> bool ramfb = pcisrc->ramfb == VIR_TRISTATE_SWITCH_ON;
>>>+ bool useIommufd = false;
>>>+ qemuDomainObjPrivate *priv = vm ? vm->privateData : NULL;
>>>+
>>>+ if (pcisrc->driver.name ==
>>>VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO &&
>>>+ pcisrc->driver.iommufd) {
>>>+ useIommufd = true;
>>>+ }
>>>
>>> /* caller has to assign proper passthrough driver name */
>>> switch (pcisrc->driver.name) {
>>>@@ -4787,6 +4795,18 @@ qemuBuildPCIHostdevDevProps(const
>>>virDomainDef *def,
>>> NULL) < 0)
>>> return NULL;
>>>
>>>+ if (useIommufd && priv) {
>>>+ g_autofree char *vfioFdName = g_strdup_printf("vfio-%04x:
>>>%02x:%02x.%d",
>>>+ pcisrc-
>>>>addr.domain, pcisrc->addr.bus,
>>>+ pcisrc-
>>>>addr.slot, pcisrc->addr.function);
>>>+
>>
>>There's no need to duplicate the list of hostdevs which use iommufd in a
>>per-domain hash table.
>>
>>For storing per-device file descriptors, we have per-device private
>>data.
>>
>>>+ int vfiofd = GPOINTER_TO_INT(g_hash_table_lookup(priv-
>>>>vfioDeviceFds, vfioFdName));
>>>+ if (virJSONValueObjectAdd(&props,
>>>+ "S:fd", g_strdup_printf("%d", vfiofd),
>>>+ NULL) < 0)
>>>+ return NULL;
>>>+ }
>>>+
>>> if (qemuBuildDeviceAddressProps(props, def, dev->info) < 0)
>>> return NULL;
>>>
>>>@@ -5197,12 +5217,14 @@ qemuBuildAcpiNodesetProps(virCommand *cmd,
>>>static int
>>>qemuBuildHostdevCommandLine(virCommand *cmd,
>>> const virDomainDef *def,
>>>- virQEMUCaps *qemuCaps)
>>>+ virQEMUCaps *qemuCaps,
>>>+ virDomainObj *vm)
>>>{
>>> size_t i;
>>> g_autoptr(virJSONValue) props = NULL;
>>> int iommufd = 0;
>>> const char * iommufdId = "iommufd0";
>>>+ qemuDomainObjPrivate *priv = vm->privateData;
>>>
>>> for (i = 0; i < def->nhostdevs; i++) {
>>> virDomainHostdevDef *hostdev = def->hostdevs[i];
>>>@@ -5233,8 +5255,10 @@ qemuBuildHostdevCommandLine(virCommand *cmd,
>>>
>>> if (subsys->u.pci.driver.iommufd && iommufd == 0) {
>>> iommufd = 1;
>>>+ virCommandPassFD(cmd, priv->iommufd,
>>>VIR_COMMAND_PASS_FD_CLOSE_PARENT);
>>> if (qemuMonitorCreateObjectProps(&props, "iommufd",
>>> iommufdId,
>>>+ "S:fd",
>>>g_strdup_printf("%d", priv->iommufd),
>>> NULL) < 0)
>>> return -1;
>>>
>>>@@ -5245,7 +5269,18 @@ qemuBuildHostdevCommandLine(virCommand *cmd,
>>> if (qemuCommandAddExtDevice(cmd, hostdev->info, def,
>>>qemuCaps) < 0)
>>> return -1;
>>>
>>>- if (!(devprops = qemuBuildPCIHostdevDevProps(def, hostdev)))
>>>+ if (subsys->u.pci.driver.iommufd) {
>>>+ virDomainHostdevSubsysPCI *pcisrc = &hostdev-
>>>>source.subsys.u.pci;
>>>+ g_autofree char *vfioFdName =
>>>g_strdup_printf("vfio- %04x:%02x:%02x.%d",
>>>+
>>>pcisrc- >addr.domain, pcisrc->addr.bus,
>>>+
>>>pcisrc- >addr.slot, pcisrc->addr.function);
>>>+
>>>+ int vfiofd =
>>>GPOINTER_TO_INT(g_hash_table_lookup(priv->vfioDeviceFds,
>>>vfioFdName));
>>>+
>>>+ virCommandPassFD(cmd, vfiofd,
>>>VIR_COMMAND_PASS_FD_CLOSE_PARENT);
>>
>>This would become just:
>>
>>qemuDomainHostdevPrivate *priv = (qemuDomainHostdevPrivate *)vsock-
>>>privateData;
>>
>>virCommandPassFD(cmd, priv->vfiofd, VIR_COMMAND_PASS_FD_CLOSE_PARENT);
>>
>
>I will proceed with implementing a qemuDomainHostdevPrivate struct and
>look into the existing implementation for qemuDomainDiskPrivate for
>reference. I was not able to find a private data attribute in the
>_virDomainHostdevDef struct definition, but I do see "virObject
>*privateData;" under the _virDomainDiskDef struct definition - are you
>aware of any reason behind this, or has it just never been needed for
>the hostdev struct?
>
It was added for disks at the time when it was needed.
Jano
>>>+ }
>>>+
>>>+ if (!(devprops = qemuBuildPCIHostdevDevProps(def,
>>>hostdev, vm)))
>>> return -1;
>>>
On 11/7/2025 4:40 AM, Ján Tomko wrote: > On a Thursday in 2025, Nathan Chen wrote: >> >> >> On 11/6/2025 11:19 AM, Ján Tomko wrote: >>>> Open iommufd FDs from libvirt backend without >>>> exposing these FDs to XML users, i.e. one per >>>> domain for /dev/iommu and one per iommufd >>>> hostdev for /dev/vfio/devices/vfioX, and pass >>>> the FD to qemu command line. >>>> >>> >>> The part formatting the object and the part formatting the device >>> should be split. >>> >> >> Sounds good, I will split it into two commits. >> Signed-off-by: >> Nathan Chen <nathanc@nvidia.com> >>>> --- >>>> src/qemu/qemu_command.c | 43 +++++++- >>>> src/qemu/qemu_command.h | 3 +- >>>> src/qemu/qemu_domain.c | 8 ++ >>>> src/qemu/qemu_domain.h | 7 ++ >>>> src/qemu/qemu_hotplug.c | 2 +- >>>> src/qemu/qemu_process.c | 232 ++++++++++++++++++++++++++++++++++++++++ >>>> 6 files changed, 289 insertions(+), 6 deletions(-) >>>> >>>> diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c >>>> index 8fd7527645..740a6970f2 100644 >>>> --- a/src/qemu/qemu_command.c >>>> +++ b/src/qemu/qemu_command.c >>>> @@ -4730,7 +4730,8 @@ qemuBuildVideoCommandLine(virCommand *cmd, >>>> >>>> virJSONValue * >>>> qemuBuildPCIHostdevDevProps(const virDomainDef *def, >>>> - virDomainHostdevDef *dev) >>>> + virDomainHostdevDef *dev, >>>> + virDomainObj *vm) >>> >>> Hmm, perhaps exposing the iommufd object in the XML would save us from >>> having to pass this. >>> >> >> We are passing virDomainObj to this function in order to retrieve the >> FD number, would you mind clarifying how we could avoid passing this >> by exposing the iommufd object in the XML? It is my understanding that >> exposing the iommufd object ID would still mean we need pass the >> virDomainObj. >> > > If it was a separate device, it would have its own data type and own > formatting function unrelated to hostdevs. That makes sense, like passing a new virDomainIommufdDef struct pointer to a new qemuBuildIommufdDevProps() function. Previously we implemented a virDomainIommufdDef struct as a member under virDomainIOMMUDef [0] to store the iommufd ID and FD number. But we changed the implementation adn XML representation to only be a bool member associated with hostdevs. What are your thoughts on either of the following paths forward? 1. Re-implementing the virDomainIommufdDef struct as a virDomainHostdevDef member, storing the FD numbers as part of a private data struct member in virDomainIommufdDef 2. Storing the FD numbers as part of a private data struct member in virDomainHostdevDef, and avoiding implementing a separate virDomainIommufdDef struct. Option 2 seems the most straightforward t ome, still allowing us to avoid passing the virDomainObj to qemuBuildPCIHostdevDevPRops(), but please let me know what you think. [0] https://lists.libvirt.org/archives/list/devel@lists.libvirt.org/thread/EASBQHPCLPK5G3PF3DEU57G6CI4GSC74/#JTIDPC3CIVB3OGEVDMR4H63OPBLKAO3N Thanks, Nathan
On 11/6/2025 11:19 AM, Ján Tomko wrote:
>> Open iommufd FDs from libvirt backend without
>> exposing these FDs to XML users, i.e. one per
>> domain for /dev/iommu and one per iommufd
>> hostdev for /dev/vfio/devices/vfioX, and pass
>> the FD to qemu command line.
>>
>
> The part formatting the object and the part formatting the device
> should be split.
>
Sounds good, I will split it into two commits.
>> Signed-off-by: Nathan Chen <nathanc@nvidia.com>
>> ---
>> src/qemu/qemu_command.c | 43 +++++++-
>> src/qemu/qemu_command.h | 3 +-
>> src/qemu/qemu_domain.c | 8 ++
>> src/qemu/qemu_domain.h | 7 ++
>> src/qemu/qemu_hotplug.c | 2 +-
>> src/qemu/qemu_process.c | 232 ++++++++++++++++++++++++++++++++++++++++
>> 6 files changed, 289 insertions(+), 6 deletions(-)
>>
>> diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c
>> index 8fd7527645..740a6970f2 100644
>> --- a/src/qemu/qemu_command.c
>> +++ b/src/qemu/qemu_command.c
>> @@ -4730,7 +4730,8 @@ qemuBuildVideoCommandLine(virCommand *cmd,
>>
>> virJSONValue *
>> qemuBuildPCIHostdevDevProps(const virDomainDef *def,
>> - virDomainHostdevDef *dev)
>> + virDomainHostdevDef *dev,
>> + virDomainObj *vm)
>
> Hmm, perhaps exposing the iommufd object in the XML would save us from
> having to pass this.
We are passing virDomainObj to this function in order to retrieve the
you referring to exposing the
>
>> {
>> g_autoptr(virJSONValue) props = NULL;
>> virDomainHostdevSubsysPCI *pcisrc = &dev->source.subsys.u.pci;
>> @@ -4741,6 +4742,13 @@ qemuBuildPCIHostdevDevProps(const virDomainDef
>> *def,
>> const char *iommufdId = NULL;
>> /* 'ramfb' property must be omitted unless it's to be enabled */
>> bool ramfb = pcisrc->ramfb == VIR_TRISTATE_SWITCH_ON;
>> + bool useIommufd = false;
>> + qemuDomainObjPrivate *priv = vm ? vm->privateData : NULL;
>> +
>> + if (pcisrc->driver.name ==
>> VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO &&
>> + pcisrc->driver.iommufd) {
>> + useIommufd = true;
>> + }
>>
>> /* caller has to assign proper passthrough driver name */
>> switch (pcisrc->driver.name) {
>> @@ -4787,6 +4795,18 @@ qemuBuildPCIHostdevDevProps(const virDomainDef
>> *def,
>> NULL) < 0)
>> return NULL;
>>
>> + if (useIommufd && priv) {
>> + g_autofree char *vfioFdName = g_strdup_printf("vfio-%04x:
>> %02x:%02x.%d",
>> + pcisrc-
>> >addr.domain, pcisrc->addr.bus,
>> + pcisrc-
>> >addr.slot, pcisrc->addr.function);
>> +
>
> There's no need to duplicate the list of hostdevs which use iommufd in a
> per-domain hash table.
>
> For storing per-device file descriptors, we have per-device private
> data.
>
>> + int vfiofd = GPOINTER_TO_INT(g_hash_table_lookup(priv-
>> >vfioDeviceFds, vfioFdName));
>> + if (virJSONValueObjectAdd(&props,
>> + "S:fd", g_strdup_printf("%d", vfiofd),
>> + NULL) < 0)
>> + return NULL;
>> + }
>> +
>> if (qemuBuildDeviceAddressProps(props, def, dev->info) < 0)
>> return NULL;
>>
>> @@ -5197,12 +5217,14 @@ qemuBuildAcpiNodesetProps(virCommand *cmd,
>> static int
>> qemuBuildHostdevCommandLine(virCommand *cmd,
>> const virDomainDef *def,
>> - virQEMUCaps *qemuCaps)
>> + virQEMUCaps *qemuCaps,
>> + virDomainObj *vm)
>> {
>> size_t i;
>> g_autoptr(virJSONValue) props = NULL;
>> int iommufd = 0;
>> const char * iommufdId = "iommufd0";
>> + qemuDomainObjPrivate *priv = vm->privateData;
>>
>> for (i = 0; i < def->nhostdevs; i++) {
>> virDomainHostdevDef *hostdev = def->hostdevs[i];
>> @@ -5233,8 +5255,10 @@ qemuBuildHostdevCommandLine(virCommand *cmd,
>>
>> if (subsys->u.pci.driver.iommufd && iommufd == 0) {
>> iommufd = 1;
>> + virCommandPassFD(cmd, priv->iommufd,
>> VIR_COMMAND_PASS_FD_CLOSE_PARENT);
>> if (qemuMonitorCreateObjectProps(&props, "iommufd",
>> iommufdId,
>> + "S:fd",
>> g_strdup_printf("%d", priv->iommufd),
>> NULL) < 0)
>> return -1;
>>
>> @@ -5245,7 +5269,18 @@ qemuBuildHostdevCommandLine(virCommand *cmd,
>> if (qemuCommandAddExtDevice(cmd, hostdev->info, def,
>> qemuCaps) < 0)
>> return -1;
>>
>> - if (!(devprops = qemuBuildPCIHostdevDevProps(def, hostdev)))
>> + if (subsys->u.pci.driver.iommufd) {
>> + virDomainHostdevSubsysPCI *pcisrc = &hostdev-
>> >source.subsys.u.pci;
>> + g_autofree char *vfioFdName = g_strdup_printf("vfio-
>> %04x:%02x:%02x.%d",
>> + pcisrc-
>> >addr.domain, pcisrc->addr.bus,
>> + pcisrc-
>> >addr.slot, pcisrc->addr.function);
>> +
>> + int vfiofd =
>> GPOINTER_TO_INT(g_hash_table_lookup(priv->vfioDeviceFds, vfioFdName));
>> +
>> + virCommandPassFD(cmd, vfiofd,
>> VIR_COMMAND_PASS_FD_CLOSE_PARENT);
>
> This would become just:
>
> qemuDomainHostdevPrivate *priv = (qemuDomainHostdevPrivate *)vsock-
> >privateData;
>
> virCommandPassFD(cmd, priv->vfiofd, VIR_COMMAND_PASS_FD_CLOSE_PARENT);
>
>> + }
>> +
>> + if (!(devprops = qemuBuildPCIHostdevDevProps(def,
>> hostdev, vm)))
>> return -1;
>>
>> if (qemuBuildDeviceCommandlineFromJSON(cmd, devprops, def,
>> qemuCaps) < 0)
>> @@ -10893,7 +10928,7 @@ qemuBuildCommandLine(virDomainObj *vm,
>> if (qemuBuildRedirdevCommandLine(cmd, def, qemuCaps) < 0)
>> return NULL;
>>
>> - if (qemuBuildHostdevCommandLine(cmd, def, qemuCaps) < 0)
>> + if (qemuBuildHostdevCommandLine(cmd, def, qemuCaps, vm) < 0)
>> return NULL;
>>
>> if (migrateURI)
>> diff --git a/src/qemu/qemu_command.h b/src/qemu/qemu_command.h
>> index ad068f1f16..380aac261f 100644
>> --- a/src/qemu/qemu_command.h
>> +++ b/src/qemu/qemu_command.h
>> @@ -180,7 +180,8 @@ qemuBuildThreadContextProps(virJSONValue **tcProps,
>> /* Current, best practice */
>> virJSONValue *
>> qemuBuildPCIHostdevDevProps(const virDomainDef *def,
>> - virDomainHostdevDef *dev);
>> + virDomainHostdevDef *dev,
>> + virDomainObj *vm);
>>
>> virJSONValue *
>> qemuBuildRNGDevProps(const virDomainDef *def,
>> diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
>> index a42721efad..86640aa3e3 100644
>> --- a/src/qemu/qemu_domain.c
>> +++ b/src/qemu/qemu_domain.c
>> @@ -1953,6 +1953,11 @@ qemuDomainObjPrivateFree(void *data)
>>
>> virChrdevFree(priv->devs);
>>
>> + if (priv->iommufd >= 0) {
>> + virEventRemoveHandle(priv->iommufd);
>
> There is no handle to remove (and none is needed). So no need for the
> condition either.
>
>> + priv->iommufd = -1;
>> + }
>> +
>> if (priv->pidMonitored >= 0) {
>> virEventRemoveHandle(priv->pidMonitored);
>> priv->pidMonitored = -1;
>> diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
>> index 45fc32a663..cecfed94a7 100644
>> --- a/src/qemu/qemu_process.c
>> +++ b/src/qemu/qemu_process.c
>> @@ -25,6 +25,7 @@
>> #include <unistd.h>
>> #include <signal.h>
>> #include <sys/stat.h>
>> +#include <dirent.h>
>
> We should not need this in qemu_process.
>
>> #if WITH_SYS_SYSCALL_H
>> # include <sys/syscall.h>
>> #endif
>> @@ -8091,6 +8092,9 @@ qemuProcessLaunch(virConnectPtr conn,
>> if (qemuExtDevicesStart(driver, vm, incomingMigrationExtDevices) < 0)
>> goto cleanup;
>>
>> + if (qemuProcessOpenVfioFds(vm) < 0)
>> + goto cleanup;
>> +
>> if (!(cmd = qemuBuildCommandLine(vm,
>> incoming ? "defer" : NULL,
>> vmop,
>> @@ -10267,3 +10271,231 @@
>> qemuProcessHandleNbdkitExit(qemuNbdkitProcess *nbdkit,
>> qemuProcessEventSubmit(vm, QEMU_PROCESS_EVENT_NBDKIT_EXITED, 0, 0,
>> nbdkit);
>> virObjectUnlock(vm);
>> }
>> +
>> +/**
>> + * qemuProcessOpenIommuFd:
>> + * @vm: domain object
>> + * @iommuFd: returned file descriptor
>> + *
>> + * Opens /dev/iommu file descriptor for the VM.
>> + *
>> + * Returns: 0 on success, -1 on failure
>> + */
>> +static int
>> +qemuProcessOpenIommuFd(virDomainObj *vm, int *iommuFd)
>> +{
>> + int fd = -1;
>> +
>> + VIR_DEBUG("Opening IOMMU FD for domain %s", vm->def->name);
>> +
>> + if ((fd = open("/dev/iommu", O_RDWR | O_CLOEXEC)) < 0) {
>> + if (errno == ENOENT) {
>> + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
>> + _("IOMMU FD support requires /dev/iommu
>> device"));
>> + } else {
>> + virReportSystemError(errno, "%s",
>> + _("cannot open /dev/iommu"));
>> + }
>> + return -1;
>> + }
>> +
>> + *iommuFd = fd;
>> + VIR_DEBUG("Opened IOMMU FD %d for domain %s", fd, vm->def->name);
>> + return 0;
>> +}
>> +
>> +/**
>> + * qemuProcessGetVfioDevicePath:
>> + * @hostdev: host device definition
>> + * @vfioPath: returned VFIO device path
>> + *
>> + * Constructs the VFIO device path for a PCI hostdev.
>> + *
>> + * Returns: 0 on success, -1 on failure
>> + */
>> +static int
>> +qemuProcessGetVfioDevicePath(virDomainHostdevDef *hostdev,
>
> No need to pass the whole hostdev here. Then this function can live in
> virpci.c
>
>> + char **vfioPath)
>> +{
>> + virPCIDeviceAddress *addr;
>> + g_autofree char *sysfsPath = NULL;
>> + DIR *dir = NULL;
>> + struct dirent *entry = NULL;
>> + int ret = -1;
>> +
© 2016 - 2025 Red Hat, Inc.