Allow access to /dev/iommu and /dev/vfio/devices/vfio* when launching a qemu
VM with iommufd feature enabled.
Signed-off-by: Nathan Chen <nathanc@nvidia.com>
---
src/qemu/qemu_cgroup.c | 61 ++++++++++++++++++++++++++++
src/qemu/qemu_cgroup.h | 1 +
src/qemu/qemu_namespace.c | 44 +++++++++++++++++++++
src/security/security_apparmor.c | 15 +++++++
src/security/security_dac.c | 34 ++++++++++++++++
src/security/security_selinux.c | 34 ++++++++++++++++
src/security/virt-aa-helper.c | 11 +++++-
src/util/virpci.c | 68 ++++++++++++++++++++++++++++++++
src/util/virpci.h | 1 +
9 files changed, 268 insertions(+), 1 deletion(-)
diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c
index 46a7dc1d8b..e15ffd2007 100644
--- a/src/qemu/qemu_cgroup.c
+++ b/src/qemu/qemu_cgroup.c
@@ -461,6 +461,54 @@ qemuTeardownInputCgroup(virDomainObj *vm,
}
+int
+qemuSetupIommufdCgroup(virDomainObj *vm)
+{
+ qemuDomainObjPrivate *priv = vm->privateData;
+ g_autoptr(DIR) dir = NULL;
+ struct dirent *dent;
+ g_autofree char *path = NULL;
+ int iommufd = 0;
+ size_t i;
+
+ for (i = 0; i < vm->def->nhostdevs; i++) {
+ if (vm->def->hostdevs[i]->source.subsys.u.pci.driver.iommufd) {
+ iommufd = 1;
+ break;
+ }
+ }
+
+ if (iommufd == 1) {
+ if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
+ return 0;
+ if (virDirOpen(&dir, "/dev/vfio/devices") < 0) {
+ if (errno == ENOENT)
+ return 0;
+ return -1;
+ }
+ while (virDirRead(dir, &dent, "/dev/vfio/devices") > 0) {
+ if (STRPREFIX(dent->d_name, "vfio")) {
+ path = g_strdup_printf("/dev/vfio/devices/%s", dent->d_name);
+ }
+ if (path &&
+ qemuCgroupAllowDevicePath(vm, path,
+ VIR_CGROUP_DEVICE_RW, false) < 0) {
+ return -1;
+ }
+ path = NULL;
+ }
+ if (virFileExists("/dev/iommu"))
+ path = g_strdup("/dev/iommu");
+ if (path &&
+ qemuCgroupAllowDevicePath(vm, path,
+ VIR_CGROUP_DEVICE_RW, false) < 0) {
+ return -1;
+ }
+ }
+ return 0;
+}
+
+
/**
* qemuSetupHostdevCgroup:
* vm: domain object
@@ -759,6 +807,7 @@ qemuSetupDevicesCgroup(virDomainObj *vm)
g_autoptr(virQEMUDriverConfig) cfg = virQEMUDriverGetConfig(priv->driver);
const char *const *deviceACL = (const char *const *) cfg->cgroupDeviceACL;
int rv = -1;
+ int iommufd = 0;
size_t i;
if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
@@ -836,6 +885,18 @@ qemuSetupDevicesCgroup(virDomainObj *vm)
return -1;
}
+ for (i = 0; i < vm->def->nhostdevs; i++) {
+ if (vm->def->hostdevs[i]->source.subsys.u.pci.driver.iommufd) {
+ iommufd = 1;
+ break;
+ }
+ }
+
+ if (iommufd == 1) {
+ if (qemuSetupIommufdCgroup(vm) < 0)
+ return -1;
+ }
+
for (i = 0; i < vm->def->nmems; i++) {
if (qemuSetupMemoryDevicesCgroup(vm, vm->def->mems[i]) < 0)
return -1;
diff --git a/src/qemu/qemu_cgroup.h b/src/qemu/qemu_cgroup.h
index 3668034cde..bea677ba3c 100644
--- a/src/qemu/qemu_cgroup.h
+++ b/src/qemu/qemu_cgroup.h
@@ -42,6 +42,7 @@ int qemuSetupHostdevCgroup(virDomainObj *vm,
int qemuTeardownHostdevCgroup(virDomainObj *vm,
virDomainHostdevDef *dev)
G_GNUC_WARN_UNUSED_RESULT;
+int qemuSetupIommufdCgroup(virDomainObj *vm);
int qemuSetupMemoryDevicesCgroup(virDomainObj *vm,
virDomainMemoryDef *mem);
int qemuTeardownMemoryDevicesCgroup(virDomainObj *vm,
diff --git a/src/qemu/qemu_namespace.c b/src/qemu/qemu_namespace.c
index 932777505b..80496f2f0f 100644
--- a/src/qemu/qemu_namespace.c
+++ b/src/qemu/qemu_namespace.c
@@ -683,6 +683,47 @@ qemuDomainSetupLaunchSecurity(virDomainObj *vm,
}
+static int
+qemuDomainSetupIommufd(virDomainObj *vm,
+ GSList **paths)
+{
+ g_autoptr(DIR) dir = NULL;
+ struct dirent *dent;
+ g_autofree char *path = NULL;
+ int iommufd = 0;
+ size_t i;
+
+ for (i = 0; i < vm->def->nhostdevs; i++) {
+ if (vm->def->hostdevs[i]->source.subsys.u.pci.driver.iommufd) {
+ iommufd = 1;
+ break;
+ }
+ }
+
+ /* Check if iommufd is enabled */
+ if (iommufd == 1) {
+ if (virDirOpen(&dir, "/dev/vfio/devices") < 0) {
+ if (errno == ENOENT)
+ return 0;
+ return -1;
+ }
+ while (virDirRead(dir, &dent, "/dev/vfio/devices") > 0) {
+ if (STRPREFIX(dent->d_name, "vfio")) {
+ path = g_strdup_printf("/dev/vfio/devices/%s", dent->d_name);
+ *paths = g_slist_prepend(*paths, g_steal_pointer(&path));
+ }
+ }
+ path = NULL;
+ if (virFileExists("/dev/iommu"))
+ path = g_strdup("/dev/iommu");
+ if (path)
+ *paths = g_slist_prepend(*paths, g_steal_pointer(&path));
+ }
+
+ return 0;
+}
+
+
static int
qemuNamespaceMknodPaths(virDomainObj *vm,
GSList *paths,
@@ -706,6 +747,9 @@ qemuDomainBuildNamespace(virQEMUDriverConfig *cfg,
if (qemuDomainSetupAllDisks(vm, &paths) < 0)
return -1;
+ if (qemuDomainSetupIommufd(vm, &paths) < 0)
+ return -1;
+
if (qemuDomainSetupAllHostdevs(vm, &paths) < 0)
return -1;
diff --git a/src/security/security_apparmor.c b/src/security/security_apparmor.c
index 68ac39611f..0a878fd205 100644
--- a/src/security/security_apparmor.c
+++ b/src/security/security_apparmor.c
@@ -856,6 +856,21 @@ AppArmorSetSecurityHostdevLabel(virSecurityManager *mgr,
}
ret = AppArmorSetSecurityPCILabel(pci, vfioGroupDev, ptr);
VIR_FREE(vfioGroupDev);
+
+ if (dev->source.subsys.u.pci.driver.iommufd) {
+ g_autofree char *vfiofdDev = virPCIDeviceGetIOMMUFDDev(pci);
+ const char *iommufdDir = "/dev/iommu";
+ if (vfiofdDev) {
+ int ret2 = AppArmorSetSecurityPCILabel(pci, vfiofdDev, ptr);
+ if (ret2 < 0)
+ ret = ret2;
+ ret2 = AppArmorSetSecurityPCILabel(pci, iommufdDir, ptr);
+ if (ret2 < 0)
+ ret = ret2;
+ } else {
+ return -1;
+ }
+ }
} else {
ret = virPCIDeviceFileIterate(pci, AppArmorSetSecurityPCILabel, ptr);
}
diff --git a/src/security/security_dac.c b/src/security/security_dac.c
index 2f788b872a..361106222d 100644
--- a/src/security/security_dac.c
+++ b/src/security/security_dac.c
@@ -1290,6 +1290,24 @@ virSecurityDACSetHostdevLabel(virSecurityManager *mgr,
ret = virSecurityDACSetHostdevLabelHelper(vfioGroupDev,
false,
&cbdata);
+ if (dev->source.subsys.u.pci.driver.iommufd) {
+ g_autofree char *vfiofdDev = virPCIDeviceGetIOMMUFDDev(pci);
+ const char *iommufdDir = "/dev/iommu";
+ if (vfiofdDev) {
+ int ret2 = virSecurityDACSetHostdevLabelHelper(vfiofdDev,
+ false,
+ &cbdata);
+ if (ret2 < 0)
+ ret = ret2;
+ ret2 = virSecurityDACSetHostdevLabelHelper(iommufdDir,
+ false,
+ &cbdata);
+ if (ret2 < 0)
+ ret = ret2;
+ } else {
+ return -1;
+ }
+ }
} else {
ret = virPCIDeviceFileIterate(pci,
virSecurityDACSetPCILabel,
@@ -1450,6 +1468,22 @@ virSecurityDACRestoreHostdevLabel(virSecurityManager *mgr,
ret = virSecurityDACRestoreFileLabelInternal(mgr, NULL,
vfioGroupDev, false);
+ if (dev->source.subsys.u.pci.driver.iommufd) {
+ g_autofree char *vfiofdDev = virPCIDeviceGetIOMMUFDDev(pci);
+ const char *iommufdDir = "/dev/iommu";
+ if (vfiofdDev) {
+ int ret2 = virSecurityDACRestoreFileLabelInternal(mgr, NULL,
+ vfiofdDev, false);
+ if (ret2 < 0)
+ ret = ret2;
+ ret2 = virSecurityDACRestoreFileLabelInternal(mgr, NULL,
+ iommufdDir, false);
+ if (ret2 < 0)
+ ret = ret2;
+ } else {
+ return -1;
+ }
+ }
} else {
ret = virPCIDeviceFileIterate(pci, virSecurityDACRestorePCILabel, mgr);
}
diff --git a/src/security/security_selinux.c b/src/security/security_selinux.c
index fa5d1568eb..fbe8f63ab4 100644
--- a/src/security/security_selinux.c
+++ b/src/security/security_selinux.c
@@ -2248,6 +2248,25 @@ virSecuritySELinuxSetHostdevSubsysLabel(virSecurityManager *mgr,
ret = virSecuritySELinuxSetHostdevLabelHelper(vfioGroupDev,
false,
&data);
+ if (dev->source.subsys.u.pci.driver.iommufd) {
+ g_autofree char *vfiofdDev = virPCIDeviceGetIOMMUFDDev(pci);
+ const char *iommufdDir = "/dev/iommu";
+ if (vfiofdDev) {
+ int ret2 = virSecuritySELinuxSetHostdevLabelHelper(vfiofdDev,
+ false,
+ &data);
+ if (ret2 < 0)
+ ret = ret2;
+ ret2 = virSecuritySELinuxSetHostdevLabelHelper(iommufdDir,
+ false,
+ &data);
+ if (ret2 < 0)
+ ret = ret2;
+ } else {
+ return -1;
+ }
+ }
+
} else {
ret = virPCIDeviceFileIterate(pci, virSecuritySELinuxSetPCILabel, &data);
}
@@ -2481,6 +2500,21 @@ virSecuritySELinuxRestoreHostdevSubsysLabel(virSecurityManager *mgr,
return -1;
ret = virSecuritySELinuxRestoreFileLabel(mgr, vfioGroupDev, false);
+
+ if (dev->source.subsys.u.pci.driver.iommufd) {
+ g_autofree char *vfiofdDev = virPCIDeviceGetIOMMUFDDev(pci);
+ const char *iommufdDir = "/dev/iommu";
+ if (vfiofdDev) {
+ int ret2 = virSecuritySELinuxRestoreFileLabel(mgr, vfiofdDev, false);
+ if (ret2 < 0)
+ ret = ret2;
+ ret2 = virSecuritySELinuxRestoreFileLabel(mgr, iommufdDir, false);
+ if (ret2 < 0)
+ ret = ret2;
+ } else {
+ return -1;
+ }
+ }
} else {
ret = virPCIDeviceFileIterate(pci, virSecuritySELinuxRestorePCILabel, mgr);
}
diff --git a/src/security/virt-aa-helper.c b/src/security/virt-aa-helper.c
index de0a826063..c9e6d9c6a9 100644
--- a/src/security/virt-aa-helper.c
+++ b/src/security/virt-aa-helper.c
@@ -878,7 +878,7 @@ get_files(vahControl * ctl)
size_t i;
g_autofree char *uuid = NULL;
char uuidstr[VIR_UUID_STRING_BUFLEN];
- bool needsVfio = false, needsvhost = false, needsgl = false;
+ bool needsVfio = false, needsvhost = false, needsgl = false, needsIommufd = false;
/* verify uuid is same as what we were given on the command line */
virUUIDFormat(ctl->def->uuid, uuidstr);
@@ -1119,6 +1119,9 @@ get_files(vahControl * ctl)
needsVfio = true;
}
+ if (dev->source.subsys.u.pci.driver.iommufd)
+ needsIommufd = true;
+
if (pci == NULL)
continue;
@@ -1348,6 +1351,12 @@ get_files(vahControl * ctl)
virBufferAddLit(&buf, " \"/dev/vfio/vfio\" rw,\n");
virBufferAddLit(&buf, " \"/dev/vfio/[0-9]*\" rw,\n");
}
+
+ if (needsIommufd) {
+ virBufferAddLit(&buf, " \"/dev/iommu\" rwm,\n");
+ virBufferAddLit(&buf, " \"/dev/vfio/devices/vfio[0-9]*\" rwm,\n");
+ }
+
if (needsgl) {
/* if using gl all sorts of further dri related paths will be needed */
virBufferAddLit(&buf, " # DRI/Mesa/(e)GL config and driver paths\n");
diff --git a/src/util/virpci.c b/src/util/virpci.c
index 90617e69c6..6e6e5e47c0 100644
--- a/src/util/virpci.c
+++ b/src/util/virpci.c
@@ -2478,6 +2478,74 @@ virPCIDeviceGetIOMMUGroupDev(virPCIDevice *dev)
return g_strdup_printf("/dev/vfio/%s", groupFile);
}
+/* virPCIDeviceGetIOMMUFDDev - return the name of the device used
+ * to control this PCI device's group (e.g. "/dev/vfio/devices/vfio15")
+ */
+char *
+virPCIDeviceGetIOMMUFDDev(virPCIDevice *dev)
+{
+ g_autofree char *path = NULL;
+ const char *pci_addr = NULL;
+ g_autoptr(DIR) dir = NULL;
+ struct dirent *entry;
+ char *vfiodev = NULL;
+
+ /* Get PCI device address */
+ pci_addr = virPCIDeviceGetName(dev);
+ if (!pci_addr)
+ return NULL;
+
+ /* First try: look in PCI device's vfio-dev subdirectory */
+ path = g_strdup_printf("/sys/bus/pci/devices/%s/vfio-dev", pci_addr);
+
+ if (virDirOpen(&dir, path) == 1) {
+ while (virDirRead(dir, &entry, path) > 0) {
+ if (!g_str_has_prefix(entry->d_name, "vfio"))
+ continue;
+
+ vfiodev = g_strdup_printf("/dev/vfio/devices/%s", entry->d_name);
+ break;
+ }
+ /* g_autoptr will automatically close dir when it goes out of scope */
+ dir = NULL;
+ }
+
+ /* Second try: scan /sys/class/vfio-dev for matching device */
+ if (!vfiodev) {
+ g_free(path);
+ path = g_strdup("/sys/class/vfio-dev");
+
+ if (virDirOpen(&dir, path) == 1) {
+ while (virDirRead(dir, &entry, path) > 0) {
+ g_autofree char *dev_link = NULL;
+ g_autofree char *target = NULL;
+
+ if (!g_str_has_prefix(entry->d_name, "vfio"))
+ continue;
+
+ dev_link = g_strdup_printf("/sys/class/vfio-dev/%s/device", entry->d_name);
+
+ if (virFileResolveLink(dev_link, &target) < 0)
+ continue;
+
+ if (strstr(target, pci_addr)) {
+ vfiodev = g_strdup_printf("/dev/vfio/devices/%s", entry->d_name);
+ break;
+ }
+ }
+ /* g_autoptr will automatically close dir */
+ }
+ }
+
+ /* Verify the device path exists and is accessible */
+ if (vfiodev && !virFileExists(vfiodev)) {
+ VIR_FREE(vfiodev);
+ return NULL;
+ }
+
+ return vfiodev;
+}
+
static int
virPCIDeviceDownstreamLacksACS(virPCIDevice *dev)
{
diff --git a/src/util/virpci.h b/src/util/virpci.h
index fc538566e1..996ffab2f9 100644
--- a/src/util/virpci.h
+++ b/src/util/virpci.h
@@ -203,6 +203,7 @@ int virPCIDeviceAddressGetIOMMUGroupNum(virPCIDeviceAddress *addr);
char *virPCIDeviceAddressGetIOMMUGroupDev(const virPCIDeviceAddress *devAddr);
bool virPCIDeviceExists(const virPCIDeviceAddress *addr);
char *virPCIDeviceGetIOMMUGroupDev(virPCIDevice *dev);
+char *virPCIDeviceGetIOMMUFDDev(virPCIDevice *dev);
int virPCIDeviceIsAssignable(virPCIDevice *dev,
int strict_acs_check);
--
2.43.0
On a Monday in 2025, Nathan Chen via Devel wrote:
>Allow access to /dev/iommu and /dev/vfio/devices/vfio* when launching a qemu
>VM with iommufd feature enabled.
>
>Signed-off-by: Nathan Chen <nathanc@nvidia.com>
>---
> src/qemu/qemu_cgroup.c | 61 ++++++++++++++++++++++++++++
> src/qemu/qemu_cgroup.h | 1 +
> src/qemu/qemu_namespace.c | 44 +++++++++++++++++++++
> src/security/security_apparmor.c | 15 +++++++
> src/security/security_dac.c | 34 ++++++++++++++++
> src/security/security_selinux.c | 34 ++++++++++++++++
> src/security/virt-aa-helper.c | 11 +++++-
> src/util/virpci.c | 68 ++++++++++++++++++++++++++++++++
> src/util/virpci.h | 1 +
> 9 files changed, 268 insertions(+), 1 deletion(-)
>
>diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c
>index 46a7dc1d8b..e15ffd2007 100644
>--- a/src/qemu/qemu_cgroup.c
>+++ b/src/qemu/qemu_cgroup.c
>@@ -461,6 +461,54 @@ qemuTeardownInputCgroup(virDomainObj *vm,
> }
>
>
>+int
>+qemuSetupIommufdCgroup(virDomainObj *vm)
>+{
>+ qemuDomainObjPrivate *priv = vm->privateData;
>+ g_autoptr(DIR) dir = NULL;
>+ struct dirent *dent;
>+ g_autofree char *path = NULL;
>+ int iommufd = 0;
>+ size_t i;
>+
>+ for (i = 0; i < vm->def->nhostdevs; i++) {
>+ if (vm->def->hostdevs[i]->source.subsys.u.pci.driver.iommufd) {
>+ iommufd = 1;
>+ break;
>+ }
>+ }
>+
>+ if (iommufd == 1) {
>+ if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
>+ return 0;
>+ if (virDirOpen(&dir, "/dev/vfio/devices") < 0) {
>+ if (errno == ENOENT)
>+ return 0;
>+ return -1;
>+ }
>+ while (virDirRead(dir, &dent, "/dev/vfio/devices") > 0) {
>+ if (STRPREFIX(dent->d_name, "vfio")) {
>+ path = g_strdup_printf("/dev/vfio/devices/%s", dent->d_name);
>+ }
>+ if (path &&
>+ qemuCgroupAllowDevicePath(vm, path,
>+ VIR_CGROUP_DEVICE_RW, false) < 0) {
>+ return -1;
This allows all the devices instead of just the ones the VM needs.
Also, this is still a hostdev, so it should be done inside
qemuSetupHostdevCgroup.
Do hostdevs using iommufd also need access to
a) /dev/vfio/vfio and
b) /dev/vfio/<iommugroup>
which were already allowed in qemuSetupHostdevCgroup?
>+ }
>+ path = NULL;
>+ }
>+ if (virFileExists("/dev/iommu"))
>+ path = g_strdup("/dev/iommu");
No need to check for the existence of the device. If it does not exist,
the VM won't start anyway.
Also, is it necessary to allow these? libvirt already opened the files
and passed file descriptors.
>+ if (path &&
>+ qemuCgroupAllowDevicePath(vm, path,
>+ VIR_CGROUP_DEVICE_RW, false) < 0) {
>+ return -1;
>+ }
>+ }
>+ return 0;
>+}
>+
>+
> /**
> * qemuSetupHostdevCgroup:
> * vm: domain object
>@@ -759,6 +807,7 @@ qemuSetupDevicesCgroup(virDomainObj *vm)
> g_autoptr(virQEMUDriverConfig) cfg = virQEMUDriverGetConfig(priv->driver);
> const char *const *deviceACL = (const char *const *) cfg->cgroupDeviceACL;
> int rv = -1;
>+ int iommufd = 0;
> size_t i;
>
> if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES))
>@@ -836,6 +885,18 @@ qemuSetupDevicesCgroup(virDomainObj *vm)
> return -1;
> }
>
>+ for (i = 0; i < vm->def->nhostdevs; i++) {
>+ if (vm->def->hostdevs[i]->source.subsys.u.pci.driver.iommufd) {
>+ iommufd = 1;
>+ break;
>+ }
>+ }
>+
No need to check this upfront. If /dev/iommu access is necessary, the
per-hostdev function qemuSetupHostdevCgroup can add it to the list
multiple times, like it already does for /dev/vfio/vfio
>+ if (iommufd == 1) {
>+ if (qemuSetupIommufdCgroup(vm) < 0)
>+ return -1;
>+ }
>+
> for (i = 0; i < vm->def->nmems; i++) {
> if (qemuSetupMemoryDevicesCgroup(vm, vm->def->mems[i]) < 0)
> return -1;
>diff --git a/src/qemu/qemu_cgroup.h b/src/qemu/qemu_cgroup.h
>index 3668034cde..bea677ba3c 100644
>--- a/src/qemu/qemu_cgroup.h
>+++ b/src/qemu/qemu_cgroup.h
>@@ -42,6 +42,7 @@ int qemuSetupHostdevCgroup(virDomainObj *vm,
> int qemuTeardownHostdevCgroup(virDomainObj *vm,
> virDomainHostdevDef *dev)
> G_GNUC_WARN_UNUSED_RESULT;
>+int qemuSetupIommufdCgroup(virDomainObj *vm);
> int qemuSetupMemoryDevicesCgroup(virDomainObj *vm,
> virDomainMemoryDef *mem);
> int qemuTeardownMemoryDevicesCgroup(virDomainObj *vm,
>diff --git a/src/qemu/qemu_namespace.c b/src/qemu/qemu_namespace.c
>index 932777505b..80496f2f0f 100644
>--- a/src/qemu/qemu_namespace.c
>+++ b/src/qemu/qemu_namespace.c
>@@ -683,6 +683,47 @@ qemuDomainSetupLaunchSecurity(virDomainObj *vm,
> }
>
>
>+static int
>+qemuDomainSetupIommufd(virDomainObj *vm,
>+ GSList **paths)
>+{
>+ g_autoptr(DIR) dir = NULL;
>+ struct dirent *dent;
>+ g_autofree char *path = NULL;
>+ int iommufd = 0;
>+ size_t i;
>+
>+ for (i = 0; i < vm->def->nhostdevs; i++) {
>+ if (vm->def->hostdevs[i]->source.subsys.u.pci.driver.iommufd) {
>+ iommufd = 1;
>+ break;
>+ }
>+ }
>+
>+ /* Check if iommufd is enabled */
>+ if (iommufd == 1) {
>+ if (virDirOpen(&dir, "/dev/vfio/devices") < 0) {
>+ if (errno == ENOENT)
>+ return 0;
>+ return -1;
>+ }
>+ while (virDirRead(dir, &dent, "/dev/vfio/devices") > 0) {
>+ if (STRPREFIX(dent->d_name, "vfio")) {
>+ path = g_strdup_printf("/dev/vfio/devices/%s", dent->d_name);
>+ *paths = g_slist_prepend(*paths, g_steal_pointer(&path));
>+ }
>+ }
>+ path = NULL;
>+ if (virFileExists("/dev/iommu"))
>+ path = g_strdup("/dev/iommu");
>+ if (path)
>+ *paths = g_slist_prepend(*paths, g_steal_pointer(&path));
Same comments as for cgroups apply here too.
>+ }
>+
>+ return 0;
>+}
>+
>+
> static int
> qemuNamespaceMknodPaths(virDomainObj *vm,
> GSList *paths,
>@@ -706,6 +747,9 @@ qemuDomainBuildNamespace(virQEMUDriverConfig *cfg,
> if (qemuDomainSetupAllDisks(vm, &paths) < 0)
> return -1;
>
>+ if (qemuDomainSetupIommufd(vm, &paths) < 0)
>+ return -1;
>+
> if (qemuDomainSetupAllHostdevs(vm, &paths) < 0)
> return -1;
>
>diff --git a/src/security/security_apparmor.c b/src/security/security_apparmor.c
>index 68ac39611f..0a878fd205 100644
>--- a/src/security/security_apparmor.c
>+++ b/src/security/security_apparmor.c
>@@ -856,6 +856,21 @@ AppArmorSetSecurityHostdevLabel(virSecurityManager *mgr,
> }
> ret = AppArmorSetSecurityPCILabel(pci, vfioGroupDev, ptr);
> VIR_FREE(vfioGroupDev);
>+
>+ if (dev->source.subsys.u.pci.driver.iommufd) {
>+ g_autofree char *vfiofdDev = virPCIDeviceGetIOMMUFDDev(pci);
>+ const char *iommufdDir = "/dev/iommu";
>+ if (vfiofdDev) {
>+ int ret2 = AppArmorSetSecurityPCILabel(pci, vfiofdDev, ptr);
>+ if (ret2 < 0)
>+ ret = ret2;
>+ ret2 = AppArmorSetSecurityPCILabel(pci, iommufdDir, ptr);
>+ if (ret2 < 0)
>+ ret = ret2;
>+ } else {
>+ return -1;
>+ }
>+ }
> } else {
> ret = virPCIDeviceFileIterate(pci, AppArmorSetSecurityPCILabel, ptr);
> }
>diff --git a/src/util/virpci.c b/src/util/virpci.c
>index 90617e69c6..6e6e5e47c0 100644
>--- a/src/util/virpci.c
>+++ b/src/util/virpci.c
>@@ -2478,6 +2478,74 @@ virPCIDeviceGetIOMMUGroupDev(virPCIDevice *dev)
> return g_strdup_printf("/dev/vfio/%s", groupFile);
> }
>
>+/* virPCIDeviceGetIOMMUFDDev - return the name of the device used
>+ * to control this PCI device's group (e.g. "/dev/vfio/devices/vfio15")
>+ */
>+char *
>+virPCIDeviceGetIOMMUFDDev(virPCIDevice *dev)
>+{
>+ g_autofree char *path = NULL;
>+ const char *pci_addr = NULL;
>+ g_autoptr(DIR) dir = NULL;
>+ struct dirent *entry;
>+ char *vfiodev = NULL;
>+
>+ /* Get PCI device address */
No need for this kind of comment - it's obvious from the variable and
function names.
>+ pci_addr = virPCIDeviceGetName(dev);
>+ if (!pci_addr)
>+ return NULL;
>+
>+ /* First try: look in PCI device's vfio-dev subdirectory */
>+ path = g_strdup_printf("/sys/bus/pci/devices/%s/vfio-dev", pci_addr);
>+
>+ if (virDirOpen(&dir, path) == 1) {
>+ while (virDirRead(dir, &entry, path) > 0) {
>+ if (!g_str_has_prefix(entry->d_name, "vfio"))
>+ continue;
>+
>+ vfiodev = g_strdup_printf("/dev/vfio/devices/%s", entry->d_name);
>+ break;
>+ }
>+ /* g_autoptr will automatically close dir when it goes out of scope */
This comment is also obvious.
>+ dir = NULL;
That does not make dir go out of scope. That's a memory leak.
We try not to mix g_auto with manual freeing of variables, so either use
two variables or two different scopes.
Jano
>+ }
>+
>+ /* Second try: scan /sys/class/vfio-dev for matching device */
>+ if (!vfiodev) {
>+ g_free(path);
>+ path = g_strdup("/sys/class/vfio-dev");
>+
On 11/21/2025 8:37 AM, Ján Tomko wrote:
> On a Monday in 2025, Nathan Chen via Devel wrote:
>> Allow access to /dev/iommu and /dev/vfio/devices/vfio* when launching
>> a qemu
>> VM with iommufd feature enabled.
>>
>> Signed-off-by: Nathan Chen <nathanc@nvidia.com>
>> ---
>> src/qemu/qemu_cgroup.c | 61 ++++++++++++++++++++++++++++
>> src/qemu/qemu_cgroup.h | 1 +
>> src/qemu/qemu_namespace.c | 44 +++++++++++++++++++++
>> src/security/security_apparmor.c | 15 +++++++
>> src/security/security_dac.c | 34 ++++++++++++++++
>> src/security/security_selinux.c | 34 ++++++++++++++++
>> src/security/virt-aa-helper.c | 11 +++++-
>> src/util/virpci.c | 68 ++++++++++++++++++++++++++++++++
>> src/util/virpci.h | 1 +
>> 9 files changed, 268 insertions(+), 1 deletion(-)
>>
>> diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c
>> index 46a7dc1d8b..e15ffd2007 100644
>> --- a/src/qemu/qemu_cgroup.c
>> +++ b/src/qemu/qemu_cgroup.c
>> @@ -461,6 +461,54 @@ qemuTeardownInputCgroup(virDomainObj *vm,
>> }
>>
>>
>> +int
>> +qemuSetupIommufdCgroup(virDomainObj *vm)
>> +{
>> + qemuDomainObjPrivate *priv = vm->privateData;
>> + g_autoptr(DIR) dir = NULL;
>> + struct dirent *dent;
>> + g_autofree char *path = NULL;
>> + int iommufd = 0;
>> + size_t i;
>> +
>> + for (i = 0; i < vm->def->nhostdevs; i++) {
>> + if (vm->def->hostdevs[i]->source.subsys.u.pci.driver.iommufd) {
>> + iommufd = 1;
>> + break;
>> + }
>> + }
>> +
>> + if (iommufd == 1) {
>> + if (!virCgroupHasController(priv->cgroup,
>> VIR_CGROUP_CONTROLLER_DEVICES))
>> + return 0;
>> + if (virDirOpen(&dir, "/dev/vfio/devices") < 0) {
>> + if (errno == ENOENT)
>> + return 0;
>> + return -1;
>> + }
>> + while (virDirRead(dir, &dent, "/dev/vfio/devices") > 0) {
>> + if (STRPREFIX(dent->d_name, "vfio")) {
>> + path = g_strdup_printf("/dev/vfio/devices/%s", dent-
>> >d_name);
>> + }
>> + if (path &&
>> + qemuCgroupAllowDevicePath(vm, path,
>> + VIR_CGROUP_DEVICE_RW,
>> false) < 0) {
>> + return -1;
>
> This allows all the devices instead of just the ones the VM needs.
>
> Also, this is still a hostdev, so it should be done inside
> qemuSetupHostdevCgroup.
>
> Do hostdevs using iommufd also need access to
> a) /dev/vfio/vfio and
> b) /dev/vfio/<iommugroup>
> which were already allowed in qemuSetupHostdevCgroup?
>
>
They don't need access to these. I'll add a check to avoid providing
access to these if iommufd is specified.
>
>> + }
>> + path = NULL;
>> + }
>
>> + if (virFileExists("/dev/iommu"))
>> + path = g_strdup("/dev/iommu");
>
> No need to check for the existence of the device. If it does not exist,
> the VM won't start anyway.
>
> Also, is it necessary to allow these? libvirt already opened the files
> and passed file descriptors.
>
Thanks for catching this, I had included this cgroup and namespace logic
from earlier patches when we did not pass the file descriptors. I'll
exclude it in the next revision.
>> + if (path &&
>> + qemuCgroupAllowDevicePath(vm, path,
>> + VIR_CGROUP_DEVICE_RW, false) <
>> 0) {
>> + return -1;
>> + }
>> + }
>> + return 0;
>> +}
>> +
>> +
>> /**
>> * qemuSetupHostdevCgroup:
>> * vm: domain object
>> @@ -759,6 +807,7 @@ qemuSetupDevicesCgroup(virDomainObj *vm)
>> g_autoptr(virQEMUDriverConfig) cfg = virQEMUDriverGetConfig(priv-
>> >driver);
>> const char *const *deviceACL = (const char *const *) cfg-
>> >cgroupDeviceACL;
>> int rv = -1;
>> + int iommufd = 0;
>> size_t i;
>>
>> if (!virCgroupHasController(priv->cgroup,
>> VIR_CGROUP_CONTROLLER_DEVICES))
>> @@ -836,6 +885,18 @@ qemuSetupDevicesCgroup(virDomainObj *vm)
>> return -1;
>> }
>>
>> + for (i = 0; i < vm->def->nhostdevs; i++) {
>> + if (vm->def->hostdevs[i]->source.subsys.u.pci.driver.iommufd) {
>> + iommufd = 1;
>> + break;
>> + }
>> + }
>> +
>
> No need to check this upfront. If /dev/iommu access is necessary, the
> per-hostdev function qemuSetupHostdevCgroup can add it to the list
> multiple times, like it already does for /dev/vfio/vfio
>
That makes sense, I will remove this.
>> + if (iommufd == 1) {
>> + if (qemuSetupIommufdCgroup(vm) < 0)
>> + return -1;
>> + }
>> +
>> for (i = 0; i < vm->def->nmems; i++) {
>> if (qemuSetupMemoryDevicesCgroup(vm, vm->def->mems[i]) < 0)
>> return -1;
>> diff --git a/src/qemu/qemu_cgroup.h b/src/qemu/qemu_cgroup.h
>> index 3668034cde..bea677ba3c 100644
>> --- a/src/qemu/qemu_cgroup.h
>> +++ b/src/qemu/qemu_cgroup.h
>> @@ -42,6 +42,7 @@ int qemuSetupHostdevCgroup(virDomainObj *vm,
>> int qemuTeardownHostdevCgroup(virDomainObj *vm,
>> virDomainHostdevDef *dev)
>> G_GNUC_WARN_UNUSED_RESULT;
>> +int qemuSetupIommufdCgroup(virDomainObj *vm);
>> int qemuSetupMemoryDevicesCgroup(virDomainObj *vm,
>> virDomainMemoryDef *mem);
>> int qemuTeardownMemoryDevicesCgroup(virDomainObj *vm,
>> diff --git a/src/qemu/qemu_namespace.c b/src/qemu/qemu_namespace.c
>> index 932777505b..80496f2f0f 100644
>> --- a/src/qemu/qemu_namespace.c
>> +++ b/src/qemu/qemu_namespace.c
>> @@ -683,6 +683,47 @@ qemuDomainSetupLaunchSecurity(virDomainObj *vm,
>> }
>>
>>
>> +static int
>> +qemuDomainSetupIommufd(virDomainObj *vm,
>> + GSList **paths)
>> +{
>> + g_autoptr(DIR) dir = NULL;
>> + struct dirent *dent;
>> + g_autofree char *path = NULL;
>> + int iommufd = 0;
>> + size_t i;
>> +
>> + for (i = 0; i < vm->def->nhostdevs; i++) {
>> + if (vm->def->hostdevs[i]->source.subsys.u.pci.driver.iommufd) {
>> + iommufd = 1;
>> + break;
>> + }
>> + }
>> +
>> + /* Check if iommufd is enabled */
>> + if (iommufd == 1) {
>> + if (virDirOpen(&dir, "/dev/vfio/devices") < 0) {
>> + if (errno == ENOENT)
>> + return 0;
>> + return -1;
>> + }
>> + while (virDirRead(dir, &dent, "/dev/vfio/devices") > 0) {
>> + if (STRPREFIX(dent->d_name, "vfio")) {
>> + path = g_strdup_printf("/dev/vfio/devices/%s", dent-
>> >d_name);
>> + *paths = g_slist_prepend(*paths,
>> g_steal_pointer(&path));
>> + }
>> + }
>> + path = NULL;
>> + if (virFileExists("/dev/iommu"))
>> + path = g_strdup("/dev/iommu");
>> + if (path)
>> + *paths = g_slist_prepend(*paths, g_steal_pointer(&path));
>
> Same comments as for cgroups apply here too.
>
Ok, I will update the namespace logic as well.
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +
>> static int
>> qemuNamespaceMknodPaths(virDomainObj *vm,
>> GSList *paths,
>> @@ -706,6 +747,9 @@ qemuDomainBuildNamespace(virQEMUDriverConfig *cfg,
>> if (qemuDomainSetupAllDisks(vm, &paths) < 0)
>> return -1;
>>
>> + if (qemuDomainSetupIommufd(vm, &paths) < 0)
>> + return -1;
>> +
>> if (qemuDomainSetupAllHostdevs(vm, &paths) < 0)
>> return -1;
>>
>> diff --git a/src/security/security_apparmor.c b/src/security/
>> security_apparmor.c
>> index 68ac39611f..0a878fd205 100644
>> --- a/src/security/security_apparmor.c
>> +++ b/src/security/security_apparmor.c
>> @@ -856,6 +856,21 @@
>> AppArmorSetSecurityHostdevLabel(virSecurityManager *mgr,
>> }
>> ret = AppArmorSetSecurityPCILabel(pci, vfioGroupDev, ptr);
>> VIR_FREE(vfioGroupDev);
>> +
>> + if (dev->source.subsys.u.pci.driver.iommufd) {
>> + g_autofree char *vfiofdDev =
>> virPCIDeviceGetIOMMUFDDev(pci);
>> + const char *iommufdDir = "/dev/iommu";
>> + if (vfiofdDev) {
>> + int ret2 = AppArmorSetSecurityPCILabel(pci,
>> vfiofdDev, ptr);
>> + if (ret2 < 0)
>> + ret = ret2;
>> + ret2 = AppArmorSetSecurityPCILabel(pci,
>> iommufdDir, ptr);
>> + if (ret2 < 0)
>> + ret = ret2;
>> + } else {
>> + return -1;
>> + }
>> + }
>> } else {
>> ret = virPCIDeviceFileIterate(pci,
>> AppArmorSetSecurityPCILabel, ptr);
>> }
>> diff --git a/src/util/virpci.c b/src/util/virpci.c
>> index 90617e69c6..6e6e5e47c0 100644
>> --- a/src/util/virpci.c
>> +++ b/src/util/virpci.c
>> @@ -2478,6 +2478,74 @@ virPCIDeviceGetIOMMUGroupDev(virPCIDevice *dev)
>> return g_strdup_printf("/dev/vfio/%s", groupFile);
>> }
>>
>> +/* virPCIDeviceGetIOMMUFDDev - return the name of the device used
>> + * to control this PCI device's group (e.g. "/dev/vfio/devices/vfio15")
>> + */
>> +char *
>> +virPCIDeviceGetIOMMUFDDev(virPCIDevice *dev)
>> +{
>> + g_autofree char *path = NULL;
>> + const char *pci_addr = NULL;
>> + g_autoptr(DIR) dir = NULL;
>> + struct dirent *entry;
>> + char *vfiodev = NULL;
>> +
>> + /* Get PCI device address */
>
> No need for this kind of comment - it's obvious from the variable and
> function names.
>
>> + pci_addr = virPCIDeviceGetName(dev);
>> + if (!pci_addr)
>> + return NULL;
>> +
>> + /* First try: look in PCI device's vfio-dev subdirectory */
>> + path = g_strdup_printf("/sys/bus/pci/devices/%s/vfio-dev",
>> pci_addr);
>> +
>> + if (virDirOpen(&dir, path) == 1) {
>> + while (virDirRead(dir, &entry, path) > 0) {
>> + if (!g_str_has_prefix(entry->d_name, "vfio"))
>> + continue;
>> +
>> + vfiodev = g_strdup_printf("/dev/vfio/devices/%s", entry-
>> >d_name);
>> + break;
>> + }
>> + /* g_autoptr will automatically close dir when it goes out of
>> scope */
>
> This comment is also obvious.
>
Yes agreed, I will exclude the obvious comments.
>> + dir = NULL;
>
> That does not make dir go out of scope. That's a memory leak.
>
> We try not to mix g_auto with manual freeing of variables, so either use
> two variables or two different scopes.
I see, I will use two variables.
Thanks,
Nathan
© 2016 - 2026 Red Hat, Inc.