From: Nathan Chen <nathanc@nvidia.com>
Integrate and use the IOMMU_OPTION_RLIMIT_MODE
ioctl to set per-process memory accounting for
iommufd. This prevents ENOMEM errors from the
default per-user memory accounting when multiple
VMs under the libvirt-qemu user have their pinned
memory summed and checked against a per-process
RLIMIT_MEMLOCK limit.
Signed-off-by: Nathan Chen <nathanc@nvidia.com>
---
po/POTFILES | 1 +
src/libvirt_private.syms | 4 ++
src/util/meson.build | 1 +
src/util/viriommufd.c | 127 +++++++++++++++++++++++++++++++++++++++
src/util/viriommufd.h | 27 +++++++++
5 files changed, 160 insertions(+)
create mode 100644 src/util/viriommufd.c
create mode 100644 src/util/viriommufd.h
diff --git a/po/POTFILES b/po/POTFILES
index f0aad35c8c..c78d2b8000 100644
--- a/po/POTFILES
+++ b/po/POTFILES
@@ -303,6 +303,7 @@ src/util/virhostuptime.c
src/util/viridentity.c
src/util/virinhibitor.c
src/util/virinitctl.c
+src/util/viriommufd.c
src/util/viriscsi.c
src/util/virjson.c
src/util/virlease.c
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
index 4e57e4a8f6..a8eadbfb8a 100644
--- a/src/libvirt_private.syms
+++ b/src/libvirt_private.syms
@@ -2652,6 +2652,10 @@ virInhibitorRelease;
virInitctlFifos;
virInitctlSetRunLevel;
+# util/viriommufd.h
+virIOMMUFDSetRLimitMode;
+virIOMMUFDSupported;
+
# util/viriscsi.h
virISCSIConnectionLogin;
virISCSIConnectionLogout;
diff --git a/src/util/meson.build b/src/util/meson.build
index 4950a795cc..9fb0aa0fe7 100644
--- a/src/util/meson.build
+++ b/src/util/meson.build
@@ -46,6 +46,7 @@ util_sources = [
'viridentity.c',
'virinhibitor.c',
'virinitctl.c',
+ 'viriommufd.c',
'viriscsi.c',
'virjson.c',
'virkeycode.c',
diff --git a/src/util/viriommufd.c b/src/util/viriommufd.c
new file mode 100644
index 0000000000..0f87f95330
--- /dev/null
+++ b/src/util/viriommufd.c
@@ -0,0 +1,127 @@
+#include <config.h>
+
+#include "viriommufd.h"
+#include "virlog.h"
+#include "virerror.h"
+#include "virfile.h"
+
+#ifdef __linux__
+
+# include <sys/ioctl.h>
+# include <linux/types.h>
+
+# ifdef HAVE_LINUX_IOMMUFD_H
+# include <linux/iommufd.h>
+# endif
+
+#define VIR_FROM_THIS VIR_FROM_NONE
+
+VIR_LOG_INIT("util.iommufd");
+
+#ifndef IOMMU_OPTION
+
+enum iommufd_option {
+ IOMMU_OPTION_RLIMIT_MODE = 0,
+ IOMMU_OPTION_HUGE_PAGES = 1,
+};
+
+enum iommufd_option_ops {
+ IOMMU_OPTION_OP_SET = 0,
+ IOMMU_OPTION_OP_GET = 1,
+};
+
+struct iommu_option {
+ __u32 size;
+ __u32 option_id;
+ __u16 op;
+ __u16 __reserved;
+ __u32 object_id;
+ __aligned_u64 val64;
+};
+
+# define IOMMUFD_TYPE (';')
+# define IOMMUFD_CMD_OPTION 0x87
+# define IOMMU_OPTION _IO(IOMMUFD_TYPE, IOMMUFD_CMD_OPTION)
+
+#endif
+
+/**
+ * virIOMMUFDSetRLimitMode:
+ * @fd: iommufd file descriptor
+ * @processAccounting: true for per-process, false for per-user
+ *
+ * Set RLIMIT_MEMLOCK accounting mode for the iommufd.
+ *
+ * Returns: 0 on success, -1 on error
+ */
+int
+virIOMMUFDSetRLimitMode(int fd, bool processAccounting)
+{
+ struct iommu_option option = {
+ .size = sizeof(struct iommu_option),
+ .option_id = IOMMU_OPTION_RLIMIT_MODE,
+ .op = IOMMU_OPTION_OP_SET,
+ .__reserved = 0,
+ .object_id = 0,
+ .val64 = processAccounting ? 1 : 0,
+ };
+
+ if (ioctl(fd, IOMMU_OPTION, &option) < 0) {
+ switch (errno) {
+ case ENOTTY:
+ VIR_WARN("IOMMU_OPTION ioctl not supported");
+ return 0;
+
+ case EOPNOTSUPP:
+ VIR_WARN("IOMMU_OPTION_RLIMIT_MODE not supported by kernel");
+ return 0;
+
+ case EINVAL:
+ virReportSystemError(errno, "%s",
+ _("invalid iommufd option parameters"));
+ return -1;
+
+ case EPERM:
+ VIR_WARN("Permission denied for IOMMU_OPTION ioctl. "
+ "Per-user-based memory accounting to be used by default.");
+ return 0;
+
+ default:
+ virReportSystemError(errno, "%s",
+ _("failed to set iommufd option"));
+ return -1;
+ }
+ }
+
+ VIR_DEBUG("Set iommufd rlimit mode to %s-based accounting",
+ processAccounting ? "process" : "user");
+ return 0;
+}
+
+/**
+ * virIOMMUFDSupported: Check for presence of /dev/iommu on host.
+ *
+ * Returns true if the file exists and false if it does not.
+ */
+bool
+virIOMMUFDSupported(void)
+{
+ return virFileExists(VIR_IOMMU_DEV_PATH);
+}
+
+#else
+
+int virIOMMUFDSetRLimitMode(int fd G_GNUC_UNUSED,
+ bool processAccounting G_GNUC_UNUSED)
+{
+ virReportError(VIR_ERR_NO_SUPPORT, "%s",
+ _("IOMMUFD is not supported on this platform"));
+ return -1;
+}
+
+bool virIOMMUFDSupported(void)
+{
+ return false;
+}
+
+#endif
diff --git a/src/util/viriommufd.h b/src/util/viriommufd.h
new file mode 100644
index 0000000000..ec6be9fa66
--- /dev/null
+++ b/src/util/viriommufd.h
@@ -0,0 +1,27 @@
+/*
+ * viriommufd.h: iommufd helpers
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "internal.h"
+
+#define VIR_IOMMU_DEV_PATH "/dev/iommu"
+
+int virIOMMUFDSetRLimitMode(int fd, bool processAccounting);
+
+bool virIOMMUFDSupported(void);
--
2.43.0
On Tue, Jan 06, 2026 at 06:49:34PM -0800, Nathan Chen via Devel wrote:
> From: Nathan Chen <nathanc@nvidia.com>
>
> Integrate and use the IOMMU_OPTION_RLIMIT_MODE
> ioctl to set per-process memory accounting for
> iommufd. This prevents ENOMEM errors from the
> default per-user memory accounting when multiple
> VMs under the libvirt-qemu user have their pinned
> memory summed and checked against a per-process
> RLIMIT_MEMLOCK limit.
>
> Signed-off-by: Nathan Chen <nathanc@nvidia.com>
> ---
> po/POTFILES | 1 +
> src/libvirt_private.syms | 4 ++
> src/util/meson.build | 1 +
> src/util/viriommufd.c | 127 +++++++++++++++++++++++++++++++++++++++
> src/util/viriommufd.h | 27 +++++++++
> 5 files changed, 160 insertions(+)
> create mode 100644 src/util/viriommufd.c
> create mode 100644 src/util/viriommufd.h
>
> diff --git a/po/POTFILES b/po/POTFILES
> index f0aad35c8c..c78d2b8000 100644
> --- a/po/POTFILES
> +++ b/po/POTFILES
> @@ -303,6 +303,7 @@ src/util/virhostuptime.c
> src/util/viridentity.c
> src/util/virinhibitor.c
> src/util/virinitctl.c
> +src/util/viriommufd.c
> src/util/viriscsi.c
> src/util/virjson.c
> src/util/virlease.c
> diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
> index 4e57e4a8f6..a8eadbfb8a 100644
> --- a/src/libvirt_private.syms
> +++ b/src/libvirt_private.syms
> @@ -2652,6 +2652,10 @@ virInhibitorRelease;
> virInitctlFifos;
> virInitctlSetRunLevel;
>
> +# util/viriommufd.h
> +virIOMMUFDSetRLimitMode;
> +virIOMMUFDSupported;
> +
> # util/viriscsi.h
> virISCSIConnectionLogin;
> virISCSIConnectionLogout;
> diff --git a/src/util/meson.build b/src/util/meson.build
> index 4950a795cc..9fb0aa0fe7 100644
> --- a/src/util/meson.build
> +++ b/src/util/meson.build
> @@ -46,6 +46,7 @@ util_sources = [
> 'viridentity.c',
> 'virinhibitor.c',
> 'virinitctl.c',
> + 'viriommufd.c',
> 'viriscsi.c',
> 'virjson.c',
> 'virkeycode.c',
> diff --git a/src/util/viriommufd.c b/src/util/viriommufd.c
> new file mode 100644
> index 0000000000..0f87f95330
> --- /dev/null
> +++ b/src/util/viriommufd.c
> @@ -0,0 +1,127 @@
> +#include <config.h>
> +
> +#include "viriommufd.h"
> +#include "virlog.h"
> +#include "virerror.h"
> +#include "virfile.h"
> +
> +#ifdef __linux__
> +
> +# include <sys/ioctl.h>
> +# include <linux/types.h>
> +
> +# ifdef HAVE_LINUX_IOMMUFD_H
Currently this will always be false, you need to add `linux/iommufd.h`
into headers list in the meson.build file.
> +# include <linux/iommufd.h>
> +# endif
> +
> +#define VIR_FROM_THIS VIR_FROM_NONE
Incorrect indentation. You probably don't have cppi installed, otherwise
running our tests would complain about it.
> +
> +VIR_LOG_INIT("util.iommufd");
> +
> +#ifndef IOMMU_OPTION
Same here, and the whole #ifndef block including #endif.
> +
> +enum iommufd_option {
> + IOMMU_OPTION_RLIMIT_MODE = 0,
> + IOMMU_OPTION_HUGE_PAGES = 1,
> +};
> +
> +enum iommufd_option_ops {
> + IOMMU_OPTION_OP_SET = 0,
> + IOMMU_OPTION_OP_GET = 1,
> +};
> +
> +struct iommu_option {
> + __u32 size;
> + __u32 option_id;
> + __u16 op;
> + __u16 __reserved;
> + __u32 object_id;
> + __aligned_u64 val64;
> +};
> +
> +# define IOMMUFD_TYPE (';')
> +# define IOMMUFD_CMD_OPTION 0x87
> +# define IOMMU_OPTION _IO(IOMMUFD_TYPE, IOMMUFD_CMD_OPTION)
> +
> +#endif
> +
> +/**
> + * virIOMMUFDSetRLimitMode:
> + * @fd: iommufd file descriptor
> + * @processAccounting: true for per-process, false for per-user
> + *
> + * Set RLIMIT_MEMLOCK accounting mode for the iommufd.
> + *
> + * Returns: 0 on success, -1 on error
> + */
I don't see this function used anywhere in this patch series. Later you
implement support to open /dev/iommu, should it be called there?
> +int
> +virIOMMUFDSetRLimitMode(int fd, bool processAccounting)
> +{
> + struct iommu_option option = {
> + .size = sizeof(struct iommu_option),
> + .option_id = IOMMU_OPTION_RLIMIT_MODE,
> + .op = IOMMU_OPTION_OP_SET,
> + .__reserved = 0,
> + .object_id = 0,
> + .val64 = processAccounting ? 1 : 0,
> + };
> +
> + if (ioctl(fd, IOMMU_OPTION, &option) < 0) {
> + switch (errno) {
> + case ENOTTY:
> + VIR_WARN("IOMMU_OPTION ioctl not supported");
> + return 0;
> +
> + case EOPNOTSUPP:
> + VIR_WARN("IOMMU_OPTION_RLIMIT_MODE not supported by kernel");
> + return 0;
> +
> + case EINVAL:
> + virReportSystemError(errno, "%s",
> + _("invalid iommufd option parameters"));
Wrong indentation.
> + return -1;
> +
> + case EPERM:
> + VIR_WARN("Permission denied for IOMMU_OPTION ioctl. "
> + "Per-user-based memory accounting to be used by default.");
> + return 0;
> +
> + default:
> + virReportSystemError(errno, "%s",
> + _("failed to set iommufd option"));
Wrong indentation.
Pavel
On 1/15/2026 6:54 AM, Pavel Hrdina wrote:
> On Tue, Jan 06, 2026 at 06:49:34PM -0800, Nathan Chen via Devel wrote:
>> From: Nathan Chen<nathanc@nvidia.com>
>>
>> Integrate and use the IOMMU_OPTION_RLIMIT_MODE
>> ioctl to set per-process memory accounting for
>> iommufd. This prevents ENOMEM errors from the
>> default per-user memory accounting when multiple
>> VMs under the libvirt-qemu user have their pinned
>> memory summed and checked against a per-process
>> RLIMIT_MEMLOCK limit.
>>
>> Signed-off-by: Nathan Chen<nathanc@nvidia.com>
>> ---
>> po/POTFILES | 1 +
>> src/libvirt_private.syms | 4 ++
>> src/util/meson.build | 1 +
>> src/util/viriommufd.c | 127 +++++++++++++++++++++++++++++++++++++++
>> src/util/viriommufd.h | 27 +++++++++
>> 5 files changed, 160 insertions(+)
>> create mode 100644 src/util/viriommufd.c
>> create mode 100644 src/util/viriommufd.h
>>
>> diff --git a/po/POTFILES b/po/POTFILES
>> index f0aad35c8c..c78d2b8000 100644
>> --- a/po/POTFILES
>> +++ b/po/POTFILES
>> @@ -303,6 +303,7 @@ src/util/virhostuptime.c
>> src/util/viridentity.c
>> src/util/virinhibitor.c
>> src/util/virinitctl.c
>> +src/util/viriommufd.c
>> src/util/viriscsi.c
>> src/util/virjson.c
>> src/util/virlease.c
>> diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
>> index 4e57e4a8f6..a8eadbfb8a 100644
>> --- a/src/libvirt_private.syms
>> +++ b/src/libvirt_private.syms
>> @@ -2652,6 +2652,10 @@ virInhibitorRelease;
>> virInitctlFifos;
>> virInitctlSetRunLevel;
>>
>> +# util/viriommufd.h
>> +virIOMMUFDSetRLimitMode;
>> +virIOMMUFDSupported;
>> +
>> # util/viriscsi.h
>> virISCSIConnectionLogin;
>> virISCSIConnectionLogout;
>> diff --git a/src/util/meson.build b/src/util/meson.build
>> index 4950a795cc..9fb0aa0fe7 100644
>> --- a/src/util/meson.build
>> +++ b/src/util/meson.build
>> @@ -46,6 +46,7 @@ util_sources = [
>> 'viridentity.c',
>> 'virinhibitor.c',
>> 'virinitctl.c',
>> + 'viriommufd.c',
>> 'viriscsi.c',
>> 'virjson.c',
>> 'virkeycode.c',
>> diff --git a/src/util/viriommufd.c b/src/util/viriommufd.c
>> new file mode 100644
>> index 0000000000..0f87f95330
>> --- /dev/null
>> +++ b/src/util/viriommufd.c
>> @@ -0,0 +1,127 @@
>> +#include <config.h>
>> +
>> +#include "viriommufd.h"
>> +#include "virlog.h"
>> +#include "virerror.h"
>> +#include "virfile.h"
>> +
>> +#ifdef __linux__
>> +
>> +# include <sys/ioctl.h>
>> +# include <linux/types.h>
>> +
>> +# ifdef HAVE_LINUX_IOMMUFD_H
> Currently this will always be false, you need to add `linux/iommufd.h`
> into headers list in the meson.build file.
>
Ok, I will include this in meson.build.
>> +# include <linux/iommufd.h>
>> +# endif
>> +
>> +#define VIR_FROM_THIS VIR_FROM_NONE
> Incorrect indentation. You probably don't have cppi installed, otherwise
> running our tests would complain about it.
>
>> +
>> +VIR_LOG_INIT("util.iommufd");
>> +
>> +#ifndef IOMMU_OPTION
> Same here, and the whole #ifndef block including #endif.
>
I'll install cppi and correct this by adding spaces to be nested inside
#ifdef __linux__, thank you for the pointer.
>> +
>> +enum iommufd_option {
>> + IOMMU_OPTION_RLIMIT_MODE = 0,
>> + IOMMU_OPTION_HUGE_PAGES = 1,
>> +};
>> +
>> +enum iommufd_option_ops {
>> + IOMMU_OPTION_OP_SET = 0,
>> + IOMMU_OPTION_OP_GET = 1,
>> +};
>> +
>> +struct iommu_option {
>> + __u32 size;
>> + __u32 option_id;
>> + __u16 op;
>> + __u16 __reserved;
>> + __u32 object_id;
>> + __aligned_u64 val64;
>> +};
>> +
>> +# define IOMMUFD_TYPE (';')
>> +# define IOMMUFD_CMD_OPTION 0x87
>> +# define IOMMU_OPTION _IO(IOMMUFD_TYPE, IOMMUFD_CMD_OPTION)
>> +
>> +#endif
>> +
>> +/**
>> + * virIOMMUFDSetRLimitMode:
>> + * @fd: iommufd file descriptor
>> + * @processAccounting: true for per-process, false for per-user
>> + *
>> + * Set RLIMIT_MEMLOCK accounting mode for the iommufd.
>> + *
>> + * Returns: 0 on success, -1 on error
>> + */
> I don't see this function used anywhere in this patch series. Later you
> implement support to open /dev/iommu, should it be called there?
>
Yes, the call to this function was removed during rebasing. I will add
it back under [PATCH 5/7] qemu: open iommufd FD from libvirt backend.
>> +int
>> +virIOMMUFDSetRLimitMode(int fd, bool processAccounting)
>> +{
>> + struct iommu_option option = {
>> + .size = sizeof(struct iommu_option),
>> + .option_id = IOMMU_OPTION_RLIMIT_MODE,
>> + .op = IOMMU_OPTION_OP_SET,
>> + .__reserved = 0,
>> + .object_id = 0,
>> + .val64 = processAccounting ? 1 : 0,
>> + };
>> +
>> + if (ioctl(fd, IOMMU_OPTION, &option) < 0) {
>> + switch (errno) {
>> + case ENOTTY:
>> + VIR_WARN("IOMMU_OPTION ioctl not supported");
>> + return 0;
>> +
>> + case EOPNOTSUPP:
>> + VIR_WARN("IOMMU_OPTION_RLIMIT_MODE not supported by kernel");
>> + return 0;
>> +
>> + case EINVAL:
>> + virReportSystemError(errno, "%s",
>> + _("invalid iommufd option parameters"));
> Wrong indentation.
>
>> + return -1;
>> +
>> + case EPERM:
>> + VIR_WARN("Permission denied for IOMMU_OPTION ioctl. "
>> + "Per-user-based memory accounting to be used by default.");
>> + return 0;
>> +
>> + default:
>> + virReportSystemError(errno, "%s",
>> + _("failed to set iommufd option"));
> Wrong indentation.
>
I will indent it one more space so it starts after the opening parenthesis.
Nathan
© 2016 - 2026 Red Hat, Inc.