[PATCH v6 3/7] qemu: Support per-process memory accounting for iommufd

Nathan Chen via Devel posted 7 patches 2 days, 10 hours ago
[PATCH v6 3/7] qemu: Support per-process memory accounting for iommufd
Posted by Nathan Chen via Devel 2 days, 10 hours ago
From: Nathan Chen <nathanc@nvidia.com>

Implement the IOMMU_OPTION_RLIMIT_MODE
ioctl to set per-process memory accounting for
iommufd. This prevents ENOMEM errors from the
default per-user memory accounting when multiple
VMs under the libvirt-qemu user have their pinned
memory summed and checked against a per-process
RLIMIT_MEMLOCK limit.

Signed-off-by: Nathan Chen <nathanc@nvidia.com>
---
 meson.build              |   1 +
 po/POTFILES              |   1 +
 src/libvirt_private.syms |   3 ++
 src/util/meson.build     |   1 +
 src/util/viriommufd.c    | 111 +++++++++++++++++++++++++++++++++++++++
 src/util/viriommufd.h    |  25 +++++++++
 6 files changed, 142 insertions(+)
 create mode 100644 src/util/viriommufd.c
 create mode 100644 src/util/viriommufd.h

diff --git a/meson.build b/meson.build
index 964d1fa4e1..a6db70f13e 100644
--- a/meson.build
+++ b/meson.build
@@ -732,6 +732,7 @@ headers = [
   'ifaddrs.h',
   'libtasn1.h',
   'linux/kvm.h',
+  'linux/iommufd.h',
   'mntent.h',
   'net/ethernet.h',
   'net/if.h',
diff --git a/po/POTFILES b/po/POTFILES
index f0aad35c8c..c78d2b8000 100644
--- a/po/POTFILES
+++ b/po/POTFILES
@@ -303,6 +303,7 @@ src/util/virhostuptime.c
 src/util/viridentity.c
 src/util/virinhibitor.c
 src/util/virinitctl.c
+src/util/viriommufd.c
 src/util/viriscsi.c
 src/util/virjson.c
 src/util/virlease.c
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
index 6bffd2eb6d..7fa76a1ec3 100644
--- a/src/libvirt_private.syms
+++ b/src/libvirt_private.syms
@@ -2646,6 +2646,9 @@ virInhibitorRelease;
 virInitctlFifos;
 virInitctlSetRunLevel;
 
+# util/viriommufd.h
+virIOMMUFDSetRLimitMode;
+
 # util/viriscsi.h
 virISCSIConnectionLogin;
 virISCSIConnectionLogout;
diff --git a/src/util/meson.build b/src/util/meson.build
index 4950a795cc..9fb0aa0fe7 100644
--- a/src/util/meson.build
+++ b/src/util/meson.build
@@ -46,6 +46,7 @@ util_sources = [
   'viridentity.c',
   'virinhibitor.c',
   'virinitctl.c',
+  'viriommufd.c',
   'viriscsi.c',
   'virjson.c',
   'virkeycode.c',
diff --git a/src/util/viriommufd.c b/src/util/viriommufd.c
new file mode 100644
index 0000000000..b44bc8ed1d
--- /dev/null
+++ b/src/util/viriommufd.c
@@ -0,0 +1,111 @@
+#include <config.h>
+
+#include "viriommufd.h"
+#include "virlog.h"
+#include "virerror.h"
+#include "virfile.h"
+
+#define VIR_FROM_THIS VIR_FROM_NONE
+
+VIR_LOG_INIT("util.iommufd");
+
+#ifdef __linux__
+
+# include <sys/ioctl.h>
+# include <linux/types.h>
+
+# ifdef HAVE_LINUX_IOMMUFD_H
+#  include <linux/iommufd.h>
+# endif
+
+# ifndef IOMMU_OPTION
+
+enum iommufd_option {
+    IOMMU_OPTION_RLIMIT_MODE = 0,
+    IOMMU_OPTION_HUGE_PAGES = 1,
+};
+
+enum iommufd_option_ops {
+    IOMMU_OPTION_OP_SET = 0,
+    IOMMU_OPTION_OP_GET = 1,
+};
+
+struct iommu_option {
+    __u32 size;
+    __u32 option_id;
+    __u16 op;
+    __u16 __reserved;
+    __u32 object_id;
+    __aligned_u64 val64;
+};
+
+#  define IOMMUFD_TYPE (';')
+#  define IOMMUFD_CMD_OPTION 0x87
+#  define IOMMU_OPTION _IO(IOMMUFD_TYPE, IOMMUFD_CMD_OPTION)
+
+# endif
+
+/**
+ * virIOMMUFDSetRLimitMode:
+ * @fd: iommufd file descriptor
+ * @processAccounting: true for per-process, false for per-user
+ *
+ * Set RLIMIT_MEMLOCK accounting mode for the iommufd.
+ *
+ * Returns: 0 on success, -1 on error
+ */
+int
+virIOMMUFDSetRLimitMode(int fd, bool processAccounting)
+{
+    struct iommu_option option = {
+        .size = sizeof(struct iommu_option),
+        .option_id = IOMMU_OPTION_RLIMIT_MODE,
+        .op = IOMMU_OPTION_OP_SET,
+        .__reserved = 0,
+        .object_id = 0,
+        .val64 = processAccounting ? 1 : 0,
+    };
+
+    if (ioctl(fd, IOMMU_OPTION, &option) < 0) {
+        switch (errno) {
+            case ENOTTY:
+                VIR_WARN("IOMMU_OPTION ioctl not supported");
+                return -1;
+
+            case EOPNOTSUPP:
+                VIR_WARN("IOMMU_OPTION_RLIMIT_MODE not supported by kernel");
+                return -1;
+
+            case EINVAL:
+                virReportSystemError(errno, "%s",
+                                     _("invalid iommufd option parameters"));
+                return -1;
+
+            case EPERM:
+                VIR_WARN("Permission denied for IOMMU_OPTION ioctl. "
+                         "Per-user-based memory accounting to be used by default.");
+                return -1;
+
+            default:
+                virReportSystemError(errno, "%s",
+                                     _("failed to set iommufd option"));
+                return -1;
+        }
+    }
+
+    VIR_DEBUG("Set iommufd rlimit mode to %s-based accounting",
+              processAccounting ? "process" : "user");
+    return 0;
+}
+
+#else
+
+int virIOMMUFDSetRLimitMode(int fd G_GNUC_UNUSED,
+                            bool processAccounting G_GNUC_UNUSED)
+{
+    virReportError(VIR_ERR_NO_SUPPORT, "%s",
+                   _("IOMMUFD is not supported on this platform"));
+    return -1;
+}
+
+#endif
diff --git a/src/util/viriommufd.h b/src/util/viriommufd.h
new file mode 100644
index 0000000000..ebecfe3633
--- /dev/null
+++ b/src/util/viriommufd.h
@@ -0,0 +1,25 @@
+/*
+ * viriommufd.h: iommufd helpers
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "internal.h"
+
+#define VIR_IOMMU_DEV_PATH "/dev/iommu"
+
+int virIOMMUFDSetRLimitMode(int fd, bool processAccounting);
-- 
2.43.0
Re: [PATCH v6 3/7] qemu: Support per-process memory accounting for iommufd
Posted by Pavel Hrdina via Devel 2 days, 8 hours ago
On Fri, Jan 30, 2026 at 10:59:14AM -0800, Nathan Chen via Devel wrote:
> From: Nathan Chen <nathanc@nvidia.com>
> 
> Implement the IOMMU_OPTION_RLIMIT_MODE
> ioctl to set per-process memory accounting for
> iommufd. This prevents ENOMEM errors from the
> default per-user memory accounting when multiple
> VMs under the libvirt-qemu user have their pinned
> memory summed and checked against a per-process
> RLIMIT_MEMLOCK limit.
> 
> Signed-off-by: Nathan Chen <nathanc@nvidia.com>
> ---
>  meson.build              |   1 +
>  po/POTFILES              |   1 +
>  src/libvirt_private.syms |   3 ++
>  src/util/meson.build     |   1 +
>  src/util/viriommufd.c    | 111 +++++++++++++++++++++++++++++++++++++++
>  src/util/viriommufd.h    |  25 +++++++++
>  6 files changed, 142 insertions(+)
>  create mode 100644 src/util/viriommufd.c
>  create mode 100644 src/util/viriommufd.h

[...]

> diff --git a/src/util/viriommufd.c b/src/util/viriommufd.c
> new file mode 100644
> index 0000000000..b44bc8ed1d
> --- /dev/null
> +++ b/src/util/viriommufd.c
> @@ -0,0 +1,111 @@
> +#include <config.h>
> +
> +#include "viriommufd.h"
> +#include "virlog.h"
> +#include "virerror.h"
> +#include "virfile.h"
> +
> +#define VIR_FROM_THIS VIR_FROM_NONE
> +
> +VIR_LOG_INIT("util.iommufd");
> +
> +#ifdef __linux__
> +
> +# include <sys/ioctl.h>
> +# include <linux/types.h>
> +
> +# ifdef HAVE_LINUX_IOMMUFD_H
> +#  include <linux/iommufd.h>
> +# endif
> +
> +# ifndef IOMMU_OPTION
> +
> +enum iommufd_option {
> +    IOMMU_OPTION_RLIMIT_MODE = 0,
> +    IOMMU_OPTION_HUGE_PAGES = 1,
> +};
> +
> +enum iommufd_option_ops {
> +    IOMMU_OPTION_OP_SET = 0,
> +    IOMMU_OPTION_OP_GET = 1,
> +};
> +
> +struct iommu_option {
> +    __u32 size;
> +    __u32 option_id;
> +    __u16 op;
> +    __u16 __reserved;
> +    __u32 object_id;
> +    __aligned_u64 val64;
> +};
> +
> +#  define IOMMUFD_TYPE (';')
> +#  define IOMMUFD_CMD_OPTION 0x87
> +#  define IOMMU_OPTION _IO(IOMMUFD_TYPE, IOMMUFD_CMD_OPTION)
> +
> +# endif
> +
> +/**
> + * virIOMMUFDSetRLimitMode:
> + * @fd: iommufd file descriptor
> + * @processAccounting: true for per-process, false for per-user
> + *
> + * Set RLIMIT_MEMLOCK accounting mode for the iommufd.
> + *
> + * Returns: 0 on success, -1 on error
> + */
> +int
> +virIOMMUFDSetRLimitMode(int fd, bool processAccounting)
> +{
> +    struct iommu_option option = {
> +        .size = sizeof(struct iommu_option),
> +        .option_id = IOMMU_OPTION_RLIMIT_MODE,
> +        .op = IOMMU_OPTION_OP_SET,
> +        .__reserved = 0,
> +        .object_id = 0,
> +        .val64 = processAccounting ? 1 : 0,
> +    };
> +
> +    if (ioctl(fd, IOMMU_OPTION, &option) < 0) {
> +        switch (errno) {
> +            case ENOTTY:
> +                VIR_WARN("IOMMU_OPTION ioctl not supported");
> +                return -1;
> +
> +            case EOPNOTSUPP:
> +                VIR_WARN("IOMMU_OPTION_RLIMIT_MODE not supported by kernel");
> +                return -1;
> +
> +            case EINVAL:
> +                virReportSystemError(errno, "%s",
> +                                     _("invalid iommufd option parameters"));
> +                return -1;
> +
> +            case EPERM:
> +                VIR_WARN("Permission denied for IOMMU_OPTION ioctl. "
> +                         "Per-user-based memory accounting to be used by default.");
> +                return -1;
> +
> +            default:
> +                virReportSystemError(errno, "%s",
> +                                     _("failed to set iommufd option"));
> +                return -1;
> +        }
> +    }

When we return -1 we should also set error instead of logging warning.
I can fix it before pushing, there are two options:

 - We will keep the switch() and keep the customized messages and
   call virReportSystemError() for each case

 - Or we can remove the switch and call virReportSystemError that already
   adds string representation of errno:

     if (ioctl(fd, IOMMU_OPTION, &option) < 0) {
         virReportSystemError(errno, "%s",
                              _("failed to set memory accounting for iommufd"));
         return -1;
     }

Pavel
Re: [PATCH v6 3/7] qemu: Support per-process memory accounting for iommufd
Posted by Nathan Chen via Devel 2 days, 1 hour ago

On 1/30/2026 1:43 PM, Pavel Hrdina wrote:
> On Fri, Jan 30, 2026 at 10:59:14AM -0800, Nathan Chen via Devel wrote:
>> From: Nathan Chen<nathanc@nvidia.com>
>>
>> Implement the IOMMU_OPTION_RLIMIT_MODE
>> ioctl to set per-process memory accounting for
>> iommufd. This prevents ENOMEM errors from the
>> default per-user memory accounting when multiple
>> VMs under the libvirt-qemu user have their pinned
>> memory summed and checked against a per-process
>> RLIMIT_MEMLOCK limit.
>>
>> Signed-off-by: Nathan Chen<nathanc@nvidia.com>
>> ---
>>   meson.build              |   1 +
>>   po/POTFILES              |   1 +
>>   src/libvirt_private.syms |   3 ++
>>   src/util/meson.build     |   1 +
>>   src/util/viriommufd.c    | 111 +++++++++++++++++++++++++++++++++++++++
>>   src/util/viriommufd.h    |  25 +++++++++
>>   6 files changed, 142 insertions(+)
>>   create mode 100644 src/util/viriommufd.c
>>   create mode 100644 src/util/viriommufd.h
> [...]
> 
>> diff --git a/src/util/viriommufd.c b/src/util/viriommufd.c
>> new file mode 100644
>> index 0000000000..b44bc8ed1d
>> --- /dev/null
>> +++ b/src/util/viriommufd.c
>> @@ -0,0 +1,111 @@
>> +#include <config.h>
>> +
>> +#include "viriommufd.h"
>> +#include "virlog.h"
>> +#include "virerror.h"
>> +#include "virfile.h"
>> +
>> +#define VIR_FROM_THIS VIR_FROM_NONE
>> +
>> +VIR_LOG_INIT("util.iommufd");
>> +
>> +#ifdef __linux__
>> +
>> +# include <sys/ioctl.h>
>> +# include <linux/types.h>
>> +
>> +# ifdef HAVE_LINUX_IOMMUFD_H
>> +#  include <linux/iommufd.h>
>> +# endif
>> +
>> +# ifndef IOMMU_OPTION
>> +
>> +enum iommufd_option {
>> +    IOMMU_OPTION_RLIMIT_MODE = 0,
>> +    IOMMU_OPTION_HUGE_PAGES = 1,
>> +};
>> +
>> +enum iommufd_option_ops {
>> +    IOMMU_OPTION_OP_SET = 0,
>> +    IOMMU_OPTION_OP_GET = 1,
>> +};
>> +
>> +struct iommu_option {
>> +    __u32 size;
>> +    __u32 option_id;
>> +    __u16 op;
>> +    __u16 __reserved;
>> +    __u32 object_id;
>> +    __aligned_u64 val64;
>> +};
>> +
>> +#  define IOMMUFD_TYPE (';')
>> +#  define IOMMUFD_CMD_OPTION 0x87
>> +#  define IOMMU_OPTION _IO(IOMMUFD_TYPE, IOMMUFD_CMD_OPTION)
>> +
>> +# endif
>> +
>> +/**
>> + * virIOMMUFDSetRLimitMode:
>> + * @fd: iommufd file descriptor
>> + * @processAccounting: true for per-process, false for per-user
>> + *
>> + * Set RLIMIT_MEMLOCK accounting mode for the iommufd.
>> + *
>> + * Returns: 0 on success, -1 on error
>> + */
>> +int
>> +virIOMMUFDSetRLimitMode(int fd, bool processAccounting)
>> +{
>> +    struct iommu_option option = {
>> +        .size = sizeof(struct iommu_option),
>> +        .option_id = IOMMU_OPTION_RLIMIT_MODE,
>> +        .op = IOMMU_OPTION_OP_SET,
>> +        .__reserved = 0,
>> +        .object_id = 0,
>> +        .val64 = processAccounting ? 1 : 0,
>> +    };
>> +
>> +    if (ioctl(fd, IOMMU_OPTION, &option) < 0) {
>> +        switch (errno) {
>> +            case ENOTTY:
>> +                VIR_WARN("IOMMU_OPTION ioctl not supported");
>> +                return -1;
>> +
>> +            case EOPNOTSUPP:
>> +                VIR_WARN("IOMMU_OPTION_RLIMIT_MODE not supported by kernel");
>> +                return -1;
>> +
>> +            case EINVAL:
>> +                virReportSystemError(errno, "%s",
>> +                                     _("invalid iommufd option parameters"));
>> +                return -1;
>> +
>> +            case EPERM:
>> +                VIR_WARN("Permission denied for IOMMU_OPTION ioctl. "
>> +                         "Per-user-based memory accounting to be used by default.");
>> +                return -1;
>> +
>> +            default:
>> +                virReportSystemError(errno, "%s",
>> +                                     _("failed to set iommufd option"));
>> +                return -1;
>> +        }
>> +    }
> When we return -1 we should also set error instead of logging warning.
> I can fix it before pushing, there are two options:
> 
>   - We will keep the switch() and keep the customized messages and
>     call virReportSystemError() for each case
> 
>   - Or we can remove the switch and call virReportSystemError that already
>     adds string representation of errno:
> 
>       if (ioctl(fd, IOMMU_OPTION, &option) < 0) {
>           virReportSystemError(errno, "%s",
>                                _("failed to set memory accounting for iommufd"));
>           return -1;
>       }

Thanks, I think we can go with the second option of removing the switch; 
it would be simpler and matches the pattern for ioctl failures elsewhere 
in Libvirt.

Nathan