[v5] qemu: Implement support for iommufd

[PATCH v5 3/7] qemu: Support per-process memory accounting for iommufd

Posted by Nathan Chen via Devel 2 weeks, 2 days ago

From: Nathan Chen <nathanc@nvidia.com>

Implement the IOMMU_OPTION_RLIMIT_MODE
ioctl to set per-process memory accounting for
iommufd. This prevents ENOMEM errors from the
default per-user memory accounting when multiple
VMs under the libvirt-qemu user have their pinned
memory summed and checked against a per-process
RLIMIT_MEMLOCK limit.

Signed-off-by: Nathan Chen <nathanc@nvidia.com>
---
 meson.build              |   1 +
 po/POTFILES              |   1 +
 src/libvirt_private.syms |   3 ++
 src/util/meson.build     |   1 +
 src/util/viriommufd.c    | 111 +++++++++++++++++++++++++++++++++++++++
 src/util/viriommufd.h    |  25 +++++++++
 6 files changed, 142 insertions(+)
 create mode 100644 src/util/viriommufd.c
 create mode 100644 src/util/viriommufd.h

diff --git a/meson.build b/meson.build
index 964d1fa4e1..a6db70f13e 100644
--- a/meson.build
+++ b/meson.build
@@ -732,6 +732,7 @@ headers = [
   'ifaddrs.h',
   'libtasn1.h',
   'linux/kvm.h',
+  'linux/iommufd.h',
   'mntent.h',
   'net/ethernet.h',
   'net/if.h',
diff --git a/po/POTFILES b/po/POTFILES
index f0aad35c8c..c78d2b8000 100644
--- a/po/POTFILES
+++ b/po/POTFILES
@@ -303,6 +303,7 @@ src/util/virhostuptime.c
 src/util/viridentity.c
 src/util/virinhibitor.c
 src/util/virinitctl.c
+src/util/viriommufd.c
 src/util/viriscsi.c
 src/util/virjson.c
 src/util/virlease.c
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
index 6bffd2eb6d..7fa76a1ec3 100644
--- a/src/libvirt_private.syms
+++ b/src/libvirt_private.syms
@@ -2646,6 +2646,9 @@ virInhibitorRelease;
 virInitctlFifos;
 virInitctlSetRunLevel;
 
+# util/viriommufd.h
+virIOMMUFDSetRLimitMode;
+
 # util/viriscsi.h
 virISCSIConnectionLogin;
 virISCSIConnectionLogout;
diff --git a/src/util/meson.build b/src/util/meson.build
index 4950a795cc..9fb0aa0fe7 100644
--- a/src/util/meson.build
+++ b/src/util/meson.build
@@ -46,6 +46,7 @@ util_sources = [
   'viridentity.c',
   'virinhibitor.c',
   'virinitctl.c',
+  'viriommufd.c',
   'viriscsi.c',
   'virjson.c',
   'virkeycode.c',
diff --git a/src/util/viriommufd.c b/src/util/viriommufd.c
new file mode 100644
index 0000000000..225c76f4b2
--- /dev/null
+++ b/src/util/viriommufd.c
@@ -0,0 +1,111 @@
+#include <config.h>
+
+#include "viriommufd.h"
+#include "virlog.h"
+#include "virerror.h"
+#include "virfile.h"
+
+#ifdef __linux__
+
+# include <sys/ioctl.h>
+# include <linux/types.h>
+
+# ifdef HAVE_LINUX_IOMMUFD_H
+#  include <linux/iommufd.h>
+# endif
+
+# define VIR_FROM_THIS VIR_FROM_NONE
+
+VIR_LOG_INIT("util.iommufd");
+
+# ifndef IOMMU_OPTION
+
+enum iommufd_option {
+    IOMMU_OPTION_RLIMIT_MODE = 0,
+    IOMMU_OPTION_HUGE_PAGES = 1,
+};
+
+enum iommufd_option_ops {
+    IOMMU_OPTION_OP_SET = 0,
+    IOMMU_OPTION_OP_GET = 1,
+};
+
+struct iommu_option {
+    __u32 size;
+    __u32 option_id;
+    __u16 op;
+    __u16 __reserved;
+    __u32 object_id;
+    __aligned_u64 val64;
+};
+
+#  define IOMMUFD_TYPE (';')
+#  define IOMMUFD_CMD_OPTION 0x87
+#  define IOMMU_OPTION _IO(IOMMUFD_TYPE, IOMMUFD_CMD_OPTION)
+
+# endif
+
+/**
+ * virIOMMUFDSetRLimitMode:
+ * @fd: iommufd file descriptor
+ * @processAccounting: true for per-process, false for per-user
+ *
+ * Set RLIMIT_MEMLOCK accounting mode for the iommufd.
+ *
+ * Returns: 0 on success, -1 on error
+ */
+int
+virIOMMUFDSetRLimitMode(int fd, bool processAccounting)
+{
+    struct iommu_option option = {
+        .size = sizeof(struct iommu_option),
+        .option_id = IOMMU_OPTION_RLIMIT_MODE,
+        .op = IOMMU_OPTION_OP_SET,
+        .__reserved = 0,
+        .object_id = 0,
+        .val64 = processAccounting ? 1 : 0,
+    };
+
+    if (ioctl(fd, IOMMU_OPTION, &option) < 0) {
+        switch (errno) {
+            case ENOTTY:
+                VIR_WARN("IOMMU_OPTION ioctl not supported");
+                return 0;
+
+            case EOPNOTSUPP:
+                VIR_WARN("IOMMU_OPTION_RLIMIT_MODE not supported by kernel");
+                return 0;
+
+            case EINVAL:
+                virReportSystemError(errno, "%s",
+                                     _("invalid iommufd option parameters"));
+                return -1;
+
+            case EPERM:
+                VIR_WARN("Permission denied for IOMMU_OPTION ioctl. "
+                         "Per-user-based memory accounting to be used by default.");
+                return 0;
+
+            default:
+                virReportSystemError(errno, "%s",
+                                     _("failed to set iommufd option"));
+                return -1;
+        }
+    }
+
+    VIR_DEBUG("Set iommufd rlimit mode to %s-based accounting",
+              processAccounting ? "process" : "user");
+    return 0;
+}
+
+#else
+
+int virIOMMUFDSetRLimitMode(int fd G_GNUC_UNUSED,
+                            bool processAccounting G_GNUC_UNUSED)
+{
+    virReportError(VIR_ERR_NO_SUPPORT, "%s",
+                   _("IOMMUFD is not supported on this platform"));
+    return -1;
+}
+
+#endif
diff --git a/src/util/viriommufd.h b/src/util/viriommufd.h
new file mode 100644
index 0000000000..ebecfe3633
--- /dev/null
+++ b/src/util/viriommufd.h
@@ -0,0 +1,25 @@
+/*
+ * viriommufd.h: iommufd helpers
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "internal.h"
+
+#define VIR_IOMMU_DEV_PATH "/dev/iommu"
+
+int virIOMMUFDSetRLimitMode(int fd, bool processAccounting);
-- 
2.43.0

Re: [PATCH v5 3/7] qemu: Support per-process memory accounting for iommufd

Posted by Pavel Hrdina via Devel 1 week, 5 days ago

On Fri, Jan 16, 2026 at 05:39:33PM -0800, Nathan Chen via Devel wrote:
> From: Nathan Chen <nathanc@nvidia.com>
> 
> Implement the IOMMU_OPTION_RLIMIT_MODE
> ioctl to set per-process memory accounting for
> iommufd. This prevents ENOMEM errors from the
> default per-user memory accounting when multiple
> VMs under the libvirt-qemu user have their pinned
> memory summed and checked against a per-process
> RLIMIT_MEMLOCK limit.
> 
> Signed-off-by: Nathan Chen <nathanc@nvidia.com>
> ---
>  meson.build              |   1 +
>  po/POTFILES              |   1 +
>  src/libvirt_private.syms |   3 ++
>  src/util/meson.build     |   1 +
>  src/util/viriommufd.c    | 111 +++++++++++++++++++++++++++++++++++++++
>  src/util/viriommufd.h    |  25 +++++++++
>  6 files changed, 142 insertions(+)
>  create mode 100644 src/util/viriommufd.c
>  create mode 100644 src/util/viriommufd.h
> 
> diff --git a/meson.build b/meson.build
> index 964d1fa4e1..a6db70f13e 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -732,6 +732,7 @@ headers = [
>    'ifaddrs.h',
>    'libtasn1.h',
>    'linux/kvm.h',
> +  'linux/iommufd.h',
>    'mntent.h',
>    'net/ethernet.h',
>    'net/if.h',
> diff --git a/po/POTFILES b/po/POTFILES
> index f0aad35c8c..c78d2b8000 100644
> --- a/po/POTFILES
> +++ b/po/POTFILES
> @@ -303,6 +303,7 @@ src/util/virhostuptime.c
>  src/util/viridentity.c
>  src/util/virinhibitor.c
>  src/util/virinitctl.c
> +src/util/viriommufd.c
>  src/util/viriscsi.c
>  src/util/virjson.c
>  src/util/virlease.c
> diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
> index 6bffd2eb6d..7fa76a1ec3 100644
> --- a/src/libvirt_private.syms
> +++ b/src/libvirt_private.syms
> @@ -2646,6 +2646,9 @@ virInhibitorRelease;
>  virInitctlFifos;
>  virInitctlSetRunLevel;
>  
> +# util/viriommufd.h
> +virIOMMUFDSetRLimitMode;
> +
>  # util/viriscsi.h
>  virISCSIConnectionLogin;
>  virISCSIConnectionLogout;
> diff --git a/src/util/meson.build b/src/util/meson.build
> index 4950a795cc..9fb0aa0fe7 100644
> --- a/src/util/meson.build
> +++ b/src/util/meson.build
> @@ -46,6 +46,7 @@ util_sources = [
>    'viridentity.c',
>    'virinhibitor.c',
>    'virinitctl.c',
> +  'viriommufd.c',
>    'viriscsi.c',
>    'virjson.c',
>    'virkeycode.c',
> diff --git a/src/util/viriommufd.c b/src/util/viriommufd.c
> new file mode 100644
> index 0000000000..225c76f4b2
> --- /dev/null
> +++ b/src/util/viriommufd.c
> @@ -0,0 +1,111 @@
> +#include <config.h>
> +
> +#include "viriommufd.h"
> +#include "virlog.h"
> +#include "virerror.h"
> +#include "virfile.h"
> +
> +#ifdef __linux__
> +
> +# include <sys/ioctl.h>
> +# include <linux/types.h>
> +
> +# ifdef HAVE_LINUX_IOMMUFD_H
> +#  include <linux/iommufd.h>
> +# endif
> +
> +# define VIR_FROM_THIS VIR_FROM_NONE
> +
> +VIR_LOG_INIT("util.iommufd");
> +
> +# ifndef IOMMU_OPTION
> +
> +enum iommufd_option {
> +    IOMMU_OPTION_RLIMIT_MODE = 0,
> +    IOMMU_OPTION_HUGE_PAGES = 1,
> +};
> +
> +enum iommufd_option_ops {
> +    IOMMU_OPTION_OP_SET = 0,
> +    IOMMU_OPTION_OP_GET = 1,
> +};
> +
> +struct iommu_option {
> +    __u32 size;
> +    __u32 option_id;
> +    __u16 op;
> +    __u16 __reserved;
> +    __u32 object_id;
> +    __aligned_u64 val64;
> +};
> +
> +#  define IOMMUFD_TYPE (';')
> +#  define IOMMUFD_CMD_OPTION 0x87
> +#  define IOMMU_OPTION _IO(IOMMUFD_TYPE, IOMMUFD_CMD_OPTION)
> +
> +# endif
> +
> +/**
> + * virIOMMUFDSetRLimitMode:
> + * @fd: iommufd file descriptor
> + * @processAccounting: true for per-process, false for per-user
> + *
> + * Set RLIMIT_MEMLOCK accounting mode for the iommufd.
> + *
> + * Returns: 0 on success, -1 on error
> + */
> +int
> +virIOMMUFDSetRLimitMode(int fd, bool processAccounting)
> +{
> +    struct iommu_option option = {
> +        .size = sizeof(struct iommu_option),
> +        .option_id = IOMMU_OPTION_RLIMIT_MODE,
> +        .op = IOMMU_OPTION_OP_SET,
> +        .__reserved = 0,
> +        .object_id = 0,
> +        .val64 = processAccounting ? 1 : 0,
> +    };
> +
> +    if (ioctl(fd, IOMMU_OPTION, &option) < 0) {
> +        switch (errno) {
> +            case ENOTTY:
> +                VIR_WARN("IOMMU_OPTION ioctl not supported");
> +                return 0;
> +
> +            case EOPNOTSUPP:
> +                VIR_WARN("IOMMU_OPTION_RLIMIT_MODE not supported by kernel");
> +                return 0;
> +
> +            case EINVAL:
> +                virReportSystemError(errno, "%s",
> +                                     _("invalid iommufd option parameters"));
> +                return -1;
> +
> +            case EPERM:
> +                VIR_WARN("Permission denied for IOMMU_OPTION ioctl. "
> +                         "Per-user-based memory accounting to be used by default.");
> +                return 0;
> +
> +            default:
> +                virReportSystemError(errno, "%s",
> +                                     _("failed to set iommufd option"));
> +                return -1;
> +        }
> +    }

In my previous testing this part of code was not used so no rlimit was
configured for the grace hopper GPU that was assigned to a VM.

The VM OS was able to see the GPU and I was able to run cuda-samples
with most of them passing. This setup didn't use vCMDQ or EGM. When I
tried patches that add support for vCMDQ I was no longer able to use the
GPU inside the VM until this code was called or setting
"setcap cap_ipc_lock=ep" on the qemu binary but it was still detected
inside the VM and the VM was started successfully.

So is this required for all devices that want to use iommufd in order
for them to work correctly inside the VM? Or is it necessary only when
specific features are used?

I wonder if we should allow to start a VM if we know the device will not
actually work correctly.

Basically if IOMMU_OPTION ioctl, IOMMU_OPTION_RLIMIT_MODE are not
supported or we get permission denied we return 0 and we let the VM
start.

Pavel

> +
> +    VIR_DEBUG("Set iommufd rlimit mode to %s-based accounting",
> +              processAccounting ? "process" : "user");
> +    return 0;
> +}
> +
> +#else
> +
> +int virIOMMUFDSetRLimitMode(int fd G_GNUC_UNUSED,
> +                            bool processAccounting G_GNUC_UNUSED)
> +{
> +    virReportError(VIR_ERR_NO_SUPPORT, "%s",
> +                   _("IOMMUFD is not supported on this platform"));
> +    return -1;
> +}
> +
> +#endif
> diff --git a/src/util/viriommufd.h b/src/util/viriommufd.h
> new file mode 100644
> index 0000000000..ebecfe3633
> --- /dev/null
> +++ b/src/util/viriommufd.h
> @@ -0,0 +1,25 @@
> +/*
> + * viriommufd.h: iommufd helpers
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library.  If not, see
> + * <http://www.gnu.org/licenses/>.
> + */
> +
> +#pragma once
> +
> +#include "internal.h"
> +
> +#define VIR_IOMMU_DEV_PATH "/dev/iommu"
> +
> +int virIOMMUFDSetRLimitMode(int fd, bool processAccounting);
> -- 
> 2.43.0
>

Re: [PATCH v5 3/7] qemu: Support per-process memory accounting for iommufd

Posted by Nathan Chen via Devel 1 week, 2 days ago


On 1/20/2026 10:24 AM, Pavel Hrdina wrote:
> On Fri, Jan 16, 2026 at 05:39:33PM -0800, Nathan Chen via Devel wrote:
>> From: Nathan Chen<nathanc@nvidia.com>
>>
>> Implement the IOMMU_OPTION_RLIMIT_MODE
>> ioctl to set per-process memory accounting for
>> iommufd. This prevents ENOMEM errors from the
>> default per-user memory accounting when multiple
>> VMs under the libvirt-qemu user have their pinned
>> memory summed and checked against a per-process
>> RLIMIT_MEMLOCK limit.
>>
>> Signed-off-by: Nathan Chen<nathanc@nvidia.com>
>> ---
>>   meson.build              |   1 +
>>   po/POTFILES              |   1 +
>>   src/libvirt_private.syms |   3 ++
>>   src/util/meson.build     |   1 +
>>   src/util/viriommufd.c    | 111 +++++++++++++++++++++++++++++++++++++++
>>   src/util/viriommufd.h    |  25 +++++++++
>>   6 files changed, 142 insertions(+)
>>   create mode 100644 src/util/viriommufd.c
>>   create mode 100644 src/util/viriommufd.h
>>
>> diff --git a/meson.build b/meson.build
>> index 964d1fa4e1..a6db70f13e 100644
>> --- a/meson.build
>> +++ b/meson.build
>> @@ -732,6 +732,7 @@ headers = [
>>     'ifaddrs.h',
>>     'libtasn1.h',
>>     'linux/kvm.h',
>> +  'linux/iommufd.h',
>>     'mntent.h',
>>     'net/ethernet.h',
>>     'net/if.h',
>> diff --git a/po/POTFILES b/po/POTFILES
>> index f0aad35c8c..c78d2b8000 100644
>> --- a/po/POTFILES
>> +++ b/po/POTFILES
>> @@ -303,6 +303,7 @@ src/util/virhostuptime.c
>>   src/util/viridentity.c
>>   src/util/virinhibitor.c
>>   src/util/virinitctl.c
>> +src/util/viriommufd.c
>>   src/util/viriscsi.c
>>   src/util/virjson.c
>>   src/util/virlease.c
>> diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
>> index 6bffd2eb6d..7fa76a1ec3 100644
>> --- a/src/libvirt_private.syms
>> +++ b/src/libvirt_private.syms
>> @@ -2646,6 +2646,9 @@ virInhibitorRelease;
>>   virInitctlFifos;
>>   virInitctlSetRunLevel;
>>   
>> +# util/viriommufd.h
>> +virIOMMUFDSetRLimitMode;
>> +
>>   # util/viriscsi.h
>>   virISCSIConnectionLogin;
>>   virISCSIConnectionLogout;
>> diff --git a/src/util/meson.build b/src/util/meson.build
>> index 4950a795cc..9fb0aa0fe7 100644
>> --- a/src/util/meson.build
>> +++ b/src/util/meson.build
>> @@ -46,6 +46,7 @@ util_sources = [
>>     'viridentity.c',
>>     'virinhibitor.c',
>>     'virinitctl.c',
>> +  'viriommufd.c',
>>     'viriscsi.c',
>>     'virjson.c',
>>     'virkeycode.c',
>> diff --git a/src/util/viriommufd.c b/src/util/viriommufd.c
>> new file mode 100644
>> index 0000000000..225c76f4b2
>> --- /dev/null
>> +++ b/src/util/viriommufd.c
>> @@ -0,0 +1,111 @@
>> +#include <config.h>
>> +
>> +#include "viriommufd.h"
>> +#include "virlog.h"
>> +#include "virerror.h"
>> +#include "virfile.h"
>> +
>> +#ifdef __linux__
>> +
>> +# include <sys/ioctl.h>
>> +# include <linux/types.h>
>> +
>> +# ifdef HAVE_LINUX_IOMMUFD_H
>> +#  include <linux/iommufd.h>
>> +# endif
>> +
>> +# define VIR_FROM_THIS VIR_FROM_NONE
>> +
>> +VIR_LOG_INIT("util.iommufd");
>> +
>> +# ifndef IOMMU_OPTION
>> +
>> +enum iommufd_option {
>> +    IOMMU_OPTION_RLIMIT_MODE = 0,
>> +    IOMMU_OPTION_HUGE_PAGES = 1,
>> +};
>> +
>> +enum iommufd_option_ops {
>> +    IOMMU_OPTION_OP_SET = 0,
>> +    IOMMU_OPTION_OP_GET = 1,
>> +};
>> +
>> +struct iommu_option {
>> +    __u32 size;
>> +    __u32 option_id;
>> +    __u16 op;
>> +    __u16 __reserved;
>> +    __u32 object_id;
>> +    __aligned_u64 val64;
>> +};
>> +
>> +#  define IOMMUFD_TYPE (';')
>> +#  define IOMMUFD_CMD_OPTION 0x87
>> +#  define IOMMU_OPTION _IO(IOMMUFD_TYPE, IOMMUFD_CMD_OPTION)
>> +
>> +# endif
>> +
>> +/**
>> + * virIOMMUFDSetRLimitMode:
>> + * @fd: iommufd file descriptor
>> + * @processAccounting: true for per-process, false for per-user
>> + *
>> + * Set RLIMIT_MEMLOCK accounting mode for the iommufd.
>> + *
>> + * Returns: 0 on success, -1 on error
>> + */
>> +int
>> +virIOMMUFDSetRLimitMode(int fd, bool processAccounting)
>> +{
>> +    struct iommu_option option = {
>> +        .size = sizeof(struct iommu_option),
>> +        .option_id = IOMMU_OPTION_RLIMIT_MODE,
>> +        .op = IOMMU_OPTION_OP_SET,
>> +        .__reserved = 0,
>> +        .object_id = 0,
>> +        .val64 = processAccounting ? 1 : 0,
>> +    };
>> +
>> +    if (ioctl(fd, IOMMU_OPTION, &option) < 0) {
>> +        switch (errno) {
>> +            case ENOTTY:
>> +                VIR_WARN("IOMMU_OPTION ioctl not supported");
>> +                return 0;
>> +
>> +            case EOPNOTSUPP:
>> +                VIR_WARN("IOMMU_OPTION_RLIMIT_MODE not supported by kernel");
>> +                return 0;
>> +
>> +            case EINVAL:
>> +                virReportSystemError(errno, "%s",
>> +                                     _("invalid iommufd option parameters"));
>> +                return -1;
>> +
>> +            case EPERM:
>> +                VIR_WARN("Permission denied for IOMMU_OPTION ioctl. "
>> +                         "Per-user-based memory accounting to be used by default.");
>> +                return 0;
>> +
>> +            default:
>> +                virReportSystemError(errno, "%s",
>> +                                     _("failed to set iommufd option"));
>> +                return -1;
>> +        }
>> +    }
> In my previous testing this part of code was not used so no rlimit was
> configured for the grace hopper GPU that was assigned to a VM.
> 
> The VM OS was able to see the GPU and I was able to run cuda-samples
> with most of them passing. This setup didn't use vCMDQ or EGM. When I
> tried patches that add support for vCMDQ I was no longer able to use the
> GPU inside the VM until this code was called or setting
> "setcap cap_ipc_lock=ep" on the qemu binary but it was still detected
> inside the VM and the VM was started successfully.
> 
> So is this required for all devices that want to use iommufd in order
> for them to work correctly inside the VM? Or is it necessary only when
> specific features are used?
> 
I don’t think the ioctl is required for all devices, but vCMDQ can 
increase accounted pinned memory over the per‑user memory locking limit. 
vCMDQ introduces additional guest‑RAM backed queues that could be the 
extra pinned/accounted memory pushing over the memory locking limit. 
Additionally, attempting to launch a second iommufd VM could increase 
accounted memory over the per-user memory locking limit.

For the case you observed, if it were truly a single isolated QEMU 
process with no other memlocked usage under the same uid, per‑process vs 
per‑user should be identical. The fact that switching to per‑process 
memory accounting fixes the issue suggests there is additional memlocked 
usage being charged to the libvirt‑qemu uid (e.g. other processes, 
helper daemons, or device‑related accounting). vCMDQ just pushes the 
summed memory over the limit.

> I wonder if we should allow to start a VM if we know the device will not
> actually work correctly.
> 
> Basically if IOMMU_OPTION ioctl, IOMMU_OPTION_RLIMIT_MODE are not
> supported or we get permission denied we return 0 and we let the VM
> start.

I’m open to returning -1 for these cases if you feel it's safer to fail 
early. What are your thoughts?

Nathan

Re: [PATCH v5 3/7] qemu: Support per-process memory accounting for iommufd

Posted by Pavel Hrdina via Devel 6 days, 8 hours ago

On Fri, Jan 23, 2026 at 12:30:28PM -0800, Nathan Chen wrote:
> 
> 
> On 1/20/2026 10:24 AM, Pavel Hrdina wrote:
> > On Fri, Jan 16, 2026 at 05:39:33PM -0800, Nathan Chen via Devel wrote:
> > > From: Nathan Chen<nathanc@nvidia.com>
> > > 
> > > Implement the IOMMU_OPTION_RLIMIT_MODE
> > > ioctl to set per-process memory accounting for
> > > iommufd. This prevents ENOMEM errors from the
> > > default per-user memory accounting when multiple
> > > VMs under the libvirt-qemu user have their pinned
> > > memory summed and checked against a per-process
> > > RLIMIT_MEMLOCK limit.
> > > 
> > > Signed-off-by: Nathan Chen<nathanc@nvidia.com>
> > > ---
> > >   meson.build              |   1 +
> > >   po/POTFILES              |   1 +
> > >   src/libvirt_private.syms |   3 ++
> > >   src/util/meson.build     |   1 +
> > >   src/util/viriommufd.c    | 111 +++++++++++++++++++++++++++++++++++++++
> > >   src/util/viriommufd.h    |  25 +++++++++
> > >   6 files changed, 142 insertions(+)
> > >   create mode 100644 src/util/viriommufd.c
> > >   create mode 100644 src/util/viriommufd.h
> > > 
> > > diff --git a/meson.build b/meson.build
> > > index 964d1fa4e1..a6db70f13e 100644
> > > --- a/meson.build
> > > +++ b/meson.build
> > > @@ -732,6 +732,7 @@ headers = [
> > >     'ifaddrs.h',
> > >     'libtasn1.h',
> > >     'linux/kvm.h',
> > > +  'linux/iommufd.h',
> > >     'mntent.h',
> > >     'net/ethernet.h',
> > >     'net/if.h',
> > > diff --git a/po/POTFILES b/po/POTFILES
> > > index f0aad35c8c..c78d2b8000 100644
> > > --- a/po/POTFILES
> > > +++ b/po/POTFILES
> > > @@ -303,6 +303,7 @@ src/util/virhostuptime.c
> > >   src/util/viridentity.c
> > >   src/util/virinhibitor.c
> > >   src/util/virinitctl.c
> > > +src/util/viriommufd.c
> > >   src/util/viriscsi.c
> > >   src/util/virjson.c
> > >   src/util/virlease.c
> > > diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
> > > index 6bffd2eb6d..7fa76a1ec3 100644
> > > --- a/src/libvirt_private.syms
> > > +++ b/src/libvirt_private.syms
> > > @@ -2646,6 +2646,9 @@ virInhibitorRelease;
> > >   virInitctlFifos;
> > >   virInitctlSetRunLevel;
> > > +# util/viriommufd.h
> > > +virIOMMUFDSetRLimitMode;
> > > +
> > >   # util/viriscsi.h
> > >   virISCSIConnectionLogin;
> > >   virISCSIConnectionLogout;
> > > diff --git a/src/util/meson.build b/src/util/meson.build
> > > index 4950a795cc..9fb0aa0fe7 100644
> > > --- a/src/util/meson.build
> > > +++ b/src/util/meson.build
> > > @@ -46,6 +46,7 @@ util_sources = [
> > >     'viridentity.c',
> > >     'virinhibitor.c',
> > >     'virinitctl.c',
> > > +  'viriommufd.c',
> > >     'viriscsi.c',
> > >     'virjson.c',
> > >     'virkeycode.c',
> > > diff --git a/src/util/viriommufd.c b/src/util/viriommufd.c
> > > new file mode 100644
> > > index 0000000000..225c76f4b2
> > > --- /dev/null
> > > +++ b/src/util/viriommufd.c
> > > @@ -0,0 +1,111 @@
> > > +#include <config.h>
> > > +
> > > +#include "viriommufd.h"
> > > +#include "virlog.h"
> > > +#include "virerror.h"
> > > +#include "virfile.h"
> > > +
> > > +#ifdef __linux__
> > > +
> > > +# include <sys/ioctl.h>
> > > +# include <linux/types.h>
> > > +
> > > +# ifdef HAVE_LINUX_IOMMUFD_H
> > > +#  include <linux/iommufd.h>
> > > +# endif
> > > +
> > > +# define VIR_FROM_THIS VIR_FROM_NONE
> > > +
> > > +VIR_LOG_INIT("util.iommufd");
> > > +
> > > +# ifndef IOMMU_OPTION
> > > +
> > > +enum iommufd_option {
> > > +    IOMMU_OPTION_RLIMIT_MODE = 0,
> > > +    IOMMU_OPTION_HUGE_PAGES = 1,
> > > +};
> > > +
> > > +enum iommufd_option_ops {
> > > +    IOMMU_OPTION_OP_SET = 0,
> > > +    IOMMU_OPTION_OP_GET = 1,
> > > +};
> > > +
> > > +struct iommu_option {
> > > +    __u32 size;
> > > +    __u32 option_id;
> > > +    __u16 op;
> > > +    __u16 __reserved;
> > > +    __u32 object_id;
> > > +    __aligned_u64 val64;
> > > +};
> > > +
> > > +#  define IOMMUFD_TYPE (';')
> > > +#  define IOMMUFD_CMD_OPTION 0x87
> > > +#  define IOMMU_OPTION _IO(IOMMUFD_TYPE, IOMMUFD_CMD_OPTION)
> > > +
> > > +# endif
> > > +
> > > +/**
> > > + * virIOMMUFDSetRLimitMode:
> > > + * @fd: iommufd file descriptor
> > > + * @processAccounting: true for per-process, false for per-user
> > > + *
> > > + * Set RLIMIT_MEMLOCK accounting mode for the iommufd.
> > > + *
> > > + * Returns: 0 on success, -1 on error
> > > + */
> > > +int
> > > +virIOMMUFDSetRLimitMode(int fd, bool processAccounting)
> > > +{
> > > +    struct iommu_option option = {
> > > +        .size = sizeof(struct iommu_option),
> > > +        .option_id = IOMMU_OPTION_RLIMIT_MODE,
> > > +        .op = IOMMU_OPTION_OP_SET,
> > > +        .__reserved = 0,
> > > +        .object_id = 0,
> > > +        .val64 = processAccounting ? 1 : 0,
> > > +    };
> > > +
> > > +    if (ioctl(fd, IOMMU_OPTION, &option) < 0) {
> > > +        switch (errno) {
> > > +            case ENOTTY:
> > > +                VIR_WARN("IOMMU_OPTION ioctl not supported");
> > > +                return 0;
> > > +
> > > +            case EOPNOTSUPP:
> > > +                VIR_WARN("IOMMU_OPTION_RLIMIT_MODE not supported by kernel");
> > > +                return 0;
> > > +
> > > +            case EINVAL:
> > > +                virReportSystemError(errno, "%s",
> > > +                                     _("invalid iommufd option parameters"));
> > > +                return -1;
> > > +
> > > +            case EPERM:
> > > +                VIR_WARN("Permission denied for IOMMU_OPTION ioctl. "
> > > +                         "Per-user-based memory accounting to be used by default.");
> > > +                return 0;
> > > +
> > > +            default:
> > > +                virReportSystemError(errno, "%s",
> > > +                                     _("failed to set iommufd option"));
> > > +                return -1;
> > > +        }
> > > +    }
> > In my previous testing this part of code was not used so no rlimit was
> > configured for the grace hopper GPU that was assigned to a VM.
> > 
> > The VM OS was able to see the GPU and I was able to run cuda-samples
> > with most of them passing. This setup didn't use vCMDQ or EGM. When I
> > tried patches that add support for vCMDQ I was no longer able to use the
> > GPU inside the VM until this code was called or setting
> > "setcap cap_ipc_lock=ep" on the qemu binary but it was still detected
> > inside the VM and the VM was started successfully.
> > 
> > So is this required for all devices that want to use iommufd in order
> > for them to work correctly inside the VM? Or is it necessary only when
> > specific features are used?
> > 
> I don’t think the ioctl is required for all devices, but vCMDQ can increase
> accounted pinned memory over the per‑user memory locking limit. vCMDQ
> introduces additional guest‑RAM backed queues that could be the extra
> pinned/accounted memory pushing over the memory locking limit. Additionally,
> attempting to launch a second iommufd VM could increase accounted memory
> over the per-user memory locking limit.

If that ioctl call is not required for all devices we should not call it
unconditionally for all VMs that will try to use iommufd with any
device.

Libvirt tries to guess correct memory limit for specific cases, see
function qemuDomainGetMemLockLimitBytes() .

If I manually set 64G hard_limit for VM with 32G ram everything works
even without calling tha ioctl:

  <memtune>
    <hard_limit unit='GiB'>64</hard_limit>
  </memtune>

So if we can figure out some reasonable overhead when vCMDQ is used that
would be better solution.

> For the case you observed, if it were truly a single isolated QEMU process
> with no other memlocked usage under the same uid, per‑process vs per‑user
> should be identical. The fact that switching to per‑process memory
> accounting fixes the issue suggests there is additional memlocked usage
> being charged to the libvirt‑qemu uid (e.g. other processes, helper daemons,
> or device‑related accounting). vCMDQ just pushes the summed memory over the
> limit.

When the limit was not high enough I got the following errors in host
dmesg:

[30507.848263] acpi NVDA200C:03: tegra241_cmdqv: unexpected error reported. vintf_map: 0000000000000002, vcmdq_map 00000000:00000000:00000000:0000000c

I think this needs additional work in QEMU, starting VM should error out
if it hits the memory limit instead of silently starting broken VM
configuration.

Pavel

> 
> > I wonder if we should allow to start a VM if we know the device will not
> > actually work correctly.
> > 
> > Basically if IOMMU_OPTION ioctl, IOMMU_OPTION_RLIMIT_MODE are not
> > supported or we get permission denied we return 0 and we let the VM
> > start.
> 
> I’m open to returning -1 for these cases if you feel it's safer to fail
> early. What are your thoughts?
> 
> Nathan
>

Re: [PATCH v5 3/7] qemu: Support per-process memory accounting for iommufd

Posted by Nathan Chen via Devel 6 days, 4 hours ago


On 1/26/2026 1:07 PM, Pavel Hrdina wrote:
> On Fri, Jan 23, 2026 at 12:30:28PM -0800, Nathan Chen wrote:
>>
>> On 1/20/2026 10:24 AM, Pavel Hrdina wrote:
>>> On Fri, Jan 16, 2026 at 05:39:33PM -0800, Nathan Chen via Devel wrote:
>>>> From: Nathan Chen<nathanc@nvidia.com>
>>>>
>>>> Implement the IOMMU_OPTION_RLIMIT_MODE
>>>> ioctl to set per-process memory accounting for
>>>> iommufd. This prevents ENOMEM errors from the
>>>> default per-user memory accounting when multiple
>>>> VMs under the libvirt-qemu user have their pinned
>>>> memory summed and checked against a per-process
>>>> RLIMIT_MEMLOCK limit.
>>>>
>>>> Signed-off-by: Nathan Chen<nathanc@nvidia.com>
>>>> ---
>>>>    meson.build              |   1 +
>>>>    po/POTFILES              |   1 +
>>>>    src/libvirt_private.syms |   3 ++
>>>>    src/util/meson.build     |   1 +
>>>>    src/util/viriommufd.c    | 111 +++++++++++++++++++++++++++++++++++++++
>>>>    src/util/viriommufd.h    |  25 +++++++++
>>>>    6 files changed, 142 insertions(+)
>>>>    create mode 100644 src/util/viriommufd.c
>>>>    create mode 100644 src/util/viriommufd.h
>>>>
>>>> diff --git a/meson.build b/meson.build
>>>> index 964d1fa4e1..a6db70f13e 100644
>>>> --- a/meson.build
>>>> +++ b/meson.build
>>>> @@ -732,6 +732,7 @@ headers = [
>>>>      'ifaddrs.h',
>>>>      'libtasn1.h',
>>>>      'linux/kvm.h',
>>>> +  'linux/iommufd.h',
>>>>      'mntent.h',
>>>>      'net/ethernet.h',
>>>>      'net/if.h',
>>>> diff --git a/po/POTFILES b/po/POTFILES
>>>> index f0aad35c8c..c78d2b8000 100644
>>>> --- a/po/POTFILES
>>>> +++ b/po/POTFILES
>>>> @@ -303,6 +303,7 @@ src/util/virhostuptime.c
>>>>    src/util/viridentity.c
>>>>    src/util/virinhibitor.c
>>>>    src/util/virinitctl.c
>>>> +src/util/viriommufd.c
>>>>    src/util/viriscsi.c
>>>>    src/util/virjson.c
>>>>    src/util/virlease.c
>>>> diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
>>>> index 6bffd2eb6d..7fa76a1ec3 100644
>>>> --- a/src/libvirt_private.syms
>>>> +++ b/src/libvirt_private.syms
>>>> @@ -2646,6 +2646,9 @@ virInhibitorRelease;
>>>>    virInitctlFifos;
>>>>    virInitctlSetRunLevel;
>>>> +# util/viriommufd.h
>>>> +virIOMMUFDSetRLimitMode;
>>>> +
>>>>    # util/viriscsi.h
>>>>    virISCSIConnectionLogin;
>>>>    virISCSIConnectionLogout;
>>>> diff --git a/src/util/meson.build b/src/util/meson.build
>>>> index 4950a795cc..9fb0aa0fe7 100644
>>>> --- a/src/util/meson.build
>>>> +++ b/src/util/meson.build
>>>> @@ -46,6 +46,7 @@ util_sources = [
>>>>      'viridentity.c',
>>>>      'virinhibitor.c',
>>>>      'virinitctl.c',
>>>> +  'viriommufd.c',
>>>>      'viriscsi.c',
>>>>      'virjson.c',
>>>>      'virkeycode.c',
>>>> diff --git a/src/util/viriommufd.c b/src/util/viriommufd.c
>>>> new file mode 100644
>>>> index 0000000000..225c76f4b2
>>>> --- /dev/null
>>>> +++ b/src/util/viriommufd.c
>>>> @@ -0,0 +1,111 @@
>>>> +#include <config.h>
>>>> +
>>>> +#include "viriommufd.h"
>>>> +#include "virlog.h"
>>>> +#include "virerror.h"
>>>> +#include "virfile.h"
>>>> +
>>>> +#ifdef __linux__
>>>> +
>>>> +# include <sys/ioctl.h>
>>>> +# include <linux/types.h>
>>>> +
>>>> +# ifdef HAVE_LINUX_IOMMUFD_H
>>>> +#  include <linux/iommufd.h>
>>>> +# endif
>>>> +
>>>> +# define VIR_FROM_THIS VIR_FROM_NONE
>>>> +
>>>> +VIR_LOG_INIT("util.iommufd");
>>>> +
>>>> +# ifndef IOMMU_OPTION
>>>> +
>>>> +enum iommufd_option {
>>>> +    IOMMU_OPTION_RLIMIT_MODE = 0,
>>>> +    IOMMU_OPTION_HUGE_PAGES = 1,
>>>> +};
>>>> +
>>>> +enum iommufd_option_ops {
>>>> +    IOMMU_OPTION_OP_SET = 0,
>>>> +    IOMMU_OPTION_OP_GET = 1,
>>>> +};
>>>> +
>>>> +struct iommu_option {
>>>> +    __u32 size;
>>>> +    __u32 option_id;
>>>> +    __u16 op;
>>>> +    __u16 __reserved;
>>>> +    __u32 object_id;
>>>> +    __aligned_u64 val64;
>>>> +};
>>>> +
>>>> +#  define IOMMUFD_TYPE (';')
>>>> +#  define IOMMUFD_CMD_OPTION 0x87
>>>> +#  define IOMMU_OPTION _IO(IOMMUFD_TYPE, IOMMUFD_CMD_OPTION)
>>>> +
>>>> +# endif
>>>> +
>>>> +/**
>>>> + * virIOMMUFDSetRLimitMode:
>>>> + * @fd: iommufd file descriptor
>>>> + * @processAccounting: true for per-process, false for per-user
>>>> + *
>>>> + * Set RLIMIT_MEMLOCK accounting mode for the iommufd.
>>>> + *
>>>> + * Returns: 0 on success, -1 on error
>>>> + */
>>>> +int
>>>> +virIOMMUFDSetRLimitMode(int fd, bool processAccounting)
>>>> +{
>>>> +    struct iommu_option option = {
>>>> +        .size = sizeof(struct iommu_option),
>>>> +        .option_id = IOMMU_OPTION_RLIMIT_MODE,
>>>> +        .op = IOMMU_OPTION_OP_SET,
>>>> +        .__reserved = 0,
>>>> +        .object_id = 0,
>>>> +        .val64 = processAccounting ? 1 : 0,
>>>> +    };
>>>> +
>>>> +    if (ioctl(fd, IOMMU_OPTION, &option) < 0) {
>>>> +        switch (errno) {
>>>> +            case ENOTTY:
>>>> +                VIR_WARN("IOMMU_OPTION ioctl not supported");
>>>> +                return 0;
>>>> +
>>>> +            case EOPNOTSUPP:
>>>> +                VIR_WARN("IOMMU_OPTION_RLIMIT_MODE not supported by kernel");
>>>> +                return 0;
>>>> +
>>>> +            case EINVAL:
>>>> +                virReportSystemError(errno, "%s",
>>>> +                                     _("invalid iommufd option parameters"));
>>>> +                return -1;
>>>> +
>>>> +            case EPERM:
>>>> +                VIR_WARN("Permission denied for IOMMU_OPTION ioctl. "
>>>> +                         "Per-user-based memory accounting to be used by default.");
>>>> +                return 0;
>>>> +
>>>> +            default:
>>>> +                virReportSystemError(errno, "%s",
>>>> +                                     _("failed to set iommufd option"));
>>>> +                return -1;
>>>> +        }
>>>> +    }
>>> In my previous testing this part of code was not used so no rlimit was
>>> configured for the grace hopper GPU that was assigned to a VM.
>>>
>>> The VM OS was able to see the GPU and I was able to run cuda-samples
>>> with most of them passing. This setup didn't use vCMDQ or EGM. When I
>>> tried patches that add support for vCMDQ I was no longer able to use the
>>> GPU inside the VM until this code was called or setting
>>> "setcap cap_ipc_lock=ep" on the qemu binary but it was still detected
>>> inside the VM and the VM was started successfully.
>>>
>>> So is this required for all devices that want to use iommufd in order
>>> for them to work correctly inside the VM? Or is it necessary only when
>>> specific features are used?
>>>
>> I don’t think the ioctl is required for all devices, but vCMDQ can increase
>> accounted pinned memory over the per‑user memory locking limit. vCMDQ
>> introduces additional guest‑RAM backed queues that could be the extra
>> pinned/accounted memory pushing over the memory locking limit. Additionally,
>> attempting to launch a second iommufd VM could increase accounted memory
>> over the per-user memory locking limit.
> If that ioctl call is not required for all devices we should not call it
> unconditionally for all VMs that will try to use iommufd with any
> device.
> 
> Libvirt tries to guess correct memory limit for specific cases, see
> function qemuDomainGetMemLockLimitBytes() .
> 
> If I manually set 64G hard_limit for VM with 32G ram everything works
> even without calling tha ioctl:
> 
>    <memtune>
>      <hard_limit unit='GiB'>64</hard_limit>
>    </memtune>
> 
> So if we can figure out some reasonable overhead when vCMDQ is used that
> would be better solution.
> 
It makes sense that the ioctl should not be used blindly for every 
iommufd VM. Would you be open to gating the per-process accounting 
behind a config setting (e.g. iommufd_rlimit_mode=process in 
libvirtd.conf)? That keeps the default behavior unchanged while 
accounting for the multi-VM failure case.

Separately, I'd be happy to add memlock limit adjustments in the vCMDQ 
Libvirt patch series under qemuDomainGetMemLockLimitBytes() when vCMDQ 
is enabled.
>> For the case you observed, if it were truly a single isolated QEMU process
>> with no other memlocked usage under the same uid, per‑process vs per‑user
>> should be identical. The fact that switching to per‑process memory
>> accounting fixes the issue suggests there is additional memlocked usage
>> being charged to the libvirt‑qemu uid (e.g. other processes, helper daemons,
>> or device‑related accounting). vCMDQ just pushes the summed memory over the
>> limit.
> When the limit was not high enough I got the following errors in host
> dmesg:
> 
> [30507.848263] acpi NVDA200C:03: tegra241_cmdqv: unexpected error reported. vintf_map: 0000000000000002, vcmdq_map 00000000:00000000:00000000:0000000c
> 
> I think this needs additional work in QEMU, starting VM should error out
> if it hits the memory limit instead of silently starting broken VM
> configuration.

Ok, I will discuss with Shameer about erroring out if it hits the memory 
limit. Thank you for testing and providing this detailed feedback.

Nathan

Re: [PATCH v5 3/7] qemu: Support per-process memory accounting for iommufd

Posted by Pavel Hrdina via Devel 3 days, 9 hours ago

On Mon, Jan 26, 2026 at 05:17:02PM -0800, Nathan Chen wrote:
> 
> 
> On 1/26/2026 1:07 PM, Pavel Hrdina wrote:
> > On Fri, Jan 23, 2026 at 12:30:28PM -0800, Nathan Chen wrote:
> > > 
> > > On 1/20/2026 10:24 AM, Pavel Hrdina wrote:
> > > > On Fri, Jan 16, 2026 at 05:39:33PM -0800, Nathan Chen via Devel wrote:
> > > > > From: Nathan Chen<nathanc@nvidia.com>
> > > > > 
> > > > > Implement the IOMMU_OPTION_RLIMIT_MODE
> > > > > ioctl to set per-process memory accounting for
> > > > > iommufd. This prevents ENOMEM errors from the
> > > > > default per-user memory accounting when multiple
> > > > > VMs under the libvirt-qemu user have their pinned
> > > > > memory summed and checked against a per-process
> > > > > RLIMIT_MEMLOCK limit.
> > > > > 
> > > > > Signed-off-by: Nathan Chen<nathanc@nvidia.com>
> > > > > ---
> > > > >    meson.build              |   1 +
> > > > >    po/POTFILES              |   1 +
> > > > >    src/libvirt_private.syms |   3 ++
> > > > >    src/util/meson.build     |   1 +
> > > > >    src/util/viriommufd.c    | 111 +++++++++++++++++++++++++++++++++++++++
> > > > >    src/util/viriommufd.h    |  25 +++++++++
> > > > >    6 files changed, 142 insertions(+)
> > > > >    create mode 100644 src/util/viriommufd.c
> > > > >    create mode 100644 src/util/viriommufd.h
> > > > > 
> > > > > diff --git a/meson.build b/meson.build
> > > > > index 964d1fa4e1..a6db70f13e 100644
> > > > > --- a/meson.build
> > > > > +++ b/meson.build
> > > > > @@ -732,6 +732,7 @@ headers = [
> > > > >      'ifaddrs.h',
> > > > >      'libtasn1.h',
> > > > >      'linux/kvm.h',
> > > > > +  'linux/iommufd.h',
> > > > >      'mntent.h',
> > > > >      'net/ethernet.h',
> > > > >      'net/if.h',
> > > > > diff --git a/po/POTFILES b/po/POTFILES
> > > > > index f0aad35c8c..c78d2b8000 100644
> > > > > --- a/po/POTFILES
> > > > > +++ b/po/POTFILES
> > > > > @@ -303,6 +303,7 @@ src/util/virhostuptime.c
> > > > >    src/util/viridentity.c
> > > > >    src/util/virinhibitor.c
> > > > >    src/util/virinitctl.c
> > > > > +src/util/viriommufd.c
> > > > >    src/util/viriscsi.c
> > > > >    src/util/virjson.c
> > > > >    src/util/virlease.c
> > > > > diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
> > > > > index 6bffd2eb6d..7fa76a1ec3 100644
> > > > > --- a/src/libvirt_private.syms
> > > > > +++ b/src/libvirt_private.syms
> > > > > @@ -2646,6 +2646,9 @@ virInhibitorRelease;
> > > > >    virInitctlFifos;
> > > > >    virInitctlSetRunLevel;
> > > > > +# util/viriommufd.h
> > > > > +virIOMMUFDSetRLimitMode;
> > > > > +
> > > > >    # util/viriscsi.h
> > > > >    virISCSIConnectionLogin;
> > > > >    virISCSIConnectionLogout;
> > > > > diff --git a/src/util/meson.build b/src/util/meson.build
> > > > > index 4950a795cc..9fb0aa0fe7 100644
> > > > > --- a/src/util/meson.build
> > > > > +++ b/src/util/meson.build
> > > > > @@ -46,6 +46,7 @@ util_sources = [
> > > > >      'viridentity.c',
> > > > >      'virinhibitor.c',
> > > > >      'virinitctl.c',
> > > > > +  'viriommufd.c',
> > > > >      'viriscsi.c',
> > > > >      'virjson.c',
> > > > >      'virkeycode.c',
> > > > > diff --git a/src/util/viriommufd.c b/src/util/viriommufd.c
> > > > > new file mode 100644
> > > > > index 0000000000..225c76f4b2
> > > > > --- /dev/null
> > > > > +++ b/src/util/viriommufd.c
> > > > > @@ -0,0 +1,111 @@
> > > > > +#include <config.h>
> > > > > +
> > > > > +#include "viriommufd.h"
> > > > > +#include "virlog.h"
> > > > > +#include "virerror.h"
> > > > > +#include "virfile.h"
> > > > > +
> > > > > +#ifdef __linux__
> > > > > +
> > > > > +# include <sys/ioctl.h>
> > > > > +# include <linux/types.h>
> > > > > +
> > > > > +# ifdef HAVE_LINUX_IOMMUFD_H
> > > > > +#  include <linux/iommufd.h>
> > > > > +# endif
> > > > > +
> > > > > +# define VIR_FROM_THIS VIR_FROM_NONE
> > > > > +
> > > > > +VIR_LOG_INIT("util.iommufd");
> > > > > +
> > > > > +# ifndef IOMMU_OPTION
> > > > > +
> > > > > +enum iommufd_option {
> > > > > +    IOMMU_OPTION_RLIMIT_MODE = 0,
> > > > > +    IOMMU_OPTION_HUGE_PAGES = 1,
> > > > > +};
> > > > > +
> > > > > +enum iommufd_option_ops {
> > > > > +    IOMMU_OPTION_OP_SET = 0,
> > > > > +    IOMMU_OPTION_OP_GET = 1,
> > > > > +};
> > > > > +
> > > > > +struct iommu_option {
> > > > > +    __u32 size;
> > > > > +    __u32 option_id;
> > > > > +    __u16 op;
> > > > > +    __u16 __reserved;
> > > > > +    __u32 object_id;
> > > > > +    __aligned_u64 val64;
> > > > > +};
> > > > > +
> > > > > +#  define IOMMUFD_TYPE (';')
> > > > > +#  define IOMMUFD_CMD_OPTION 0x87
> > > > > +#  define IOMMU_OPTION _IO(IOMMUFD_TYPE, IOMMUFD_CMD_OPTION)
> > > > > +
> > > > > +# endif
> > > > > +
> > > > > +/**
> > > > > + * virIOMMUFDSetRLimitMode:
> > > > > + * @fd: iommufd file descriptor
> > > > > + * @processAccounting: true for per-process, false for per-user
> > > > > + *
> > > > > + * Set RLIMIT_MEMLOCK accounting mode for the iommufd.
> > > > > + *
> > > > > + * Returns: 0 on success, -1 on error
> > > > > + */
> > > > > +int
> > > > > +virIOMMUFDSetRLimitMode(int fd, bool processAccounting)
> > > > > +{
> > > > > +    struct iommu_option option = {
> > > > > +        .size = sizeof(struct iommu_option),
> > > > > +        .option_id = IOMMU_OPTION_RLIMIT_MODE,
> > > > > +        .op = IOMMU_OPTION_OP_SET,
> > > > > +        .__reserved = 0,
> > > > > +        .object_id = 0,
> > > > > +        .val64 = processAccounting ? 1 : 0,
> > > > > +    };
> > > > > +
> > > > > +    if (ioctl(fd, IOMMU_OPTION, &option) < 0) {
> > > > > +        switch (errno) {
> > > > > +            case ENOTTY:
> > > > > +                VIR_WARN("IOMMU_OPTION ioctl not supported");
> > > > > +                return 0;
> > > > > +
> > > > > +            case EOPNOTSUPP:
> > > > > +                VIR_WARN("IOMMU_OPTION_RLIMIT_MODE not supported by kernel");
> > > > > +                return 0;
> > > > > +
> > > > > +            case EINVAL:
> > > > > +                virReportSystemError(errno, "%s",
> > > > > +                                     _("invalid iommufd option parameters"));
> > > > > +                return -1;
> > > > > +
> > > > > +            case EPERM:
> > > > > +                VIR_WARN("Permission denied for IOMMU_OPTION ioctl. "
> > > > > +                         "Per-user-based memory accounting to be used by default.");
> > > > > +                return 0;
> > > > > +
> > > > > +            default:
> > > > > +                virReportSystemError(errno, "%s",
> > > > > +                                     _("failed to set iommufd option"));
> > > > > +                return -1;
> > > > > +        }
> > > > > +    }
> > > > In my previous testing this part of code was not used so no rlimit was
> > > > configured for the grace hopper GPU that was assigned to a VM.
> > > > 
> > > > The VM OS was able to see the GPU and I was able to run cuda-samples
> > > > with most of them passing. This setup didn't use vCMDQ or EGM. When I
> > > > tried patches that add support for vCMDQ I was no longer able to use the
> > > > GPU inside the VM until this code was called or setting
> > > > "setcap cap_ipc_lock=ep" on the qemu binary but it was still detected
> > > > inside the VM and the VM was started successfully.
> > > > 
> > > > So is this required for all devices that want to use iommufd in order
> > > > for them to work correctly inside the VM? Or is it necessary only when
> > > > specific features are used?
> > > > 
> > > I don’t think the ioctl is required for all devices, but vCMDQ can increase
> > > accounted pinned memory over the per‑user memory locking limit. vCMDQ
> > > introduces additional guest‑RAM backed queues that could be the extra
> > > pinned/accounted memory pushing over the memory locking limit. Additionally,
> > > attempting to launch a second iommufd VM could increase accounted memory
> > > over the per-user memory locking limit.
> > If that ioctl call is not required for all devices we should not call it
> > unconditionally for all VMs that will try to use iommufd with any
> > device.
> > 
> > Libvirt tries to guess correct memory limit for specific cases, see
> > function qemuDomainGetMemLockLimitBytes() .
> > 
> > If I manually set 64G hard_limit for VM with 32G ram everything works
> > even without calling tha ioctl:
> > 
> >    <memtune>
> >      <hard_limit unit='GiB'>64</hard_limit>
> >    </memtune>
> > 
> > So if we can figure out some reasonable overhead when vCMDQ is used that
> > would be better solution.
> > 
> It makes sense that the ioctl should not be used blindly for every iommufd
> VM. Would you be open to gating the per-process accounting behind a config
> setting (e.g. iommufd_rlimit_mode=process in libvirtd.conf)? That keeps the
> default behavior unchanged while accounting for the multi-VM failure case.

I have no HW with multiple GPUs available to test if this is required or
not in order to start multiple VMs each using one GPU.

Currently based on my testing for single VM it is not required. Are you
sure if we need this? If not we can remove this patch.

> Separately, I'd be happy to add memlock limit adjustments in the vCMDQ
> Libvirt patch series under qemuDomainGetMemLockLimitBytes() when vCMDQ is
> enabled.

It seems that there is no need to make any changes to current code,
libvirt already adds extra 1GiB if there is single PCI hostdev attached
to the VM.

> > > For the case you observed, if it were truly a single isolated QEMU process
> > > with no other memlocked usage under the same uid, per‑process vs per‑user
> > > should be identical. The fact that switching to per‑process memory
> > > accounting fixes the issue suggests there is additional memlocked usage
> > > being charged to the libvirt‑qemu uid (e.g. other processes, helper daemons,
> > > or device‑related accounting). vCMDQ just pushes the summed memory over the
> > > limit.
> > When the limit was not high enough I got the following errors in host
> > dmesg:
> > 
> > [30507.848263] acpi NVDA200C:03: tegra241_cmdqv: unexpected error reported. vintf_map: 0000000000000002, vcmdq_map 00000000:00000000:00000000:0000000c
> > 
> > I think this needs additional work in QEMU, starting VM should error out
> > if it hits the memory limit instead of silently starting broken VM
> > configuration.
> 
> Ok, I will discuss with Shameer about erroring out if it hits the memory
> limit. Thank you for testing and providing this detailed feedback.

I have new details about this error. It only happens when vCMDQ is used
and only when VM with vCMDQ is started for the first time after host is
power cycled (reboot is not enough to trigger this error).

If this happens shutting down the VM and starting it again no longer
produce this error and I was able to run cuda-samples inside the VM.

Pavel

> 
> Nathan
>

Re: [PATCH v5 3/7] qemu: Support per-process memory accounting for iommufd

Posted by Nathan Chen via Devel 3 days, 7 hours ago


On 1/29/2026 11:58 AM, Pavel Hrdina wrote:
> On Mon, Jan 26, 2026 at 05:17:02PM -0800, Nathan Chen wrote:
>>
>> On 1/26/2026 1:07 PM, Pavel Hrdina wrote:
>>> On Fri, Jan 23, 2026 at 12:30:28PM -0800, Nathan Chen wrote:
>>>> On 1/20/2026 10:24 AM, Pavel Hrdina wrote:
>>>>> On Fri, Jan 16, 2026 at 05:39:33PM -0800, Nathan Chen via Devel wrote:
>>>>>> From: Nathan Chen<nathanc@nvidia.com>
>>>>>>
>>>>>> Implement the IOMMU_OPTION_RLIMIT_MODE
>>>>>> ioctl to set per-process memory accounting for
>>>>>> iommufd. This prevents ENOMEM errors from the
>>>>>> default per-user memory accounting when multiple
>>>>>> VMs under the libvirt-qemu user have their pinned
>>>>>> memory summed and checked against a per-process
>>>>>> RLIMIT_MEMLOCK limit.
>>>>>>
>>>>>> Signed-off-by: Nathan Chen<nathanc@nvidia.com>
>>>>>> ---
>>>>>>     meson.build              |   1 +
>>>>>>     po/POTFILES              |   1 +
>>>>>>     src/libvirt_private.syms |   3 ++
>>>>>>     src/util/meson.build     |   1 +
>>>>>>     src/util/viriommufd.c    | 111 +++++++++++++++++++++++++++++++++++++++
>>>>>>     src/util/viriommufd.h    |  25 +++++++++
>>>>>>     6 files changed, 142 insertions(+)
>>>>>>     create mode 100644 src/util/viriommufd.c
>>>>>>     create mode 100644 src/util/viriommufd.h
>>>>>>
>>>>>> diff --git a/meson.build b/meson.build
>>>>>> index 964d1fa4e1..a6db70f13e 100644
>>>>>> --- a/meson.build
>>>>>> +++ b/meson.build
>>>>>> @@ -732,6 +732,7 @@ headers = [
>>>>>>       'ifaddrs.h',
>>>>>>       'libtasn1.h',
>>>>>>       'linux/kvm.h',
>>>>>> +  'linux/iommufd.h',
>>>>>>       'mntent.h',
>>>>>>       'net/ethernet.h',
>>>>>>       'net/if.h',
>>>>>> diff --git a/po/POTFILES b/po/POTFILES
>>>>>> index f0aad35c8c..c78d2b8000 100644
>>>>>> --- a/po/POTFILES
>>>>>> +++ b/po/POTFILES
>>>>>> @@ -303,6 +303,7 @@ src/util/virhostuptime.c
>>>>>>     src/util/viridentity.c
>>>>>>     src/util/virinhibitor.c
>>>>>>     src/util/virinitctl.c
>>>>>> +src/util/viriommufd.c
>>>>>>     src/util/viriscsi.c
>>>>>>     src/util/virjson.c
>>>>>>     src/util/virlease.c
>>>>>> diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
>>>>>> index 6bffd2eb6d..7fa76a1ec3 100644
>>>>>> --- a/src/libvirt_private.syms
>>>>>> +++ b/src/libvirt_private.syms
>>>>>> @@ -2646,6 +2646,9 @@ virInhibitorRelease;
>>>>>>     virInitctlFifos;
>>>>>>     virInitctlSetRunLevel;
>>>>>> +# util/viriommufd.h
>>>>>> +virIOMMUFDSetRLimitMode;
>>>>>> +
>>>>>>     # util/viriscsi.h
>>>>>>     virISCSIConnectionLogin;
>>>>>>     virISCSIConnectionLogout;
>>>>>> diff --git a/src/util/meson.build b/src/util/meson.build
>>>>>> index 4950a795cc..9fb0aa0fe7 100644
>>>>>> --- a/src/util/meson.build
>>>>>> +++ b/src/util/meson.build
>>>>>> @@ -46,6 +46,7 @@ util_sources = [
>>>>>>       'viridentity.c',
>>>>>>       'virinhibitor.c',
>>>>>>       'virinitctl.c',
>>>>>> +  'viriommufd.c',
>>>>>>       'viriscsi.c',
>>>>>>       'virjson.c',
>>>>>>       'virkeycode.c',
>>>>>> diff --git a/src/util/viriommufd.c b/src/util/viriommufd.c
>>>>>> new file mode 100644
>>>>>> index 0000000000..225c76f4b2
>>>>>> --- /dev/null
>>>>>> +++ b/src/util/viriommufd.c
>>>>>> @@ -0,0 +1,111 @@
>>>>>> +#include <config.h>
>>>>>> +
>>>>>> +#include "viriommufd.h"
>>>>>> +#include "virlog.h"
>>>>>> +#include "virerror.h"
>>>>>> +#include "virfile.h"
>>>>>> +
>>>>>> +#ifdef __linux__
>>>>>> +
>>>>>> +# include <sys/ioctl.h>
>>>>>> +# include <linux/types.h>
>>>>>> +
>>>>>> +# ifdef HAVE_LINUX_IOMMUFD_H
>>>>>> +#  include <linux/iommufd.h>
>>>>>> +# endif
>>>>>> +
>>>>>> +# define VIR_FROM_THIS VIR_FROM_NONE
>>>>>> +
>>>>>> +VIR_LOG_INIT("util.iommufd");
>>>>>> +
>>>>>> +# ifndef IOMMU_OPTION
>>>>>> +
>>>>>> +enum iommufd_option {
>>>>>> +    IOMMU_OPTION_RLIMIT_MODE = 0,
>>>>>> +    IOMMU_OPTION_HUGE_PAGES = 1,
>>>>>> +};
>>>>>> +
>>>>>> +enum iommufd_option_ops {
>>>>>> +    IOMMU_OPTION_OP_SET = 0,
>>>>>> +    IOMMU_OPTION_OP_GET = 1,
>>>>>> +};
>>>>>> +
>>>>>> +struct iommu_option {
>>>>>> +    __u32 size;
>>>>>> +    __u32 option_id;
>>>>>> +    __u16 op;
>>>>>> +    __u16 __reserved;
>>>>>> +    __u32 object_id;
>>>>>> +    __aligned_u64 val64;
>>>>>> +};
>>>>>> +
>>>>>> +#  define IOMMUFD_TYPE (';')
>>>>>> +#  define IOMMUFD_CMD_OPTION 0x87
>>>>>> +#  define IOMMU_OPTION _IO(IOMMUFD_TYPE, IOMMUFD_CMD_OPTION)
>>>>>> +
>>>>>> +# endif
>>>>>> +
>>>>>> +/**
>>>>>> + * virIOMMUFDSetRLimitMode:
>>>>>> + * @fd: iommufd file descriptor
>>>>>> + * @processAccounting: true for per-process, false for per-user
>>>>>> + *
>>>>>> + * Set RLIMIT_MEMLOCK accounting mode for the iommufd.
>>>>>> + *
>>>>>> + * Returns: 0 on success, -1 on error
>>>>>> + */
>>>>>> +int
>>>>>> +virIOMMUFDSetRLimitMode(int fd, bool processAccounting)
>>>>>> +{
>>>>>> +    struct iommu_option option = {
>>>>>> +        .size = sizeof(struct iommu_option),
>>>>>> +        .option_id = IOMMU_OPTION_RLIMIT_MODE,
>>>>>> +        .op = IOMMU_OPTION_OP_SET,
>>>>>> +        .__reserved = 0,
>>>>>> +        .object_id = 0,
>>>>>> +        .val64 = processAccounting ? 1 : 0,
>>>>>> +    };
>>>>>> +
>>>>>> +    if (ioctl(fd, IOMMU_OPTION, &option) < 0) {
>>>>>> +        switch (errno) {
>>>>>> +            case ENOTTY:
>>>>>> +                VIR_WARN("IOMMU_OPTION ioctl not supported");
>>>>>> +                return 0;
>>>>>> +
>>>>>> +            case EOPNOTSUPP:
>>>>>> +                VIR_WARN("IOMMU_OPTION_RLIMIT_MODE not supported by kernel");
>>>>>> +                return 0;
>>>>>> +
>>>>>> +            case EINVAL:
>>>>>> +                virReportSystemError(errno, "%s",
>>>>>> +                                     _("invalid iommufd option parameters"));
>>>>>> +                return -1;
>>>>>> +
>>>>>> +            case EPERM:
>>>>>> +                VIR_WARN("Permission denied for IOMMU_OPTION ioctl. "
>>>>>> +                         "Per-user-based memory accounting to be used by default.");
>>>>>> +                return 0;
>>>>>> +
>>>>>> +            default:
>>>>>> +                virReportSystemError(errno, "%s",
>>>>>> +                                     _("failed to set iommufd option"));
>>>>>> +                return -1;
>>>>>> +        }
>>>>>> +    }
>>>>> In my previous testing this part of code was not used so no rlimit was
>>>>> configured for the grace hopper GPU that was assigned to a VM.
>>>>>
>>>>> The VM OS was able to see the GPU and I was able to run cuda-samples
>>>>> with most of them passing. This setup didn't use vCMDQ or EGM. When I
>>>>> tried patches that add support for vCMDQ I was no longer able to use the
>>>>> GPU inside the VM until this code was called or setting
>>>>> "setcap cap_ipc_lock=ep" on the qemu binary but it was still detected
>>>>> inside the VM and the VM was started successfully.
>>>>>
>>>>> So is this required for all devices that want to use iommufd in order
>>>>> for them to work correctly inside the VM? Or is it necessary only when
>>>>> specific features are used?
>>>>>
>>>> I don’t think the ioctl is required for all devices, but vCMDQ can increase
>>>> accounted pinned memory over the per‑user memory locking limit. vCMDQ
>>>> introduces additional guest‑RAM backed queues that could be the extra
>>>> pinned/accounted memory pushing over the memory locking limit. Additionally,
>>>> attempting to launch a second iommufd VM could increase accounted memory
>>>> over the per-user memory locking limit.
>>> If that ioctl call is not required for all devices we should not call it
>>> unconditionally for all VMs that will try to use iommufd with any
>>> device.
>>>
>>> Libvirt tries to guess correct memory limit for specific cases, see
>>> function qemuDomainGetMemLockLimitBytes() .
>>>
>>> If I manually set 64G hard_limit for VM with 32G ram everything works
>>> even without calling tha ioctl:
>>>
>>>     <memtune>
>>>       <hard_limit unit='GiB'>64</hard_limit>
>>>     </memtune>
>>>
>>> So if we can figure out some reasonable overhead when vCMDQ is used that
>>> would be better solution.
>>>
>> It makes sense that the ioctl should not be used blindly for every iommufd
>> VM. Would you be open to gating the per-process accounting behind a config
>> setting (e.g. iommufd_rlimit_mode=process in libvirtd.conf)? That keeps the
>> default behavior unchanged while accounting for the multi-VM failure case.
> I have no HW with multiple GPUs available to test if this is required or
> not in order to start multiple VMs each using one GPU.
> 
> Currently based on my testing for single VM it is not required. Are you
> sure if we need this? If not we can remove this patch.
> 
I am sure we need this - I just reproduced the behavior again by 
removing the call to this ioctl and launching a second VM when another 
VM is already up. The second VM does not boot and we see the following 
error:

2026-01-29 22:35:29.927+0000: 291942: error : 
qemuProcessReportLogError:2151 : internal error: QEMU unexpectedly 
closed the monitor (vm='1gpu-vm-2'): 2026-01-29T22:35:29.836876Z 
qemu-system-aarch64: -device 
{"driver":"vfio-pci","host":"0009:06:00.0","id":"hostdev0","x-vpasid-cap-offset":4088,"iommufd":"iommufd0","fd":"21","bus":"pci.3","addr":"0x0"}: 
vfio hostdev0: memory listener initialization failed: Region ram-node0: 
vfio_container_dma_map(0xabaebbf9bb80, 0x40000000, 0x400000000, 
0xfeb733e00000) = -12 (Cannot allocate memory)
error: Failed to start domain '1gpu-vm-2'
error: internal error: QEMU unexpectedly closed the monitor 
(vm='1gpu-vm-2'): 2026-01-29T22:35:29.836876Z qemu-system-aarch64: 
-device 
{"driver":"vfio-pci","host":"0009:06:00.0","id":"hostdev0","x-vpasid-cap-offset":4088,"iommufd":"iommufd0","fd":"21","bus":"pci.3","addr":"0x0"}: 
vfio hostdev0: memory listener initialization failed: Region ram-node0: 
vfio_container_dma_map(0xabaebbf9bb80, 0x40000000, 0x400000000, 
0xfeb733e00000) = -12 (Cannot allocate memory)


>> Separately, I'd be happy to add memlock limit adjustments in the vCMDQ
>> Libvirt patch series under qemuDomainGetMemLockLimitBytes() when vCMDQ is
>> enabled.
> It seems that there is no need to make any changes to current code,
> libvirt already adds extra 1GiB if there is single PCI hostdev attached
> to the VM.
> 
>>>> For the case you observed, if it were truly a single isolated QEMU process
>>>> with no other memlocked usage under the same uid, per‑process vs per‑user
>>>> should be identical. The fact that switching to per‑process memory
>>>> accounting fixes the issue suggests there is additional memlocked usage
>>>> being charged to the libvirt‑qemu uid (e.g. other processes, helper daemons,
>>>> or device‑related accounting). vCMDQ just pushes the summed memory over the
>>>> limit.
>>> When the limit was not high enough I got the following errors in host
>>> dmesg:
>>>
>>> [30507.848263] acpi NVDA200C:03: tegra241_cmdqv: unexpected error reported. vintf_map: 0000000000000002, vcmdq_map 00000000:00000000:00000000:0000000c
>>>
>>> I think this needs additional work in QEMU, starting VM should error out
>>> if it hits the memory limit instead of silently starting broken VM
>>> configuration.
>> Ok, I will discuss with Shameer about erroring out if it hits the memory
>> limit. Thank you for testing and providing this detailed feedback.
> I have new details about this error. It only happens when vCMDQ is used
> and only when VM with vCMDQ is started for the first time after host is
> power cycled (reboot is not enough to trigger this error).
> 
> If this happens shutting down the VM and starting it again no longer
> produce this error and I was able to run cuda-samples inside the VM.

Are you encountering the same behavior with raw QEMU command line when 
you power cycle the host and launch a vCMDQ VM for the first time?

Nathan

Re: [PATCH v5 3/7] qemu: Support per-process memory accounting for iommufd

Posted by Pavel Hrdina via Devel 2 days, 13 hours ago

On Thu, Jan 29, 2026 at 02:39:52PM -0800, Nathan Chen wrote:
> 
> 
> On 1/29/2026 11:58 AM, Pavel Hrdina wrote:
> > On Mon, Jan 26, 2026 at 05:17:02PM -0800, Nathan Chen wrote:
> > > 
> > > On 1/26/2026 1:07 PM, Pavel Hrdina wrote:
> > > > On Fri, Jan 23, 2026 at 12:30:28PM -0800, Nathan Chen wrote:
> > > > > On 1/20/2026 10:24 AM, Pavel Hrdina wrote:
> > > > > > On Fri, Jan 16, 2026 at 05:39:33PM -0800, Nathan Chen via Devel wrote:
> > > > > > > From: Nathan Chen<nathanc@nvidia.com>
> > > > > > > 
> > > > > > > Implement the IOMMU_OPTION_RLIMIT_MODE
> > > > > > > ioctl to set per-process memory accounting for
> > > > > > > iommufd. This prevents ENOMEM errors from the
> > > > > > > default per-user memory accounting when multiple
> > > > > > > VMs under the libvirt-qemu user have their pinned
> > > > > > > memory summed and checked against a per-process
> > > > > > > RLIMIT_MEMLOCK limit.
> > > > > > > 
> > > > > > > Signed-off-by: Nathan Chen<nathanc@nvidia.com>
> > > > > > > ---
> > > > > > >     meson.build              |   1 +
> > > > > > >     po/POTFILES              |   1 +
> > > > > > >     src/libvirt_private.syms |   3 ++
> > > > > > >     src/util/meson.build     |   1 +
> > > > > > >     src/util/viriommufd.c    | 111 +++++++++++++++++++++++++++++++++++++++
> > > > > > >     src/util/viriommufd.h    |  25 +++++++++
> > > > > > >     6 files changed, 142 insertions(+)
> > > > > > >     create mode 100644 src/util/viriommufd.c
> > > > > > >     create mode 100644 src/util/viriommufd.h
> > > > > > > 
> > > > > > > diff --git a/meson.build b/meson.build
> > > > > > > index 964d1fa4e1..a6db70f13e 100644
> > > > > > > --- a/meson.build
> > > > > > > +++ b/meson.build
> > > > > > > @@ -732,6 +732,7 @@ headers = [
> > > > > > >       'ifaddrs.h',
> > > > > > >       'libtasn1.h',
> > > > > > >       'linux/kvm.h',
> > > > > > > +  'linux/iommufd.h',
> > > > > > >       'mntent.h',
> > > > > > >       'net/ethernet.h',
> > > > > > >       'net/if.h',
> > > > > > > diff --git a/po/POTFILES b/po/POTFILES
> > > > > > > index f0aad35c8c..c78d2b8000 100644
> > > > > > > --- a/po/POTFILES
> > > > > > > +++ b/po/POTFILES
> > > > > > > @@ -303,6 +303,7 @@ src/util/virhostuptime.c
> > > > > > >     src/util/viridentity.c
> > > > > > >     src/util/virinhibitor.c
> > > > > > >     src/util/virinitctl.c
> > > > > > > +src/util/viriommufd.c
> > > > > > >     src/util/viriscsi.c
> > > > > > >     src/util/virjson.c
> > > > > > >     src/util/virlease.c
> > > > > > > diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
> > > > > > > index 6bffd2eb6d..7fa76a1ec3 100644
> > > > > > > --- a/src/libvirt_private.syms
> > > > > > > +++ b/src/libvirt_private.syms
> > > > > > > @@ -2646,6 +2646,9 @@ virInhibitorRelease;
> > > > > > >     virInitctlFifos;
> > > > > > >     virInitctlSetRunLevel;
> > > > > > > +# util/viriommufd.h
> > > > > > > +virIOMMUFDSetRLimitMode;
> > > > > > > +
> > > > > > >     # util/viriscsi.h
> > > > > > >     virISCSIConnectionLogin;
> > > > > > >     virISCSIConnectionLogout;
> > > > > > > diff --git a/src/util/meson.build b/src/util/meson.build
> > > > > > > index 4950a795cc..9fb0aa0fe7 100644
> > > > > > > --- a/src/util/meson.build
> > > > > > > +++ b/src/util/meson.build
> > > > > > > @@ -46,6 +46,7 @@ util_sources = [
> > > > > > >       'viridentity.c',
> > > > > > >       'virinhibitor.c',
> > > > > > >       'virinitctl.c',
> > > > > > > +  'viriommufd.c',
> > > > > > >       'viriscsi.c',
> > > > > > >       'virjson.c',
> > > > > > >       'virkeycode.c',
> > > > > > > diff --git a/src/util/viriommufd.c b/src/util/viriommufd.c
> > > > > > > new file mode 100644
> > > > > > > index 0000000000..225c76f4b2
> > > > > > > --- /dev/null
> > > > > > > +++ b/src/util/viriommufd.c
> > > > > > > @@ -0,0 +1,111 @@
> > > > > > > +#include <config.h>
> > > > > > > +
> > > > > > > +#include "viriommufd.h"
> > > > > > > +#include "virlog.h"
> > > > > > > +#include "virerror.h"
> > > > > > > +#include "virfile.h"
> > > > > > > +
> > > > > > > +#ifdef __linux__
> > > > > > > +
> > > > > > > +# include <sys/ioctl.h>
> > > > > > > +# include <linux/types.h>
> > > > > > > +
> > > > > > > +# ifdef HAVE_LINUX_IOMMUFD_H
> > > > > > > +#  include <linux/iommufd.h>
> > > > > > > +# endif
> > > > > > > +
> > > > > > > +# define VIR_FROM_THIS VIR_FROM_NONE
> > > > > > > +
> > > > > > > +VIR_LOG_INIT("util.iommufd");
> > > > > > > +
> > > > > > > +# ifndef IOMMU_OPTION
> > > > > > > +
> > > > > > > +enum iommufd_option {
> > > > > > > +    IOMMU_OPTION_RLIMIT_MODE = 0,
> > > > > > > +    IOMMU_OPTION_HUGE_PAGES = 1,
> > > > > > > +};
> > > > > > > +
> > > > > > > +enum iommufd_option_ops {
> > > > > > > +    IOMMU_OPTION_OP_SET = 0,
> > > > > > > +    IOMMU_OPTION_OP_GET = 1,
> > > > > > > +};
> > > > > > > +
> > > > > > > +struct iommu_option {
> > > > > > > +    __u32 size;
> > > > > > > +    __u32 option_id;
> > > > > > > +    __u16 op;
> > > > > > > +    __u16 __reserved;
> > > > > > > +    __u32 object_id;
> > > > > > > +    __aligned_u64 val64;
> > > > > > > +};
> > > > > > > +
> > > > > > > +#  define IOMMUFD_TYPE (';')
> > > > > > > +#  define IOMMUFD_CMD_OPTION 0x87
> > > > > > > +#  define IOMMU_OPTION _IO(IOMMUFD_TYPE, IOMMUFD_CMD_OPTION)
> > > > > > > +
> > > > > > > +# endif
> > > > > > > +
> > > > > > > +/**
> > > > > > > + * virIOMMUFDSetRLimitMode:
> > > > > > > + * @fd: iommufd file descriptor
> > > > > > > + * @processAccounting: true for per-process, false for per-user
> > > > > > > + *
> > > > > > > + * Set RLIMIT_MEMLOCK accounting mode for the iommufd.
> > > > > > > + *
> > > > > > > + * Returns: 0 on success, -1 on error
> > > > > > > + */
> > > > > > > +int
> > > > > > > +virIOMMUFDSetRLimitMode(int fd, bool processAccounting)
> > > > > > > +{
> > > > > > > +    struct iommu_option option = {
> > > > > > > +        .size = sizeof(struct iommu_option),
> > > > > > > +        .option_id = IOMMU_OPTION_RLIMIT_MODE,
> > > > > > > +        .op = IOMMU_OPTION_OP_SET,
> > > > > > > +        .__reserved = 0,
> > > > > > > +        .object_id = 0,
> > > > > > > +        .val64 = processAccounting ? 1 : 0,
> > > > > > > +    };
> > > > > > > +
> > > > > > > +    if (ioctl(fd, IOMMU_OPTION, &option) < 0) {
> > > > > > > +        switch (errno) {
> > > > > > > +            case ENOTTY:
> > > > > > > +                VIR_WARN("IOMMU_OPTION ioctl not supported");
> > > > > > > +                return 0;
> > > > > > > +
> > > > > > > +            case EOPNOTSUPP:
> > > > > > > +                VIR_WARN("IOMMU_OPTION_RLIMIT_MODE not supported by kernel");
> > > > > > > +                return 0;
> > > > > > > +
> > > > > > > +            case EINVAL:
> > > > > > > +                virReportSystemError(errno, "%s",
> > > > > > > +                                     _("invalid iommufd option parameters"));
> > > > > > > +                return -1;
> > > > > > > +
> > > > > > > +            case EPERM:
> > > > > > > +                VIR_WARN("Permission denied for IOMMU_OPTION ioctl. "
> > > > > > > +                         "Per-user-based memory accounting to be used by default.");
> > > > > > > +                return 0;
> > > > > > > +
> > > > > > > +            default:
> > > > > > > +                virReportSystemError(errno, "%s",
> > > > > > > +                                     _("failed to set iommufd option"));
> > > > > > > +                return -1;
> > > > > > > +        }
> > > > > > > +    }
> > > > > > In my previous testing this part of code was not used so no rlimit was
> > > > > > configured for the grace hopper GPU that was assigned to a VM.
> > > > > > 
> > > > > > The VM OS was able to see the GPU and I was able to run cuda-samples
> > > > > > with most of them passing. This setup didn't use vCMDQ or EGM. When I
> > > > > > tried patches that add support for vCMDQ I was no longer able to use the
> > > > > > GPU inside the VM until this code was called or setting
> > > > > > "setcap cap_ipc_lock=ep" on the qemu binary but it was still detected
> > > > > > inside the VM and the VM was started successfully.
> > > > > > 
> > > > > > So is this required for all devices that want to use iommufd in order
> > > > > > for them to work correctly inside the VM? Or is it necessary only when
> > > > > > specific features are used?
> > > > > > 
> > > > > I don’t think the ioctl is required for all devices, but vCMDQ can increase
> > > > > accounted pinned memory over the per‑user memory locking limit. vCMDQ
> > > > > introduces additional guest‑RAM backed queues that could be the extra
> > > > > pinned/accounted memory pushing over the memory locking limit. Additionally,
> > > > > attempting to launch a second iommufd VM could increase accounted memory
> > > > > over the per-user memory locking limit.
> > > > If that ioctl call is not required for all devices we should not call it
> > > > unconditionally for all VMs that will try to use iommufd with any
> > > > device.
> > > > 
> > > > Libvirt tries to guess correct memory limit for specific cases, see
> > > > function qemuDomainGetMemLockLimitBytes() .
> > > > 
> > > > If I manually set 64G hard_limit for VM with 32G ram everything works
> > > > even without calling tha ioctl:
> > > > 
> > > >     <memtune>
> > > >       <hard_limit unit='GiB'>64</hard_limit>
> > > >     </memtune>
> > > > 
> > > > So if we can figure out some reasonable overhead when vCMDQ is used that
> > > > would be better solution.
> > > > 
> > > It makes sense that the ioctl should not be used blindly for every iommufd
> > > VM. Would you be open to gating the per-process accounting behind a config
> > > setting (e.g. iommufd_rlimit_mode=process in libvirtd.conf)? That keeps the
> > > default behavior unchanged while accounting for the multi-VM failure case.
> > I have no HW with multiple GPUs available to test if this is required or
> > not in order to start multiple VMs each using one GPU.
> > 
> > Currently based on my testing for single VM it is not required. Are you
> > sure if we need this? If not we can remove this patch.
> > 
> I am sure we need this - I just reproduced the behavior again by removing
> the call to this ioctl and launching a second VM when another VM is already
> up. The second VM does not boot and we see the following error:

Thanks for testing this, I've managed to get a system with multiple
network devices and managed to reproduce the same error. So it looks
like this is necessary every time iommufd is used.

As I already mentioned in this case we should always error out if we
cannot set per-process accounting to make sure that the VM will stay
within limits set for it by libvirt.

> 2026-01-29 22:35:29.927+0000: 291942: error : qemuProcessReportLogError:2151
> : internal error: QEMU unexpectedly closed the monitor (vm='1gpu-vm-2'):
> 2026-01-29T22:35:29.836876Z qemu-system-aarch64: -device {"driver":"vfio-pci","host":"0009:06:00.0","id":"hostdev0","x-vpasid-cap-offset":4088,"iommufd":"iommufd0","fd":"21","bus":"pci.3","addr":"0x0"}:
> vfio hostdev0: memory listener initialization failed: Region ram-node0:
> vfio_container_dma_map(0xabaebbf9bb80, 0x40000000, 0x400000000,
> 0xfeb733e00000) = -12 (Cannot allocate memory)
> error: Failed to start domain '1gpu-vm-2'
> error: internal error: QEMU unexpectedly closed the monitor
> (vm='1gpu-vm-2'): 2026-01-29T22:35:29.836876Z qemu-system-aarch64: -device {"driver":"vfio-pci","host":"0009:06:00.0","id":"hostdev0","x-vpasid-cap-offset":4088,"iommufd":"iommufd0","fd":"21","bus":"pci.3","addr":"0x0"}:
> vfio hostdev0: memory listener initialization failed: Region ram-node0:
> vfio_container_dma_map(0xabaebbf9bb80, 0x40000000, 0x400000000,
> 0xfeb733e00000) = -12 (Cannot allocate memory)
> 
> 
> > > Separately, I'd be happy to add memlock limit adjustments in the vCMDQ
> > > Libvirt patch series under qemuDomainGetMemLockLimitBytes() when vCMDQ is
> > > enabled.
> > It seems that there is no need to make any changes to current code,
> > libvirt already adds extra 1GiB if there is single PCI hostdev attached
> > to the VM.
> > 
> > > > > For the case you observed, if it were truly a single isolated QEMU process
> > > > > with no other memlocked usage under the same uid, per‑process vs per‑user
> > > > > should be identical. The fact that switching to per‑process memory
> > > > > accounting fixes the issue suggests there is additional memlocked usage
> > > > > being charged to the libvirt‑qemu uid (e.g. other processes, helper daemons,
> > > > > or device‑related accounting). vCMDQ just pushes the summed memory over the
> > > > > limit.
> > > > When the limit was not high enough I got the following errors in host
> > > > dmesg:
> > > > 
> > > > [30507.848263] acpi NVDA200C:03: tegra241_cmdqv: unexpected error reported. vintf_map: 0000000000000002, vcmdq_map 00000000:00000000:00000000:0000000c
> > > > 
> > > > I think this needs additional work in QEMU, starting VM should error out
> > > > if it hits the memory limit instead of silently starting broken VM
> > > > configuration.
> > > Ok, I will discuss with Shameer about erroring out if it hits the memory
> > > limit. Thank you for testing and providing this detailed feedback.
> > I have new details about this error. It only happens when vCMDQ is used
> > and only when VM with vCMDQ is started for the first time after host is
> > power cycled (reboot is not enough to trigger this error).
> > 
> > If this happens shutting down the VM and starting it again no longer
> > produce this error and I was able to run cuda-samples inside the VM.
> 
> Are you encountering the same behavior with raw QEMU command line when you
> power cycle the host and launch a vCMDQ VM for the first time?

Yes I was able to reproduce it by running qemu directly as root.

Pavel

Re: [PATCH v5 3/7] qemu: Support per-process memory accounting for iommufd

Posted by Nathan Chen via Devel 2 days, 12 hours ago


On 1/30/2026 8:09 AM, Pavel Hrdina wrote:
> On Thu, Jan 29, 2026 at 02:39:52PM -0800, Nathan Chen wrote:
>>
>> On 1/29/2026 11:58 AM, Pavel Hrdina wrote:
>>> On Mon, Jan 26, 2026 at 05:17:02PM -0800, Nathan Chen wrote:
>>>> On 1/26/2026 1:07 PM, Pavel Hrdina wrote:
>>>>> On Fri, Jan 23, 2026 at 12:30:28PM -0800, Nathan Chen wrote:
>>>>>> On 1/20/2026 10:24 AM, Pavel Hrdina wrote:
>>>>>>> On Fri, Jan 16, 2026 at 05:39:33PM -0800, Nathan Chen via Devel wrote:
>>>>>>>> From: Nathan Chen<nathanc@nvidia.com>
>>>>>>>>
>>>>>>>> Implement the IOMMU_OPTION_RLIMIT_MODE
>>>>>>>> ioctl to set per-process memory accounting for
>>>>>>>> iommufd. This prevents ENOMEM errors from the
>>>>>>>> default per-user memory accounting when multiple
>>>>>>>> VMs under the libvirt-qemu user have their pinned
>>>>>>>> memory summed and checked against a per-process
>>>>>>>> RLIMIT_MEMLOCK limit.
>>>>>>>>
>>>>>>>> Signed-off-by: Nathan Chen<nathanc@nvidia.com>
>>>>>>>> ---
>>>>>>>>      meson.build              |   1 +
>>>>>>>>      po/POTFILES              |   1 +
>>>>>>>>      src/libvirt_private.syms |   3 ++
>>>>>>>>      src/util/meson.build     |   1 +
>>>>>>>>      src/util/viriommufd.c    | 111 +++++++++++++++++++++++++++++++++++++++
>>>>>>>>      src/util/viriommufd.h    |  25 +++++++++
>>>>>>>>      6 files changed, 142 insertions(+)
>>>>>>>>      create mode 100644 src/util/viriommufd.c
>>>>>>>>      create mode 100644 src/util/viriommufd.h
>>>>>>>>
>>>>>>>> diff --git a/meson.build b/meson.build
>>>>>>>> index 964d1fa4e1..a6db70f13e 100644
>>>>>>>> --- a/meson.build
>>>>>>>> +++ b/meson.build
>>>>>>>> @@ -732,6 +732,7 @@ headers = [
>>>>>>>>        'ifaddrs.h',
>>>>>>>>        'libtasn1.h',
>>>>>>>>        'linux/kvm.h',
>>>>>>>> +  'linux/iommufd.h',
>>>>>>>>        'mntent.h',
>>>>>>>>        'net/ethernet.h',
>>>>>>>>        'net/if.h',
>>>>>>>> diff --git a/po/POTFILES b/po/POTFILES
>>>>>>>> index f0aad35c8c..c78d2b8000 100644
>>>>>>>> --- a/po/POTFILES
>>>>>>>> +++ b/po/POTFILES
>>>>>>>> @@ -303,6 +303,7 @@ src/util/virhostuptime.c
>>>>>>>>      src/util/viridentity.c
>>>>>>>>      src/util/virinhibitor.c
>>>>>>>>      src/util/virinitctl.c
>>>>>>>> +src/util/viriommufd.c
>>>>>>>>      src/util/viriscsi.c
>>>>>>>>      src/util/virjson.c
>>>>>>>>      src/util/virlease.c
>>>>>>>> diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
>>>>>>>> index 6bffd2eb6d..7fa76a1ec3 100644
>>>>>>>> --- a/src/libvirt_private.syms
>>>>>>>> +++ b/src/libvirt_private.syms
>>>>>>>> @@ -2646,6 +2646,9 @@ virInhibitorRelease;
>>>>>>>>      virInitctlFifos;
>>>>>>>>      virInitctlSetRunLevel;
>>>>>>>> +# util/viriommufd.h
>>>>>>>> +virIOMMUFDSetRLimitMode;
>>>>>>>> +
>>>>>>>>      # util/viriscsi.h
>>>>>>>>      virISCSIConnectionLogin;
>>>>>>>>      virISCSIConnectionLogout;
>>>>>>>> diff --git a/src/util/meson.build b/src/util/meson.build
>>>>>>>> index 4950a795cc..9fb0aa0fe7 100644
>>>>>>>> --- a/src/util/meson.build
>>>>>>>> +++ b/src/util/meson.build
>>>>>>>> @@ -46,6 +46,7 @@ util_sources = [
>>>>>>>>        'viridentity.c',
>>>>>>>>        'virinhibitor.c',
>>>>>>>>        'virinitctl.c',
>>>>>>>> +  'viriommufd.c',
>>>>>>>>        'viriscsi.c',
>>>>>>>>        'virjson.c',
>>>>>>>>        'virkeycode.c',
>>>>>>>> diff --git a/src/util/viriommufd.c b/src/util/viriommufd.c
>>>>>>>> new file mode 100644
>>>>>>>> index 0000000000..225c76f4b2
>>>>>>>> --- /dev/null
>>>>>>>> +++ b/src/util/viriommufd.c
>>>>>>>> @@ -0,0 +1,111 @@
>>>>>>>> +#include <config.h>
>>>>>>>> +
>>>>>>>> +#include "viriommufd.h"
>>>>>>>> +#include "virlog.h"
>>>>>>>> +#include "virerror.h"
>>>>>>>> +#include "virfile.h"
>>>>>>>> +
>>>>>>>> +#ifdef __linux__
>>>>>>>> +
>>>>>>>> +# include <sys/ioctl.h>
>>>>>>>> +# include <linux/types.h>
>>>>>>>> +
>>>>>>>> +# ifdef HAVE_LINUX_IOMMUFD_H
>>>>>>>> +#  include <linux/iommufd.h>
>>>>>>>> +# endif
>>>>>>>> +
>>>>>>>> +# define VIR_FROM_THIS VIR_FROM_NONE
>>>>>>>> +
>>>>>>>> +VIR_LOG_INIT("util.iommufd");
>>>>>>>> +
>>>>>>>> +# ifndef IOMMU_OPTION
>>>>>>>> +
>>>>>>>> +enum iommufd_option {
>>>>>>>> +    IOMMU_OPTION_RLIMIT_MODE = 0,
>>>>>>>> +    IOMMU_OPTION_HUGE_PAGES = 1,
>>>>>>>> +};
>>>>>>>> +
>>>>>>>> +enum iommufd_option_ops {
>>>>>>>> +    IOMMU_OPTION_OP_SET = 0,
>>>>>>>> +    IOMMU_OPTION_OP_GET = 1,
>>>>>>>> +};
>>>>>>>> +
>>>>>>>> +struct iommu_option {
>>>>>>>> +    __u32 size;
>>>>>>>> +    __u32 option_id;
>>>>>>>> +    __u16 op;
>>>>>>>> +    __u16 __reserved;
>>>>>>>> +    __u32 object_id;
>>>>>>>> +    __aligned_u64 val64;
>>>>>>>> +};
>>>>>>>> +
>>>>>>>> +#  define IOMMUFD_TYPE (';')
>>>>>>>> +#  define IOMMUFD_CMD_OPTION 0x87
>>>>>>>> +#  define IOMMU_OPTION _IO(IOMMUFD_TYPE, IOMMUFD_CMD_OPTION)
>>>>>>>> +
>>>>>>>> +# endif
>>>>>>>> +
>>>>>>>> +/**
>>>>>>>> + * virIOMMUFDSetRLimitMode:
>>>>>>>> + * @fd: iommufd file descriptor
>>>>>>>> + * @processAccounting: true for per-process, false for per-user
>>>>>>>> + *
>>>>>>>> + * Set RLIMIT_MEMLOCK accounting mode for the iommufd.
>>>>>>>> + *
>>>>>>>> + * Returns: 0 on success, -1 on error
>>>>>>>> + */
>>>>>>>> +int
>>>>>>>> +virIOMMUFDSetRLimitMode(int fd, bool processAccounting)
>>>>>>>> +{
>>>>>>>> +    struct iommu_option option = {
>>>>>>>> +        .size = sizeof(struct iommu_option),
>>>>>>>> +        .option_id = IOMMU_OPTION_RLIMIT_MODE,
>>>>>>>> +        .op = IOMMU_OPTION_OP_SET,
>>>>>>>> +        .__reserved = 0,
>>>>>>>> +        .object_id = 0,
>>>>>>>> +        .val64 = processAccounting ? 1 : 0,
>>>>>>>> +    };
>>>>>>>> +
>>>>>>>> +    if (ioctl(fd, IOMMU_OPTION, &option) < 0) {
>>>>>>>> +        switch (errno) {
>>>>>>>> +            case ENOTTY:
>>>>>>>> +                VIR_WARN("IOMMU_OPTION ioctl not supported");
>>>>>>>> +                return 0;
>>>>>>>> +
>>>>>>>> +            case EOPNOTSUPP:
>>>>>>>> +                VIR_WARN("IOMMU_OPTION_RLIMIT_MODE not supported by kernel");
>>>>>>>> +                return 0;
>>>>>>>> +
>>>>>>>> +            case EINVAL:
>>>>>>>> +                virReportSystemError(errno, "%s",
>>>>>>>> +                                     _("invalid iommufd option parameters"));
>>>>>>>> +                return -1;
>>>>>>>> +
>>>>>>>> +            case EPERM:
>>>>>>>> +                VIR_WARN("Permission denied for IOMMU_OPTION ioctl. "
>>>>>>>> +                         "Per-user-based memory accounting to be used by default.");
>>>>>>>> +                return 0;
>>>>>>>> +
>>>>>>>> +            default:
>>>>>>>> +                virReportSystemError(errno, "%s",
>>>>>>>> +                                     _("failed to set iommufd option"));
>>>>>>>> +                return -1;
>>>>>>>> +        }
>>>>>>>> +    }
>>>>>>> In my previous testing this part of code was not used so no rlimit was
>>>>>>> configured for the grace hopper GPU that was assigned to a VM.
>>>>>>>
>>>>>>> The VM OS was able to see the GPU and I was able to run cuda-samples
>>>>>>> with most of them passing. This setup didn't use vCMDQ or EGM. When I
>>>>>>> tried patches that add support for vCMDQ I was no longer able to use the
>>>>>>> GPU inside the VM until this code was called or setting
>>>>>>> "setcap cap_ipc_lock=ep" on the qemu binary but it was still detected
>>>>>>> inside the VM and the VM was started successfully.
>>>>>>>
>>>>>>> So is this required for all devices that want to use iommufd in order
>>>>>>> for them to work correctly inside the VM? Or is it necessary only when
>>>>>>> specific features are used?
>>>>>>>
>>>>>> I don’t think the ioctl is required for all devices, but vCMDQ can increase
>>>>>> accounted pinned memory over the per‑user memory locking limit. vCMDQ
>>>>>> introduces additional guest‑RAM backed queues that could be the extra
>>>>>> pinned/accounted memory pushing over the memory locking limit. Additionally,
>>>>>> attempting to launch a second iommufd VM could increase accounted memory
>>>>>> over the per-user memory locking limit.
>>>>> If that ioctl call is not required for all devices we should not call it
>>>>> unconditionally for all VMs that will try to use iommufd with any
>>>>> device.
>>>>>
>>>>> Libvirt tries to guess correct memory limit for specific cases, see
>>>>> function qemuDomainGetMemLockLimitBytes() .
>>>>>
>>>>> If I manually set 64G hard_limit for VM with 32G ram everything works
>>>>> even without calling tha ioctl:
>>>>>
>>>>>      <memtune>
>>>>>        <hard_limit unit='GiB'>64</hard_limit>
>>>>>      </memtune>
>>>>>
>>>>> So if we can figure out some reasonable overhead when vCMDQ is used that
>>>>> would be better solution.
>>>>>
>>>> It makes sense that the ioctl should not be used blindly for every iommufd
>>>> VM. Would you be open to gating the per-process accounting behind a config
>>>> setting (e.g. iommufd_rlimit_mode=process in libvirtd.conf)? That keeps the
>>>> default behavior unchanged while accounting for the multi-VM failure case.
>>> I have no HW with multiple GPUs available to test if this is required or
>>> not in order to start multiple VMs each using one GPU.
>>>
>>> Currently based on my testing for single VM it is not required. Are you
>>> sure if we need this? If not we can remove this patch.
>>>
>> I am sure we need this - I just reproduced the behavior again by removing
>> the call to this ioctl and launching a second VM when another VM is already
>> up. The second VM does not boot and we see the following error:
> Thanks for testing this, I've managed to get a system with multiple
> network devices and managed to reproduce the same error. So it looks
> like this is necessary every time iommufd is used.
> 
> As I already mentioned in this case we should always error out if we
> cannot set per-process accounting to make sure that the VM will stay
> within limits set for it by libvirt.
> 
Ok, I will add a change to error out if we cannot set per-process 
accounting and continue to try to set it every time iommufd is used. 
Thanks for testing and verifying the issue.

>> 2026-01-29 22:35:29.927+0000: 291942: error : qemuProcessReportLogError:2151
>> : internal error: QEMU unexpectedly closed the monitor (vm='1gpu-vm-2'):
>> 2026-01-29T22:35:29.836876Z qemu-system-aarch64: -device {"driver":"vfio-pci","host":"0009:06:00.0","id":"hostdev0","x-vpasid-cap-offset":4088,"iommufd":"iommufd0","fd":"21","bus":"pci.3","addr":"0x0"}:
>> vfio hostdev0: memory listener initialization failed: Region ram-node0:
>> vfio_container_dma_map(0xabaebbf9bb80, 0x40000000, 0x400000000,
>> 0xfeb733e00000) = -12 (Cannot allocate memory)
>> error: Failed to start domain '1gpu-vm-2'
>> error: internal error: QEMU unexpectedly closed the monitor
>> (vm='1gpu-vm-2'): 2026-01-29T22:35:29.836876Z qemu-system-aarch64: -device {"driver":"vfio-pci","host":"0009:06:00.0","id":"hostdev0","x-vpasid-cap-offset":4088,"iommufd":"iommufd0","fd":"21","bus":"pci.3","addr":"0x0"}:
>> vfio hostdev0: memory listener initialization failed: Region ram-node0:
>> vfio_container_dma_map(0xabaebbf9bb80, 0x40000000, 0x400000000,
>> 0xfeb733e00000) = -12 (Cannot allocate memory)
>>
>>
>>>> Separately, I'd be happy to add memlock limit adjustments in the vCMDQ
>>>> Libvirt patch series under qemuDomainGetMemLockLimitBytes() when vCMDQ is
>>>> enabled.
>>> It seems that there is no need to make any changes to current code,
>>> libvirt already adds extra 1GiB if there is single PCI hostdev attached
>>> to the VM.
>>>
>>>>>> For the case you observed, if it were truly a single isolated QEMU process
>>>>>> with no other memlocked usage under the same uid, per‑process vs per‑user
>>>>>> should be identical. The fact that switching to per‑process memory
>>>>>> accounting fixes the issue suggests there is additional memlocked usage
>>>>>> being charged to the libvirt‑qemu uid (e.g. other processes, helper daemons,
>>>>>> or device‑related accounting). vCMDQ just pushes the summed memory over the
>>>>>> limit.
>>>>> When the limit was not high enough I got the following errors in host
>>>>> dmesg:
>>>>>
>>>>> [30507.848263] acpi NVDA200C:03: tegra241_cmdqv: unexpected error reported. vintf_map: 0000000000000002, vcmdq_map 00000000:00000000:00000000:0000000c
>>>>>
>>>>> I think this needs additional work in QEMU, starting VM should error out
>>>>> if it hits the memory limit instead of silently starting broken VM
>>>>> configuration.
>>>> Ok, I will discuss with Shameer about erroring out if it hits the memory
>>>> limit. Thank you for testing and providing this detailed feedback.
>>> I have new details about this error. It only happens when vCMDQ is used
>>> and only when VM with vCMDQ is started for the first time after host is
>>> power cycled (reboot is not enough to trigger this error).
>>>
>>> If this happens shutting down the VM and starting it again no longer
>>> produce this error and I was able to run cuda-samples inside the VM.
>> Are you encountering the same behavior with raw QEMU command line when you
>> power cycle the host and launch a vCMDQ VM for the first time?
> Yes I was able to reproduce it by running qemu directly as root.

This should be addressed by the following patch: 
https://lore.kernel.org/linux-iommu/20260129224341.1594785-1-nicolinc@nvidia.com/

Nathan

Re: [PATCH v5 3/7] qemu: Support per-process memory accounting for iommufd

Posted by Pavel Hrdina via Devel 1 week, 6 days ago

On Fri, Jan 16, 2026 at 05:39:33PM -0800, Nathan Chen via Devel wrote:
> From: Nathan Chen <nathanc@nvidia.com>
> 
> Implement the IOMMU_OPTION_RLIMIT_MODE
> ioctl to set per-process memory accounting for
> iommufd. This prevents ENOMEM errors from the
> default per-user memory accounting when multiple
> VMs under the libvirt-qemu user have their pinned
> memory summed and checked against a per-process
> RLIMIT_MEMLOCK limit.
> 
> Signed-off-by: Nathan Chen <nathanc@nvidia.com>
> ---
>  meson.build              |   1 +
>  po/POTFILES              |   1 +
>  src/libvirt_private.syms |   3 ++
>  src/util/meson.build     |   1 +
>  src/util/viriommufd.c    | 111 +++++++++++++++++++++++++++++++++++++++
>  src/util/viriommufd.h    |  25 +++++++++
>  6 files changed, 142 insertions(+)
>  create mode 100644 src/util/viriommufd.c
>  create mode 100644 src/util/viriommufd.h
> 
> diff --git a/meson.build b/meson.build
> index 964d1fa4e1..a6db70f13e 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -732,6 +732,7 @@ headers = [
>    'ifaddrs.h',
>    'libtasn1.h',
>    'linux/kvm.h',
> +  'linux/iommufd.h',
>    'mntent.h',
>    'net/ethernet.h',
>    'net/if.h',
> diff --git a/po/POTFILES b/po/POTFILES
> index f0aad35c8c..c78d2b8000 100644
> --- a/po/POTFILES
> +++ b/po/POTFILES
> @@ -303,6 +303,7 @@ src/util/virhostuptime.c
>  src/util/viridentity.c
>  src/util/virinhibitor.c
>  src/util/virinitctl.c
> +src/util/viriommufd.c
>  src/util/viriscsi.c
>  src/util/virjson.c
>  src/util/virlease.c
> diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
> index 6bffd2eb6d..7fa76a1ec3 100644
> --- a/src/libvirt_private.syms
> +++ b/src/libvirt_private.syms
> @@ -2646,6 +2646,9 @@ virInhibitorRelease;
>  virInitctlFifos;
>  virInitctlSetRunLevel;
>  
> +# util/viriommufd.h
> +virIOMMUFDSetRLimitMode;
> +
>  # util/viriscsi.h
>  virISCSIConnectionLogin;
>  virISCSIConnectionLogout;
> diff --git a/src/util/meson.build b/src/util/meson.build
> index 4950a795cc..9fb0aa0fe7 100644
> --- a/src/util/meson.build
> +++ b/src/util/meson.build
> @@ -46,6 +46,7 @@ util_sources = [
>    'viridentity.c',
>    'virinhibitor.c',
>    'virinitctl.c',
> +  'viriommufd.c',
>    'viriscsi.c',
>    'virjson.c',
>    'virkeycode.c',
> diff --git a/src/util/viriommufd.c b/src/util/viriommufd.c
> new file mode 100644
> index 0000000000..225c76f4b2
> --- /dev/null
> +++ b/src/util/viriommufd.c
> @@ -0,0 +1,111 @@
> +#include <config.h>
> +
> +#include "viriommufd.h"
> +#include "virlog.h"
> +#include "virerror.h"
> +#include "virfile.h"
> +
> +#ifdef __linux__
> +
> +# include <sys/ioctl.h>
> +# include <linux/types.h>
> +
> +# ifdef HAVE_LINUX_IOMMUFD_H
> +#  include <linux/iommufd.h>
> +# endif
> +
> +# define VIR_FROM_THIS VIR_FROM_NONE
> +
> +VIR_LOG_INIT("util.iommufd");

Move these two before #ifdef __linux__ as they don't depend on linux and
not having VIR_FROM_THIS defines breaks compilation on non-linux systems
because the else branch calls virReportError().

Pavel

Re: [PATCH v5 3/7] qemu: Support per-process memory accounting for iommufd

Posted by Nathan Chen via Devel 1 week, 5 days ago


On 1/19/2026 5:41 AM, Pavel Hrdina wrote:
> On Fri, Jan 16, 2026 at 05:39:33PM -0800, Nathan Chen via Devel wrote:
>> From: Nathan Chen<nathanc@nvidia.com>
>>
>> Implement the IOMMU_OPTION_RLIMIT_MODE
>> ioctl to set per-process memory accounting for
>> iommufd. This prevents ENOMEM errors from the
>> default per-user memory accounting when multiple
>> VMs under the libvirt-qemu user have their pinned
>> memory summed and checked against a per-process
>> RLIMIT_MEMLOCK limit.
>>
>> Signed-off-by: Nathan Chen<nathanc@nvidia.com>
>> ---
>>   meson.build              |   1 +
>>   po/POTFILES              |   1 +
>>   src/libvirt_private.syms |   3 ++
>>   src/util/meson.build     |   1 +
>>   src/util/viriommufd.c    | 111 +++++++++++++++++++++++++++++++++++++++
>>   src/util/viriommufd.h    |  25 +++++++++
>>   6 files changed, 142 insertions(+)
>>   create mode 100644 src/util/viriommufd.c
>>   create mode 100644 src/util/viriommufd.h
>>
>> diff --git a/meson.build b/meson.build
>> index 964d1fa4e1..a6db70f13e 100644
>> --- a/meson.build
>> +++ b/meson.build
>> @@ -732,6 +732,7 @@ headers = [
>>     'ifaddrs.h',
>>     'libtasn1.h',
>>     'linux/kvm.h',
>> +  'linux/iommufd.h',
>>     'mntent.h',
>>     'net/ethernet.h',
>>     'net/if.h',
>> diff --git a/po/POTFILES b/po/POTFILES
>> index f0aad35c8c..c78d2b8000 100644
>> --- a/po/POTFILES
>> +++ b/po/POTFILES
>> @@ -303,6 +303,7 @@ src/util/virhostuptime.c
>>   src/util/viridentity.c
>>   src/util/virinhibitor.c
>>   src/util/virinitctl.c
>> +src/util/viriommufd.c
>>   src/util/viriscsi.c
>>   src/util/virjson.c
>>   src/util/virlease.c
>> diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
>> index 6bffd2eb6d..7fa76a1ec3 100644
>> --- a/src/libvirt_private.syms
>> +++ b/src/libvirt_private.syms
>> @@ -2646,6 +2646,9 @@ virInhibitorRelease;
>>   virInitctlFifos;
>>   virInitctlSetRunLevel;
>>   
>> +# util/viriommufd.h
>> +virIOMMUFDSetRLimitMode;
>> +
>>   # util/viriscsi.h
>>   virISCSIConnectionLogin;
>>   virISCSIConnectionLogout;
>> diff --git a/src/util/meson.build b/src/util/meson.build
>> index 4950a795cc..9fb0aa0fe7 100644
>> --- a/src/util/meson.build
>> +++ b/src/util/meson.build
>> @@ -46,6 +46,7 @@ util_sources = [
>>     'viridentity.c',
>>     'virinhibitor.c',
>>     'virinitctl.c',
>> +  'viriommufd.c',
>>     'viriscsi.c',
>>     'virjson.c',
>>     'virkeycode.c',
>> diff --git a/src/util/viriommufd.c b/src/util/viriommufd.c
>> new file mode 100644
>> index 0000000000..225c76f4b2
>> --- /dev/null
>> +++ b/src/util/viriommufd.c
>> @@ -0,0 +1,111 @@
>> +#include <config.h>
>> +
>> +#include "viriommufd.h"
>> +#include "virlog.h"
>> +#include "virerror.h"
>> +#include "virfile.h"
>> +
>> +#ifdef __linux__
>> +
>> +# include <sys/ioctl.h>
>> +# include <linux/types.h>
>> +
>> +# ifdef HAVE_LINUX_IOMMUFD_H
>> +#  include <linux/iommufd.h>
>> +# endif
>> +
>> +# define VIR_FROM_THIS VIR_FROM_NONE
>> +
>> +VIR_LOG_INIT("util.iommufd");
> Move these two before #ifdef __linux__ as they don't depend on linux and
> not having VIR_FROM_THIS defines breaks compilation on non-linux systems
> because the else branch calls virReportError().

Ok that makes sense, I will move these out in the next revision.

Nathan

[PATCH v5 1/7] qemu: Implement support for associating iommufd to hostdev
[PATCH v5 2/7] qemu: Introduce privateData for hostdevs
[PATCH v5 3/7] qemu: Support per-process memory accounting for iommufd
[PATCH v5 4/7] qemu: open VFIO FDs from libvirt backend
[PATCH v5 5/7] qemu: open iommufd FD from libvirt backend
[PATCH v5 6/7] qemu: Update Cgroup, namespace, and seclabel for iommufd
[PATCH v5 7/7] tests: qemuxmlconfdata: provide iommufd sample XML and CLI args