qga: implement 'guest-get-nvidia-smi' command

[PATCH] qga: implement 'guest-get-nvidia-smi' command

Posted by João Vilaça 1 day, 11 hours ago

---
 qga/commands-posix.c | 64 ++++++++++++++++++++++++++++++++++++++++++++
 qga/commands-win32.c | 64 ++++++++++++++++++++++++++++++++++++++++++++
 qga/qapi-schema.json | 59 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 187 insertions(+)

diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index 837be51c40..631a8a9ee6 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -1415,3 +1415,67 @@ GuestLoadAverage *qmp_guest_get_load(Error **errp)
     return ret;
 }
 #endif
+
+GuestNvidiaGpuList *qmp_guest_get_nvidia_smi(Error **errp)
+{
+    const gchar *argv[] = {
+        "nvidia-smi",
+        "--query-gpu=index,name,driver_version,"
+            "temperature.gpu,utilization.gpu,utilization.memory,"
+            "memory.total,memory.free,memory.used",
+        "--format=csv,noheader,nounits",
+        NULL
+    };
+    g_autofree gchar *stdout_buf = NULL;
+    g_autofree gchar *stderr_buf = NULL;
+    gint exit_status;
+    GError *gerr = NULL;
+    GuestNvidiaGpuList *head = NULL, **tail = &head;
+
+    if (!g_spawn_sync(NULL, (gchar **)argv, NULL,
+                      G_SPAWN_SEARCH_PATH,
+                      NULL, NULL,
+                      &stdout_buf, &stderr_buf,
+                      &exit_status, &gerr)) {
+        error_setg(errp, "failed to run nvidia-smi: %s", gerr->message);
+        g_error_free(gerr);
+        return NULL;
+    }
+
+    if (exit_status != 0) {
+        error_setg(errp, "nvidia-smi failed (exit %d): %s",
+                   exit_status, stderr_buf ? stderr_buf : "unknown error");
+        return NULL;
+    }
+
+    gchar **lines = g_strsplit(stdout_buf, "\n", -1);
+    for (int i = 0; lines[i] != NULL; i++) {
+        gchar *line = g_strstrip(lines[i]);
+        if (*line == '\0') {
+            continue;
+        }
+
+        gchar **f = g_strsplit(line, ", ", 9);
+        if (g_strv_length(f) < 9) {
+            g_strfreev(f);
+            continue;
+        }
+
+        GuestNvidiaGpu *gpu     = g_new0(GuestNvidiaGpu, 1);
+        gpu->index              = (int)g_ascii_strtoll(f[0], NULL, 10);
+        gpu->name               = g_strdup(g_strstrip(f[1]));
+        gpu->driver_version     = g_strdup(g_strstrip(f[2]));
+        gpu->temperature        = (int)g_ascii_strtoll(f[3], NULL, 10);
+        gpu->gpu_utilization    = (int)g_ascii_strtoll(f[4], NULL, 10);
+        gpu->memory_utilization = (int)g_ascii_strtoll(f[5], NULL, 10);
+        gpu->memory_total       = (int)g_ascii_strtoll(f[6], NULL, 10);
+        gpu->memory_free        = (int)g_ascii_strtoll(f[7], NULL, 10);
+        gpu->memory_used        = (int)g_ascii_strtoll(f[8], NULL, 10);
+
+        QAPI_LIST_APPEND(tail, gpu);
+        g_strfreev(f);
+    }
+    g_strfreev(lines);
+
+    return head;
+}
diff --git a/qga/commands-win32.c b/qga/commands-win32.c
index c0bf3467bd..a78d5b71f5 100644
--- a/qga/commands-win32.c
+++ b/qga/commands-win32.c
@@ -2764,3 +2764,67 @@ GuestNetworkRouteList *qmp_guest_network_get_route(Error **errp)
     g_hash_table_destroy(interface_metric_cache);
     return head;
 }
+
+GuestNvidiaGpuList *qmp_guest_get_nvidia_smi(Error **errp)
+{
+    const gchar *argv[] = {
+        "nvidia-smi",
+        "--query-gpu=index,name,driver_version,"
+            "temperature.gpu,utilization.gpu,utilization.memory,"
+            "memory.total,memory.free,memory.used",
+        "--format=csv,noheader,nounits",
+        NULL
+    };
+    g_autofree gchar *stdout_buf = NULL;
+    g_autofree gchar *stderr_buf = NULL;
+    gint exit_status;
+    GError *gerr = NULL;
+    GuestNvidiaGpuList *head = NULL, **tail = &head;
+
+    if (!g_spawn_sync(NULL, (gchar **)argv, NULL,
+                      G_SPAWN_SEARCH_PATH,
+                      NULL, NULL,
+                      &stdout_buf, &stderr_buf,
+                      &exit_status, &gerr)) {
+        error_setg(errp, "failed to run nvidia-smi: %s", gerr->message);
+        g_error_free(gerr);
+        return NULL;
+    }
+
+    if (exit_status != 0) {
+        error_setg(errp, "nvidia-smi failed (exit %d): %s",
+                   exit_status, stderr_buf ? stderr_buf : "unknown error");
+        return NULL;
+    }
+
+    gchar **lines = g_strsplit(stdout_buf, "\n", -1);
+    for (int i = 0; lines[i] != NULL; i++) {
+        gchar *line = g_strstrip(lines[i]);
+        if (*line == '\0') {
+            continue;
+        }
+
+        gchar **f = g_strsplit(line, ", ", 9);
+        if (g_strv_length(f) < 9) {
+            g_strfreev(f);
+            continue;
+        }
+
+        GuestNvidiaGpu *gpu     = g_new0(GuestNvidiaGpu, 1);
+        gpu->index              = (int)g_ascii_strtoll(f[0], NULL, 10);
+        gpu->name               = g_strdup(g_strstrip(f[1]));
+        gpu->driver_version     = g_strdup(g_strstrip(f[2]));
+        gpu->temperature        = (int)g_ascii_strtoll(f[3], NULL, 10);
+        gpu->gpu_utilization    = (int)g_ascii_strtoll(f[4], NULL, 10);
+        gpu->memory_utilization = (int)g_ascii_strtoll(f[5], NULL, 10);
+        gpu->memory_total       = (int)g_ascii_strtoll(f[6], NULL, 10);
+        gpu->memory_free        = (int)g_ascii_strtoll(f[7], NULL, 10);
+        gpu->memory_used        = (int)g_ascii_strtoll(f[8], NULL, 10);
+
+        QAPI_LIST_APPEND(tail, gpu);
+        g_strfreev(f);
+    }
+    g_strfreev(lines);
+
+    return head;
+}
diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json
index c57bc9a02f..8abbf71131 100644
--- a/qga/qapi-schema.json
+++ b/qga/qapi-schema.json
@@ -1876,6 +1876,65 @@
   'if': { 'any': ['CONFIG_WIN32', 'CONFIG_GETLOADAVG'] }
 }
 
+##
+# @GuestNvidiaGpu:
+#
+# Information about a single NVIDIA GPU as reported by nvidia-smi.
+#
+# @index: GPU index (0-based), stable across reboots for a given
+#         hardware slot
+#
+# @name: GPU product name (e.g. "NVIDIA A100-SXM4-80GB")
+#
+# @driver-version: version string of the installed NVIDIA driver
+#
+# @temperature: GPU die temperature in degrees Celsius
+#
+# @gpu-utilization: GPU compute engine utilization in percent (0-100)
+#
+# @memory-utilization: GPU memory controller utilization in percent
+#                      (0-100)
+#
+# @memory-total: total framebuffer memory in MiB
+#
+# @memory-free: free framebuffer memory in MiB
+#
+# @memory-used: used framebuffer memory in MiB
+#
+# Since: 10.1
+##
+{ 'struct': 'GuestNvidiaGpu',
+  'data': {
+      'index':              'int',
+      'name':               'str',
+      'driver-version':     'str',
+      'temperature':        'int',
+      'gpu-utilization':    'int',
+      'memory-utilization': 'int',
+      'memory-total':       'int',
+      'memory-free':        'int',
+      'memory-used':        'int'
+  }
+}
+
+##
+# @guest-get-nvidia-smi:
+#
+# Query NVIDIA GPU information via nvidia-smi inside the guest.
+#
+# Returns one @GuestNvidiaGpu entry per physical GPU (or MIG instance)
+# detected by the NVIDIA driver.
+#
+# Errors:
+#   - If nvidia-smi is not installed or not found in $PATH
+#   - If nvidia-smi exits with a non-zero status (e.g. no NVIDIA
+#     device)
+#
+# Since: 10.1
+##
+{ 'command': 'guest-get-nvidia-smi',
+  'returns': ['GuestNvidiaGpu'] }
+
 ##
 # @GuestNetworkRoute:
 #
-- 
2.53.0

Re: [PATCH] qga: implement 'guest-get-nvidia-smi' command

Posted by Markus Armbruster 10 hours ago

You neglected to cc: me.  We recommend to use scripts/get_maintainer.pl
to find all the maintainers, then use common sense to trim.

João Vilaça <machadovilaca@gmail.com> writes:

The commit message needs to explain why and how the patch is useful.

For a patch adding a command to qemu-ga, like this one, it needs to
state the command's anticipated use cases.

> ---
>  qga/commands-posix.c | 64 ++++++++++++++++++++++++++++++++++++++++++++
>  qga/commands-win32.c | 64 ++++++++++++++++++++++++++++++++++++++++++++
>  qga/qapi-schema.json | 59 ++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 187 insertions(+)
>
> diff --git a/qga/commands-posix.c b/qga/commands-posix.c
> index 837be51c40..631a8a9ee6 100644
> --- a/qga/commands-posix.c
> +++ b/qga/commands-posix.c
> @@ -1415,3 +1415,67 @@ GuestLoadAverage *qmp_guest_get_load(Error **errp)
>      return ret;
>  }
>  #endif
> +
> +GuestNvidiaGpuList *qmp_guest_get_nvidia_smi(Error **errp)
> +{
> +    const gchar *argv[] = {
> +        "nvidia-smi",
> +        "--query-gpu=index,name,driver_version,"
> +            "temperature.gpu,utilization.gpu,utilization.memory,"
> +            "memory.total,memory.free,memory.used",
> +        "--format=csv,noheader,nounits",
> +        NULL
> +    };
> +    g_autofree gchar *stdout_buf = NULL;
> +    g_autofree gchar *stderr_buf = NULL;
> +    gint exit_status;
> +    GError *gerr = NULL;
> +    GuestNvidiaGpuList *head = NULL, **tail = &head;
> +
> +    if (!g_spawn_sync(NULL, (gchar **)argv, NULL,
> +                      G_SPAWN_SEARCH_PATH,
> +                      NULL, NULL,
> +                      &stdout_buf, &stderr_buf,
> +                      &exit_status, &gerr)) {

Why not ga_run_command()?  Hmm, it throws away the command's output on
success.

Kostiantyn, should ga_run_command() be rewritten on top of
g_spawn_sync()?

> +        error_setg(errp, "failed to run nvidia-smi: %s", gerr->message);
> +        g_error_free(gerr);
> +        return NULL;
> +    }
> +
> +    if (exit_status != 0) {
> +        error_setg(errp, "nvidia-smi failed (exit %d): %s",
> +                   exit_status, stderr_buf ? stderr_buf : "unknown error");

This is wrong if @stderr_buf can contain newlines.  qapi/error.h:

 * The resulting message should be a single phrase, with no newline or
 * trailing punctuation.

However, there's similar misuse elsewhere in this file.  Oh well, carry
on.

> +        return NULL;
> +    }
> +

I figure the command's output is some form of CSV.  Can you point to its
documentation?

> +    gchar **lines = g_strsplit(stdout_buf, "\n", -1);
> +    for (int i = 0; lines[i] != NULL; i++) {
> +        gchar *line = g_strstrip(lines[i]);
> +        if (*line == '\0') {
> +            continue;

Silently ignore empty lines.  Okay.

> +        }
> +
> +        gchar **f = g_strsplit(line, ", ", 9);

If the line has more than 9 values, they are squashed into the last one.

> +        if (g_strv_length(f) < 9) {
> +            g_strfreev(f);
> +            continue;

Silently ignore lines with less than 9 values.

> +        }
> +
> +        GuestNvidiaGpu *gpu     = g_new0(GuestNvidiaGpu, 1);
> +        gpu->index              = (int)g_ascii_strtoll(f[0], NULL, 10);

If the value doesn't parse as decimal signed integer, we silently assume
zero.

If it parses, we silently ignore any text following it.

If it parses a value outside 64 bit signed range, we silently assume its
largest or smallest value.

> +        gpu->name               = g_strdup(g_strstrip(f[1]));
> +        gpu->driver_version     = g_strdup(g_strstrip(f[2]));
> +        gpu->temperature        = (int)g_ascii_strtoll(f[3], NULL, 10);
> +        gpu->gpu_utilization    = (int)g_ascii_strtoll(f[4], NULL, 10);
> +        gpu->memory_utilization = (int)g_ascii_strtoll(f[5], NULL, 10);
> +        gpu->memory_total       = (int)g_ascii_strtoll(f[6], NULL, 10);
> +        gpu->memory_free        = (int)g_ascii_strtoll(f[7], NULL, 10);
> +        gpu->memory_used        = (int)g_ascii_strtoll(f[8], NULL, 10);
> +
> +        QAPI_LIST_APPEND(tail, gpu);
> +        g_strfreev(f);
> +    }
> +    g_strfreev(lines);

Are you *sure* this is robust enough?

Please consider a bog-standard LL(1) parser.

> +
> +    return head;
> +}
> diff --git a/qga/commands-win32.c b/qga/commands-win32.c
> index c0bf3467bd..a78d5b71f5 100644
> --- a/qga/commands-win32.c
> +++ b/qga/commands-win32.c
> @@ -2764,3 +2764,67 @@ GuestNetworkRouteList *qmp_guest_network_get_route(Error **errp)
>      g_hash_table_destroy(interface_metric_cache);
>      return head;
>  }
> +
> +GuestNvidiaGpuList *qmp_guest_get_nvidia_smi(Error **errp)
> +{
> +    const gchar *argv[] = {
> +        "nvidia-smi",
> +        "--query-gpu=index,name,driver_version,"
> +            "temperature.gpu,utilization.gpu,utilization.memory,"
> +            "memory.total,memory.free,memory.used",
> +        "--format=csv,noheader,nounits",
> +        NULL
> +    };
> +    g_autofree gchar *stdout_buf = NULL;
> +    g_autofree gchar *stderr_buf = NULL;
> +    gint exit_status;
> +    GError *gerr = NULL;
> +    GuestNvidiaGpuList *head = NULL, **tail = &head;
> +
> +    if (!g_spawn_sync(NULL, (gchar **)argv, NULL,
> +                      G_SPAWN_SEARCH_PATH,
> +                      NULL, NULL,
> +                      &stdout_buf, &stderr_buf,
> +                      &exit_status, &gerr)) {
> +        error_setg(errp, "failed to run nvidia-smi: %s", gerr->message);
> +        g_error_free(gerr);
> +        return NULL;
> +    }
> +
> +    if (exit_status != 0) {
> +        error_setg(errp, "nvidia-smi failed (exit %d): %s",
> +                   exit_status, stderr_buf ? stderr_buf : "unknown error");
> +        return NULL;
> +    }
> +
> +    gchar **lines = g_strsplit(stdout_buf, "\n", -1);
> +    for (int i = 0; lines[i] != NULL; i++) {
> +        gchar *line = g_strstrip(lines[i]);
> +        if (*line == '\0') {
> +            continue;
> +        }
> +
> +        gchar **f = g_strsplit(line, ", ", 9);
> +        if (g_strv_length(f) < 9) {
> +            g_strfreev(f);
> +            continue;
> +        }
> +
> +        GuestNvidiaGpu *gpu     = g_new0(GuestNvidiaGpu, 1);
> +        gpu->index              = (int)g_ascii_strtoll(f[0], NULL, 10);
> +        gpu->name               = g_strdup(g_strstrip(f[1]));
> +        gpu->driver_version     = g_strdup(g_strstrip(f[2]));
> +        gpu->temperature        = (int)g_ascii_strtoll(f[3], NULL, 10);
> +        gpu->gpu_utilization    = (int)g_ascii_strtoll(f[4], NULL, 10);
> +        gpu->memory_utilization = (int)g_ascii_strtoll(f[5], NULL, 10);
> +        gpu->memory_total       = (int)g_ascii_strtoll(f[6], NULL, 10);
> +        gpu->memory_free        = (int)g_ascii_strtoll(f[7], NULL, 10);
> +        gpu->memory_used        = (int)g_ascii_strtoll(f[8], NULL, 10);
> +
> +        QAPI_LIST_APPEND(tail, gpu);
> +        g_strfreev(f);
> +    }
> +    g_strfreev(lines);
> +
> +    return head;
> +}

Duplicates the output parser.  Can you avoid that?

> diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json
> index c57bc9a02f..8abbf71131 100644
> --- a/qga/qapi-schema.json
> +++ b/qga/qapi-schema.json
> @@ -1876,6 +1876,65 @@
>    'if': { 'any': ['CONFIG_WIN32', 'CONFIG_GETLOADAVG'] }
>  }
>  
> +##
> +# @GuestNvidiaGpu:
> +#
> +# Information about a single NVIDIA GPU as reported by nvidia-smi.
> +#
> +# @index: GPU index (0-based), stable across reboots for a given
> +#         hardware slot

Please format like

   # @index: GPU index (0-based), stable across reboots for a given
   #     hardware slot

> +#
> +# @name: GPU product name (e.g. "NVIDIA A100-SXM4-80GB")
> +#
> +# @driver-version: version string of the installed NVIDIA driver
> +#
> +# @temperature: GPU die temperature in degrees Celsius
> +#
> +# @gpu-utilization: GPU compute engine utilization in percent (0-100)

(0-100) feels redundant.

> +#
> +# @memory-utilization: GPU memory controller utilization in percent
> +#                      (0-100)

Likewise.

> +#
> +# @memory-total: total framebuffer memory in MiB
> +#
> +# @memory-free: free framebuffer memory in MiB
> +#
> +# @memory-used: used framebuffer memory in MiB
> +#
> +# Since: 10.1

11.1 most likely.

> +##
> +{ 'struct': 'GuestNvidiaGpu',
> +  'data': {
> +      'index':              'int',
> +      'name':               'str',
> +      'driver-version':     'str',
> +      'temperature':        'int',
> +      'gpu-utilization':    'int',
> +      'memory-utilization': 'int',
> +      'memory-total':       'int',
> +      'memory-free':        'int',
> +      'memory-used':        'int'
> +  }
> +}
> +
> +##
> +# @guest-get-nvidia-smi:
> +#
> +# Query NVIDIA GPU information via nvidia-smi inside the guest.
> +#
> +# Returns one @GuestNvidiaGpu entry per physical GPU (or MIG instance)
> +# detected by the NVIDIA driver.
> +#
> +# Errors:
> +#   - If nvidia-smi is not installed or not found in $PATH
> +#   - If nvidia-smi exits with a non-zero status (e.g. no NVIDIA
> +#     device)

We commonly mention the error kind like this:

   #   - If nvidia-smi is not installed or not found in $PATH,
   #     GenericError
   #   - If nvidia-smi exits with a non-zero status (e.g. no NVIDIA
   #     device), GenericError

> +#
> +# Since: 10.1
> +##
> +{ 'command': 'guest-get-nvidia-smi',
> +  'returns': ['GuestNvidiaGpu'] }
> +
>  ##
>  # @GuestNetworkRoute:
>  #

Why not use existing guest-exec, and leave the parsing to the client?

Re: [PATCH] qga: implement 'guest-get-nvidia-smi' command

Posted by Kostiantyn Kostiuk 10 hours ago

On Wed, Apr 1, 2026 at 2:25 PM Markus Armbruster <armbru@redhat.com> wrote:

> You neglected to cc: me.  We recommend to use scripts/get_maintainer.pl
> to find all the maintainers, then use common sense to trim.
>
> João Vilaça <machadovilaca@gmail.com> writes:
>
> The commit message needs to explain why and how the patch is useful.
>
> For a patch adding a command to qemu-ga, like this one, it needs to
> state the command's anticipated use cases.
>
> > ---
> >  qga/commands-posix.c | 64 ++++++++++++++++++++++++++++++++++++++++++++
> >  qga/commands-win32.c | 64 ++++++++++++++++++++++++++++++++++++++++++++
> >  qga/qapi-schema.json | 59 ++++++++++++++++++++++++++++++++++++++++
> >  3 files changed, 187 insertions(+)
> >
> > diff --git a/qga/commands-posix.c b/qga/commands-posix.c
> > index 837be51c40..631a8a9ee6 100644
> > --- a/qga/commands-posix.c
> > +++ b/qga/commands-posix.c
> > @@ -1415,3 +1415,67 @@ GuestLoadAverage *qmp_guest_get_load(Error **errp)
> >      return ret;
> >  }
> >  #endif
> > +
> > +GuestNvidiaGpuList *qmp_guest_get_nvidia_smi(Error **errp)
> > +{
> > +    const gchar *argv[] = {
> > +        "nvidia-smi",
> > +        "--query-gpu=index,name,driver_version,"
> > +            "temperature.gpu,utilization.gpu,utilization.memory,"
> > +            "memory.total,memory.free,memory.used",
> > +        "--format=csv,noheader,nounits",
> > +        NULL
> > +    };
> > +    g_autofree gchar *stdout_buf = NULL;
> > +    g_autofree gchar *stderr_buf = NULL;
> > +    gint exit_status;
> > +    GError *gerr = NULL;
> > +    GuestNvidiaGpuList *head = NULL, **tail = &head;
> > +
> > +    if (!g_spawn_sync(NULL, (gchar **)argv, NULL,
> > +                      G_SPAWN_SEARCH_PATH,
> > +                      NULL, NULL,
> > +                      &stdout_buf, &stderr_buf,
> > +                      &exit_status, &gerr)) {
>
> Why not ga_run_command()?  Hmm, it throws away the command's output on
> success.
>
> Kostiantyn, should ga_run_command() be rewritten on top of
> g_spawn_sync()?
>

The idea of ga_run_command was different.
Now, we can improve it if we see new use cases.


>
> > +        error_setg(errp, "failed to run nvidia-smi: %s", gerr->message);
> > +        g_error_free(gerr);
> > +        return NULL;
> > +    }
> > +
> > +    if (exit_status != 0) {
> > +        error_setg(errp, "nvidia-smi failed (exit %d): %s",
> > +                   exit_status, stderr_buf ? stderr_buf : "unknown
> error");
>
> This is wrong if @stderr_buf can contain newlines.  qapi/error.h:
>
>  * The resulting message should be a single phrase, with no newline or
>  * trailing punctuation.
>
> However, there's similar misuse elsewhere in this file.  Oh well, carry
> on.
>
> > +        return NULL;
> > +    }
> > +
>
> I figure the command's output is some form of CSV.  Can you point to its
> documentation?
>
> > +    gchar **lines = g_strsplit(stdout_buf, "\n", -1);
> > +    for (int i = 0; lines[i] != NULL; i++) {
> > +        gchar *line = g_strstrip(lines[i]);
> > +        if (*line == '\0') {
> > +            continue;
>
> Silently ignore empty lines.  Okay.
>
> > +        }
> > +
> > +        gchar **f = g_strsplit(line, ", ", 9);
>
> If the line has more than 9 values, they are squashed into the last one.
>
> > +        if (g_strv_length(f) < 9) {
> > +            g_strfreev(f);
> > +            continue;
>
> Silently ignore lines with less than 9 values.
>
> > +        }
> > +
> > +        GuestNvidiaGpu *gpu     = g_new0(GuestNvidiaGpu, 1);
> > +        gpu->index              = (int)g_ascii_strtoll(f[0], NULL, 10);
>
> If the value doesn't parse as decimal signed integer, we silently assume
> zero.
>
> If it parses, we silently ignore any text following it.
>
> If it parses a value outside 64 bit signed range, we silently assume its
> largest or smallest value.
>
> > +        gpu->name               = g_strdup(g_strstrip(f[1]));
> > +        gpu->driver_version     = g_strdup(g_strstrip(f[2]));
> > +        gpu->temperature        = (int)g_ascii_strtoll(f[3], NULL, 10);
> > +        gpu->gpu_utilization    = (int)g_ascii_strtoll(f[4], NULL, 10);
> > +        gpu->memory_utilization = (int)g_ascii_strtoll(f[5], NULL, 10);
> > +        gpu->memory_total       = (int)g_ascii_strtoll(f[6], NULL, 10);
> > +        gpu->memory_free        = (int)g_ascii_strtoll(f[7], NULL, 10);
> > +        gpu->memory_used        = (int)g_ascii_strtoll(f[8], NULL, 10);
> > +
> > +        QAPI_LIST_APPEND(tail, gpu);
> > +        g_strfreev(f);
> > +    }
> > +    g_strfreev(lines);
>
> Are you *sure* this is robust enough?
>
> Please consider a bog-standard LL(1) parser.
>
> > +
> > +    return head;
> > +}
> > diff --git a/qga/commands-win32.c b/qga/commands-win32.c
> > index c0bf3467bd..a78d5b71f5 100644
> > --- a/qga/commands-win32.c
> > +++ b/qga/commands-win32.c
> > @@ -2764,3 +2764,67 @@ GuestNetworkRouteList
> *qmp_guest_network_get_route(Error **errp)
> >      g_hash_table_destroy(interface_metric_cache);
> >      return head;
> >  }
> > +
> > +GuestNvidiaGpuList *qmp_guest_get_nvidia_smi(Error **errp)
> > +{
> > +    const gchar *argv[] = {
> > +        "nvidia-smi",
> > +        "--query-gpu=index,name,driver_version,"
> > +            "temperature.gpu,utilization.gpu,utilization.memory,"
> > +            "memory.total,memory.free,memory.used",
> > +        "--format=csv,noheader,nounits",
> > +        NULL
> > +    };
> > +    g_autofree gchar *stdout_buf = NULL;
> > +    g_autofree gchar *stderr_buf = NULL;
> > +    gint exit_status;
> > +    GError *gerr = NULL;
> > +    GuestNvidiaGpuList *head = NULL, **tail = &head;
> > +
> > +    if (!g_spawn_sync(NULL, (gchar **)argv, NULL,
> > +                      G_SPAWN_SEARCH_PATH,
> > +                      NULL, NULL,
> > +                      &stdout_buf, &stderr_buf,
> > +                      &exit_status, &gerr)) {
> > +        error_setg(errp, "failed to run nvidia-smi: %s", gerr->message);
> > +        g_error_free(gerr);
> > +        return NULL;
> > +    }
> > +
> > +    if (exit_status != 0) {
> > +        error_setg(errp, "nvidia-smi failed (exit %d): %s",
> > +                   exit_status, stderr_buf ? stderr_buf : "unknown
> error");
> > +        return NULL;
> > +    }
> > +
> > +    gchar **lines = g_strsplit(stdout_buf, "\n", -1);
> > +    for (int i = 0; lines[i] != NULL; i++) {
> > +        gchar *line = g_strstrip(lines[i]);
> > +        if (*line == '\0') {
> > +            continue;
> > +        }
> > +
> > +        gchar **f = g_strsplit(line, ", ", 9);
> > +        if (g_strv_length(f) < 9) {
> > +            g_strfreev(f);
> > +            continue;
> > +        }
> > +
> > +        GuestNvidiaGpu *gpu     = g_new0(GuestNvidiaGpu, 1);
> > +        gpu->index              = (int)g_ascii_strtoll(f[0], NULL, 10);
> > +        gpu->name               = g_strdup(g_strstrip(f[1]));
> > +        gpu->driver_version     = g_strdup(g_strstrip(f[2]));
> > +        gpu->temperature        = (int)g_ascii_strtoll(f[3], NULL, 10);
> > +        gpu->gpu_utilization    = (int)g_ascii_strtoll(f[4], NULL, 10);
> > +        gpu->memory_utilization = (int)g_ascii_strtoll(f[5], NULL, 10);
> > +        gpu->memory_total       = (int)g_ascii_strtoll(f[6], NULL, 10);
> > +        gpu->memory_free        = (int)g_ascii_strtoll(f[7], NULL, 10);
> > +        gpu->memory_used        = (int)g_ascii_strtoll(f[8], NULL, 10);
> > +
> > +        QAPI_LIST_APPEND(tail, gpu);
> > +        g_strfreev(f);
> > +    }
> > +    g_strfreev(lines);
> > +
> > +    return head;
> > +}
>
> Duplicates the output parser.  Can you avoid that?
>

There is an open question of who should be looking for the nvidia-smi
command?
In the case of Windows, we definitely know that nvidia-smi will be in
Program Files or
in the DriverStore, so it will be missing in PATH by default.
If QGA should use this knowledge, then the Windows part will be different
(except for the parsing logic)
because we will be looking for nvidia-smi path. If no, and the user must
update PATH manually,
then this can be moved to commands.c


>
> > diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json
> > index c57bc9a02f..8abbf71131 100644
> > --- a/qga/qapi-schema.json
> > +++ b/qga/qapi-schema.json
> > @@ -1876,6 +1876,65 @@
> >    'if': { 'any': ['CONFIG_WIN32', 'CONFIG_GETLOADAVG'] }
> >  }
> >
> > +##
> > +# @GuestNvidiaGpu:
> > +#
> > +# Information about a single NVIDIA GPU as reported by nvidia-smi.
> > +#
> > +# @index: GPU index (0-based), stable across reboots for a given
> > +#         hardware slot
>
> Please format like
>
>    # @index: GPU index (0-based), stable across reboots for a given
>    #     hardware slot
>
> > +#
> > +# @name: GPU product name (e.g. "NVIDIA A100-SXM4-80GB")
> > +#
> > +# @driver-version: version string of the installed NVIDIA driver
> > +#
> > +# @temperature: GPU die temperature in degrees Celsius
> > +#
> > +# @gpu-utilization: GPU compute engine utilization in percent (0-100)
>
> (0-100) feels redundant.
>
> > +#
> > +# @memory-utilization: GPU memory controller utilization in percent
> > +#                      (0-100)
>
> Likewise.
>
> > +#
> > +# @memory-total: total framebuffer memory in MiB
> > +#
> > +# @memory-free: free framebuffer memory in MiB
> > +#
> > +# @memory-used: used framebuffer memory in MiB
> > +#
> > +# Since: 10.1
>
> 11.1 most likely.
>
> > +##
> > +{ 'struct': 'GuestNvidiaGpu',
> > +  'data': {
> > +      'index':              'int',
> > +      'name':               'str',
> > +      'driver-version':     'str',
> > +      'temperature':        'int',
> > +      'gpu-utilization':    'int',
> > +      'memory-utilization': 'int',
> > +      'memory-total':       'int',
> > +      'memory-free':        'int',
> > +      'memory-used':        'int'
> > +  }
> > +}
> > +
> > +##
> > +# @guest-get-nvidia-smi:
> > +#
> > +# Query NVIDIA GPU information via nvidia-smi inside the guest.
> > +#
> > +# Returns one @GuestNvidiaGpu entry per physical GPU (or MIG instance)
> > +# detected by the NVIDIA driver.
> > +#
> > +# Errors:
> > +#   - If nvidia-smi is not installed or not found in $PATH
> > +#   - If nvidia-smi exits with a non-zero status (e.g. no NVIDIA
> > +#     device)
>
> We commonly mention the error kind like this:
>
>    #   - If nvidia-smi is not installed or not found in $PATH,
>    #     GenericError
>    #   - If nvidia-smi exits with a non-zero status (e.g. no NVIDIA
>    #     device), GenericError
>
> > +#
> > +# Since: 10.1
> > +##
> > +{ 'command': 'guest-get-nvidia-smi',
> > +  'returns': ['GuestNvidiaGpu'] }
> > +
> >  ##
> >  # @GuestNetworkRoute:
> >  #
>
> Why not use existing guest-exec, and leave the parsing to the client?
>

I agree with Daniel that guest-exec is a command that should not exist.
For example, in RHEL/CentOS, this command is disabled by default, and
we recommend using SSH over VSock to run any command.
We have seen the problem several times with guest-exec. Also, we have an
upstream patch
for guest-exec + SELinux support to try to workaround a security issue.
Maybe we should discuss marking guest-exec as deprecated.

So, I agree with the author to add a separate command instead.

Re: [PATCH] qga: implement 'guest-get-nvidia-smi' command

Posted by Daniel P. Berrangé 10 hours ago

On Wed, Apr 01, 2026 at 02:50:31PM +0300, Kostiantyn Kostiuk wrote:
> On Wed, Apr 1, 2026 at 2:25 PM Markus Armbruster <armbru@redhat.com> wrote:
> 
> > You neglected to cc: me.  We recommend to use scripts/get_maintainer.pl
> > to find all the maintainers, then use common sense to trim.
> >
> > João Vilaça <machadovilaca@gmail.com> writes:
> >
> > The commit message needs to explain why and how the patch is useful.
> >
> > For a patch adding a command to qemu-ga, like this one, it needs to
> > state the command's anticipated use cases.
> >
> > > ---
> > >  qga/commands-posix.c | 64 ++++++++++++++++++++++++++++++++++++++++++++
> > >  qga/commands-win32.c | 64 ++++++++++++++++++++++++++++++++++++++++++++
> > >  qga/qapi-schema.json | 59 ++++++++++++++++++++++++++++++++++++++++
> > >  3 files changed, 187 insertions(+)
> > >
> > > diff --git a/qga/commands-posix.c b/qga/commands-posix.c
> > > index 837be51c40..631a8a9ee6 100644
> > > --- a/qga/commands-posix.c
> > > +++ b/qga/commands-posix.c
> > > @@ -1415,3 +1415,67 @@ GuestLoadAverage *qmp_guest_get_load(Error **errp)
> > >      return ret;
> > >  }
> > >  #endif
> > > +
> > > +GuestNvidiaGpuList *qmp_guest_get_nvidia_smi(Error **errp)
> > > +{
> > > +    const gchar *argv[] = {
> > > +        "nvidia-smi",
> > > +        "--query-gpu=index,name,driver_version,"
> > > +            "temperature.gpu,utilization.gpu,utilization.memory,"
> > > +            "memory.total,memory.free,memory.used",
> > > +        "--format=csv,noheader,nounits",
> > > +        NULL
> > > +    };
> > > +    g_autofree gchar *stdout_buf = NULL;
> > > +    g_autofree gchar *stderr_buf = NULL;
> > > +    gint exit_status;
> > > +    GError *gerr = NULL;
> > > +    GuestNvidiaGpuList *head = NULL, **tail = &head;
> > > +
> > > +    if (!g_spawn_sync(NULL, (gchar **)argv, NULL,
> > > +                      G_SPAWN_SEARCH_PATH,
> > > +                      NULL, NULL,
> > > +                      &stdout_buf, &stderr_buf,
> > > +                      &exit_status, &gerr)) {
> >
> > Why not ga_run_command()?  Hmm, it throws away the command's output on
> > success.
> >
> > Kostiantyn, should ga_run_command() be rewritten on top of
> > g_spawn_sync()?
> >
> 
> The idea of ga_run_command was different.
> Now, we can improve it if we see new use cases.

It looks pretty simple to modify the existing ga_run_command to
return stdout &stderr as buffers, rather than discarding on
success.

Re-writing to use g_spawn_sync would not be required for that,
though it might be a nice thing todo at some point for platform
portability.


> > > +#
> > > +# Since: 10.1
> > > +##
> > > +{ 'command': 'guest-get-nvidia-smi',
> > > +  'returns': ['GuestNvidiaGpu'] }
> > > +
> > >  ##
> > >  # @GuestNetworkRoute:
> > >  #
> >
> > Why not use existing guest-exec, and leave the parsing to the client?
> >
> 
> I agree with Daniel that guest-exec is a command that should not exist.
> For example, in RHEL/CentOS, this command is disabled by default, and
> we recommend using SSH over VSock to run any command.
> We have seen the problem several times with guest-exec. Also, we have an
> upstream patch
> for guest-exec + SELinux support to try to workaround a security issue.
> Maybe we should discuss marking guest-exec as deprecated.
> 
> So, I agree with the author to add a separate command instead.

I'm somewhat sceptical that any of this should be in scope for the
QEMU agent, as opposed to being left as a job for an existing
general purpose monitoring agent.

With regards,
Daniel
-- 
|: https://berrange.com       ~~        https://hachyderm.io/@berrange :|
|: https://libvirt.org          ~~          https://entangle-photo.org :|
|: https://pixelfed.art/berrange   ~~    https://fstop138.berrange.com :|

Re: [PATCH] qga: implement 'guest-get-nvidia-smi' command

Posted by João Vilaça 10 hours ago

I tested initially with guest-exec but had issues with SELinux.
I was able to create specific rules to allow virt_qemu_ga_t to access
NVIDIA devices.
But while investigating, I saw a bug discussing guest-exec being disabled
RHEL/CentOS
and whether or not Fedora should do the same. So, I decided to pursue this
alternative.


On Wed, 1 Apr 2026 at 12:50, Kostiantyn Kostiuk <kkostiuk@redhat.com> wrote:

>
>
>
>
> On Wed, Apr 1, 2026 at 2:25 PM Markus Armbruster <armbru@redhat.com>
> wrote:
>
>> You neglected to cc: me.  We recommend to use scripts/get_maintainer.pl
>> to find all the maintainers, then use common sense to trim.
>>
>> João Vilaça <machadovilaca@gmail.com> writes:
>>
>> The commit message needs to explain why and how the patch is useful.
>>
>> For a patch adding a command to qemu-ga, like this one, it needs to
>> state the command's anticipated use cases.
>>
>> > ---
>> >  qga/commands-posix.c | 64 ++++++++++++++++++++++++++++++++++++++++++++
>> >  qga/commands-win32.c | 64 ++++++++++++++++++++++++++++++++++++++++++++
>> >  qga/qapi-schema.json | 59 ++++++++++++++++++++++++++++++++++++++++
>> >  3 files changed, 187 insertions(+)
>> >
>> > diff --git a/qga/commands-posix.c b/qga/commands-posix.c
>> > index 837be51c40..631a8a9ee6 100644
>> > --- a/qga/commands-posix.c
>> > +++ b/qga/commands-posix.c
>> > @@ -1415,3 +1415,67 @@ GuestLoadAverage *qmp_guest_get_load(Error
>> **errp)
>> >      return ret;
>> >  }
>> >  #endif
>> > +
>> > +GuestNvidiaGpuList *qmp_guest_get_nvidia_smi(Error **errp)
>> > +{
>> > +    const gchar *argv[] = {
>> > +        "nvidia-smi",
>> > +        "--query-gpu=index,name,driver_version,"
>> > +            "temperature.gpu,utilization.gpu,utilization.memory,"
>> > +            "memory.total,memory.free,memory.used",
>> > +        "--format=csv,noheader,nounits",
>> > +        NULL
>> > +    };
>> > +    g_autofree gchar *stdout_buf = NULL;
>> > +    g_autofree gchar *stderr_buf = NULL;
>> > +    gint exit_status;
>> > +    GError *gerr = NULL;
>> > +    GuestNvidiaGpuList *head = NULL, **tail = &head;
>> > +
>> > +    if (!g_spawn_sync(NULL, (gchar **)argv, NULL,
>> > +                      G_SPAWN_SEARCH_PATH,
>> > +                      NULL, NULL,
>> > +                      &stdout_buf, &stderr_buf,
>> > +                      &exit_status, &gerr)) {
>>
>> Why not ga_run_command()?  Hmm, it throws away the command's output on
>> success.
>>
>> Kostiantyn, should ga_run_command() be rewritten on top of
>> g_spawn_sync()?
>>
>
> The idea of ga_run_command was different.
> Now, we can improve it if we see new use cases.
>
>
>>
>> > +        error_setg(errp, "failed to run nvidia-smi: %s",
>> gerr->message);
>> > +        g_error_free(gerr);
>> > +        return NULL;
>> > +    }
>> > +
>> > +    if (exit_status != 0) {
>> > +        error_setg(errp, "nvidia-smi failed (exit %d): %s",
>> > +                   exit_status, stderr_buf ? stderr_buf : "unknown
>> error");
>>
>> This is wrong if @stderr_buf can contain newlines.  qapi/error.h:
>>
>>  * The resulting message should be a single phrase, with no newline or
>>  * trailing punctuation.
>>
>> However, there's similar misuse elsewhere in this file.  Oh well, carry
>> on.
>>
>> > +        return NULL;
>> > +    }
>> > +
>>
>> I figure the command's output is some form of CSV.  Can you point to its
>> documentation?
>>
>> > +    gchar **lines = g_strsplit(stdout_buf, "\n", -1);
>> > +    for (int i = 0; lines[i] != NULL; i++) {
>> > +        gchar *line = g_strstrip(lines[i]);
>> > +        if (*line == '\0') {
>> > +            continue;
>>
>> Silently ignore empty lines.  Okay.
>>
>> > +        }
>> > +
>> > +        gchar **f = g_strsplit(line, ", ", 9);
>>
>> If the line has more than 9 values, they are squashed into the last one.
>>
>> > +        if (g_strv_length(f) < 9) {
>> > +            g_strfreev(f);
>> > +            continue;
>>
>> Silently ignore lines with less than 9 values.
>>
>> > +        }
>> > +
>> > +        GuestNvidiaGpu *gpu     = g_new0(GuestNvidiaGpu, 1);
>> > +        gpu->index              = (int)g_ascii_strtoll(f[0], NULL, 10);
>>
>> If the value doesn't parse as decimal signed integer, we silently assume
>> zero.
>>
>> If it parses, we silently ignore any text following it.
>>
>> If it parses a value outside 64 bit signed range, we silently assume its
>> largest or smallest value.
>>
>> > +        gpu->name               = g_strdup(g_strstrip(f[1]));
>> > +        gpu->driver_version     = g_strdup(g_strstrip(f[2]));
>> > +        gpu->temperature        = (int)g_ascii_strtoll(f[3], NULL, 10);
>> > +        gpu->gpu_utilization    = (int)g_ascii_strtoll(f[4], NULL, 10);
>> > +        gpu->memory_utilization = (int)g_ascii_strtoll(f[5], NULL, 10);
>> > +        gpu->memory_total       = (int)g_ascii_strtoll(f[6], NULL, 10);
>> > +        gpu->memory_free        = (int)g_ascii_strtoll(f[7], NULL, 10);
>> > +        gpu->memory_used        = (int)g_ascii_strtoll(f[8], NULL, 10);
>> > +
>> > +        QAPI_LIST_APPEND(tail, gpu);
>> > +        g_strfreev(f);
>> > +    }
>> > +    g_strfreev(lines);
>>
>> Are you *sure* this is robust enough?
>>
>> Please consider a bog-standard LL(1) parser.
>>
>> > +
>> > +    return head;
>> > +}
>> > diff --git a/qga/commands-win32.c b/qga/commands-win32.c
>> > index c0bf3467bd..a78d5b71f5 100644
>> > --- a/qga/commands-win32.c
>> > +++ b/qga/commands-win32.c
>> > @@ -2764,3 +2764,67 @@ GuestNetworkRouteList
>> *qmp_guest_network_get_route(Error **errp)
>> >      g_hash_table_destroy(interface_metric_cache);
>> >      return head;
>> >  }
>> > +
>> > +GuestNvidiaGpuList *qmp_guest_get_nvidia_smi(Error **errp)
>> > +{
>> > +    const gchar *argv[] = {
>> > +        "nvidia-smi",
>> > +        "--query-gpu=index,name,driver_version,"
>> > +            "temperature.gpu,utilization.gpu,utilization.memory,"
>> > +            "memory.total,memory.free,memory.used",
>> > +        "--format=csv,noheader,nounits",
>> > +        NULL
>> > +    };
>> > +    g_autofree gchar *stdout_buf = NULL;
>> > +    g_autofree gchar *stderr_buf = NULL;
>> > +    gint exit_status;
>> > +    GError *gerr = NULL;
>> > +    GuestNvidiaGpuList *head = NULL, **tail = &head;
>> > +
>> > +    if (!g_spawn_sync(NULL, (gchar **)argv, NULL,
>> > +                      G_SPAWN_SEARCH_PATH,
>> > +                      NULL, NULL,
>> > +                      &stdout_buf, &stderr_buf,
>> > +                      &exit_status, &gerr)) {
>> > +        error_setg(errp, "failed to run nvidia-smi: %s",
>> gerr->message);
>> > +        g_error_free(gerr);
>> > +        return NULL;
>> > +    }
>> > +
>> > +    if (exit_status != 0) {
>> > +        error_setg(errp, "nvidia-smi failed (exit %d): %s",
>> > +                   exit_status, stderr_buf ? stderr_buf : "unknown
>> error");
>> > +        return NULL;
>> > +    }
>> > +
>> > +    gchar **lines = g_strsplit(stdout_buf, "\n", -1);
>> > +    for (int i = 0; lines[i] != NULL; i++) {
>> > +        gchar *line = g_strstrip(lines[i]);
>> > +        if (*line == '\0') {
>> > +            continue;
>> > +        }
>> > +
>> > +        gchar **f = g_strsplit(line, ", ", 9);
>> > +        if (g_strv_length(f) < 9) {
>> > +            g_strfreev(f);
>> > +            continue;
>> > +        }
>> > +
>> > +        GuestNvidiaGpu *gpu     = g_new0(GuestNvidiaGpu, 1);
>> > +        gpu->index              = (int)g_ascii_strtoll(f[0], NULL, 10);
>> > +        gpu->name               = g_strdup(g_strstrip(f[1]));
>> > +        gpu->driver_version     = g_strdup(g_strstrip(f[2]));
>> > +        gpu->temperature        = (int)g_ascii_strtoll(f[3], NULL, 10);
>> > +        gpu->gpu_utilization    = (int)g_ascii_strtoll(f[4], NULL, 10);
>> > +        gpu->memory_utilization = (int)g_ascii_strtoll(f[5], NULL, 10);
>> > +        gpu->memory_total       = (int)g_ascii_strtoll(f[6], NULL, 10);
>> > +        gpu->memory_free        = (int)g_ascii_strtoll(f[7], NULL, 10);
>> > +        gpu->memory_used        = (int)g_ascii_strtoll(f[8], NULL, 10);
>> > +
>> > +        QAPI_LIST_APPEND(tail, gpu);
>> > +        g_strfreev(f);
>> > +    }
>> > +    g_strfreev(lines);
>> > +
>> > +    return head;
>> > +}
>>
>> Duplicates the output parser.  Can you avoid that?
>>
>
> There is an open question of who should be looking for the nvidia-smi
> command?
> In the case of Windows, we definitely know that nvidia-smi will be in
> Program Files or
> in the DriverStore, so it will be missing in PATH by default.
> If QGA should use this knowledge, then the Windows part will be different
> (except for the parsing logic)
> because we will be looking for nvidia-smi path. If no, and the user must
> update PATH manually,
> then this can be moved to commands.c
>
>
>>
>> > diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json
>> > index c57bc9a02f..8abbf71131 100644
>> > --- a/qga/qapi-schema.json
>> > +++ b/qga/qapi-schema.json
>> > @@ -1876,6 +1876,65 @@
>> >    'if': { 'any': ['CONFIG_WIN32', 'CONFIG_GETLOADAVG'] }
>> >  }
>> >
>> > +##
>> > +# @GuestNvidiaGpu:
>> > +#
>> > +# Information about a single NVIDIA GPU as reported by nvidia-smi.
>> > +#
>> > +# @index: GPU index (0-based), stable across reboots for a given
>> > +#         hardware slot
>>
>> Please format like
>>
>>    # @index: GPU index (0-based), stable across reboots for a given
>>    #     hardware slot
>>
>> > +#
>> > +# @name: GPU product name (e.g. "NVIDIA A100-SXM4-80GB")
>> > +#
>> > +# @driver-version: version string of the installed NVIDIA driver
>> > +#
>> > +# @temperature: GPU die temperature in degrees Celsius
>> > +#
>> > +# @gpu-utilization: GPU compute engine utilization in percent (0-100)
>>
>> (0-100) feels redundant.
>>
>> > +#
>> > +# @memory-utilization: GPU memory controller utilization in percent
>> > +#                      (0-100)
>>
>> Likewise.
>>
>> > +#
>> > +# @memory-total: total framebuffer memory in MiB
>> > +#
>> > +# @memory-free: free framebuffer memory in MiB
>> > +#
>> > +# @memory-used: used framebuffer memory in MiB
>> > +#
>> > +# Since: 10.1
>>
>> 11.1 most likely.
>>
>> > +##
>> > +{ 'struct': 'GuestNvidiaGpu',
>> > +  'data': {
>> > +      'index':              'int',
>> > +      'name':               'str',
>> > +      'driver-version':     'str',
>> > +      'temperature':        'int',
>> > +      'gpu-utilization':    'int',
>> > +      'memory-utilization': 'int',
>> > +      'memory-total':       'int',
>> > +      'memory-free':        'int',
>> > +      'memory-used':        'int'
>> > +  }
>> > +}
>> > +
>> > +##
>> > +# @guest-get-nvidia-smi:
>> > +#
>> > +# Query NVIDIA GPU information via nvidia-smi inside the guest.
>> > +#
>> > +# Returns one @GuestNvidiaGpu entry per physical GPU (or MIG instance)
>> > +# detected by the NVIDIA driver.
>> > +#
>> > +# Errors:
>> > +#   - If nvidia-smi is not installed or not found in $PATH
>> > +#   - If nvidia-smi exits with a non-zero status (e.g. no NVIDIA
>> > +#     device)
>>
>> We commonly mention the error kind like this:
>>
>>    #   - If nvidia-smi is not installed or not found in $PATH,
>>    #     GenericError
>>    #   - If nvidia-smi exits with a non-zero status (e.g. no NVIDIA
>>    #     device), GenericError
>>
>> > +#
>> > +# Since: 10.1
>> > +##
>> > +{ 'command': 'guest-get-nvidia-smi',
>> > +  'returns': ['GuestNvidiaGpu'] }
>> > +
>> >  ##
>> >  # @GuestNetworkRoute:
>> >  #
>>
>> Why not use existing guest-exec, and leave the parsing to the client?
>>
>
> I agree with Daniel that guest-exec is a command that should not exist.
> For example, in RHEL/CentOS, this command is disabled by default, and
> we recommend using SSH over VSock to run any command.
> We have seen the problem several times with guest-exec. Also, we have an
> upstream patch
> for guest-exec + SELinux support to try to workaround a security issue.
> Maybe we should discuss marking guest-exec as deprecated.
>
> So, I agree with the author to add a separate command instead.
>
>
>
>
>

Re: [PATCH] qga: implement 'guest-get-nvidia-smi' command

Posted by Kostiantyn Kostiuk 1 day, 9 hours ago

Hi João,

Thanks for your patch. Can you explain the use case of this command?

Best Regards,
Kostiantyn Kostiuk.


On Tue, Mar 31, 2026 at 2:02 PM João Vilaça <machadovilaca@gmail.com> wrote:

> ---
>  qga/commands-posix.c | 64 ++++++++++++++++++++++++++++++++++++++++++++
>  qga/commands-win32.c | 64 ++++++++++++++++++++++++++++++++++++++++++++
>  qga/qapi-schema.json | 59 ++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 187 insertions(+)
>
> diff --git a/qga/commands-posix.c b/qga/commands-posix.c
> index 837be51c40..631a8a9ee6 100644
> --- a/qga/commands-posix.c
> +++ b/qga/commands-posix.c
> @@ -1415,3 +1415,67 @@ GuestLoadAverage *qmp_guest_get_load(Error **errp)
>      return ret;
>  }
>  #endif
> +
> +GuestNvidiaGpuList *qmp_guest_get_nvidia_smi(Error **errp)
> +{
> +    const gchar *argv[] = {
> +        "nvidia-smi",
> +        "--query-gpu=index,name,driver_version,"
> +            "temperature.gpu,utilization.gpu,utilization.memory,"
> +            "memory.total,memory.free,memory.used",
> +        "--format=csv,noheader,nounits",
> +        NULL
> +    };
> +    g_autofree gchar *stdout_buf = NULL;
> +    g_autofree gchar *stderr_buf = NULL;
> +    gint exit_status;
> +    GError *gerr = NULL;
> +    GuestNvidiaGpuList *head = NULL, **tail = &head;
> +
> +    if (!g_spawn_sync(NULL, (gchar **)argv, NULL,
> +                      G_SPAWN_SEARCH_PATH,
> +                      NULL, NULL,
> +                      &stdout_buf, &stderr_buf,
> +                      &exit_status, &gerr)) {
> +        error_setg(errp, "failed to run nvidia-smi: %s", gerr->message);
> +        g_error_free(gerr);
> +        return NULL;
> +    }
> +
> +    if (exit_status != 0) {
> +        error_setg(errp, "nvidia-smi failed (exit %d): %s",
> +                   exit_status, stderr_buf ? stderr_buf : "unknown
> error");
> +        return NULL;
> +    }
> +
> +    gchar **lines = g_strsplit(stdout_buf, "\n", -1);
> +    for (int i = 0; lines[i] != NULL; i++) {
> +        gchar *line = g_strstrip(lines[i]);
> +        if (*line == '\0') {
> +            continue;
> +        }
> +
> +        gchar **f = g_strsplit(line, ", ", 9);
> +        if (g_strv_length(f) < 9) {
> +            g_strfreev(f);
> +            continue;
> +        }
> +
> +        GuestNvidiaGpu *gpu     = g_new0(GuestNvidiaGpu, 1);
> +        gpu->index              = (int)g_ascii_strtoll(f[0], NULL, 10);
> +        gpu->name               = g_strdup(g_strstrip(f[1]));
> +        gpu->driver_version     = g_strdup(g_strstrip(f[2]));
> +        gpu->temperature        = (int)g_ascii_strtoll(f[3], NULL, 10);
> +        gpu->gpu_utilization    = (int)g_ascii_strtoll(f[4], NULL, 10);
> +        gpu->memory_utilization = (int)g_ascii_strtoll(f[5], NULL, 10);
> +        gpu->memory_total       = (int)g_ascii_strtoll(f[6], NULL, 10);
> +        gpu->memory_free        = (int)g_ascii_strtoll(f[7], NULL, 10);
> +        gpu->memory_used        = (int)g_ascii_strtoll(f[8], NULL, 10);
> +
> +        QAPI_LIST_APPEND(tail, gpu);
> +        g_strfreev(f);
> +    }
> +    g_strfreev(lines);
> +
> +    return head;
> +}
> diff --git a/qga/commands-win32.c b/qga/commands-win32.c
> index c0bf3467bd..a78d5b71f5 100644
> --- a/qga/commands-win32.c
> +++ b/qga/commands-win32.c
> @@ -2764,3 +2764,67 @@ GuestNetworkRouteList
> *qmp_guest_network_get_route(Error **errp)
>      g_hash_table_destroy(interface_metric_cache);
>      return head;
>  }
> +
> +GuestNvidiaGpuList *qmp_guest_get_nvidia_smi(Error **errp)
> +{
> +    const gchar *argv[] = {
> +        "nvidia-smi",
> +        "--query-gpu=index,name,driver_version,"
> +            "temperature.gpu,utilization.gpu,utilization.memory,"
> +            "memory.total,memory.free,memory.used",
> +        "--format=csv,noheader,nounits",
> +        NULL
> +    };
> +    g_autofree gchar *stdout_buf = NULL;
> +    g_autofree gchar *stderr_buf = NULL;
> +    gint exit_status;
> +    GError *gerr = NULL;
> +    GuestNvidiaGpuList *head = NULL, **tail = &head;
> +
> +    if (!g_spawn_sync(NULL, (gchar **)argv, NULL,
> +                      G_SPAWN_SEARCH_PATH,
> +                      NULL, NULL,
> +                      &stdout_buf, &stderr_buf,
> +                      &exit_status, &gerr)) {
> +        error_setg(errp, "failed to run nvidia-smi: %s", gerr->message);
> +        g_error_free(gerr);
> +        return NULL;
> +    }
> +
> +    if (exit_status != 0) {
> +        error_setg(errp, "nvidia-smi failed (exit %d): %s",
> +                   exit_status, stderr_buf ? stderr_buf : "unknown
> error");
> +        return NULL;
> +    }
> +
> +    gchar **lines = g_strsplit(stdout_buf, "\n", -1);
> +    for (int i = 0; lines[i] != NULL; i++) {
> +        gchar *line = g_strstrip(lines[i]);
> +        if (*line == '\0') {
> +            continue;
> +        }
> +
> +        gchar **f = g_strsplit(line, ", ", 9);
> +        if (g_strv_length(f) < 9) {
> +            g_strfreev(f);
> +            continue;
> +        }
> +
> +        GuestNvidiaGpu *gpu     = g_new0(GuestNvidiaGpu, 1);
> +        gpu->index              = (int)g_ascii_strtoll(f[0], NULL, 10);
> +        gpu->name               = g_strdup(g_strstrip(f[1]));
> +        gpu->driver_version     = g_strdup(g_strstrip(f[2]));
> +        gpu->temperature        = (int)g_ascii_strtoll(f[3], NULL, 10);
> +        gpu->gpu_utilization    = (int)g_ascii_strtoll(f[4], NULL, 10);
> +        gpu->memory_utilization = (int)g_ascii_strtoll(f[5], NULL, 10);
> +        gpu->memory_total       = (int)g_ascii_strtoll(f[6], NULL, 10);
> +        gpu->memory_free        = (int)g_ascii_strtoll(f[7], NULL, 10);
> +        gpu->memory_used        = (int)g_ascii_strtoll(f[8], NULL, 10);
> +
> +        QAPI_LIST_APPEND(tail, gpu);
> +        g_strfreev(f);
> +    }
> +    g_strfreev(lines);
> +
> +    return head;
> +}
> diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json
> index c57bc9a02f..8abbf71131 100644
> --- a/qga/qapi-schema.json
> +++ b/qga/qapi-schema.json
> @@ -1876,6 +1876,65 @@
>    'if': { 'any': ['CONFIG_WIN32', 'CONFIG_GETLOADAVG'] }
>  }
>
> +##
> +# @GuestNvidiaGpu:
> +#
> +# Information about a single NVIDIA GPU as reported by nvidia-smi.
> +#
> +# @index: GPU index (0-based), stable across reboots for a given
> +#         hardware slot
> +#
> +# @name: GPU product name (e.g. "NVIDIA A100-SXM4-80GB")
> +#
> +# @driver-version: version string of the installed NVIDIA driver
> +#
> +# @temperature: GPU die temperature in degrees Celsius
> +#
> +# @gpu-utilization: GPU compute engine utilization in percent (0-100)
> +#
> +# @memory-utilization: GPU memory controller utilization in percent
> +#                      (0-100)
> +#
> +# @memory-total: total framebuffer memory in MiB
> +#
> +# @memory-free: free framebuffer memory in MiB
> +#
> +# @memory-used: used framebuffer memory in MiB
> +#
> +# Since: 10.1
> +##
> +{ 'struct': 'GuestNvidiaGpu',
> +  'data': {
> +      'index':              'int',
> +      'name':               'str',
> +      'driver-version':     'str',
> +      'temperature':        'int',
> +      'gpu-utilization':    'int',
> +      'memory-utilization': 'int',
> +      'memory-total':       'int',
> +      'memory-free':        'int',
> +      'memory-used':        'int'
> +  }
> +}
> +
> +##
> +# @guest-get-nvidia-smi:
> +#
> +# Query NVIDIA GPU information via nvidia-smi inside the guest.
> +#
> +# Returns one @GuestNvidiaGpu entry per physical GPU (or MIG instance)
> +# detected by the NVIDIA driver.
> +#
> +# Errors:
> +#   - If nvidia-smi is not installed or not found in $PATH
> +#   - If nvidia-smi exits with a non-zero status (e.g. no NVIDIA
> +#     device)
> +#
> +# Since: 10.1
> +##
> +{ 'command': 'guest-get-nvidia-smi',
> +  'returns': ['GuestNvidiaGpu'] }
> +
>  ##
>  # @GuestNetworkRoute:
>  #
> --
> 2.53.0
>
>

Re: [PATCH] qga: implement 'guest-get-nvidia-smi' command

Posted by João Vilaça 1 day, 9 hours ago

Monitoring, on the hypervisor side, of GPU workloads running in guests with
NVIDIA GPU passthrough (VFIO) or vGPU (NVIDIA GRID).
Operators who want GPU metrics from a fleet of VMs must either SSH into
each guest or deploy a separate monitoring agent inside the guest itself
(e.g. DCGM exporter).

On Tue, 31 Mar 2026 at 13:57, Kostiantyn Kostiuk <kkostiuk@redhat.com>
wrote:

> Hi João,
>
> Thanks for your patch. Can you explain the use case of this command?
>
> Best Regards,
> Kostiantyn Kostiuk.
>
>
> On Tue, Mar 31, 2026 at 2:02 PM João Vilaça <machadovilaca@gmail.com>
> wrote:
>
>> ---
>>  qga/commands-posix.c | 64 ++++++++++++++++++++++++++++++++++++++++++++
>>  qga/commands-win32.c | 64 ++++++++++++++++++++++++++++++++++++++++++++
>>  qga/qapi-schema.json | 59 ++++++++++++++++++++++++++++++++++++++++
>>  3 files changed, 187 insertions(+)
>>
>> diff --git a/qga/commands-posix.c b/qga/commands-posix.c
>> index 837be51c40..631a8a9ee6 100644
>> --- a/qga/commands-posix.c
>> +++ b/qga/commands-posix.c
>> @@ -1415,3 +1415,67 @@ GuestLoadAverage *qmp_guest_get_load(Error **errp)
>>      return ret;
>>  }
>>  #endif
>> +
>> +GuestNvidiaGpuList *qmp_guest_get_nvidia_smi(Error **errp)
>> +{
>> +    const gchar *argv[] = {
>> +        "nvidia-smi",
>> +        "--query-gpu=index,name,driver_version,"
>> +            "temperature.gpu,utilization.gpu,utilization.memory,"
>> +            "memory.total,memory.free,memory.used",
>> +        "--format=csv,noheader,nounits",
>> +        NULL
>> +    };
>> +    g_autofree gchar *stdout_buf = NULL;
>> +    g_autofree gchar *stderr_buf = NULL;
>> +    gint exit_status;
>> +    GError *gerr = NULL;
>> +    GuestNvidiaGpuList *head = NULL, **tail = &head;
>> +
>> +    if (!g_spawn_sync(NULL, (gchar **)argv, NULL,
>> +                      G_SPAWN_SEARCH_PATH,
>> +                      NULL, NULL,
>> +                      &stdout_buf, &stderr_buf,
>> +                      &exit_status, &gerr)) {
>> +        error_setg(errp, "failed to run nvidia-smi: %s", gerr->message);
>> +        g_error_free(gerr);
>> +        return NULL;
>> +    }
>> +
>> +    if (exit_status != 0) {
>> +        error_setg(errp, "nvidia-smi failed (exit %d): %s",
>> +                   exit_status, stderr_buf ? stderr_buf : "unknown
>> error");
>> +        return NULL;
>> +    }
>> +
>> +    gchar **lines = g_strsplit(stdout_buf, "\n", -1);
>> +    for (int i = 0; lines[i] != NULL; i++) {
>> +        gchar *line = g_strstrip(lines[i]);
>> +        if (*line == '\0') {
>> +            continue;
>> +        }
>> +
>> +        gchar **f = g_strsplit(line, ", ", 9);
>> +        if (g_strv_length(f) < 9) {
>> +            g_strfreev(f);
>> +            continue;
>> +        }
>> +
>> +        GuestNvidiaGpu *gpu     = g_new0(GuestNvidiaGpu, 1);
>> +        gpu->index              = (int)g_ascii_strtoll(f[0], NULL, 10);
>> +        gpu->name               = g_strdup(g_strstrip(f[1]));
>> +        gpu->driver_version     = g_strdup(g_strstrip(f[2]));
>> +        gpu->temperature        = (int)g_ascii_strtoll(f[3], NULL, 10);
>> +        gpu->gpu_utilization    = (int)g_ascii_strtoll(f[4], NULL, 10);
>> +        gpu->memory_utilization = (int)g_ascii_strtoll(f[5], NULL, 10);
>> +        gpu->memory_total       = (int)g_ascii_strtoll(f[6], NULL, 10);
>> +        gpu->memory_free        = (int)g_ascii_strtoll(f[7], NULL, 10);
>> +        gpu->memory_used        = (int)g_ascii_strtoll(f[8], NULL, 10);
>> +
>> +        QAPI_LIST_APPEND(tail, gpu);
>> +        g_strfreev(f);
>> +    }
>> +    g_strfreev(lines);
>> +
>> +    return head;
>> +}
>> diff --git a/qga/commands-win32.c b/qga/commands-win32.c
>> index c0bf3467bd..a78d5b71f5 100644
>> --- a/qga/commands-win32.c
>> +++ b/qga/commands-win32.c
>> @@ -2764,3 +2764,67 @@ GuestNetworkRouteList
>> *qmp_guest_network_get_route(Error **errp)
>>      g_hash_table_destroy(interface_metric_cache);
>>      return head;
>>  }
>> +
>> +GuestNvidiaGpuList *qmp_guest_get_nvidia_smi(Error **errp)
>> +{
>> +    const gchar *argv[] = {
>> +        "nvidia-smi",
>> +        "--query-gpu=index,name,driver_version,"
>> +            "temperature.gpu,utilization.gpu,utilization.memory,"
>> +            "memory.total,memory.free,memory.used",
>> +        "--format=csv,noheader,nounits",
>> +        NULL
>> +    };
>> +    g_autofree gchar *stdout_buf = NULL;
>> +    g_autofree gchar *stderr_buf = NULL;
>> +    gint exit_status;
>> +    GError *gerr = NULL;
>> +    GuestNvidiaGpuList *head = NULL, **tail = &head;
>> +
>> +    if (!g_spawn_sync(NULL, (gchar **)argv, NULL,
>> +                      G_SPAWN_SEARCH_PATH,
>> +                      NULL, NULL,
>> +                      &stdout_buf, &stderr_buf,
>> +                      &exit_status, &gerr)) {
>> +        error_setg(errp, "failed to run nvidia-smi: %s", gerr->message);
>> +        g_error_free(gerr);
>> +        return NULL;
>> +    }
>> +
>> +    if (exit_status != 0) {
>> +        error_setg(errp, "nvidia-smi failed (exit %d): %s",
>> +                   exit_status, stderr_buf ? stderr_buf : "unknown
>> error");
>> +        return NULL;
>> +    }
>> +
>> +    gchar **lines = g_strsplit(stdout_buf, "\n", -1);
>> +    for (int i = 0; lines[i] != NULL; i++) {
>> +        gchar *line = g_strstrip(lines[i]);
>> +        if (*line == '\0') {
>> +            continue;
>> +        }
>> +
>> +        gchar **f = g_strsplit(line, ", ", 9);
>> +        if (g_strv_length(f) < 9) {
>> +            g_strfreev(f);
>> +            continue;
>> +        }
>> +
>> +        GuestNvidiaGpu *gpu     = g_new0(GuestNvidiaGpu, 1);
>> +        gpu->index              = (int)g_ascii_strtoll(f[0], NULL, 10);
>> +        gpu->name               = g_strdup(g_strstrip(f[1]));
>> +        gpu->driver_version     = g_strdup(g_strstrip(f[2]));
>> +        gpu->temperature        = (int)g_ascii_strtoll(f[3], NULL, 10);
>> +        gpu->gpu_utilization    = (int)g_ascii_strtoll(f[4], NULL, 10);
>> +        gpu->memory_utilization = (int)g_ascii_strtoll(f[5], NULL, 10);
>> +        gpu->memory_total       = (int)g_ascii_strtoll(f[6], NULL, 10);
>> +        gpu->memory_free        = (int)g_ascii_strtoll(f[7], NULL, 10);
>> +        gpu->memory_used        = (int)g_ascii_strtoll(f[8], NULL, 10);
>> +
>> +        QAPI_LIST_APPEND(tail, gpu);
>> +        g_strfreev(f);
>> +    }
>> +    g_strfreev(lines);
>> +
>> +    return head;
>> +}
>> diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json
>> index c57bc9a02f..8abbf71131 100644
>> --- a/qga/qapi-schema.json
>> +++ b/qga/qapi-schema.json
>> @@ -1876,6 +1876,65 @@
>>    'if': { 'any': ['CONFIG_WIN32', 'CONFIG_GETLOADAVG'] }
>>  }
>>
>> +##
>> +# @GuestNvidiaGpu:
>> +#
>> +# Information about a single NVIDIA GPU as reported by nvidia-smi.
>> +#
>> +# @index: GPU index (0-based), stable across reboots for a given
>> +#         hardware slot
>> +#
>> +# @name: GPU product name (e.g. "NVIDIA A100-SXM4-80GB")
>> +#
>> +# @driver-version: version string of the installed NVIDIA driver
>> +#
>> +# @temperature: GPU die temperature in degrees Celsius
>> +#
>> +# @gpu-utilization: GPU compute engine utilization in percent (0-100)
>> +#
>> +# @memory-utilization: GPU memory controller utilization in percent
>> +#                      (0-100)
>> +#
>> +# @memory-total: total framebuffer memory in MiB
>> +#
>> +# @memory-free: free framebuffer memory in MiB
>> +#
>> +# @memory-used: used framebuffer memory in MiB
>> +#
>> +# Since: 10.1
>> +##
>> +{ 'struct': 'GuestNvidiaGpu',
>> +  'data': {
>> +      'index':              'int',
>> +      'name':               'str',
>> +      'driver-version':     'str',
>> +      'temperature':        'int',
>> +      'gpu-utilization':    'int',
>> +      'memory-utilization': 'int',
>> +      'memory-total':       'int',
>> +      'memory-free':        'int',
>> +      'memory-used':        'int'
>> +  }
>> +}
>> +
>> +##
>> +# @guest-get-nvidia-smi:
>> +#
>> +# Query NVIDIA GPU information via nvidia-smi inside the guest.
>> +#
>> +# Returns one @GuestNvidiaGpu entry per physical GPU (or MIG instance)
>> +# detected by the NVIDIA driver.
>> +#
>> +# Errors:
>> +#   - If nvidia-smi is not installed or not found in $PATH
>> +#   - If nvidia-smi exits with a non-zero status (e.g. no NVIDIA
>> +#     device)
>> +#
>> +# Since: 10.1
>> +##
>> +{ 'command': 'guest-get-nvidia-smi',
>> +  'returns': ['GuestNvidiaGpu'] }
>> +
>>  ##
>>  # @GuestNetworkRoute:
>>  #
>> --
>> 2.53.0
>>
>>

Re: [PATCH] qga: implement 'guest-get-nvidia-smi' command

Posted by Daniel P. Berrangé 10 hours ago

On Tue, Mar 31, 2026 at 02:07:27PM +0100, João Vilaça wrote:
> Monitoring, on the hypervisor side, of GPU workloads running in guests with
> NVIDIA GPU passthrough (VFIO) or vGPU (NVIDIA GRID).
> Operators who want GPU metrics from a fleet of VMs must either SSH into
> each guest or deploy a separate monitoring agent inside the guest itself
> (e.g. DCGM exporter).

IMHO, directly people to deploy an existing monmitoring agent solution
in the guest is likely the right thing todo.

We don't really want the QEMU guest agent expanding arbitrarily such
that it effectively re-invents what's already offered by general
purpose monitoring agents.

If the problem is that those existing monitoring agents need network
support, then IMHO the answer is likely to enhance the agent to use
vsock, rather than add more monitoring commands to QEMU.

With regards,
Daniel
-- 
|: https://berrange.com       ~~        https://hachyderm.io/@berrange :|
|: https://libvirt.org          ~~          https://entangle-photo.org :|
|: https://pixelfed.art/berrange   ~~    https://fstop138.berrange.com :|