[PATCH v2 1/4] qemu.conf changes to support multiple memory backend

Michael Galaxy via Devel posted 4 patches 4 months, 2 weeks ago
There is a newer version of this series
[PATCH v2 1/4] qemu.conf changes to support multiple memory backend
Posted by Michael Galaxy via Devel 4 months, 2 weeks ago
From: Michael Galaxy <mgalaxy@akamai.com>

We start by introducing a backwards-compatible, comma-separated
specification that will not break existing installations, such
as in the following example:

$ cat qemu.conf | grep memory_backing_dir
memory_backing_dir = ["/path/to/pmem/0", "/path/to/pmem/1"]

(The old syntax with a single string is also still supported)
memory_backing_dir = "/path/to/dir"

In our case, we almost always have two NUMA nodes, so in that
example, we have two PMEM regions which are created on the
Linux kernel command line that get mounted into those two
locations for libvirt to use.

Signed-off-by: Michael Galaxy <mgalaxy@akamai.com>
---
 src/qemu/qemu_command.c |   8 ++-
 src/qemu/qemu_conf.c    | 141 +++++++++++++++++++++++++++++++++++-----
 src/qemu/qemu_conf.h    |  14 ++--
 src/qemu/qemu_driver.c  |  29 +++++----
 src/qemu/qemu_hotplug.c |   6 +-
 src/qemu/qemu_process.c |  44 +++++++------
 src/qemu/qemu_process.h |   7 +-
 7 files changed, 188 insertions(+), 61 deletions(-)

diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c
index 2d0eddc79e..9a25fdb0ed 100644
--- a/src/qemu/qemu_command.c
+++ b/src/qemu/qemu_command.c
@@ -3420,7 +3420,9 @@ qemuBuildMemoryBackendProps(virJSONValue **backendProps,
         } else {
             /* We can have both pagesize and mem source. If that's the case,
              * prefer hugepages as those are more specific. */
-            if (qemuGetMemoryBackingPath(priv->driver, def, mem->info.alias, &memPath) < 0)
+
+            virDomainXMLPrivateDataCallbacks *privateData = (virDomainXMLPrivateDataCallbacks *) priv;
+            if (qemuGetMemoryBackingPath(def, privateData, mem->targetNode, mem->info.alias, &memPath) < 0)
                 return -1;
         }
 
@@ -7295,7 +7297,9 @@ qemuBuildMemPathStr(const virDomainDef *def,
             return -1;
         prealloc = true;
     } else if (def->mem.source == VIR_DOMAIN_MEMORY_SOURCE_FILE) {
-        if (qemuGetMemoryBackingPath(priv->driver, def, "ram", &mem_path) < 0)
+        // This path should not be reached if NUMA is requested
+        virDomainXMLPrivateDataCallbacks *privateData = (virDomainXMLPrivateDataCallbacks *) priv;
+        if (qemuGetMemoryBackingPath(def, privateData, 0, "ram", &mem_path) < 0)
             return -1;
     }
 
diff --git a/src/qemu/qemu_conf.c b/src/qemu/qemu_conf.c
index 4050a82341..b808e33b77 100644
--- a/src/qemu/qemu_conf.c
+++ b/src/qemu/qemu_conf.c
@@ -43,6 +43,7 @@
 #include "virfile.h"
 #include "virstring.h"
 #include "virutil.h"
+#include "virnuma.h"
 #include "configmake.h"
 #include "security/security_util.h"
 
@@ -137,6 +138,9 @@ virQEMUDriverConfig *virQEMUDriverConfigNew(bool privileged,
 
     cfg->cgroupControllers = -1; /* -1 == auto-detect */
 
+    cfg->memoryBackingDirs = g_new0(char *, 1);
+    cfg->nb_memoryBackingDirs = 1;
+
     if (root != NULL) {
         cfg->logDir = g_strdup_printf("%s/log/qemu", root);
         cfg->swtpmLogDir = g_strdup_printf("%s/log/swtpm", root);
@@ -153,7 +157,7 @@ virQEMUDriverConfig *virQEMUDriverConfigNew(bool privileged,
         cfg->checkpointDir = g_strdup_printf("%s/checkpoint", cfg->libDir);
         cfg->autoDumpPath = g_strdup_printf("%s/dump", cfg->libDir);
         cfg->nvramDir = g_strdup_printf("%s/nvram", cfg->libDir);
-        cfg->memoryBackingDir = g_strdup_printf("%s/ram", cfg->libDir);
+        cfg->memoryBackingDirs[0] = g_strdup_printf("%s/ram", cfg->libDir);
     } else if (privileged) {
         cfg->logDir = g_strdup_printf("%s/log/libvirt/qemu", LOCALSTATEDIR);
 
@@ -174,7 +178,8 @@ virQEMUDriverConfig *virQEMUDriverConfigNew(bool privileged,
         cfg->checkpointDir = g_strdup_printf("%s/checkpoint", cfg->libDir);
         cfg->autoDumpPath = g_strdup_printf("%s/dump", cfg->libDir);
         cfg->nvramDir = g_strdup_printf("%s/nvram", cfg->libDir);
-        cfg->memoryBackingDir = g_strdup_printf("%s/ram", cfg->libDir);
+
+        cfg->memoryBackingDirs[0] = g_strdup_printf("%s/ram", cfg->libDir);
         cfg->swtpmStorageDir = g_strdup_printf("%s/lib/libvirt/swtpm",
                                                LOCALSTATEDIR);
     } else {
@@ -201,7 +206,7 @@ virQEMUDriverConfig *virQEMUDriverConfigNew(bool privileged,
                                              cfg->configBaseDir);
         cfg->autoDumpPath = g_strdup_printf("%s/qemu/dump", cfg->configBaseDir);
         cfg->nvramDir = g_strdup_printf("%s/qemu/nvram", cfg->configBaseDir);
-        cfg->memoryBackingDir = g_strdup_printf("%s/qemu/ram", cfg->configBaseDir);
+        cfg->memoryBackingDirs[0] = g_strdup_printf("%s/qemu/ram", cfg->configBaseDir);
         cfg->swtpmStorageDir = g_strdup_printf("%s/qemu/swtpm",
                                                cfg->configBaseDir);
     }
@@ -294,6 +299,7 @@ virQEMUDriverConfig *virQEMUDriverConfigNew(bool privileged,
 static void virQEMUDriverConfigDispose(void *obj)
 {
     virQEMUDriverConfig *cfg = obj;
+    size_t i;
 
     virBitmapFree(cfg->namespaces);
 
@@ -369,7 +375,12 @@ static void virQEMUDriverConfigDispose(void *obj)
 
     virFirmwareFreeList(cfg->firmwares, cfg->nfirmwares);
 
-    g_free(cfg->memoryBackingDir);
+    for (i = 0; i < cfg->nb_memoryBackingDirs; i++) {
+        g_free(cfg->memoryBackingDirs[i]);
+    }
+
+    g_free(cfg->memoryBackingDirs);
+
     g_free(cfg->swtpmStorageDir);
 
     g_strfreev(cfg->capabilityfilters);
@@ -1018,15 +1029,22 @@ static int
 virQEMUDriverConfigLoadMemoryEntry(virQEMUDriverConfig *cfg,
                                    virConf *conf)
 {
-    g_autofree char *dir = NULL;
+    char **memoryBackingDirs = NULL;
+    g_auto(GStrv) params = NULL;
     int rc;
 
-    if ((rc = virConfGetValueString(conf, "memory_backing_dir", &dir)) < 0)
+    if ((rc = virConfGetValueStringList(conf, "memory_backing_dir", true, &memoryBackingDirs) < 0))
         return -1;
 
-    if (rc > 0) {
-        VIR_FREE(cfg->memoryBackingDir);
-        cfg->memoryBackingDir = g_strdup_printf("%s/libvirt/qemu", dir);
+    if (rc == 0) {
+        size_t i;
+        for (i = 0; i < cfg->nb_memoryBackingDirs; i++)
+            g_free(cfg->memoryBackingDirs[i]);
+
+        cfg->nb_memoryBackingDirs = g_strv_length(memoryBackingDirs);
+        cfg->memoryBackingDirs = g_new0(char *, cfg->nb_memoryBackingDirs);
+        for (i = 0; i < cfg->nb_memoryBackingDirs; i++)
+            cfg->memoryBackingDirs[i] = g_strdup_printf("%s/libvirt/qemu", memoryBackingDirs[i]);
     }
 
     return 0;
@@ -1595,22 +1613,110 @@ qemuGetDomainHupageMemPath(virQEMUDriver *driver,
 
 
 int
-qemuGetMemoryBackingDomainPath(virQEMUDriver *driver,
-                               const virDomainDef *def,
+qemuGetMemoryBackingDomainPath(const virDomainDef *def,
+                               virDomainXMLPrivateDataCallbacks *priv,
+                               const size_t targetNode,
                                char **path)
 {
+    qemuDomainObjPrivate *privateData = (qemuDomainObjPrivate *) priv;
+    virQEMUDriver *driver = privateData->driver;
     g_autoptr(virQEMUDriverConfig) cfg = virQEMUDriverGetConfig(driver);
     const char *root = driver->embeddedRoot;
     g_autofree char *shortName = NULL;
+    size_t path_index = 0; // original behavior, described below
 
     if (!(shortName = virDomainDefGetShortName(def)))
         return -1;
 
-    if (root && !STRPREFIX(cfg->memoryBackingDir, root)) {
+    /*
+     * We have three use cases:
+     *
+     * 1. Domain has multiple NUMA nodes, but they have only specified
+     *    a single directory path in qemu.conf. (Original default behavior).
+     *
+     *    In this case, we already placed the memory backing path for each NUMA node
+     *    into the same path location. Preserve the established default behavior.
+     *
+     * 2. Domain has multiple NUMA nodes, but we have asked for multiple directory
+     *    paths as well.
+     *
+     *    In this case, we will have a one-to-one relationship between the number
+     *    of NUMA nodes and the order in which the paths are provided.
+     *    If the user does not specify enough paths, then we need to throw an error.
+     *    NOTE: This is open to comment. The "ordering" of the paths here is not intially
+     *    configurable to preserve backwards compatibility with the original qemu.conf syntax.
+     *    If controlling the ordering is desired, we would need to revise the syntax in
+     *    qemu.conf to make that possible. That hasn't been needed so far.
+     *
+     *    NOTE A): We must check with numatune here, if requested. The number of NUMA nodes
+     *             may be less than or equal to the number of provided paths. If it is less,
+     *             we have to respect the choices made by numatune. In this case, we will map the
+     *             physical NUMA nodes (0, 1, 2...) in the order in which they appear in qemu.conf
+     *
+     * 3. Domain has a single NUMA node, but we have asked for multiple directory paths.
+     *
+     *       In this case we also need to check if numatune is requested. If so,
+     *       we want to pick the path indicated by numatune.
+     *
+     * NOTE B): In both cases 2 and 3, if numatune is requested, the path obviously cannot
+     *         be changed on the fly, like it normally would be in "restrictive" mode
+     *         during runtime. So, we will only do this is the mode requested is "strict".
+     *
+     * NOTE C): Furthermore, in both cases 2 and 3, if the number of directory paths provided
+     *          is more than one, and one of either: a) no numatune is provided at all or
+     *          b) numatune is in fact provided, but the mode is not strict,
+     *          then we must thrown error. This is because we cannot know which backing
+     *          directory path to choose without the user's input.
+     *
+     * NOTE D): If one or more directory paths is requested in any of the cases 1, 2, or 3,
+     *          the numatune cannot specifiy more than one NUMA node, because the only mode
+     *          possible with directory paths is "strict" (e.g. automatic numa balancing of
+     *          memory will not work). Only one numa node can be requested by numatune, else
+     *          we must throw an error.
+     */
+
+    if (cfg->nb_memoryBackingDirs > 1) {
+        virDomainNuma *numatune = def->numa;
+        virBitmap *numaBitmap = virDomainNumatuneGetNodeset(numatune, privateData->autoNodeset, targetNode);
+        size_t numa_node_count = virDomainNumaGetNodeCount(def->numa);
+        virDomainNumatuneMemMode mode;
+
+        if ((numatune && numaBitmap && virNumaNodesetIsAvailable(numaBitmap)) &&
+            virDomainNumatuneGetMode(def->numa, -1, &mode) == 0 &&
+            mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT &&
+            virBitmapCountBits(numaBitmap) == 1) {
+
+            // Is numatune provided?
+            // Is it strict?
+            // Does it only specify a single pinning for this target?
+            // Yes to all 3? then good to go.
+
+            if (cfg->nb_memoryBackingDirs < numa_node_count) {
+                virReportError(VIR_ERR_INTERNAL_ERROR,
+                               _("Domain requesting configuration for %1$lu NUMA nodes, but memory backing directory only has (%2$lu) directory paths available. Either reduce this to one directory or provide more paths to use."),
+                               numa_node_count,
+                               cfg->nb_memoryBackingDirs);
+                return -1;
+            }
+
+            path_index = virBitmapNextSetBit(numaBitmap, -1);
+        } else if (numa_node_count > 1 && numa_node_count == cfg->nb_memoryBackingDirs) {
+            // Be nice. A valid numatune and pinning has not been specified, but the number
+            // of paths matches up exactly, so just assign them one-to-one.
+            path_index = targetNode;
+        } else {
+            virReportError(VIR_ERR_INTERNAL_ERROR,
+                           _("There are (%1$lu) memory directory directories configured. Domain must use a 'strict' numatune as well as an associated pinning configuration for each NUMA node before proceeding. An individual NUMA node can only be pinned to a single backing directory. Please correct the domain configuration or remove the memory backing directories and try again."),
+                cfg->nb_memoryBackingDirs);
+                return -1;
+        }
+    }
+
+    if (root && !STRPREFIX(cfg->memoryBackingDirs[path_index], root)) {
         g_autofree char * hash = virDomainDriverGenerateRootHash("qemu", root);
-        *path = g_strdup_printf("%s/%s-%s", cfg->memoryBackingDir, hash, shortName);
+        *path = g_strdup_printf("%s/%s-%s", cfg->memoryBackingDirs[path_index], hash, shortName);
     } else {
-        *path = g_strdup_printf("%s/%s", cfg->memoryBackingDir, shortName);
+        *path = g_strdup_printf("%s/%s", cfg->memoryBackingDirs[path_index], shortName);
     }
 
     return 0;
@@ -1630,8 +1736,9 @@ qemuGetMemoryBackingDomainPath(virQEMUDriver *driver,
  *          -1 otherwise (with error reported).
  */
 int
-qemuGetMemoryBackingPath(virQEMUDriver *driver,
-                         const virDomainDef *def,
+qemuGetMemoryBackingPath(const virDomainDef *def,
+                         virDomainXMLPrivateDataCallbacks *priv,
+                         const size_t targetNode,
                          const char *alias,
                          char **memPath)
 {
@@ -1644,7 +1751,7 @@ qemuGetMemoryBackingPath(virQEMUDriver *driver,
         return -1;
     }
 
-    if (qemuGetMemoryBackingDomainPath(driver, def, &domainPath) < 0)
+    if (qemuGetMemoryBackingDomainPath(def, priv, targetNode, &domainPath) < 0)
         return -1;
 
     *memPath = g_strdup_printf("%s/%s", domainPath, alias);
diff --git a/src/qemu/qemu_conf.h b/src/qemu/qemu_conf.h
index 36049b4bfa..dabf4d19a4 100644
--- a/src/qemu/qemu_conf.h
+++ b/src/qemu/qemu_conf.h
@@ -221,7 +221,8 @@ struct _virQEMUDriverConfig {
     unsigned int glusterDebugLevel;
     bool virtiofsdDebug;
 
-    char *memoryBackingDir;
+    char **memoryBackingDirs;
+    size_t nb_memoryBackingDirs;
 
     uid_t swtpm_user;
     gid_t swtpm_group;
@@ -369,11 +370,14 @@ int qemuGetDomainHupageMemPath(virQEMUDriver *driver,
                                unsigned long long pagesize,
                                char **memPath);
 
-int qemuGetMemoryBackingDomainPath(virQEMUDriver *driver,
-                                   const virDomainDef *def,
+int qemuGetMemoryBackingDomainPath(const virDomainDef *def,
+                                   virDomainXMLPrivateDataCallbacks *priv,
+                                   const size_t targetNode,
                                    char **path);
-int qemuGetMemoryBackingPath(virQEMUDriver *driver,
-                             const virDomainDef *def,
+
+int qemuGetMemoryBackingPath(const virDomainDef *def,
+                             virDomainXMLPrivateDataCallbacks *priv,
+                             const size_t targetNode,
                              const char *alias,
                              char **memPath);
 
diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c
index e2698c7924..51ca7dca78 100644
--- a/src/qemu/qemu_driver.c
+++ b/src/qemu/qemu_driver.c
@@ -653,11 +653,14 @@ qemuStateInitialize(bool privileged,
                              cfg->nvramDir);
         goto error;
     }
-    if (g_mkdir_with_parents(cfg->memoryBackingDir, 0777) < 0) {
-        virReportSystemError(errno, _("Failed to create memory backing dir %1$s"),
-                             cfg->memoryBackingDir);
-        goto error;
+    for (i = 0; i < cfg->nb_memoryBackingDirs; i++) {
+        if (g_mkdir_with_parents(cfg->memoryBackingDirs[i], 0777) < 0) {
+            virReportSystemError(errno, _("Failed to create memory backing dir # %1$lu @ %2$s, total: %3$lu"),
+            i, cfg->memoryBackingDirs[i], cfg->nb_memoryBackingDirs);
+            goto error;
+        }
     }
+
     if (g_mkdir_with_parents(cfg->slirpStateDir, 0777) < 0) {
         virReportSystemError(errno, _("Failed to create slirp state dir %1$s"),
                              cfg->slirpStateDir);
@@ -802,12 +805,14 @@ qemuStateInitialize(bool privileged,
                                  (int)cfg->group);
             goto error;
         }
-        if (chown(cfg->memoryBackingDir, cfg->user, cfg->group) < 0) {
-            virReportSystemError(errno,
+        for (i = 0; i < cfg->nb_memoryBackingDirs; i++) {
+            if (chown(cfg->memoryBackingDirs[i], cfg->user, cfg->group) < 0) {
+                virReportSystemError(errno,
                                  _("unable to set ownership of '%1$s' to %2$d:%3$d"),
-                                 cfg->memoryBackingDir, (int)cfg->user,
+                                 cfg->memoryBackingDirs[i], (int)cfg->user,
                                  (int)cfg->group);
-            goto error;
+                goto error;
+            }
         }
         if (chown(cfg->slirpStateDir, cfg->user, cfg->group) < 0) {
             virReportSystemError(errno,
@@ -855,10 +860,12 @@ qemuStateInitialize(bool privileged,
             goto error;
     }
 
-    if (privileged &&
-        virFileUpdatePerm(cfg->memoryBackingDir,
+    for (i = 0; i < cfg->nb_memoryBackingDirs; i++) {
+        if (privileged &&
+            virFileUpdatePerm(cfg->memoryBackingDirs[i],
                           0, S_IXGRP | S_IXOTH) < 0)
-        goto error;
+            goto error;
+    }
 
     /* Get all the running persistent or transient configs first */
     if (virDomainObjListLoadAllConfigs(qemu_driver->domains,
diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c
index 4a3f4f657e..59db29aada 100644
--- a/src/qemu/qemu_hotplug.c
+++ b/src/qemu/qemu_hotplug.c
@@ -2276,7 +2276,7 @@ qemuDomainAttachMemory(virQEMUDriver *driver,
                                     priv, vm->def, mem, true, false, NULL) < 0)
         goto cleanup;
 
-    if (qemuProcessBuildDestroyMemoryPaths(driver, vm, mem, true) < 0)
+    if (qemuProcessBuildDestroyMemoryPaths(vm, mem, true) < 0)
         goto cleanup;
 
     if (qemuDomainNamespaceSetupMemory(vm, mem, &teardowndevice) < 0)
@@ -2351,7 +2351,7 @@ qemuDomainAttachMemory(virQEMUDriver *driver,
     qemuDomainObjExitMonitor(vm);
 
     if (objAdded && mem)
-        ignore_value(qemuProcessDestroyMemoryBackingPath(driver, vm, mem));
+        ignore_value(qemuProcessDestroyMemoryBackingPath(vm->def, priv, mem));
 
     virErrorRestore(&orig_err);
     if (!mem)
@@ -4634,7 +4634,7 @@ qemuDomainRemoveMemoryDevice(virQEMUDriver *driver,
     if (qemuDomainNamespaceTeardownMemory(vm, mem) <  0)
         VIR_WARN("Unable to remove memory device from /dev");
 
-    if (qemuProcessDestroyMemoryBackingPath(driver, vm, mem) < 0)
+    if (qemuProcessDestroyMemoryBackingPath(vm->def, priv, mem) < 0)
         VIR_WARN("Unable to destroy memory backing path");
 
     qemuDomainReleaseMemoryDeviceSlot(vm, mem);
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
index 7ef7040a85..a9af8fe353 100644
--- a/src/qemu/qemu_process.c
+++ b/src/qemu/qemu_process.c
@@ -4046,12 +4046,12 @@ qemuProcessBuildDestroyMemoryPathsImpl(virQEMUDriver *driver,
 
 
 int
-qemuProcessBuildDestroyMemoryPaths(virQEMUDriver *driver,
-                                   virDomainObj *vm,
+qemuProcessBuildDestroyMemoryPaths(virDomainObj *vm,
                                    virDomainMemoryDef *mem,
                                    bool build)
 {
-
+    qemuDomainObjPrivate *priv = vm->privateData;
+    virQEMUDriver *driver = priv->driver;
     g_autoptr(virQEMUDriverConfig) cfg = virQEMUDriverGetConfig(driver);
     size_t i;
     bool shouldBuildHP = false;
@@ -4081,13 +4081,14 @@ qemuProcessBuildDestroyMemoryPaths(virQEMUDriver *driver,
     }
 
     if (!build || shouldBuildMB) {
-        g_autofree char *path = NULL;
-        if (qemuGetMemoryBackingDomainPath(driver, vm->def, &path) < 0)
-            return -1;
+        for (i = 0; i < cfg->nb_memoryBackingDirs; i++) {
+            g_autofree char *path = NULL;
+            if (qemuGetMemoryBackingDomainPath(vm->def, vm->privateData, i, &path) < 0)
+                return -1;
 
-        if (qemuProcessBuildDestroyMemoryPathsImpl(driver, vm,
-                                                   path, build) < 0)
-            return -1;
+            if (qemuProcessBuildDestroyMemoryPathsImpl(driver, vm, path, build) < 0)
+                return -1;
+        }
     }
 
     return 0;
@@ -4095,19 +4096,24 @@ qemuProcessBuildDestroyMemoryPaths(virQEMUDriver *driver,
 
 
 int
-qemuProcessDestroyMemoryBackingPath(virQEMUDriver *driver,
-                                    virDomainObj *vm,
+qemuProcessDestroyMemoryBackingPath(virDomainDef *def,
+                                    qemuDomainObjPrivate *priv,
                                     virDomainMemoryDef *mem)
 {
+    virQEMUDriver *driver = priv->driver;
     g_autofree char *path = NULL;
+    g_autoptr(virQEMUDriverConfig) cfg = virQEMUDriverGetConfig(driver);
+    size_t i;
 
-    if (qemuGetMemoryBackingPath(driver, vm->def, mem->info.alias, &path) < 0)
-        return -1;
+    for (i = 0; i < cfg->nb_memoryBackingDirs; i++) {
+        virDomainXMLPrivateDataCallbacks *privateData = (virDomainXMLPrivateDataCallbacks *) priv;
+        if (qemuGetMemoryBackingPath(def, privateData, i, mem->info.alias, &path) < 0)
+            return -1;
 
-    if (unlink(path) < 0 &&
-        errno != ENOENT) {
-        virReportSystemError(errno, _("Unable to remove %1$s"), path);
-        return -1;
+        if (unlink(path) < 0 && errno != ENOENT) {
+            virReportSystemError(errno, _("Unable to remove %1$s"), path);
+            return -1;
+        }
     }
 
     return 0;
@@ -7268,7 +7274,7 @@ qemuProcessPrepareHost(virQEMUDriver *driver,
     if (qemuProcessPrepareHostBackendChardev(vm) < 0)
         return -1;
 
-    if (qemuProcessBuildDestroyMemoryPaths(driver, vm, NULL, true) < 0)
+    if (qemuProcessBuildDestroyMemoryPaths(vm, NULL, true) < 0)
         return -1;
 
     /* Ensure no historical cgroup for this VM is lying around bogus
@@ -8482,7 +8488,7 @@ void qemuProcessStop(virQEMUDriver *driver,
         goto endjob;
     }
 
-    qemuProcessBuildDestroyMemoryPaths(driver, vm, NULL, false);
+    qemuProcessBuildDestroyMemoryPaths(vm, NULL, false);
 
     if (!!g_atomic_int_dec_and_test(&driver->nactive) && driver->inhibitCallback)
         driver->inhibitCallback(false, driver->inhibitOpaque);
diff --git a/src/qemu/qemu_process.h b/src/qemu/qemu_process.h
index c1ea949215..eebd0a4d1b 100644
--- a/src/qemu/qemu_process.h
+++ b/src/qemu/qemu_process.h
@@ -38,13 +38,12 @@ int qemuProcessStopCPUs(virQEMUDriver *driver,
                         virDomainPausedReason reason,
                         virDomainAsyncJob asyncJob);
 
-int qemuProcessBuildDestroyMemoryPaths(virQEMUDriver *driver,
-                                       virDomainObj *vm,
+int qemuProcessBuildDestroyMemoryPaths(virDomainObj *vm,
                                        virDomainMemoryDef *mem,
                                        bool build);
 
-int qemuProcessDestroyMemoryBackingPath(virQEMUDriver *driver,
-                                        virDomainObj *vm,
+int qemuProcessDestroyMemoryBackingPath(virDomainDef *def,
+                                        qemuDomainObjPrivate *priv,
                                         virDomainMemoryDef *mem);
 
 void qemuProcessReconnectAll(virQEMUDriver *driver);
-- 
2.34.1
Re: [PATCH v2 1/4] qemu.conf changes to support multiple memory backend
Posted by Martin Kletzander 2 months, 1 week ago
On Wed, Jun 05, 2024 at 04:37:35PM -0400, mgalaxy@akamai.com wrote:
>From: Michael Galaxy <mgalaxy@akamai.com>
>

Hi,

sorry for such a late reply and thank you for the patches.  There are
few things that need tweaking and the series does not apply cleanly,
but fortunately there is only one conflict.  Once needed adjustments are
incorporated I see no reason why this should not be supported.  What I
found is provided inline.

>We start by introducing a backwards-compatible, comma-separated
>specification that will not break existing installations, such
>as in the following example:
>
>$ cat qemu.conf | grep memory_backing_dir
>memory_backing_dir = ["/path/to/pmem/0", "/path/to/pmem/1"]
>
>(The old syntax with a single string is also still supported)
>memory_backing_dir = "/path/to/dir"
>
>In our case, we almost always have two NUMA nodes, so in that
>example, we have two PMEM regions which are created on the
>Linux kernel command line that get mounted into those two
>locations for libvirt to use.
>
>Signed-off-by: Michael Galaxy <mgalaxy@akamai.com>
>---
> src/qemu/qemu_command.c |   8 ++-
> src/qemu/qemu_conf.c    | 141 +++++++++++++++++++++++++++++++++++-----
> src/qemu/qemu_conf.h    |  14 ++--
> src/qemu/qemu_driver.c  |  29 +++++----
> src/qemu/qemu_hotplug.c |   6 +-
> src/qemu/qemu_process.c |  44 +++++++------
> src/qemu/qemu_process.h |   7 +-
> 7 files changed, 188 insertions(+), 61 deletions(-)
>
>diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c
>index 2d0eddc79e..9a25fdb0ed 100644
>--- a/src/qemu/qemu_command.c
>+++ b/src/qemu/qemu_command.c
>@@ -3420,7 +3420,9 @@ qemuBuildMemoryBackendProps(virJSONValue **backendProps,
>         } else {
>             /* We can have both pagesize and mem source. If that's the case,
>              * prefer hugepages as those are more specific. */
>-            if (qemuGetMemoryBackingPath(priv->driver, def, mem->info.alias, &memPath) < 0)
>+
>+            virDomainXMLPrivateDataCallbacks *privateData = (virDomainXMLPrivateDataCallbacks *) priv;

I guess this is some leftover from previous versions.  The pointer is
not to virDomainXMLPrivateDataCallbacks and should not be cast to it.
Especially when you cast it back in the function you pass it to.

>+            if (qemuGetMemoryBackingPath(def, privateData, mem->targetNode, mem->info.alias, &memPath) < 0)
>                 return -1;
>         }
>
>@@ -7295,7 +7297,9 @@ qemuBuildMemPathStr(const virDomainDef *def,
>             return -1;
>         prealloc = true;
>     } else if (def->mem.source == VIR_DOMAIN_MEMORY_SOURCE_FILE) {
>-        if (qemuGetMemoryBackingPath(priv->driver, def, "ram", &mem_path) < 0)
>+        // This path should not be reached if NUMA is requested
>+        virDomainXMLPrivateDataCallbacks *privateData = (virDomainXMLPrivateDataCallbacks *) priv;

Same here.

>+        if (qemuGetMemoryBackingPath(def, privateData, 0, "ram", &mem_path) < 0)
>             return -1;
>     }
>
>diff --git a/src/qemu/qemu_conf.c b/src/qemu/qemu_conf.c
>index 4050a82341..b808e33b77 100644
>--- a/src/qemu/qemu_conf.c
>+++ b/src/qemu/qemu_conf.c
>@@ -43,6 +43,7 @@
> #include "virfile.h"
> #include "virstring.h"
> #include "virutil.h"
>+#include "virnuma.h"
> #include "configmake.h"
> #include "security/security_util.h"
>
>@@ -137,6 +138,9 @@ virQEMUDriverConfig *virQEMUDriverConfigNew(bool privileged,
>
>     cfg->cgroupControllers = -1; /* -1 == auto-detect */
>
>+    cfg->memoryBackingDirs = g_new0(char *, 1);
>+    cfg->nb_memoryBackingDirs = 1;
>+
>     if (root != NULL) {
>         cfg->logDir = g_strdup_printf("%s/log/qemu", root);
>         cfg->swtpmLogDir = g_strdup_printf("%s/log/swtpm", root);
>@@ -153,7 +157,7 @@ virQEMUDriverConfig *virQEMUDriverConfigNew(bool privileged,
>         cfg->checkpointDir = g_strdup_printf("%s/checkpoint", cfg->libDir);
>         cfg->autoDumpPath = g_strdup_printf("%s/dump", cfg->libDir);
>         cfg->nvramDir = g_strdup_printf("%s/nvram", cfg->libDir);
>-        cfg->memoryBackingDir = g_strdup_printf("%s/ram", cfg->libDir);
>+        cfg->memoryBackingDirs[0] = g_strdup_printf("%s/ram", cfg->libDir);
>     } else if (privileged) {
>         cfg->logDir = g_strdup_printf("%s/log/libvirt/qemu", LOCALSTATEDIR);
>
>@@ -174,7 +178,8 @@ virQEMUDriverConfig *virQEMUDriverConfigNew(bool privileged,
>         cfg->checkpointDir = g_strdup_printf("%s/checkpoint", cfg->libDir);
>         cfg->autoDumpPath = g_strdup_printf("%s/dump", cfg->libDir);
>         cfg->nvramDir = g_strdup_printf("%s/nvram", cfg->libDir);
>-        cfg->memoryBackingDir = g_strdup_printf("%s/ram", cfg->libDir);
>+
>+        cfg->memoryBackingDirs[0] = g_strdup_printf("%s/ram", cfg->libDir);
>         cfg->swtpmStorageDir = g_strdup_printf("%s/lib/libvirt/swtpm",
>                                                LOCALSTATEDIR);
>     } else {
>@@ -201,7 +206,7 @@ virQEMUDriverConfig *virQEMUDriverConfigNew(bool privileged,
>                                              cfg->configBaseDir);
>         cfg->autoDumpPath = g_strdup_printf("%s/qemu/dump", cfg->configBaseDir);
>         cfg->nvramDir = g_strdup_printf("%s/qemu/nvram", cfg->configBaseDir);
>-        cfg->memoryBackingDir = g_strdup_printf("%s/qemu/ram", cfg->configBaseDir);
>+        cfg->memoryBackingDirs[0] = g_strdup_printf("%s/qemu/ram", cfg->configBaseDir);
>         cfg->swtpmStorageDir = g_strdup_printf("%s/qemu/swtpm",
>                                                cfg->configBaseDir);
>     }
>@@ -294,6 +299,7 @@ virQEMUDriverConfig *virQEMUDriverConfigNew(bool privileged,
> static void virQEMUDriverConfigDispose(void *obj)
> {
>     virQEMUDriverConfig *cfg = obj;
>+    size_t i;
>
>     virBitmapFree(cfg->namespaces);
>
>@@ -369,7 +375,12 @@ static void virQEMUDriverConfigDispose(void *obj)
>
>     virFirmwareFreeList(cfg->firmwares, cfg->nfirmwares);
>
>-    g_free(cfg->memoryBackingDir);
>+    for (i = 0; i < cfg->nb_memoryBackingDirs; i++) {
>+        g_free(cfg->memoryBackingDirs[i]);
>+    }
>+
>+    g_free(cfg->memoryBackingDirs);
>+
>     g_free(cfg->swtpmStorageDir);
>
>     g_strfreev(cfg->capabilityfilters);
>@@ -1018,15 +1029,22 @@ static int
> virQEMUDriverConfigLoadMemoryEntry(virQEMUDriverConfig *cfg,
>                                    virConf *conf)
> {
>-    g_autofree char *dir = NULL;
>+    char **memoryBackingDirs = NULL;
>+    g_auto(GStrv) params = NULL;

Unused variable.

>     int rc;
>
>-    if ((rc = virConfGetValueString(conf, "memory_backing_dir", &dir)) < 0)
>+    if ((rc = virConfGetValueStringList(conf, "memory_backing_dir", true, &memoryBackingDirs) < 0))
>         return -1;
>
>-    if (rc > 0) {
>-        VIR_FREE(cfg->memoryBackingDir);
>-        cfg->memoryBackingDir = g_strdup_printf("%s/libvirt/qemu", dir);
>+    if (rc == 0) {
>+        size_t i;
>+        for (i = 0; i < cfg->nb_memoryBackingDirs; i++)
>+            g_free(cfg->memoryBackingDirs[i]);
>+
>+        cfg->nb_memoryBackingDirs = g_strv_length(memoryBackingDirs);
>+        cfg->memoryBackingDirs = g_new0(char *, cfg->nb_memoryBackingDirs);
>+        for (i = 0; i < cfg->nb_memoryBackingDirs; i++)
>+            cfg->memoryBackingDirs[i] = g_strdup_printf("%s/libvirt/qemu", memoryBackingDirs[i]);
>     }
>
>     return 0;
>@@ -1595,22 +1613,110 @@ qemuGetDomainHupageMemPath(virQEMUDriver *driver,
>
>
> int
>-qemuGetMemoryBackingDomainPath(virQEMUDriver *driver,
>-                               const virDomainDef *def,
>+qemuGetMemoryBackingDomainPath(const virDomainDef *def,
>+                               virDomainXMLPrivateDataCallbacks *priv,

Change this to qemuDomainObjPrivate and ...

>+                               const size_t targetNode,
>                                char **path)
> {
>+    qemuDomainObjPrivate *privateData = (qemuDomainObjPrivate *) priv;

... remove this cast.

>+    virQEMUDriver *driver = privateData->driver;
>     g_autoptr(virQEMUDriverConfig) cfg = virQEMUDriverGetConfig(driver);
>     const char *root = driver->embeddedRoot;
>     g_autofree char *shortName = NULL;
>+    size_t path_index = 0; // original behavior, described below
>
>     if (!(shortName = virDomainDefGetShortName(def)))
>         return -1;
>
>-    if (root && !STRPREFIX(cfg->memoryBackingDir, root)) {
>+    /*
>+     * We have three use cases:
>+     *
>+     * 1. Domain has multiple NUMA nodes, but they have only specified
>+     *    a single directory path in qemu.conf. (Original default behavior).
>+     *
>+     *    In this case, we already placed the memory backing path for each NUMA node
>+     *    into the same path location. Preserve the established default behavior.
>+     *
>+     * 2. Domain has multiple NUMA nodes, but we have asked for multiple directory
>+     *    paths as well.
>+     *
>+     *    In this case, we will have a one-to-one relationship between the number
>+     *    of NUMA nodes and the order in which the paths are provided.
>+     *    If the user does not specify enough paths, then we need to throw an error.
>+     *    NOTE: This is open to comment. The "ordering" of the paths here is not intially
>+     *    configurable to preserve backwards compatibility with the original qemu.conf syntax.
>+     *    If controlling the ordering is desired, we would need to revise the syntax in
>+     *    qemu.conf to make that possible. That hasn't been needed so far.
>+     *
>+     *    NOTE A): We must check with numatune here, if requested. The number of NUMA nodes
>+     *             may be less than or equal to the number of provided paths. If it is less,
>+     *             we have to respect the choices made by numatune. In this case, we will map the
>+     *             physical NUMA nodes (0, 1, 2...) in the order in which they appear in qemu.conf
>+     *
>+     * 3. Domain has a single NUMA node, but we have asked for multiple directory paths.
>+     *
>+     *       In this case we also need to check if numatune is requested. If so,
>+     *       we want to pick the path indicated by numatune.
>+     *
>+     * NOTE B): In both cases 2 and 3, if numatune is requested, the path obviously cannot
>+     *         be changed on the fly, like it normally would be in "restrictive" mode
>+     *         during runtime. So, we will only do this is the mode requested is "strict".
>+     *
>+     * NOTE C): Furthermore, in both cases 2 and 3, if the number of directory paths provided
>+     *          is more than one, and one of either: a) no numatune is provided at all or
>+     *          b) numatune is in fact provided, but the mode is not strict,
>+     *          then we must thrown error. This is because we cannot know which backing

"we must throw an error"

>+     *          directory path to choose without the user's input.
>+     *
>+     * NOTE D): If one or more directory paths is requested in any of the cases 1, 2, or 3,
>+     *          the numatune cannot specifiy more than one NUMA node, because the only mode
>+     *          possible with directory paths is "strict" (e.g. automatic numa balancing of
>+     *          memory will not work). Only one numa node can be requested by numatune, else
>+     *          we must throw an error.
>+     */
>+
>+    if (cfg->nb_memoryBackingDirs > 1) {
>+        virDomainNuma *numatune = def->numa;

Just use def->numa, the temporary variable name is not used always below
and makes the code harder to read.

>+        virBitmap *numaBitmap = virDomainNumatuneGetNodeset(numatune, privateData->autoNodeset, targetNode);
>+        size_t numa_node_count = virDomainNumaGetNodeCount(def->numa);
>+        virDomainNumatuneMemMode mode;
>+
>+        if ((numatune && numaBitmap && virNumaNodesetIsAvailable(numaBitmap)) &&

The parentheses are not needed here.

>+            virDomainNumatuneGetMode(def->numa, -1, &mode) == 0 &&
>+            mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT &&
>+            virBitmapCountBits(numaBitmap) == 1) {
>+
>+            // Is numatune provided?
>+            // Is it strict?
>+            // Does it only specify a single pinning for this target?
>+            // Yes to all 3? then good to go.
>+
>+            if (cfg->nb_memoryBackingDirs < numa_node_count) {
>+                virReportError(VIR_ERR_INTERNAL_ERROR,

I would not say this is an internal error, but maybe VIR_ERR_CONFIG_UNSUPPORTED.

>+                               _("Domain requesting configuration for %1$lu NUMA nodes, but memory backing directory only has (%2$lu) directory paths available. Either reduce this to one directory or provide more paths to use."),
>+                               numa_node_count,
>+                               cfg->nb_memoryBackingDirs);
>+                return -1;
>+            }
>+
>+            path_index = virBitmapNextSetBit(numaBitmap, -1);

What if the single pinning is to the third host node, but there are only
two memoryBackingDirs supplied in the config?

>+        } else if (numa_node_count > 1 && numa_node_count == cfg->nb_memoryBackingDirs) {
>+            // Be nice. A valid numatune and pinning has not been specified, but the number
>+            // of paths matches up exactly, so just assign them one-to-one.
>+            path_index = targetNode;
>+        } else {
>+            virReportError(VIR_ERR_INTERNAL_ERROR,

Again, not an internal error.

>+                           _("There are (%1$lu) memory directory directories configured. Domain must use a 'strict' numatune as well as an associated pinning configuration for each NUMA node before proceeding. An individual NUMA node can only be pinned to a single backing directory. Please correct the domain configuration or remove the memory backing directories and try again."),
>+                cfg->nb_memoryBackingDirs);
>+                return -1;
>+        }
>+    }
>+
>+    if (root && !STRPREFIX(cfg->memoryBackingDirs[path_index], root)) {
>         g_autofree char * hash = virDomainDriverGenerateRootHash("qemu", root);
>-        *path = g_strdup_printf("%s/%s-%s", cfg->memoryBackingDir, hash, shortName);
>+        *path = g_strdup_printf("%s/%s-%s", cfg->memoryBackingDirs[path_index], hash, shortName);
>     } else {
>-        *path = g_strdup_printf("%s/%s", cfg->memoryBackingDir, shortName);
>+        *path = g_strdup_printf("%s/%s", cfg->memoryBackingDirs[path_index], shortName);
>     }
>
>     return 0;
>@@ -1630,8 +1736,9 @@ qemuGetMemoryBackingDomainPath(virQEMUDriver *driver,
>  *          -1 otherwise (with error reported).
>  */
> int
>-qemuGetMemoryBackingPath(virQEMUDriver *driver,
>-                         const virDomainDef *def,
>+qemuGetMemoryBackingPath(const virDomainDef *def,
>+                         virDomainXMLPrivateDataCallbacks *priv,

Change this to qemuDomainObjPrivate *.

>+                         const size_t targetNode,
>                          const char *alias,
>                          char **memPath)
> {
>@@ -1644,7 +1751,7 @@ qemuGetMemoryBackingPath(virQEMUDriver *driver,
>         return -1;
>     }
>
>-    if (qemuGetMemoryBackingDomainPath(driver, def, &domainPath) < 0)
>+    if (qemuGetMemoryBackingDomainPath(def, priv, targetNode, &domainPath) < 0)
>         return -1;
>
>     *memPath = g_strdup_printf("%s/%s", domainPath, alias);
>diff --git a/src/qemu/qemu_conf.h b/src/qemu/qemu_conf.h
>index 36049b4bfa..dabf4d19a4 100644
>--- a/src/qemu/qemu_conf.h
>+++ b/src/qemu/qemu_conf.h
>@@ -221,7 +221,8 @@ struct _virQEMUDriverConfig {
>     unsigned int glusterDebugLevel;
>     bool virtiofsdDebug;
>
>-    char *memoryBackingDir;
>+    char **memoryBackingDirs;
>+    size_t nb_memoryBackingDirs;

We usually just prefix with "n" for the counters, even though it is not
always more readable, but to stay consistent please use "nmemoryBackingDirs".

>
>     uid_t swtpm_user;
>     gid_t swtpm_group;
>@@ -369,11 +370,14 @@ int qemuGetDomainHupageMemPath(virQEMUDriver *driver,
>                                unsigned long long pagesize,
>                                char **memPath);
>
>-int qemuGetMemoryBackingDomainPath(virQEMUDriver *driver,
>-                                   const virDomainDef *def,
>+int qemuGetMemoryBackingDomainPath(const virDomainDef *def,
>+                                   virDomainXMLPrivateDataCallbacks *priv,
>+                                   const size_t targetNode,
>                                    char **path);
>-int qemuGetMemoryBackingPath(virQEMUDriver *driver,
>-                             const virDomainDef *def,
>+
>+int qemuGetMemoryBackingPath(const virDomainDef *def,
>+                             virDomainXMLPrivateDataCallbacks *priv,
>+                             const size_t targetNode,
>                              const char *alias,
>                              char **memPath);
>
>diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c
>index e2698c7924..51ca7dca78 100644
>--- a/src/qemu/qemu_driver.c
>+++ b/src/qemu/qemu_driver.c
>@@ -653,11 +653,14 @@ qemuStateInitialize(bool privileged,
>                              cfg->nvramDir);
>         goto error;
>     }
>-    if (g_mkdir_with_parents(cfg->memoryBackingDir, 0777) < 0) {
>-        virReportSystemError(errno, _("Failed to create memory backing dir %1$s"),
>-                             cfg->memoryBackingDir);
>-        goto error;
>+    for (i = 0; i < cfg->nb_memoryBackingDirs; i++) {
>+        if (g_mkdir_with_parents(cfg->memoryBackingDirs[i], 0777) < 0) {
>+            virReportSystemError(errno, _("Failed to create memory backing dir # %1$lu @ %2$s, total: %3$lu"),
>+            i, cfg->memoryBackingDirs[i], cfg->nb_memoryBackingDirs);

This line should be indented with the first parameter, if it gets too
long feel free to split each parameter to its own line.

>+            goto error;
>+        }
>     }
>+
>     if (g_mkdir_with_parents(cfg->slirpStateDir, 0777) < 0) {
>         virReportSystemError(errno, _("Failed to create slirp state dir %1$s"),
>                              cfg->slirpStateDir);
>@@ -802,12 +805,14 @@ qemuStateInitialize(bool privileged,
>                                  (int)cfg->group);
>             goto error;
>         }
>-        if (chown(cfg->memoryBackingDir, cfg->user, cfg->group) < 0) {
>-            virReportSystemError(errno,
>+        for (i = 0; i < cfg->nb_memoryBackingDirs; i++) {
>+            if (chown(cfg->memoryBackingDirs[i], cfg->user, cfg->group) < 0) {
>+                virReportSystemError(errno,
>                                  _("unable to set ownership of '%1$s' to %2$d:%3$d"),
>-                                 cfg->memoryBackingDir, (int)cfg->user,
>+                                 cfg->memoryBackingDirs[i], (int)cfg->user,
>                                  (int)cfg->group);

The indentation needs fixing here as well.

>-            goto error;
>+                goto error;
>+            }
>         }
>         if (chown(cfg->slirpStateDir, cfg->user, cfg->group) < 0) {
>             virReportSystemError(errno,
>@@ -855,10 +860,12 @@ qemuStateInitialize(bool privileged,
>             goto error;
>     }
>
>-    if (privileged &&
>-        virFileUpdatePerm(cfg->memoryBackingDir,
>+    for (i = 0; i < cfg->nb_memoryBackingDirs; i++) {
>+        if (privileged &&
>+            virFileUpdatePerm(cfg->memoryBackingDirs[i],
>                           0, S_IXGRP | S_IXOTH) < 0)

Indentation.

>-        goto error;
>+            goto error;
>+    }
>
>     /* Get all the running persistent or transient configs first */
>     if (virDomainObjListLoadAllConfigs(qemu_driver->domains,
>diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c
>index 4a3f4f657e..59db29aada 100644
>--- a/src/qemu/qemu_hotplug.c
>+++ b/src/qemu/qemu_hotplug.c
>@@ -2276,7 +2276,7 @@ qemuDomainAttachMemory(virQEMUDriver *driver,
>                                     priv, vm->def, mem, true, false, NULL) < 0)
>         goto cleanup;
>
>-    if (qemuProcessBuildDestroyMemoryPaths(driver, vm, mem, true) < 0)
>+    if (qemuProcessBuildDestroyMemoryPaths(vm, mem, true) < 0)
>         goto cleanup;
>
>     if (qemuDomainNamespaceSetupMemory(vm, mem, &teardowndevice) < 0)
>@@ -2351,7 +2351,7 @@ qemuDomainAttachMemory(virQEMUDriver *driver,
>     qemuDomainObjExitMonitor(vm);
>
>     if (objAdded && mem)
>-        ignore_value(qemuProcessDestroyMemoryBackingPath(driver, vm, mem));
>+        ignore_value(qemuProcessDestroyMemoryBackingPath(vm->def, priv, mem));
>
>     virErrorRestore(&orig_err);
>     if (!mem)
>@@ -4634,7 +4634,7 @@ qemuDomainRemoveMemoryDevice(virQEMUDriver *driver,
>     if (qemuDomainNamespaceTeardownMemory(vm, mem) <  0)
>         VIR_WARN("Unable to remove memory device from /dev");
>
>-    if (qemuProcessDestroyMemoryBackingPath(driver, vm, mem) < 0)
>+    if (qemuProcessDestroyMemoryBackingPath(vm->def, priv, mem) < 0)
>         VIR_WARN("Unable to destroy memory backing path");
>
>     qemuDomainReleaseMemoryDeviceSlot(vm, mem);
>diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
>index 7ef7040a85..a9af8fe353 100644
>--- a/src/qemu/qemu_process.c
>+++ b/src/qemu/qemu_process.c
>@@ -4046,12 +4046,12 @@ qemuProcessBuildDestroyMemoryPathsImpl(virQEMUDriver *driver,
>
>
> int
>-qemuProcessBuildDestroyMemoryPaths(virQEMUDriver *driver,
>-                                   virDomainObj *vm,
>+qemuProcessBuildDestroyMemoryPaths(virDomainObj *vm,
>                                    virDomainMemoryDef *mem,
>                                    bool build)
> {
>-
>+    qemuDomainObjPrivate *priv = vm->privateData;
>+    virQEMUDriver *driver = priv->driver;

It's nice that you are removing the parameter `driver` which is not
needed since it can be extracted from another parameter `vm`, but this
really should be its own separate patch doing just that one thing.

The following patch, which changes functionality, will look much nicer
after that.  Not only for review, but also for the future if someone
needs to figure out what changed.

>     g_autoptr(virQEMUDriverConfig) cfg = virQEMUDriverGetConfig(driver);
>     size_t i;
>     bool shouldBuildHP = false;
>@@ -4081,13 +4081,14 @@ qemuProcessBuildDestroyMemoryPaths(virQEMUDriver *driver,
>     }
>
>     if (!build || shouldBuildMB) {
>-        g_autofree char *path = NULL;
>-        if (qemuGetMemoryBackingDomainPath(driver, vm->def, &path) < 0)
>-            return -1;
>+        for (i = 0; i < cfg->nb_memoryBackingDirs; i++) {
>+            g_autofree char *path = NULL;
>+            if (qemuGetMemoryBackingDomainPath(vm->def, vm->privateData, i, &path) < 0)
>+                return -1;
>
>-        if (qemuProcessBuildDestroyMemoryPathsImpl(driver, vm,
>-                                                   path, build) < 0)
>-            return -1;
>+            if (qemuProcessBuildDestroyMemoryPathsImpl(driver, vm, path, build) < 0)
>+                return -1;
>+        }
>     }
>
>     return 0;
>@@ -4095,19 +4096,24 @@ qemuProcessBuildDestroyMemoryPaths(virQEMUDriver *driver,
>
>
> int
>-qemuProcessDestroyMemoryBackingPath(virQEMUDriver *driver,
>-                                    virDomainObj *vm,
>+qemuProcessDestroyMemoryBackingPath(virDomainDef *def,
>+                                    qemuDomainObjPrivate *priv,
>                                     virDomainMemoryDef *mem)

This function needs access to the driver, the domain definition and its
private data.  All those can be extracted from the domain object `vm`,
but more importantly all of them are accessible in all the callers
modified above.  Why make it more complicated and change the parameters
while changing the logic?  Either a) keep it the same or b) supply
everything so that this function is easier to read or c) supply the
minimum (just `vm`) so that this function is easier to call from various
places.  In this case probably not the last option (c) since it is
called from just two functions that have access to the same variables.

> {
>+    virQEMUDriver *driver = priv->driver;
>     g_autofree char *path = NULL;
>+    g_autoptr(virQEMUDriverConfig) cfg = virQEMUDriverGetConfig(driver);
>+    size_t i;
>
>-    if (qemuGetMemoryBackingPath(driver, vm->def, mem->info.alias, &path) < 0)
>-        return -1;
>+    for (i = 0; i < cfg->nb_memoryBackingDirs; i++) {
>+        virDomainXMLPrivateDataCallbacks *privateData = (virDomainXMLPrivateDataCallbacks *) priv;
>+        if (qemuGetMemoryBackingPath(def, privateData, i, mem->info.alias, &path) < 0)
>+            return -1;
>

I wonder why are the paths not saved in the status XML of the running
domain.  Since they can be changed during the VM's lifetime (while
restarting libvirt) it could cause issues during clean-up.  Sure,
unlink() will set errno to ENOENT, but the proper paths will not be
cleaned up.  It'd also make this part of the code easier and safer to
use from the callers.  But that's pre-existing.

>-    if (unlink(path) < 0 &&
>-        errno != ENOENT) {
>-        virReportSystemError(errno, _("Unable to remove %1$s"), path);
>-        return -1;
>+        if (unlink(path) < 0 && errno != ENOENT) {
>+            virReportSystemError(errno, _("Unable to remove %1$s"), path);
>+            return -1;
>+        }
>     }
>
>     return 0;
>@@ -7268,7 +7274,7 @@ qemuProcessPrepareHost(virQEMUDriver *driver,
>     if (qemuProcessPrepareHostBackendChardev(vm) < 0)
>         return -1;
>
>-    if (qemuProcessBuildDestroyMemoryPaths(driver, vm, NULL, true) < 0)
>+    if (qemuProcessBuildDestroyMemoryPaths(vm, NULL, true) < 0)
>         return -1;
>
>     /* Ensure no historical cgroup for this VM is lying around bogus
>@@ -8482,7 +8488,7 @@ void qemuProcessStop(virQEMUDriver *driver,
>         goto endjob;
>     }
>
>-    qemuProcessBuildDestroyMemoryPaths(driver, vm, NULL, false);
>+    qemuProcessBuildDestroyMemoryPaths(vm, NULL, false);
>
>     if (!!g_atomic_int_dec_and_test(&driver->nactive) && driver->inhibitCallback)
>         driver->inhibitCallback(false, driver->inhibitOpaque);
>diff --git a/src/qemu/qemu_process.h b/src/qemu/qemu_process.h
>index c1ea949215..eebd0a4d1b 100644
>--- a/src/qemu/qemu_process.h
>+++ b/src/qemu/qemu_process.h
>@@ -38,13 +38,12 @@ int qemuProcessStopCPUs(virQEMUDriver *driver,
>                         virDomainPausedReason reason,
>                         virDomainAsyncJob asyncJob);
>
>-int qemuProcessBuildDestroyMemoryPaths(virQEMUDriver *driver,
>-                                       virDomainObj *vm,
>+int qemuProcessBuildDestroyMemoryPaths(virDomainObj *vm,
>                                        virDomainMemoryDef *mem,
>                                        bool build);
>
>-int qemuProcessDestroyMemoryBackingPath(virQEMUDriver *driver,
>-                                        virDomainObj *vm,
>+int qemuProcessDestroyMemoryBackingPath(virDomainDef *def,
>+                                        qemuDomainObjPrivate *priv,
>                                         virDomainMemoryDef *mem);
>
> void qemuProcessReconnectAll(virQEMUDriver *driver);
>-- 
>2.34.1
>

This patch changes the configuration, so it should also adjust the
documentation and since configuration is used everywhere it should also
add a few unit tests with the changed configuration and few edge cases,
just so we can be sure it really works the way it is supposed to.
Re: [PATCH v2 1/4] qemu.conf changes to support multiple memory backend
Posted by Michael Galaxy via Devel 2 months ago
Ok, I tested one of your concerns below, and I caught that already.

Just following up......

On 8/6/24 07:35, Martin Kletzander wrote:
> On Wed, Jun 05, 2024 at 04:37:35PM -0400, mgalaxy@akamai.com wrote:
>> From: Michael Galaxy <mgalaxy@akamai.com>
>> +                               _("Domain requesting configuration 
>> for %1$lu NUMA nodes, but memory backing directory only has (%2$lu) 
>> directory paths available. Either reduce this to one directory or 
>> provide more paths to use."),
>> +                               numa_node_count,
>> +                               cfg->nb_memoryBackingDirs);
>> +                return -1;
>> +            }
>> +
>> +            path_index = virBitmapNextSetBit(numaBitmap, -1);
>
> What if the single pinning is to the third host node, but there are only
> two memoryBackingDirs supplied in the config?
>
I tested this. This case was already covered by this conditional check
that I included in the v2 patch:

+            if (cfg->nb_memoryBackingDirs < numa_node_count) {
+                virReportError(VIR_ERR_INTERNAL_ERROR,
+                               _("Domain requesting configuration for %1$lu NUMA nodes, but memory backing directory only has (%2$lu) directory paths available. Either reduce this to one directory or provide more paths to use."),
+                               numa_node_count,
+                               cfg->nb_memoryBackingDirs);
+                return -1;
+            }

- Michael
Re: [PATCH v2 1/4] qemu.conf changes to support multiple memory backend
Posted by Michael Galaxy via Devel 2 months, 1 week ago
By the way, it took me a few hours to notice that you reviewed this. 
Thanks again.

Please disregard my RESEND. Sorry for the noise!

- Michael

On 8/6/24 07:35, Martin Kletzander wrote:
> On Wed, Jun 05, 2024 at 04:37:35PM -0400, mgalaxy@akamai.com wrote:
>> From: Michael Galaxy <mgalaxy@akamai.com>
>>
>
> Hi,
>
> sorry for such a late reply and thank you for the patches.  There are
> few things that need tweaking and the series does not apply cleanly,
> but fortunately there is only one conflict.  Once needed adjustments are
> incorporated I see no reason why this should not be supported. What I
> found is provided inline.
>
>> We start by introducing a backwards-compatible, comma-separated
>> specification that will not break existing installations, such
>> as in the following example:
>>
>> $ cat qemu.conf | grep memory_backing_dir
>> memory_backing_dir = ["/path/to/pmem/0", "/path/to/pmem/1"]
>>
>> (The old syntax with a single string is also still supported)
>> memory_backing_dir = "/path/to/dir"
>>
>> In our case, we almost always have two NUMA nodes, so in that
>> example, we have two PMEM regions which are created on the
>> Linux kernel command line that get mounted into those two
>> locations for libvirt to use.
>>
>> Signed-off-by: Michael Galaxy <mgalaxy@akamai.com>
>> ---
>> src/qemu/qemu_command.c |   8 ++-
>> src/qemu/qemu_conf.c    | 141 +++++++++++++++++++++++++++++++++++-----
>> src/qemu/qemu_conf.h    |  14 ++--
>> src/qemu/qemu_driver.c  |  29 +++++----
>> src/qemu/qemu_hotplug.c |   6 +-
>> src/qemu/qemu_process.c |  44 +++++++------
>> src/qemu/qemu_process.h |   7 +-
>> 7 files changed, 188 insertions(+), 61 deletions(-)
>>
>> diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c
>> index 2d0eddc79e..9a25fdb0ed 100644
>> --- a/src/qemu/qemu_command.c
>> +++ b/src/qemu/qemu_command.c
>> @@ -3420,7 +3420,9 @@ qemuBuildMemoryBackendProps(virJSONValue 
>> **backendProps,
>>         } else {
>>             /* We can have both pagesize and mem source. If that's 
>> the case,
>>              * prefer hugepages as those are more specific. */
>> -            if (qemuGetMemoryBackingPath(priv->driver, def, 
>> mem->info.alias, &memPath) < 0)
>> +
>> +            virDomainXMLPrivateDataCallbacks *privateData = 
>> (virDomainXMLPrivateDataCallbacks *) priv;
>
> I guess this is some leftover from previous versions.  The pointer is
> not to virDomainXMLPrivateDataCallbacks and should not be cast to it.
> Especially when you cast it back in the function you pass it to.
>
>> +            if (qemuGetMemoryBackingPath(def, privateData, 
>> mem->targetNode, mem->info.alias, &memPath) < 0)
>>                 return -1;
>>         }
>>
>> @@ -7295,7 +7297,9 @@ qemuBuildMemPathStr(const virDomainDef *def,
>>             return -1;
>>         prealloc = true;
>>     } else if (def->mem.source == VIR_DOMAIN_MEMORY_SOURCE_FILE) {
>> -        if (qemuGetMemoryBackingPath(priv->driver, def, "ram", 
>> &mem_path) < 0)
>> +        // This path should not be reached if NUMA is requested
>> +        virDomainXMLPrivateDataCallbacks *privateData = 
>> (virDomainXMLPrivateDataCallbacks *) priv;
>
> Same here.
>
>> +        if (qemuGetMemoryBackingPath(def, privateData, 0, "ram", 
>> &mem_path) < 0)
>>             return -1;
>>     }
>>
>> diff --git a/src/qemu/qemu_conf.c b/src/qemu/qemu_conf.c
>> index 4050a82341..b808e33b77 100644
>> --- a/src/qemu/qemu_conf.c
>> +++ b/src/qemu/qemu_conf.c
>> @@ -43,6 +43,7 @@
>> #include "virfile.h"
>> #include "virstring.h"
>> #include "virutil.h"
>> +#include "virnuma.h"
>> #include "configmake.h"
>> #include "security/security_util.h"
>>
>> @@ -137,6 +138,9 @@ virQEMUDriverConfig *virQEMUDriverConfigNew(bool 
>> privileged,
>>
>>     cfg->cgroupControllers = -1; /* -1 == auto-detect */
>>
>> +    cfg->memoryBackingDirs = g_new0(char *, 1);
>> +    cfg->nb_memoryBackingDirs = 1;
>> +
>>     if (root != NULL) {
>>         cfg->logDir = g_strdup_printf("%s/log/qemu", root);
>>         cfg->swtpmLogDir = g_strdup_printf("%s/log/swtpm", root);
>> @@ -153,7 +157,7 @@ virQEMUDriverConfig *virQEMUDriverConfigNew(bool 
>> privileged,
>>         cfg->checkpointDir = g_strdup_printf("%s/checkpoint", 
>> cfg->libDir);
>>         cfg->autoDumpPath = g_strdup_printf("%s/dump", cfg->libDir);
>>         cfg->nvramDir = g_strdup_printf("%s/nvram", cfg->libDir);
>> -        cfg->memoryBackingDir = g_strdup_printf("%s/ram", cfg->libDir);
>> +        cfg->memoryBackingDirs[0] = g_strdup_printf("%s/ram", 
>> cfg->libDir);
>>     } else if (privileged) {
>>         cfg->logDir = g_strdup_printf("%s/log/libvirt/qemu", 
>> LOCALSTATEDIR);
>>
>> @@ -174,7 +178,8 @@ virQEMUDriverConfig *virQEMUDriverConfigNew(bool 
>> privileged,
>>         cfg->checkpointDir = g_strdup_printf("%s/checkpoint", 
>> cfg->libDir);
>>         cfg->autoDumpPath = g_strdup_printf("%s/dump", cfg->libDir);
>>         cfg->nvramDir = g_strdup_printf("%s/nvram", cfg->libDir);
>> -        cfg->memoryBackingDir = g_strdup_printf("%s/ram", cfg->libDir);
>> +
>> +        cfg->memoryBackingDirs[0] = g_strdup_printf("%s/ram", 
>> cfg->libDir);
>>         cfg->swtpmStorageDir = g_strdup_printf("%s/lib/libvirt/swtpm",
>>                                                LOCALSTATEDIR);
>>     } else {
>> @@ -201,7 +206,7 @@ virQEMUDriverConfig *virQEMUDriverConfigNew(bool 
>> privileged,
>> cfg->configBaseDir);
>>         cfg->autoDumpPath = g_strdup_printf("%s/qemu/dump", 
>> cfg->configBaseDir);
>>         cfg->nvramDir = g_strdup_printf("%s/qemu/nvram", 
>> cfg->configBaseDir);
>> -        cfg->memoryBackingDir = g_strdup_printf("%s/qemu/ram", 
>> cfg->configBaseDir);
>> +        cfg->memoryBackingDirs[0] = g_strdup_printf("%s/qemu/ram", 
>> cfg->configBaseDir);
>>         cfg->swtpmStorageDir = g_strdup_printf("%s/qemu/swtpm",
>> cfg->configBaseDir);
>>     }
>> @@ -294,6 +299,7 @@ virQEMUDriverConfig *virQEMUDriverConfigNew(bool 
>> privileged,
>> static void virQEMUDriverConfigDispose(void *obj)
>> {
>>     virQEMUDriverConfig *cfg = obj;
>> +    size_t i;
>>
>>     virBitmapFree(cfg->namespaces);
>>
>> @@ -369,7 +375,12 @@ static void virQEMUDriverConfigDispose(void *obj)
>>
>>     virFirmwareFreeList(cfg->firmwares, cfg->nfirmwares);
>>
>> -    g_free(cfg->memoryBackingDir);
>> +    for (i = 0; i < cfg->nb_memoryBackingDirs; i++) {
>> +        g_free(cfg->memoryBackingDirs[i]);
>> +    }
>> +
>> +    g_free(cfg->memoryBackingDirs);
>> +
>>     g_free(cfg->swtpmStorageDir);
>>
>>     g_strfreev(cfg->capabilityfilters);
>> @@ -1018,15 +1029,22 @@ static int
>> virQEMUDriverConfigLoadMemoryEntry(virQEMUDriverConfig *cfg,
>>                                    virConf *conf)
>> {
>> -    g_autofree char *dir = NULL;
>> +    char **memoryBackingDirs = NULL;
>> +    g_auto(GStrv) params = NULL;
>
> Unused variable.
>
>>     int rc;
>>
>> -    if ((rc = virConfGetValueString(conf, "memory_backing_dir", 
>> &dir)) < 0)
>> +    if ((rc = virConfGetValueStringList(conf, "memory_backing_dir", 
>> true, &memoryBackingDirs) < 0))
>>         return -1;
>>
>> -    if (rc > 0) {
>> -        VIR_FREE(cfg->memoryBackingDir);
>> -        cfg->memoryBackingDir = g_strdup_printf("%s/libvirt/qemu", 
>> dir);
>> +    if (rc == 0) {
>> +        size_t i;
>> +        for (i = 0; i < cfg->nb_memoryBackingDirs; i++)
>> +            g_free(cfg->memoryBackingDirs[i]);
>> +
>> +        cfg->nb_memoryBackingDirs = g_strv_length(memoryBackingDirs);
>> +        cfg->memoryBackingDirs = g_new0(char *, 
>> cfg->nb_memoryBackingDirs);
>> +        for (i = 0; i < cfg->nb_memoryBackingDirs; i++)
>> +            cfg->memoryBackingDirs[i] = 
>> g_strdup_printf("%s/libvirt/qemu", memoryBackingDirs[i]);
>>     }
>>
>>     return 0;
>> @@ -1595,22 +1613,110 @@ qemuGetDomainHupageMemPath(virQEMUDriver 
>> *driver,
>>
>>
>> int
>> -qemuGetMemoryBackingDomainPath(virQEMUDriver *driver,
>> -                               const virDomainDef *def,
>> +qemuGetMemoryBackingDomainPath(const virDomainDef *def,
>> +                               virDomainXMLPrivateDataCallbacks *priv,
>
> Change this to qemuDomainObjPrivate and ...
>
>> +                               const size_t targetNode,
>>                                char **path)
>> {
>> +    qemuDomainObjPrivate *privateData = (qemuDomainObjPrivate *) priv;
>
> ... remove this cast.
>
>> +    virQEMUDriver *driver = privateData->driver;
>>     g_autoptr(virQEMUDriverConfig) cfg = virQEMUDriverGetConfig(driver);
>>     const char *root = driver->embeddedRoot;
>>     g_autofree char *shortName = NULL;
>> +    size_t path_index = 0; // original behavior, described below
>>
>>     if (!(shortName = virDomainDefGetShortName(def)))
>>         return -1;
>>
>> -    if (root && !STRPREFIX(cfg->memoryBackingDir, root)) {
>> +    /*
>> +     * We have three use cases:
>> +     *
>> +     * 1. Domain has multiple NUMA nodes, but they have only specified
>> +     *    a single directory path in qemu.conf. (Original default 
>> behavior).
>> +     *
>> +     *    In this case, we already placed the memory backing path 
>> for each NUMA node
>> +     *    into the same path location. Preserve the established 
>> default behavior.
>> +     *
>> +     * 2. Domain has multiple NUMA nodes, but we have asked for 
>> multiple directory
>> +     *    paths as well.
>> +     *
>> +     *    In this case, we will have a one-to-one relationship 
>> between the number
>> +     *    of NUMA nodes and the order in which the paths are provided.
>> +     *    If the user does not specify enough paths, then we need to 
>> throw an error.
>> +     *    NOTE: This is open to comment. The "ordering" of the paths 
>> here is not intially
>> +     *    configurable to preserve backwards compatibility with the 
>> original qemu.conf syntax.
>> +     *    If controlling the ordering is desired, we would need to 
>> revise the syntax in
>> +     *    qemu.conf to make that possible. That hasn't been needed 
>> so far.
>> +     *
>> +     *    NOTE A): We must check with numatune here, if requested. 
>> The number of NUMA nodes
>> +     *             may be less than or equal to the number of 
>> provided paths. If it is less,
>> +     *             we have to respect the choices made by numatune. 
>> In this case, we will map the
>> +     *             physical NUMA nodes (0, 1, 2...) in the order in 
>> which they appear in qemu.conf
>> +     *
>> +     * 3. Domain has a single NUMA node, but we have asked for 
>> multiple directory paths.
>> +     *
>> +     *       In this case we also need to check if numatune is 
>> requested. If so,
>> +     *       we want to pick the path indicated by numatune.
>> +     *
>> +     * NOTE B): In both cases 2 and 3, if numatune is requested, the 
>> path obviously cannot
>> +     *         be changed on the fly, like it normally would be in 
>> "restrictive" mode
>> +     *         during runtime. So, we will only do this is the mode 
>> requested is "strict".
>> +     *
>> +     * NOTE C): Furthermore, in both cases 2 and 3, if the number of 
>> directory paths provided
>> +     *          is more than one, and one of either: a) no numatune 
>> is provided at all or
>> +     *          b) numatune is in fact provided, but the mode is not 
>> strict,
>> +     *          then we must thrown error. This is because we cannot 
>> know which backing
>
> "we must throw an error"
>
>> +     *          directory path to choose without the user's input.
>> +     *
>> +     * NOTE D): If one or more directory paths is requested in any 
>> of the cases 1, 2, or 3,
>> +     *          the numatune cannot specifiy more than one NUMA 
>> node, because the only mode
>> +     *          possible with directory paths is "strict" (e.g. 
>> automatic numa balancing of
>> +     *          memory will not work). Only one numa node can be 
>> requested by numatune, else
>> +     *          we must throw an error.
>> +     */
>> +
>> +    if (cfg->nb_memoryBackingDirs > 1) {
>> +        virDomainNuma *numatune = def->numa;
>
> Just use def->numa, the temporary variable name is not used always below
> and makes the code harder to read.
>
>> +        virBitmap *numaBitmap = 
>> virDomainNumatuneGetNodeset(numatune, privateData->autoNodeset, 
>> targetNode);
>> +        size_t numa_node_count = virDomainNumaGetNodeCount(def->numa);
>> +        virDomainNumatuneMemMode mode;
>> +
>> +        if ((numatune && numaBitmap && 
>> virNumaNodesetIsAvailable(numaBitmap)) &&
>
> The parentheses are not needed here.
>
>> + virDomainNumatuneGetMode(def->numa, -1, &mode) == 0 &&
>> +            mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT &&
>> +            virBitmapCountBits(numaBitmap) == 1) {
>> +
>> +            // Is numatune provided?
>> +            // Is it strict?
>> +            // Does it only specify a single pinning for this target?
>> +            // Yes to all 3? then good to go.
>> +
>> +            if (cfg->nb_memoryBackingDirs < numa_node_count) {
>> +                virReportError(VIR_ERR_INTERNAL_ERROR,
>
> I would not say this is an internal error, but maybe 
> VIR_ERR_CONFIG_UNSUPPORTED.
>
>> +                               _("Domain requesting configuration 
>> for %1$lu NUMA nodes, but memory backing directory only has (%2$lu) 
>> directory paths available. Either reduce this to one directory or 
>> provide more paths to use."),
>> +                               numa_node_count,
>> +                               cfg->nb_memoryBackingDirs);
>> +                return -1;
>> +            }
>> +
>> +            path_index = virBitmapNextSetBit(numaBitmap, -1);
>
> What if the single pinning is to the third host node, but there are only
> two memoryBackingDirs supplied in the config?
>
>> +        } else if (numa_node_count > 1 && numa_node_count == 
>> cfg->nb_memoryBackingDirs) {
>> +            // Be nice. A valid numatune and pinning has not been 
>> specified, but the number
>> +            // of paths matches up exactly, so just assign them 
>> one-to-one.
>> +            path_index = targetNode;
>> +        } else {
>> +            virReportError(VIR_ERR_INTERNAL_ERROR,
>
> Again, not an internal error.
>
>> +                           _("There are (%1$lu) memory directory 
>> directories configured. Domain must use a 'strict' numatune as well 
>> as an associated pinning configuration for each NUMA node before 
>> proceeding. An individual NUMA node can only be pinned to a single 
>> backing directory. Please correct the domain configuration or remove 
>> the memory backing directories and try again."),
>> +                cfg->nb_memoryBackingDirs);
>> +                return -1;
>> +        }
>> +    }
>> +
>> +    if (root && !STRPREFIX(cfg->memoryBackingDirs[path_index], root)) {
>>         g_autofree char * hash = 
>> virDomainDriverGenerateRootHash("qemu", root);
>> -        *path = g_strdup_printf("%s/%s-%s", cfg->memoryBackingDir, 
>> hash, shortName);
>> +        *path = g_strdup_printf("%s/%s-%s", 
>> cfg->memoryBackingDirs[path_index], hash, shortName);
>>     } else {
>> -        *path = g_strdup_printf("%s/%s", cfg->memoryBackingDir, 
>> shortName);
>> +        *path = g_strdup_printf("%s/%s", 
>> cfg->memoryBackingDirs[path_index], shortName);
>>     }
>>
>>     return 0;
>> @@ -1630,8 +1736,9 @@ qemuGetMemoryBackingDomainPath(virQEMUDriver 
>> *driver,
>>  *          -1 otherwise (with error reported).
>>  */
>> int
>> -qemuGetMemoryBackingPath(virQEMUDriver *driver,
>> -                         const virDomainDef *def,
>> +qemuGetMemoryBackingPath(const virDomainDef *def,
>> +                         virDomainXMLPrivateDataCallbacks *priv,
>
> Change this to qemuDomainObjPrivate *.
>
>> +                         const size_t targetNode,
>>                          const char *alias,
>>                          char **memPath)
>> {
>> @@ -1644,7 +1751,7 @@ qemuGetMemoryBackingPath(virQEMUDriver *driver,
>>         return -1;
>>     }
>>
>> -    if (qemuGetMemoryBackingDomainPath(driver, def, &domainPath) < 0)
>> +    if (qemuGetMemoryBackingDomainPath(def, priv, targetNode, 
>> &domainPath) < 0)
>>         return -1;
>>
>>     *memPath = g_strdup_printf("%s/%s", domainPath, alias);
>> diff --git a/src/qemu/qemu_conf.h b/src/qemu/qemu_conf.h
>> index 36049b4bfa..dabf4d19a4 100644
>> --- a/src/qemu/qemu_conf.h
>> +++ b/src/qemu/qemu_conf.h
>> @@ -221,7 +221,8 @@ struct _virQEMUDriverConfig {
>>     unsigned int glusterDebugLevel;
>>     bool virtiofsdDebug;
>>
>> -    char *memoryBackingDir;
>> +    char **memoryBackingDirs;
>> +    size_t nb_memoryBackingDirs;
>
> We usually just prefix with "n" for the counters, even though it is not
> always more readable, but to stay consistent please use 
> "nmemoryBackingDirs".
>
>>
>>     uid_t swtpm_user;
>>     gid_t swtpm_group;
>> @@ -369,11 +370,14 @@ int qemuGetDomainHupageMemPath(virQEMUDriver 
>> *driver,
>>                                unsigned long long pagesize,
>>                                char **memPath);
>>
>> -int qemuGetMemoryBackingDomainPath(virQEMUDriver *driver,
>> -                                   const virDomainDef *def,
>> +int qemuGetMemoryBackingDomainPath(const virDomainDef *def,
>> + virDomainXMLPrivateDataCallbacks *priv,
>> +                                   const size_t targetNode,
>>                                    char **path);
>> -int qemuGetMemoryBackingPath(virQEMUDriver *driver,
>> -                             const virDomainDef *def,
>> +
>> +int qemuGetMemoryBackingPath(const virDomainDef *def,
>> +                             virDomainXMLPrivateDataCallbacks *priv,
>> +                             const size_t targetNode,
>>                              const char *alias,
>>                              char **memPath);
>>
>> diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c
>> index e2698c7924..51ca7dca78 100644
>> --- a/src/qemu/qemu_driver.c
>> +++ b/src/qemu/qemu_driver.c
>> @@ -653,11 +653,14 @@ qemuStateInitialize(bool privileged,
>>                              cfg->nvramDir);
>>         goto error;
>>     }
>> -    if (g_mkdir_with_parents(cfg->memoryBackingDir, 0777) < 0) {
>> -        virReportSystemError(errno, _("Failed to create memory 
>> backing dir %1$s"),
>> -                             cfg->memoryBackingDir);
>> -        goto error;
>> +    for (i = 0; i < cfg->nb_memoryBackingDirs; i++) {
>> +        if (g_mkdir_with_parents(cfg->memoryBackingDirs[i], 0777) < 
>> 0) {
>> +            virReportSystemError(errno, _("Failed to create memory 
>> backing dir # %1$lu @ %2$s, total: %3$lu"),
>> +            i, cfg->memoryBackingDirs[i], cfg->nb_memoryBackingDirs);
>
> This line should be indented with the first parameter, if it gets too
> long feel free to split each parameter to its own line.
>
>> +            goto error;
>> +        }
>>     }
>> +
>>     if (g_mkdir_with_parents(cfg->slirpStateDir, 0777) < 0) {
>>         virReportSystemError(errno, _("Failed to create slirp state 
>> dir %1$s"),
>>                              cfg->slirpStateDir);
>> @@ -802,12 +805,14 @@ qemuStateInitialize(bool privileged,
>>                                  (int)cfg->group);
>>             goto error;
>>         }
>> -        if (chown(cfg->memoryBackingDir, cfg->user, cfg->group) < 0) {
>> -            virReportSystemError(errno,
>> +        for (i = 0; i < cfg->nb_memoryBackingDirs; i++) {
>> +            if (chown(cfg->memoryBackingDirs[i], cfg->user, 
>> cfg->group) < 0) {
>> +                virReportSystemError(errno,
>>                                  _("unable to set ownership of '%1$s' 
>> to %2$d:%3$d"),
>> -                                 cfg->memoryBackingDir, (int)cfg->user,
>> +                                 cfg->memoryBackingDirs[i], 
>> (int)cfg->user,
>>                                  (int)cfg->group);
>
> The indentation needs fixing here as well.
>
>> -            goto error;
>> +                goto error;
>> +            }
>>         }
>>         if (chown(cfg->slirpStateDir, cfg->user, cfg->group) < 0) {
>>             virReportSystemError(errno,
>> @@ -855,10 +860,12 @@ qemuStateInitialize(bool privileged,
>>             goto error;
>>     }
>>
>> -    if (privileged &&
>> -        virFileUpdatePerm(cfg->memoryBackingDir,
>> +    for (i = 0; i < cfg->nb_memoryBackingDirs; i++) {
>> +        if (privileged &&
>> +            virFileUpdatePerm(cfg->memoryBackingDirs[i],
>>                           0, S_IXGRP | S_IXOTH) < 0)
>
> Indentation.
>
>> -        goto error;
>> +            goto error;
>> +    }
>>
>>     /* Get all the running persistent or transient configs first */
>>     if (virDomainObjListLoadAllConfigs(qemu_driver->domains,
>> diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c
>> index 4a3f4f657e..59db29aada 100644
>> --- a/src/qemu/qemu_hotplug.c
>> +++ b/src/qemu/qemu_hotplug.c
>> @@ -2276,7 +2276,7 @@ qemuDomainAttachMemory(virQEMUDriver *driver,
>>                                     priv, vm->def, mem, true, false, 
>> NULL) < 0)
>>         goto cleanup;
>>
>> -    if (qemuProcessBuildDestroyMemoryPaths(driver, vm, mem, true) < 0)
>> +    if (qemuProcessBuildDestroyMemoryPaths(vm, mem, true) < 0)
>>         goto cleanup;
>>
>>     if (qemuDomainNamespaceSetupMemory(vm, mem, &teardowndevice) < 0)
>> @@ -2351,7 +2351,7 @@ qemuDomainAttachMemory(virQEMUDriver *driver,
>>     qemuDomainObjExitMonitor(vm);
>>
>>     if (objAdded && mem)
>> - ignore_value(qemuProcessDestroyMemoryBackingPath(driver, vm, mem));
>> + ignore_value(qemuProcessDestroyMemoryBackingPath(vm->def, priv, mem));
>>
>>     virErrorRestore(&orig_err);
>>     if (!mem)
>> @@ -4634,7 +4634,7 @@ qemuDomainRemoveMemoryDevice(virQEMUDriver 
>> *driver,
>>     if (qemuDomainNamespaceTeardownMemory(vm, mem) <  0)
>>         VIR_WARN("Unable to remove memory device from /dev");
>>
>> -    if (qemuProcessDestroyMemoryBackingPath(driver, vm, mem) < 0)
>> +    if (qemuProcessDestroyMemoryBackingPath(vm->def, priv, mem) < 0)
>>         VIR_WARN("Unable to destroy memory backing path");
>>
>>     qemuDomainReleaseMemoryDeviceSlot(vm, mem);
>> diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
>> index 7ef7040a85..a9af8fe353 100644
>> --- a/src/qemu/qemu_process.c
>> +++ b/src/qemu/qemu_process.c
>> @@ -4046,12 +4046,12 @@ 
>> qemuProcessBuildDestroyMemoryPathsImpl(virQEMUDriver *driver,
>>
>>
>> int
>> -qemuProcessBuildDestroyMemoryPaths(virQEMUDriver *driver,
>> -                                   virDomainObj *vm,
>> +qemuProcessBuildDestroyMemoryPaths(virDomainObj *vm,
>>                                    virDomainMemoryDef *mem,
>>                                    bool build)
>> {
>> -
>> +    qemuDomainObjPrivate *priv = vm->privateData;
>> +    virQEMUDriver *driver = priv->driver;
>
> It's nice that you are removing the parameter `driver` which is not
> needed since it can be extracted from another parameter `vm`, but this
> really should be its own separate patch doing just that one thing.
>
> The following patch, which changes functionality, will look much nicer
> after that.  Not only for review, but also for the future if someone
> needs to figure out what changed.
>
>>     g_autoptr(virQEMUDriverConfig) cfg = virQEMUDriverGetConfig(driver);
>>     size_t i;
>>     bool shouldBuildHP = false;
>> @@ -4081,13 +4081,14 @@ 
>> qemuProcessBuildDestroyMemoryPaths(virQEMUDriver *driver,
>>     }
>>
>>     if (!build || shouldBuildMB) {
>> -        g_autofree char *path = NULL;
>> -        if (qemuGetMemoryBackingDomainPath(driver, vm->def, &path) < 0)
>> -            return -1;
>> +        for (i = 0; i < cfg->nb_memoryBackingDirs; i++) {
>> +            g_autofree char *path = NULL;
>> +            if (qemuGetMemoryBackingDomainPath(vm->def, 
>> vm->privateData, i, &path) < 0)
>> +                return -1;
>>
>> -        if (qemuProcessBuildDestroyMemoryPathsImpl(driver, vm,
>> -                                                   path, build) < 0)
>> -            return -1;
>> +            if (qemuProcessBuildDestroyMemoryPathsImpl(driver, vm, 
>> path, build) < 0)
>> +                return -1;
>> +        }
>>     }
>>
>>     return 0;
>> @@ -4095,19 +4096,24 @@ 
>> qemuProcessBuildDestroyMemoryPaths(virQEMUDriver *driver,
>>
>>
>> int
>> -qemuProcessDestroyMemoryBackingPath(virQEMUDriver *driver,
>> -                                    virDomainObj *vm,
>> +qemuProcessDestroyMemoryBackingPath(virDomainDef *def,
>> +                                    qemuDomainObjPrivate *priv,
>>                                     virDomainMemoryDef *mem)
>
> This function needs access to the driver, the domain definition and its
> private data.  All those can be extracted from the domain object `vm`,
> but more importantly all of them are accessible in all the callers
> modified above.  Why make it more complicated and change the parameters
> while changing the logic?  Either a) keep it the same or b) supply
> everything so that this function is easier to read or c) supply the
> minimum (just `vm`) so that this function is easier to call from various
> places.  In this case probably not the last option (c) since it is
> called from just two functions that have access to the same variables.
>
>> {
>> +    virQEMUDriver *driver = priv->driver;
>>     g_autofree char *path = NULL;
>> +    g_autoptr(virQEMUDriverConfig) cfg = 
>> virQEMUDriverGetConfig(driver);
>> +    size_t i;
>>
>> -    if (qemuGetMemoryBackingPath(driver, vm->def, mem->info.alias, 
>> &path) < 0)
>> -        return -1;
>> +    for (i = 0; i < cfg->nb_memoryBackingDirs; i++) {
>> +        virDomainXMLPrivateDataCallbacks *privateData = 
>> (virDomainXMLPrivateDataCallbacks *) priv;
>> +        if (qemuGetMemoryBackingPath(def, privateData, i, 
>> mem->info.alias, &path) < 0)
>> +            return -1;
>>
>
> I wonder why are the paths not saved in the status XML of the running
> domain.  Since they can be changed during the VM's lifetime (while
> restarting libvirt) it could cause issues during clean-up.  Sure,
> unlink() will set errno to ENOENT, but the proper paths will not be
> cleaned up.  It'd also make this part of the code easier and safer to
> use from the callers.  But that's pre-existing.
>
>> -    if (unlink(path) < 0 &&
>> -        errno != ENOENT) {
>> -        virReportSystemError(errno, _("Unable to remove %1$s"), path);
>> -        return -1;
>> +        if (unlink(path) < 0 && errno != ENOENT) {
>> +            virReportSystemError(errno, _("Unable to remove %1$s"), 
>> path);
>> +            return -1;
>> +        }
>>     }
>>
>>     return 0;
>> @@ -7268,7 +7274,7 @@ qemuProcessPrepareHost(virQEMUDriver *driver,
>>     if (qemuProcessPrepareHostBackendChardev(vm) < 0)
>>         return -1;
>>
>> -    if (qemuProcessBuildDestroyMemoryPaths(driver, vm, NULL, true) < 0)
>> +    if (qemuProcessBuildDestroyMemoryPaths(vm, NULL, true) < 0)
>>         return -1;
>>
>>     /* Ensure no historical cgroup for this VM is lying around bogus
>> @@ -8482,7 +8488,7 @@ void qemuProcessStop(virQEMUDriver *driver,
>>         goto endjob;
>>     }
>>
>> -    qemuProcessBuildDestroyMemoryPaths(driver, vm, NULL, false);
>> +    qemuProcessBuildDestroyMemoryPaths(vm, NULL, false);
>>
>>     if (!!g_atomic_int_dec_and_test(&driver->nactive) && 
>> driver->inhibitCallback)
>>         driver->inhibitCallback(false, driver->inhibitOpaque);
>> diff --git a/src/qemu/qemu_process.h b/src/qemu/qemu_process.h
>> index c1ea949215..eebd0a4d1b 100644
>> --- a/src/qemu/qemu_process.h
>> +++ b/src/qemu/qemu_process.h
>> @@ -38,13 +38,12 @@ int qemuProcessStopCPUs(virQEMUDriver *driver,
>>                         virDomainPausedReason reason,
>>                         virDomainAsyncJob asyncJob);
>>
>> -int qemuProcessBuildDestroyMemoryPaths(virQEMUDriver *driver,
>> -                                       virDomainObj *vm,
>> +int qemuProcessBuildDestroyMemoryPaths(virDomainObj *vm,
>>                                        virDomainMemoryDef *mem,
>>                                        bool build);
>>
>> -int qemuProcessDestroyMemoryBackingPath(virQEMUDriver *driver,
>> -                                        virDomainObj *vm,
>> +int qemuProcessDestroyMemoryBackingPath(virDomainDef *def,
>> +                                        qemuDomainObjPrivate *priv,
>>                                         virDomainMemoryDef *mem);
>>
>> void qemuProcessReconnectAll(virQEMUDriver *driver);
>> -- 
>> 2.34.1
>>
>
> This patch changes the configuration, so it should also adjust the
> documentation and since configuration is used everywhere it should also
> add a few unit tests with the changed configuration and few edge cases,
> just so we can be sure it really works the way it is supposed to.
Re: [PATCH v2 1/4] qemu.conf changes to support multiple memory backend
Posted by Michael Galaxy via Devel 2 months, 1 week ago
Thank you so much for the detailed comments. I'll work on those.

I believe you had a couple questions below on this one......inlined .....

On 8/6/24 07:35, Martin Kletzander wrote:
> On Wed, Jun 05, 2024 at 04:37:35PM -0400, mgalaxy@akamai.com wrote:
>> From: Michael Galaxy <mgalaxy@akamai.com>
>>
>> +                               _("Domain requesting configuration 
>> for %1$lu NUMA nodes, but memory backing directory only has (%2$lu) 
>> directory paths available. Either reduce this to one directory or 
>> provide more paths to use."),
>> +                               numa_node_count,
>> +                               cfg->nb_memoryBackingDirs);
>> +                return -1;
>> +            }
>> +
>> +            path_index = virBitmapNextSetBit(numaBitmap, -1);
>
> What if the single pinning is to the third host node, but there are only
> two memoryBackingDirs supplied in the config?

That case is totally supported (and expected). For example a *non* vNUMA 
guests (a single virtual NUMA node)
running on a standard 2-socket (2-NUMA-node) host in a typical pizza box.

If I understood that correctly, that is supported without a problem. The 
error above is describing the case that
a virtual machine has *more* virtual NUMA nodes configured than the host 
has available to offer. In such a case,
either the user needs to provide more host nodes or turn this feature 
off or use a single NUMA node (in which
case all the guest nodes would be placed in a single host node).

>> {
>> +    virQEMUDriver *driver = priv->driver;
>>     g_autofree char *path = NULL;
>> +    g_autoptr(virQEMUDriverConfig) cfg = 
>> virQEMUDriverGetConfig(driver);
>> +    size_t i;
>>
>> -    if (qemuGetMemoryBackingPath(driver, vm->def, mem->info.alias, 
>> &path) < 0)
>> -        return -1;
>> +    for (i = 0; i < cfg->nb_memoryBackingDirs; i++) {
>> +        virDomainXMLPrivateDataCallbacks *privateData = 
>> (virDomainXMLPrivateDataCallbacks *) priv;
>> +        if (qemuGetMemoryBackingPath(def, privateData, i, 
>> mem->info.alias, &path) < 0)
>> +            return -1;
>>
>
> I wonder why are the paths not saved in the status XML of the running
> domain.  Since they can be changed during the VM's lifetime (while
> restarting libvirt) it could cause issues during clean-up.  Sure,
> unlink() will set errno to ENOENT, but the proper paths will not be
> cleaned up.  It'd also make this part of the code easier and safer to
> use from the callers.  But that's pre-existing.

I'm not sure I'm following the question on this one. I do not think we 
want the paths to be changing
during the VM's lifetime. Are you asking if we *want*  to support that? 
The current
libvirt XML schema does not support this. The actual path is kept hidden 
from the guest VM
and is only known internally to libvirt.

Can you clarify?


>
>> -    if (unlink(path) < 0 &&
>> -        errno != ENOENT) {
>> -        virReportSystemError(errno, _("Unable to remove %1$s"), path);
>> -        return -1;
>> +        if (unlink(path) < 0 && errno != ENOENT) {
>> +            virReportSystemError(errno, _("Unable to remove %1$s"), 
>> path);
>> +            return -1;
>> +        }
>>     }
>>
>>     return 0;
>> @@ -7268,7 +7274,7 @@ qemuProcessPrepareHost(virQEMUDriver *driver,
>>     if (qemuProcessPrepareHostBackendChardev(vm) < 0)
>>         return -1;
>>
>> -    if (qemuProcessBuildDestroyMemoryPaths(driver, vm, NULL, true) < 0)
>> +    if (qemuProcessBuildDestroyMemoryPaths(vm, NULL, true) < 0)
>>         return -1;
>>
>>     /* Ensure no historical cgroup for this VM is lying around bogus
>> @@ -8482,7 +8488,7 @@ void qemuProcessStop(virQEMUDriver *driver,
>>         goto endjob;
>>     }
>>
>> -    qemuProcessBuildDestroyMemoryPaths(driver, vm, NULL, false);
>> +    qemuProcessBuildDestroyMemoryPaths(vm, NULL, false);
>>
>>     if (!!g_atomic_int_dec_and_test(&driver->nactive) && 
>> driver->inhibitCallback)
>>         driver->inhibitCallback(false, driver->inhibitOpaque);
>> diff --git a/src/qemu/qemu_process.h b/src/qemu/qemu_process.h
>> index c1ea949215..eebd0a4d1b 100644
>> --- a/src/qemu/qemu_process.h
>> +++ b/src/qemu/qemu_process.h
>> @@ -38,13 +38,12 @@ int qemuProcessStopCPUs(virQEMUDriver *driver,
>>                         virDomainPausedReason reason,
>>                         virDomainAsyncJob asyncJob);
>>
>> -int qemuProcessBuildDestroyMemoryPaths(virQEMUDriver *driver,
>> -                                       virDomainObj *vm,
>> +int qemuProcessBuildDestroyMemoryPaths(virDomainObj *vm,
>>                                        virDomainMemoryDef *mem,
>>                                        bool build);
>>
>> -int qemuProcessDestroyMemoryBackingPath(virQEMUDriver *driver,
>> -                                        virDomainObj *vm,
>> +int qemuProcessDestroyMemoryBackingPath(virDomainDef *def,
>> +                                        qemuDomainObjPrivate *priv,
>>                                         virDomainMemoryDef *mem);
>>
>> void qemuProcessReconnectAll(virQEMUDriver *driver);
>> -- 
>> 2.34.1
>>
>
> This patch changes the configuration, so it should also adjust the
> documentation and since configuration is used everywhere it should also
> add a few unit tests with the changed configuration and few edge cases,
> just so we can be sure it really works the way it is supposed to.
>

Acknowledged. I did update the documentation already but I did not add
a new unit test. I can work on that......

- Michael
Re: [PATCH v2 1/4] qemu.conf changes to support multiple memory backend
Posted by Martin Kletzander 2 months, 1 week ago
On Tue, Aug 06, 2024 at 03:45:54PM -0500, Michael Galaxy wrote:
>Thank you so much for the detailed comments. I'll work on those.
>
>I believe you had a couple questions below on this one......inlined .....
>
>On 8/6/24 07:35, Martin Kletzander wrote:
>> On Wed, Jun 05, 2024 at 04:37:35PM -0400, mgalaxy@akamai.com wrote:
>>> From: Michael Galaxy <mgalaxy@akamai.com>
>>>
>>> +                               _("Domain requesting configuration
>>> for %1$lu NUMA nodes, but memory backing directory only has (%2$lu)
>>> directory paths available. Either reduce this to one directory or
>>> provide more paths to use."),
>>> +                               numa_node_count,
>>> +                               cfg->nb_memoryBackingDirs);
>>> +                return -1;
>>> +            }
>>> +
>>> +            path_index = virBitmapNextSetBit(numaBitmap, -1);
>>
>> What if the single pinning is to the third host node, but there are only
>> two memoryBackingDirs supplied in the config?
>
>That case is totally supported (and expected). For example a *non* vNUMA
>guests (a single virtual NUMA node)
>running on a standard 2-socket (2-NUMA-node) host in a typical pizza box.
>
>If I understood that correctly, that is supported without a problem. The
>error above is describing the case that
>a virtual machine has *more* virtual NUMA nodes configured than the host
>has available to offer. In such a case,
>either the user needs to provide more host nodes or turn this feature
>off or use a single NUMA node (in which
>case all the guest nodes would be placed in a single host node).
>

If the machine is only pinned to the third node and two
memoryBackingDirs are supplied then path_index will be 2 and later
accessing memoryBackingDirs[path_index] will crash.  Or maybe I have
missed something.  That's why I asked since I'm unsure myself, I have
not tested it.

>>> {
>>> +    virQEMUDriver *driver = priv->driver;
>>>     g_autofree char *path = NULL;
>>> +    g_autoptr(virQEMUDriverConfig) cfg =
>>> virQEMUDriverGetConfig(driver);
>>> +    size_t i;
>>>
>>> -    if (qemuGetMemoryBackingPath(driver, vm->def, mem->info.alias,
>>> &path) < 0)
>>> -        return -1;
>>> +    for (i = 0; i < cfg->nb_memoryBackingDirs; i++) {
>>> +        virDomainXMLPrivateDataCallbacks *privateData =
>>> (virDomainXMLPrivateDataCallbacks *) priv;
>>> +        if (qemuGetMemoryBackingPath(def, privateData, i,
>>> mem->info.alias, &path) < 0)
>>> +            return -1;
>>>
>>
>> I wonder why are the paths not saved in the status XML of the running
>> domain.  Since they can be changed during the VM's lifetime (while
>> restarting libvirt) it could cause issues during clean-up.  Sure,
>> unlink() will set errno to ENOENT, but the proper paths will not be
>> cleaned up.  It'd also make this part of the code easier and safer to
>> use from the callers.  But that's pre-existing.
>
>I'm not sure I'm following the question on this one. I do not think we
>want the paths to be changing
>during the VM's lifetime. Are you asking if we *want*  to support that?
>The current
>libvirt XML schema does not support this. The actual path is kept hidden
>from the guest VM
>and is only known internally to libvirt.
>
>Can you clarify?
>

Exactly, we do not want the paths to change.  But if you start the VM,
then stop virtqemud/libvirtd (totally supported), change the config file
to point to a different directory (or even different number of
directories, I haven't even thought of that), then start the daemon back
again, we'll have a problem when the VM is being cleaned up.

For these kinds of situations we are keeping more pieces of information
about running domains in a so called status XML.  This is saved on disk,
but unavailable to the user so saving something there does not
explicitly expose it to the user.  Having the paths saved there would
make it nicer to clean up and it should've probably been done even with
that one path that is supported nowadays.  I can have a look at fixing
that at first and then your patches could be applied on top of that fix
if that's fine with you or you can have a look at tackling that
yourself, it should not be difficult, tests for that should be nice to
write as well, it just needs to be done in a back-compat way, of course.

See qemuDomainObjPrivateXMLFormat() and qemuDomainObjPrivateXMLParse()
for some starting points.
Re: [PATCH v2 1/4] qemu.conf changes to support multiple memory backend
Posted by Michael Galaxy via Devel 2 months, 1 week ago
Hi,

Answers below....

On 8/7/24 08:23, Martin Kletzander wrote:
> On Tue, Aug 06, 2024 at 03:45:54PM -0500, Michael Galaxy wrote:
>> Thank you so much for the detailed comments. I'll work on those.
>>
>> I believe you had a couple questions below on this one......inlined 
>> .....
>>
>> On 8/6/24 07:35, Martin Kletzander wrote:
>>> On Wed, Jun 05, 2024 at 04:37:35PM -0400, mgalaxy@akamai.com wrote:
>>>> From: Michael Galaxy <mgalaxy@akamai.com>
>>>>
>>>> +                               _("Domain requesting configuration
>>>> for %1$lu NUMA nodes, but memory backing directory only has (%2$lu)
>>>> directory paths available. Either reduce this to one directory or
>>>> provide more paths to use."),
>>>> +                               numa_node_count,
>>>> + cfg->nb_memoryBackingDirs);
>>>> +                return -1;
>>>> +            }
>>>> +
>>>> +            path_index = virBitmapNextSetBit(numaBitmap, -1);
>>>
>>> What if the single pinning is to the third host node, but there are 
>>> only
>>> two memoryBackingDirs supplied in the config?
>>
>> That case is totally supported (and expected). For example a *non* vNUMA
>> guests (a single virtual NUMA node)
>> running on a standard 2-socket (2-NUMA-node) host in a typical pizza 
>> box.
>>
>> If I understood that correctly, that is supported without a problem. The
>> error above is describing the case that
>> a virtual machine has *more* virtual NUMA nodes configured than the host
>> has available to offer. In such a case,
>> either the user needs to provide more host nodes or turn this feature
>> off or use a single NUMA node (in which
>> case all the guest nodes would be placed in a single host node).
>>
>
> If the machine is only pinned to the third node and two
> memoryBackingDirs are supplied then path_index will be 2 and later
> accessing memoryBackingDirs[path_index] will crash.  Or maybe I have
> missed something.  That's why I asked since I'm unsure myself, I have
> not tested it.
>
Oh ok, now I understand the question. I don't believe I've tested that 
either.

I'll see if libvirt lets the user do that or not and verify. Good catch.


>>>> {
>>>> +    virQEMUDriver *driver = priv->driver;
>>>>     g_autofree char *path = NULL;
>>>> +    g_autoptr(virQEMUDriverConfig) cfg =
>>>> virQEMUDriverGetConfig(driver);
>>>> +    size_t i;
>>>>
>>>> -    if (qemuGetMemoryBackingPath(driver, vm->def, mem->info.alias,
>>>> &path) < 0)
>>>> -        return -1;
>>>> +    for (i = 0; i < cfg->nb_memoryBackingDirs; i++) {
>>>> +        virDomainXMLPrivateDataCallbacks *privateData =
>>>> (virDomainXMLPrivateDataCallbacks *) priv;
>>>> +        if (qemuGetMemoryBackingPath(def, privateData, i,
>>>> mem->info.alias, &path) < 0)
>>>> +            return -1;
>>>>
>>>
>>> I wonder why are the paths not saved in the status XML of the running
>>> domain.  Since they can be changed during the VM's lifetime (while
>>> restarting libvirt) it could cause issues during clean-up. Sure,
>>> unlink() will set errno to ENOENT, but the proper paths will not be
>>> cleaned up.  It'd also make this part of the code easier and safer to
>>> use from the callers.  But that's pre-existing.
>>
>> I'm not sure I'm following the question on this one. I do not think we
>> want the paths to be changing
>> during the VM's lifetime. Are you asking if we *want*  to support that?
>> The current
>> libvirt XML schema does not support this. The actual path is kept hidden
>> from the guest VM
>> and is only known internally to libvirt.
>>
>> Can you clarify?
>>
>
> Exactly, we do not want the paths to change.  But if you start the VM,
> then stop virtqemud/libvirtd (totally supported), change the config file
> to point to a different directory (or even different number of
> directories, I haven't even thought of that), then start the daemon back
> again, we'll have a problem when the VM is being cleaned up.
>
> For these kinds of situations we are keeping more pieces of information
> about running domains in a so called status XML.  This is saved on disk,
> but unavailable to the user so saving something there does not
> explicitly expose it to the user.  Having the paths saved there would
> make it nicer to clean up and it should've probably been done even with
> that one path that is supported nowadays.  I can have a look at fixing
> that at first and then your patches could be applied on top of that fix
> if that's fine with you or you can have a look at tackling that
> yourself, it should not be difficult, tests for that should be nice to
> write as well, it just needs to be done in a back-compat way, of course.
>
> See qemuDomainObjPrivateXMLFormat() and qemuDomainObjPrivateXMLParse()
> for some starting points.


Oooooooh, I had no idea about this "status XML" feature/concept. I did 
not even
know that it was there. Thanks for the heads up. Let me look into it and 
get back to you.

Thanks again,
- Michael
Re: [PATCH v2 1/4] qemu.conf changes to support multiple memory backend
Posted by Michael Galaxy via Devel 2 months ago
Hi Martin,

On 8/7/24 10:10, Michael Galaxy wrote:
> Hi,
>
> Answers below....
>
> On 8/7/24 08:23, Martin Kletzander wrote:
>> On Tue, Aug 06, 2024 at 03:45:54PM -0500, Michael Galaxy wrote:
>>> Thank you so much for the detailed comments. I'll work on those.
>>>
>>> I believe you had a couple questions below on this one......inlined 
>>> .....
>>>
>>> On 8/6/24 07:35, Martin Kletzander wrote:
>>>> On Wed, Jun 05, 2024 at 04:37:35PM -0400, mgalaxy@akamai.com wrote:
>>>>> From: Michael Galaxy <mgalaxy@akamai.com>
>>>>>
>>>>> +                               _("Domain requesting configuration
>>>>> for %1$lu NUMA nodes, but memory backing directory only has (%2$lu)
>>>>> directory paths available. Either reduce this to one directory or
>>>>> provide more paths to use."),
>>>>> +                               numa_node_count,
>>>>> + cfg->nb_memoryBackingDirs);
>>>>> +                return -1;
>>>>> +            }
>>>>> +
>>>>> +            path_index = virBitmapNextSetBit(numaBitmap, -1);
>>>>
>>>> What if the single pinning is to the third host node, but there are 
>>>> only
>>>> two memoryBackingDirs supplied in the config?
>>>
>>> That case is totally supported (and expected). For example a *non* 
>>> vNUMA
>>> guests (a single virtual NUMA node)
>>> running on a standard 2-socket (2-NUMA-node) host in a typical pizza 
>>> box.
>>>
>>> If I understood that correctly, that is supported without a problem. 
>>> The
>>> error above is describing the case that
>>> a virtual machine has *more* virtual NUMA nodes configured than the 
>>> host
>>> has available to offer. In such a case,
>>> either the user needs to provide more host nodes or turn this feature
>>> off or use a single NUMA node (in which
>>> case all the guest nodes would be placed in a single host node).
>>>
>>
>> If the machine is only pinned to the third node and two
>> memoryBackingDirs are supplied then path_index will be 2 and later
>> accessing memoryBackingDirs[path_index] will crash.  Or maybe I have
>> missed something.  That's why I asked since I'm unsure myself, I have
>> not tested it.
>>
> Oh ok, now I understand the question. I don't believe I've tested that 
> either.
>
> I'll see if libvirt lets the user do that or not and verify. Good catch.
>
>
>>>>> {
>>>>> +    virQEMUDriver *driver = priv->driver;
>>>>>     g_autofree char *path = NULL;
>>>>> +    g_autoptr(virQEMUDriverConfig) cfg =
>>>>> virQEMUDriverGetConfig(driver);
>>>>> +    size_t i;
>>>>>
>>>>> -    if (qemuGetMemoryBackingPath(driver, vm->def, mem->info.alias,
>>>>> &path) < 0)
>>>>> -        return -1;
>>>>> +    for (i = 0; i < cfg->nb_memoryBackingDirs; i++) {
>>>>> +        virDomainXMLPrivateDataCallbacks *privateData =
>>>>> (virDomainXMLPrivateDataCallbacks *) priv;
>>>>> +        if (qemuGetMemoryBackingPath(def, privateData, i,
>>>>> mem->info.alias, &path) < 0)
>>>>> +            return -1;
>>>>>
>>>>
>>>> I wonder why are the paths not saved in the status XML of the running
>>>> domain.  Since they can be changed during the VM's lifetime (while
>>>> restarting libvirt) it could cause issues during clean-up. Sure,
>>>> unlink() will set errno to ENOENT, but the proper paths will not be
>>>> cleaned up.  It'd also make this part of the code easier and safer to
>>>> use from the callers.  But that's pre-existing.
>>>
>>> I'm not sure I'm following the question on this one. I do not think we
>>> want the paths to be changing
>>> during the VM's lifetime. Are you asking if we *want*  to support that?
>>> The current
>>> libvirt XML schema does not support this. The actual path is kept 
>>> hidden
>>> from the guest VM
>>> and is only known internally to libvirt.
>>>
>>> Can you clarify?
>>>
>>
>> Exactly, we do not want the paths to change.  But if you start the VM,
>> then stop virtqemud/libvirtd (totally supported), change the config file
>> to point to a different directory (or even different number of
>> directories, I haven't even thought of that), then start the daemon back
>> again, we'll have a problem when the VM is being cleaned up.
>>
>> For these kinds of situations we are keeping more pieces of information
>> about running domains in a so called status XML.  This is saved on disk,
>> but unavailable to the user so saving something there does not
>> explicitly expose it to the user.  Having the paths saved there would
>> make it nicer to clean up and it should've probably been done even with
>> that one path that is supported nowadays.  I can have a look at fixing
>> that at first and then your patches could be applied on top of that fix
>> if that's fine with you or you can have a look at tackling that
>> yourself, it should not be difficult, tests for that should be nice to
>> write as well, it just needs to be done in a back-compat way, of course.
>>
>> See qemuDomainObjPrivateXMLFormat() and qemuDomainObjPrivateXMLParse()
>> for some starting points.
>
I've finished addressing most of the rest of the review comments here 
and I'll
sent that out, but I'd like to avoid too many more revisions here.

IMHO, the problem above is really orthogonal to this feature and, as you 
said,
applies to the original (single path) case, so I'd like to leave that as 
a homework
assignment to the community.

Realistically, in a well-managed cloud, this is highly unlikely to 
happen, so I'm
not too worried about it.

Would a ticket be helpful somewhere? I don't know if libvirt has a way  
of tracking
these kinds of discoveries, I'd be happy to open a ticket somwhere and put
it in the libvirt backlog.

- Michael
Re: [PATCH v2 1/4] qemu.conf changes to support multiple memory backend
Posted by Martin Kletzander 1 month, 1 week ago
On Thu, Aug 15, 2024 at 01:30:38PM -0500, Michael Galaxy wrote:
>On 8/7/24 10:10, Michael Galaxy wrote:
>> On 8/7/24 08:23, Martin Kletzander wrote:
>>> Exactly, we do not want the paths to change.  But if you start the VM,
>>> then stop virtqemud/libvirtd (totally supported), change the config file
>>> to point to a different directory (or even different number of
>>> directories, I haven't even thought of that), then start the daemon back
>>> again, we'll have a problem when the VM is being cleaned up.
>>>
>>> For these kinds of situations we are keeping more pieces of information
>>> about running domains in a so called status XML.  This is saved on disk,
>>> but unavailable to the user so saving something there does not
>>> explicitly expose it to the user.  Having the paths saved there would
>>> make it nicer to clean up and it should've probably been done even with
>>> that one path that is supported nowadays.  I can have a look at fixing
>>> that at first and then your patches could be applied on top of that fix
>>> if that's fine with you or you can have a look at tackling that
>>> yourself, it should not be difficult, tests for that should be nice to
>>> write as well, it just needs to be done in a back-compat way, of course.
>>>
>>> See qemuDomainObjPrivateXMLFormat() and qemuDomainObjPrivateXMLParse()
>>> for some starting points.
>>
>I've finished addressing most of the rest of the review comments here
>and I'll
>sent that out, but I'd like to avoid too many more revisions here.
>
>IMHO, the problem above is really orthogonal to this feature and, as you
>said,
>applies to the original (single path) case, so I'd like to leave that as
>a homework
>assignment to the community.
>

Yes, but once we start changing things it is harder to fix them.

>Realistically, in a well-managed cloud, this is highly unlikely to
>happen, so I'm
>not too worried about it.
>

I am not talking about accidental changes.  It is fine until someone
"really needs" to change that value and expects libvirt to behave
correctly.

>Would a ticket be helpful somewhere? I don't know if libvirt has a way
>of tracking
>these kinds of discoveries, I'd be happy to open a ticket somwhere and put
>it in the libvirt backlog.
>

Sure, you can file an issue on gitlab:

https://gitlab.com/libvirt/libvirt/-/issues/new

but I'll try to work on and post a quick fix anyway so that it does not
block my already late review of your v3.

>- Michael
>
Re: [PATCH v2 1/4] qemu.conf changes to support multiple memory backend
Posted by Martin Kletzander 2 weeks, 1 day ago
On Sun, Sep 08, 2024 at 08:00:02AM +0200, Martin Kletzander wrote:
>On Thu, Aug 15, 2024 at 01:30:38PM -0500, Michael Galaxy wrote:
>>On 8/7/24 10:10, Michael Galaxy wrote:
>>> On 8/7/24 08:23, Martin Kletzander wrote:
>>>> Exactly, we do not want the paths to change.  But if you start the VM,
>>>> then stop virtqemud/libvirtd (totally supported), change the config file
>>>> to point to a different directory (or even different number of
>>>> directories, I haven't even thought of that), then start the daemon back
>>>> again, we'll have a problem when the VM is being cleaned up.
>>>>
>>>> For these kinds of situations we are keeping more pieces of information
>>>> about running domains in a so called status XML.  This is saved on disk,
>>>> but unavailable to the user so saving something there does not
>>>> explicitly expose it to the user.  Having the paths saved there would
>>>> make it nicer to clean up and it should've probably been done even with
>>>> that one path that is supported nowadays.  I can have a look at fixing
>>>> that at first and then your patches could be applied on top of that fix
>>>> if that's fine with you or you can have a look at tackling that
>>>> yourself, it should not be difficult, tests for that should be nice to
>>>> write as well, it just needs to be done in a back-compat way, of course.
>>>>
>>>> See qemuDomainObjPrivateXMLFormat() and qemuDomainObjPrivateXMLParse()
>>>> for some starting points.
>>>
>>I've finished addressing most of the rest of the review comments here
>>and I'll
>>sent that out, but I'd like to avoid too many more revisions here.
>>
>>IMHO, the problem above is really orthogonal to this feature and, as you
>>said,
>>applies to the original (single path) case, so I'd like to leave that as
>>a homework
>>assignment to the community.
>>
>
>Yes, but once we start changing things it is harder to fix them.
>
>>Realistically, in a well-managed cloud, this is highly unlikely to
>>happen, so I'm
>>not too worried about it.
>>
>
>I am not talking about accidental changes.  It is fine until someone
>"really needs" to change that value and expects libvirt to behave
>correctly.
>
>>Would a ticket be helpful somewhere? I don't know if libvirt has a way
>>of tracking
>>these kinds of discoveries, I'd be happy to open a ticket somwhere and put
>>it in the libvirt backlog.
>>
>
>Sure, you can file an issue on gitlab:
>
>https://gitlab.com/libvirt/libvirt/-/issues/new
>
>but I'll try to work on and post a quick fix anyway so that it does not
>block my already late review of your v3.
>

Sorry, I completely forgot to let you know that the fix is already in
the latest release, last commit of the series in master is 6f0974ca32ce.
Would you mind rebasing your patches on top of current master to reflect
the changes?

Thank you,
Martin

>>- Michael
>>