auto partition guests providing the host NUMA topology

Expand all Fold all
[libvirt] [RFC PATCH auto partition NUMA guest domains v1 1/2] domain: auto partition guests providing the host NUMA topology
Posted by Wim Ten Have 7 years, 4 months ago
From: Wim ten Have <wim.ten.have@oracle.com>

Add a mechanism to auto partition the host NUMA topology under the
guest domain.

This patch adds a framework to automatically partition the host into a
small vNUMA subset defined by the guest XML given <vcpu> and <memory>
description when <cpu mode="host-passthrough" check="numa".../> are in
effect and the hypervisor indicates per the host capabilities that a
physical NUMA topology is in effect.

The mechanism automatically renders the host capabilities provided NUMA
architecture, evenly balances the guest reserved vcpu and memory amongst
its vNUMA composed cells and have the cell allocated vcpus pinned towards
the host NUMA node physical cpusets.  This in such way that the host NUMA
topology is still in effect under the partitioned guest vNUMA domain.

Signed-off-by: Wim ten Have <wim.ten.have@oracle.com>
---
 docs/formatdomain.html.in |   7 ++
 docs/schemas/cputypes.rng |   1 +
 src/conf/cpu_conf.c       |   3 +-
 src/conf/cpu_conf.h       |   1 +
 src/conf/domain_conf.c    | 166 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 177 insertions(+), 1 deletion(-)

diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in
index 1f12ab5b4214..ba073d952545 100644
--- a/docs/formatdomain.html.in
+++ b/docs/formatdomain.html.in
@@ -1500,6 +1500,13 @@
           <dd>The virtual CPU created by the hypervisor will be checked
             against the CPU specification and the domain will not be started
             unless the two CPUs match.</dd>
+
+          <dt><code>numa</code></dt>
+          <dd>When the CPU mode='host-passthrough' check='numa' option
+            combination is set, libvirt auto partitions the guest domain
+            by rendering the host NUMA architecture.  Here the virtual
+            CPUs and memory are evenly balanced across the defined NUMA
+            nodes. The vCPUs are also pinned to their physical CPUs.</dd>
         </dl>
 
         <span class="since">Since 0.9.10</span>, an optional <code>mode</code>
diff --git a/docs/schemas/cputypes.rng b/docs/schemas/cputypes.rng
index 1f1e0e36d59b..d384d161ee7e 100644
--- a/docs/schemas/cputypes.rng
+++ b/docs/schemas/cputypes.rng
@@ -29,6 +29,7 @@
         <value>none</value>
         <value>partial</value>
         <value>full</value>
+        <value>numa</value>
       </choice>
     </attribute>
   </define>
diff --git a/src/conf/cpu_conf.c b/src/conf/cpu_conf.c
index 863413e75eaa..0d52f6aa4813 100644
--- a/src/conf/cpu_conf.c
+++ b/src/conf/cpu_conf.c
@@ -52,7 +52,8 @@ VIR_ENUM_IMPL(virCPUCheck, VIR_CPU_CHECK_LAST,
               "default",
               "none",
               "partial",
-              "full")
+              "full",
+              "numa")
 
 VIR_ENUM_IMPL(virCPUFallback, VIR_CPU_FALLBACK_LAST,
               "allow",
diff --git a/src/conf/cpu_conf.h b/src/conf/cpu_conf.h
index 9f2e7ee2649d..f2e2f0bef3ae 100644
--- a/src/conf/cpu_conf.h
+++ b/src/conf/cpu_conf.h
@@ -68,6 +68,7 @@ typedef enum {
     VIR_CPU_CHECK_NONE,
     VIR_CPU_CHECK_PARTIAL,
     VIR_CPU_CHECK_FULL,
+    VIR_CPU_CHECK_NUMA,
 
     VIR_CPU_CHECK_LAST
 } virCPUCheck;
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c
index 9911d56130a9..c2f9398cfe85 100644
--- a/src/conf/domain_conf.c
+++ b/src/conf/domain_conf.c
@@ -1759,6 +1759,168 @@ virDomainDefGetVcpusTopology(const virDomainDef *def,
 }
 
 
+/**
+ * virDomainNumaAutoconfig: auto partition guest vNUMA XML definitions
+ * taking the machine NUMA topology creating a small guest copy instance.
+ * @def: domain definition
+ * @caps: host capabilities
+ *
+ * Auto partitioning vNUMA guests is requested under XML configuration
+ * <cpu mode="host-passthrough" check="numa">.  Here libvirt takes the
+ * host NUMA topology, including maxvcpus, online vcpus, memory and
+ * pinning node cpu's where it renders the guest domain vNUMA topology
+ * building an architectural copy of the host.
+ *
+ * Returns 0 on success and -1 on error.
+ */
+static int
+virDomainNumaAutoconfig(virDomainDefPtr def,
+                        virCapsPtr caps)
+{
+    int ret = -1;
+
+    if (caps && def->cpu &&
+        def->cpu->mode == VIR_CPU_MODE_HOST_PASSTHROUGH &&
+        def->cpu->check == VIR_CPU_CHECK_NUMA) {
+
+        size_t i, cell;
+        size_t nvcpus = 0;
+        size_t nnumaCell = 0;
+        unsigned long long memsizeCell = 0;
+        virBitmapPtr vnumask = NULL;
+        virCapsHostPtr host = &caps->host;
+
+        nnumaCell = host->nnumaCell;
+        if (!nnumaCell)
+            goto cleanup;
+
+        /* Compute the online vcpus */
+        for (i = 0; i < def->maxvcpus; i++)
+            if (def->vcpus[i]->online)
+                nvcpus++;
+
+        if (nvcpus < nnumaCell) {
+            VIR_WARN("vNUMA disabled: %ld vcpus is insufficient "
+                     "to arrange a vNUMA topology for %ld nodes.",
+                      nvcpus, nnumaCell);
+            goto cleanup;
+        }
+
+        /* Compute the memory size (memsizeCell) per arranged nnumaCell
+         */
+        if ((memsizeCell = def->mem.total_memory / nnumaCell) == 0)
+            goto cleanup;
+
+        /* Correct vNUMA can only be accomplished if the number of maxvcpus
+         * is a multiple of the number of physical nodes.  If this is not
+         * possible we set sockets, cores and threads to 0 so libvirt
+         * creates a default topology where all vcpus appear as sockets and
+         * cores and threads are set to 1.
+         */
+        if (def->maxvcpus % nnumaCell) {
+            VIR_WARN("vNUMA: configured %ld vcpus do not meet the host "
+                     "%ld NUMA nodes for an evenly balanced cpu topology.",
+                      def->maxvcpus, nnumaCell);
+            def->cpu->sockets = def->cpu->cores = def->cpu->threads = 0;
+        } else {
+            /* Below artificial cpu topology computed aims for best host
+             * matching cores/threads alignment fitting the configured vcpus.
+             */
+            unsigned int sockets = host->numaCell[nnumaCell-1]->cpus->socket_id + 1;
+            unsigned int threads = host->cpu->threads;
+
+            if (def->maxvcpus % (sockets * threads))
+                threads = 1;
+
+            def->cpu->cores = def->maxvcpus / (sockets * threads);
+            def->cpu->threads = threads;
+            def->cpu->sockets = sockets;
+        }
+
+        /* Build the vNUMA topology.  Our former universe might have
+         * changed entirely where it did grow beyond former dimensions
+         * so fully free current allocations and start from scratch
+         * building new vNUMA topology.
+         */
+        virDomainNumaFree(def->numa);
+        if (!(def->numa = virDomainNumaNew()))
+            goto error;
+
+        if (!virDomainNumaSetNodeCount(def->numa, nnumaCell))
+            goto error;
+
+        for (cell = 0; cell < nnumaCell; cell++) {
+            char *vcpus = NULL;
+            size_t ndistances;
+            virBitmapPtr cpumask = NULL;
+            virCapsHostNUMACell *numaCell = host->numaCell[cell];
+
+            /* per NUMA cell memory size */
+            virDomainNumaSetNodeMemorySize(def->numa, cell, memsizeCell);
+
+            /* per NUMA cell vcpu range to mask */
+            for (i = cell; i < def->maxvcpus; i += nnumaCell) {
+                char *tmp = NULL;
+
+                if ((virAsprintf(&tmp, "%ld%s", i,
+                        ((def->maxvcpus - i) > nnumaCell) ? "," : "") < 0) ||
+                    (virAsprintf(&vcpus, "%s%s",
+                        (vcpus ? vcpus : ""), tmp) < 0)) {
+                    VIR_FREE(tmp);
+                    VIR_FREE(vcpus);
+                    goto error;
+                }
+                VIR_FREE(tmp);
+            }
+
+            if ((virBitmapParse(vcpus, &cpumask, VIR_DOMAIN_CPUMASK_LEN) < 0) ||
+                (virDomainNumaSetNodeCpumask(def->numa, cell, cpumask) == NULL)) {
+                VIR_FREE(vcpus);
+                goto error;
+            }
+            VIR_FREE(vcpus);
+
+            /* per NUMA cpus sibling vNUMA pinning */
+            if (!(vnumask = virBitmapNew(nnumaCell * numaCell->ncpus)))
+                goto error;
+
+            for (i = 0; i < numaCell->ncpus; i++) {
+                unsigned int id = numaCell->cpus[i].id;
+
+                if (virBitmapSetBit(vnumask, id) < 0) {
+                    virBitmapFree(vnumask);
+                    goto error;
+                }
+            }
+
+            for (i = 0; i < def->maxvcpus; i++) {
+                if (virBitmapIsBitSet(cpumask, i))
+                    def->vcpus[i]->cpumask = virBitmapNewCopy(vnumask);
+            }
+            virBitmapFree(vnumask);
+
+            /* per NUMA cell sibling distances */
+            ndistances = numaCell->nsiblings;
+            if (ndistances &&
+                virDomainNumaSetNodeDistanceCount(def->numa, cell, ndistances) != nnumaCell)
+                goto error;
+
+            for (i = 0; i < ndistances; i++) {
+                unsigned int distance = numaCell->siblings[i].distance;
+
+                if (virDomainNumaSetNodeDistance(def->numa, cell, i, distance) != distance)
+                    goto error;
+            }
+        }
+    }
+ cleanup:
+    ret = 0;
+
+ error:
+    return ret;
+}
+
+
 virDomainDiskDefPtr
 virDomainDiskDefNew(virDomainXMLOptionPtr xmlopt)
 {
@@ -19749,6 +19911,10 @@ virDomainDefParseXML(xmlDocPtr xml,
     if (virDomainNumaDefCPUParseXML(def->numa, ctxt) < 0)
         goto error;
 
+    /* Check and apply auto partition vNUMA topology to the guest if requested */
+    if (virDomainNumaAutoconfig(def, caps))
+        goto error;
+
     if (virDomainNumaGetCPUCountTotal(def->numa) > virDomainDefGetVcpusMax(def)) {
         virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
                        _("Number of CPUs in <numa> exceeds the"
-- 
2.17.1

--
libvir-list mailing list
libvir-list@redhat.com
https://www.redhat.com/mailman/listinfo/libvir-list
[libvirt] [RFC PATCH auto partition NUMA guest domains v1 1/2] domain: auto partition guests providing the host NUMA topology
[libvirt] [RFC PATCH auto partition NUMA guest domains v1 2/2] qemuxml2argv: add tests that exercise vNUMA auto partition topology