[libvirt PATCH 08/13] ch_cgroup: methods for cgroup mgmt in ch driver

Vineeth Pillai posted 13 patches 4 years, 3 months ago
There is a newer version of this series
[libvirt PATCH 08/13] ch_cgroup: methods for cgroup mgmt in ch driver
Posted by Vineeth Pillai 4 years, 3 months ago
Signed-off-by: Vineeth Pillai <viremana@linux.microsoft.com>
Signed-off-by: Praveen K Paladugu <prapal@linux.microsoft.com>
---
 po/POTFILES.in      |   1 +
 src/ch/ch_cgroup.c  | 457 ++++++++++++++++++++++++++++++++++++++++++++
 src/ch/ch_cgroup.h  |  45 +++++
 src/ch/ch_conf.c    |   2 +
 src/ch/ch_conf.h    |   4 +-
 src/ch/ch_domain.c  |  33 ++++
 src/ch/ch_domain.h  |   3 +-
 src/ch/ch_monitor.c | 125 ++++++++++--
 src/ch/ch_monitor.h |  54 +++++-
 src/ch/ch_process.c | 288 +++++++++++++++++++++++++++-
 src/ch/ch_process.h |   3 +
 src/ch/meson.build  |   2 +
 12 files changed, 991 insertions(+), 26 deletions(-)
 create mode 100644 src/ch/ch_cgroup.c
 create mode 100644 src/ch/ch_cgroup.h

diff --git a/po/POTFILES.in b/po/POTFILES.in
index b554cf08ca..3a8db501bc 100644
--- a/po/POTFILES.in
+++ b/po/POTFILES.in
@@ -19,6 +19,7 @@
 @SRCDIR@src/bhyve/bhyve_parse_command.c
 @SRCDIR@src/bhyve/bhyve_process.c
 @SRCDIR@src/ch/ch_conf.c
+@SRCDIR@src/ch/ch_cgroup.c
 @SRCDIR@src/ch/ch_domain.c
 @SRCDIR@src/ch/ch_driver.c
 @SRCDIR@src/ch/ch_monitor.c
diff --git a/src/ch/ch_cgroup.c b/src/ch/ch_cgroup.c
new file mode 100644
index 0000000000..6be2184cf1
--- /dev/null
+++ b/src/ch/ch_cgroup.c
@@ -0,0 +1,457 @@
+/*
+ * ch_cgroup.c: CH cgroup management
+ *
+ * Copyright Microsoft Corp. 2020-2021
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#include "ch_cgroup.h"
+#include "ch_domain.h"
+#include "ch_process.h"
+#include "vircgroup.h"
+#include "virlog.h"
+#include "viralloc.h"
+#include "virerror.h"
+#include "domain_audit.h"
+#include "domain_cgroup.h"
+#include "virscsi.h"
+#include "virstring.h"
+#include "virfile.h"
+#include "virtypedparam.h"
+#include "virnuma.h"
+#include "virdevmapper.h"
+#include "virutil.h"
+
+#define VIR_FROM_THIS VIR_FROM_CH
+
+VIR_LOG_INIT("ch.ch_cgroup");
+
+static int
+chSetupBlkioCgroup(virDomainObj * vm)
+{
+    virCHDomainObjPrivate *priv = vm->privateData;
+
+    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_BLKIO)) {
+        if (vm->def->blkio.weight || vm->def->blkio.ndevices) {
+            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+                           _("Block I/O tuning is not available on this host"));
+            return -1;
+        } else {
+            return 0;
+        }
+    }
+
+    return virDomainCgroupSetupBlkio(priv->cgroup, vm->def->blkio);
+}
+
+
+static int
+chSetupMemoryCgroup(virDomainObj * vm)
+{
+    virCHDomainObjPrivate *priv = vm->privateData;
+
+    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_MEMORY)) {
+        if (virMemoryLimitIsSet(vm->def->mem.hard_limit) ||
+            virMemoryLimitIsSet(vm->def->mem.soft_limit) ||
+            virMemoryLimitIsSet(vm->def->mem.swap_hard_limit)) {
+            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+                           _("Memory cgroup is not available on this host"));
+            return -1;
+        } else {
+            return 0;
+        }
+    }
+
+    return virDomainCgroupSetupMemtune(priv->cgroup, vm->def->mem);
+}
+
+static int
+chSetupCpusetCgroup(virDomainObj * vm)
+{
+    virCHDomainObjPrivate *priv = vm->privateData;
+
+    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
+        return 0;
+
+    if (virCgroupSetCpusetMemoryMigrate(priv->cgroup, true) < 0)
+        return -1;
+
+    return 0;
+}
+
+
+static int
+chSetupCpuCgroup(virDomainObj * vm)
+{
+    virCHDomainObjPrivate *priv = vm->privateData;
+
+    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
+        if (vm->def->cputune.sharesSpecified) {
+            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+                           _("CPU tuning is not available on this host"));
+            return -1;
+        } else {
+            return 0;
+        }
+    }
+
+    if (vm->def->cputune.sharesSpecified) {
+
+        if (virCgroupSetCpuShares(priv->cgroup, vm->def->cputune.shares) < 0)
+            return -1;
+
+    }
+
+    return 0;
+}
+
+
+static int
+chInitCgroup(virDomainObj * vm, size_t nnicindexes, int *nicindexes)
+{
+    virCHDomainObjPrivate *priv = vm->privateData;
+
+    g_autoptr(virCHDriverConfig) cfg = virCHDriverGetConfig(priv->driver);
+
+    if (!priv->driver->privileged)
+        return 0;
+
+    if (!virCgroupAvailable())
+        return 0;
+
+    virCgroupFree(priv->cgroup);
+
+    if (!vm->def->resource) {
+        virDomainResourceDef *res;
+
+        res = g_new0(virDomainResourceDef, 1);
+
+        res->partition = g_strdup("/machine");
+
+        vm->def->resource = res;
+    }
+
+    if (vm->def->resource->partition[0] != '/') {
+        virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
+                       _("Resource partition '%s' must start with '/'"),
+                       vm->def->resource->partition);
+        return -1;
+    }
+
+    if (virCgroupNewMachine(priv->machineName, "ch", vm->def->uuid, NULL, vm->pid, false, nnicindexes, nicindexes, vm->def->resource->partition, cfg->cgroupControllers, 0,     /* maxThreadsPerProc */
+                            &priv->cgroup) < 0) {
+        if (virCgroupNewIgnoreError())
+            return 0;
+
+        return -1;
+    }
+
+    return 0;
+}
+
+static void
+chRestoreCgroupState(virDomainObj * vm)
+{
+    g_autofree char *mem_mask = NULL;
+    g_autofree char *nodeset = NULL;
+    virCHDomainObjPrivate *priv = vm->privateData;
+    size_t i = 0;
+
+    g_autoptr(virBitmap) all_nodes = NULL;
+    virCgroup *cgroup_temp = NULL;
+
+    if (!virNumaIsAvailable() ||
+        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
+        return;
+
+    if (!(all_nodes = virNumaGetHostMemoryNodeset()))
+        goto error;
+
+    if (!(mem_mask = virBitmapFormat(all_nodes)))
+        goto error;
+
+    if ((virCgroupHasEmptyTasks(priv->cgroup,
+                                VIR_CGROUP_CONTROLLER_CPUSET)) <= 0)
+        goto error;
+
+    if (virCgroupSetCpusetMems(priv->cgroup, mem_mask) < 0)
+        goto error;
+
+    for (i = 0; i < virDomainDefGetVcpusMax(vm->def); i++) {
+        virDomainVcpuDef *vcpu = virDomainDefGetVcpu(vm->def, i);
+
+        if (!vcpu->online)
+            continue;
+
+        if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_VCPU, i,
+                               false, &cgroup_temp) < 0 ||
+            virCgroupSetCpusetMemoryMigrate(cgroup_temp, true) < 0 ||
+            virCgroupGetCpusetMems(cgroup_temp, &nodeset) < 0 ||
+            virCgroupSetCpusetMems(cgroup_temp, nodeset) < 0)
+            goto cleanup;
+
+        g_free(nodeset);
+        virCgroupFree(cgroup_temp);
+    }
+
+    for (i = 0; i < vm->def->niothreadids; i++) {
+        if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_IOTHREAD,
+                               vm->def->iothreadids[i]->iothread_id,
+                               false, &cgroup_temp) < 0 ||
+            virCgroupSetCpusetMemoryMigrate(cgroup_temp, true) < 0 ||
+            virCgroupGetCpusetMems(cgroup_temp, &nodeset) < 0 ||
+            virCgroupSetCpusetMems(cgroup_temp, nodeset) < 0)
+            goto cleanup;
+
+        g_free(nodeset);
+        virCgroupFree(cgroup_temp);
+    }
+
+    if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_EMULATOR, 0,
+                           false, &cgroup_temp) < 0 ||
+        virCgroupSetCpusetMemoryMigrate(cgroup_temp, true) < 0 ||
+        virCgroupGetCpusetMems(cgroup_temp, &nodeset) < 0 ||
+        virCgroupSetCpusetMems(cgroup_temp, nodeset) < 0)
+        goto cleanup;
+
+    cleanup:
+        virCgroupFree(cgroup_temp);
+        return;
+
+    error:
+        virResetLastError();
+        VIR_DEBUG("Couldn't restore cgroups to meaningful state");
+        goto cleanup;
+}
+
+int
+chConnectCgroup(virDomainObj * vm)
+{
+    virCHDomainObjPrivate *priv = vm->privateData;
+
+    g_autoptr(virCHDriverConfig) cfg = virCHDriverGetConfig(priv->driver);
+
+    if (!priv->driver->privileged)
+        return 0;
+
+    if (!virCgroupAvailable())
+        return 0;
+
+    virCgroupFree(priv->cgroup);
+
+    if (virCgroupNewDetectMachine(vm->def->name,
+                                  "ch",
+                                  vm->pid,
+                                  cfg->cgroupControllers,
+                                  priv->machineName, &priv->cgroup) < 0)
+        return -1;
+
+    chRestoreCgroupState(vm);
+    return 0;
+}
+
+int
+chSetupCgroup(virDomainObj * vm, size_t nnicindexes, int *nicindexes)
+{
+    virCHDomainObjPrivate *priv = vm->privateData;
+
+    if (!vm->pid) {
+        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
+                       _("Cannot setup cgroups until process is started"));
+        return -1;
+    }
+
+    if (chInitCgroup(vm, nnicindexes, nicindexes) < 0)
+        return -1;
+
+    if (!priv->cgroup)
+        return 0;
+
+    if (chSetupBlkioCgroup(vm) < 0)
+        return -1;
+
+    if (chSetupMemoryCgroup(vm) < 0)
+        return -1;
+
+    if (chSetupCpuCgroup(vm) < 0)
+        return -1;
+
+    if (chSetupCpusetCgroup(vm) < 0)
+        return -1;
+
+    return 0;
+}
+
+int
+chSetupCgroupVcpuBW(virCgroup * cgroup,
+                    unsigned long long period, long long quota)
+{
+    return virCgroupSetupCpuPeriodQuota(cgroup, period, quota);
+}
+
+
+int
+chSetupCgroupCpusetCpus(virCgroup * cgroup, virBitmap * cpumask)
+{
+    return virCgroupSetupCpusetCpus(cgroup, cpumask);
+}
+
+int
+chSetupGlobalCpuCgroup(virDomainObj * vm)
+{
+    virCHDomainObjPrivate *priv = vm->privateData;
+    unsigned long long period = vm->def->cputune.global_period;
+    long long quota = vm->def->cputune.global_quota;
+    g_autofree char *mem_mask = NULL;
+    virDomainNumatuneMemMode mem_mode;
+
+    if ((period || quota) &&
+        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
+        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+                       _("cgroup cpu is required for scheduler tuning"));
+        return -1;
+    }
+
+    /*
+     * If CPU cgroup controller is not initialized here, then we need
+     * neither period nor quota settings.  And if CPUSET controller is
+     * not initialized either, then there's nothing to do anyway.
+     */
+    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
+        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
+        return 0;
+
+
+    if (virDomainNumatuneGetMode(vm->def->numa, -1, &mem_mode) == 0 &&
+        mem_mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT &&
+        virDomainNumatuneMaybeFormatNodeset(vm->def->numa,
+                                            priv->autoNodeset,
+                                            &mem_mask, -1) < 0)
+        return -1;
+
+    if (period || quota) {
+        if (chSetupCgroupVcpuBW(priv->cgroup, period, quota) < 0)
+            return -1;
+    }
+
+    return 0;
+}
+
+
+int
+chRemoveCgroup(virDomainObj * vm)
+{
+    virCHDomainObjPrivate *priv = vm->privateData;
+
+    if (priv->cgroup == NULL)
+        return 0;               /* Not supported, so claim success */
+
+    if (virCgroupTerminateMachine(priv->machineName) < 0) {
+        if (!virCgroupNewIgnoreError())
+            VIR_DEBUG("Failed to terminate cgroup for %s", vm->def->name);
+    }
+
+    return virCgroupRemove(priv->cgroup);
+}
+
+
+static void
+chCgroupEmulatorAllNodesDataFree(chCgroupEmulatorAllNodesData * data)
+{
+    if (!data)
+        return;
+
+    virCgroupFree(data->emulatorCgroup);
+    g_free(data->emulatorMemMask);
+    g_free(data);
+}
+
+
+/**
+ * chCgroupEmulatorAllNodesAllow:
+ * @cgroup: domain cgroup pointer
+ * @retData: filled with structure used to roll back the operation
+ *
+ * Allows all NUMA nodes for the cloud hypervisor thread temporarily. This is
+ * necessary when hotplugging cpus since it requires memory allocated in the
+ * DMA region. Afterwards the operation can be reverted by
+ * chCgroupEmulatorAllNodesRestore.
+ *
+ * Returns 0 on success -1 on error
+ */
+int
+chCgroupEmulatorAllNodesAllow(virCgroup * cgroup,
+                              chCgroupEmulatorAllNodesData ** retData)
+{
+    chCgroupEmulatorAllNodesData *data = NULL;
+    g_autofree char *all_nodes_str = NULL;
+
+    g_autoptr(virBitmap) all_nodes = NULL;
+    int ret = -1;
+
+    if (!virNumaIsAvailable() ||
+        !virCgroupHasController(cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
+        return 0;
+
+    if (!(all_nodes = virNumaGetHostMemoryNodeset()))
+        goto cleanup;
+
+    if (!(all_nodes_str = virBitmapFormat(all_nodes)))
+        goto cleanup;
+
+    data = g_new0(chCgroupEmulatorAllNodesData, 1);
+
+    if (virCgroupNewThread(cgroup, VIR_CGROUP_THREAD_EMULATOR, 0,
+                           false, &data->emulatorCgroup) < 0)
+        goto cleanup;
+
+    if (virCgroupGetCpusetMems(data->emulatorCgroup, &data->emulatorMemMask) < 0
+        || virCgroupSetCpusetMems(data->emulatorCgroup, all_nodes_str) < 0)
+        goto cleanup;
+
+    *retData = g_steal_pointer(&data);
+    ret = 0;
+
+    cleanup:
+        chCgroupEmulatorAllNodesDataFree(data);
+
+    return ret;
+}
+
+
+/**
+ * chCgroupEmulatorAllNodesRestore:
+ * @data: data structure created by chCgroupEmulatorAllNodesAllow
+ *
+ * Rolls back the setting done by chCgroupEmulatorAllNodesAllow and frees the
+ * associated data.
+ */
+void
+chCgroupEmulatorAllNodesRestore(chCgroupEmulatorAllNodesData * data)
+{
+    virError *err;
+
+    if (!data)
+        return;
+
+    virErrorPreserveLast(&err);
+    virCgroupSetCpusetMems(data->emulatorCgroup, data->emulatorMemMask);
+    virErrorRestore(&err);
+
+    chCgroupEmulatorAllNodesDataFree(data);
+}
diff --git a/src/ch/ch_cgroup.h b/src/ch/ch_cgroup.h
new file mode 100644
index 0000000000..0152b5477c
--- /dev/null
+++ b/src/ch/ch_cgroup.h
@@ -0,0 +1,45 @@
+/*
+ * ch_cgroup.h: CH cgroup management
+ *
+ * Copyright Microsoft Corp. 2020-2021
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "virusb.h"
+#include "vircgroup.h"
+#include "domain_conf.h"
+#include "ch_conf.h"
+
+int chConnectCgroup(virDomainObj * vm);
+int chSetupCgroup(virDomainObj * vm, size_t nnicindexes, int *nicindexes);
+int chSetupCgroupVcpuBW(virCgroup * cgroup,
+                        unsigned long long period, long long quota);
+int chSetupCgroupCpusetCpus(virCgroup * cgroup, virBitmap * cpumask);
+int chSetupGlobalCpuCgroup(virDomainObj * vm);
+int chRemoveCgroup(virDomainObj * vm);
+
+typedef struct _chCgroupEmulatorAllNodesData chCgroupEmulatorAllNodesData;
+
+struct _chCgroupEmulatorAllNodesData {
+    virCgroup *emulatorCgroup;
+    char *emulatorMemMask;
+};
+
+int chCgroupEmulatorAllNodesAllow(virCgroup * cgroup,
+                                  chCgroupEmulatorAllNodesData ** data);
+void chCgroupEmulatorAllNodesRestore(chCgroupEmulatorAllNodesData * data);
diff --git a/src/ch/ch_conf.c b/src/ch/ch_conf.c
index ed0fffe5d6..7f70452296 100644
--- a/src/ch/ch_conf.c
+++ b/src/ch/ch_conf.c
@@ -141,6 +141,8 @@ virCHDriverConfigNew(bool privileged)
     if (!(cfg = virObjectNew(virCHDriverConfigClass)))
         return NULL;
 
+    cfg->cgroupControllers = -1; /* Auto detect */
+
     if (privileged) {
         if (virGetUserID(CH_USER, &cfg->user) < 0)
             return NULL;
diff --git a/src/ch/ch_conf.h b/src/ch/ch_conf.h
index 49f286f97a..19deb8e568 100644
--- a/src/ch/ch_conf.h
+++ b/src/ch/ch_conf.h
@@ -35,11 +35,13 @@ struct _virCHDriverConfig {
 
     char *stateDir;
     char *logDir;
-
+    int cgroupControllers;
     uid_t user;
     gid_t group;
 };
 
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(virCHDriverConfig, virObjectUnref);
+
 struct _virCHDriver
 {
     virMutex lock;
diff --git a/src/ch/ch_domain.c b/src/ch/ch_domain.c
index e1030800aa..d0aaeed1f4 100644
--- a/src/ch/ch_domain.c
+++ b/src/ch/ch_domain.c
@@ -326,6 +326,39 @@ chValidateDomainDeviceDef(const virDomainDeviceDef *dev,
                        _("Serial can only be enabled for a PTY"));
         return -1;
     }
+    return 0;
+}
+int
+virCHDomainRefreshThreadInfo(virDomainObj *vm)
+{
+    size_t maxvcpus = virDomainDefGetVcpusMax(vm->def);
+    virCHMonitorThreadInfo *info = NULL;
+    size_t nthreads, ncpus = 0;
+    size_t i;
+
+    nthreads = virCHMonitorGetThreadInfo(virCHDomainGetMonitor(vm),
+                                         true, &info);
+
+    for (i = 0; i < nthreads; i++) {
+        virCHDomainVcpuPrivate *vcpupriv;
+        virDomainVcpuDef *vcpu;
+        virCHMonitorCPUInfo *vcpuInfo;
+
+        if (info[i].type != virCHThreadTypeVcpu)
+            continue;
+
+        // TODO: hotplug support
+        vcpuInfo = &info[i].vcpuInfo;
+        vcpu = virDomainDefGetVcpu(vm->def, vcpuInfo->cpuid);
+        vcpupriv = CH_DOMAIN_VCPU_PRIVATE(vcpu);
+        vcpupriv->tid = vcpuInfo->tid;
+        ncpus++;
+    }
+
+    // TODO: Remove the warning when hotplug is implemented.
+    if (ncpus != maxvcpus)
+        VIR_WARN("Mismatch in the number of cpus, expected: %ld, actual: %ld",
+                 maxvcpus, ncpus);
 
     return 0;
 }
diff --git a/src/ch/ch_domain.h b/src/ch/ch_domain.h
index 3ac3421015..2ce3e2cef3 100644
--- a/src/ch/ch_domain.h
+++ b/src/ch/ch_domain.h
@@ -89,7 +89,8 @@ virCHDomainObjBeginJob(virDomainObj *obj, enum virCHDomainJob job)
 void
 virCHDomainObjEndJob(virDomainObj *obj);
 
-int virCHDomainRefreshVcpuInfo(virDomainObj *vm);
+int virCHDomainRefreshThreadInfo(virDomainObj *vm);
+
 pid_t virCHDomainGetVcpuPid(virDomainObj *vm, unsigned int vcpuid);
 bool virCHDomainHasVcpuPids(virDomainObj *vm);
 
diff --git a/src/ch/ch_monitor.c b/src/ch/ch_monitor.c
index c0ae031200..095779cb3f 100644
--- a/src/ch/ch_monitor.c
+++ b/src/ch/ch_monitor.c
@@ -41,6 +41,7 @@ VIR_LOG_INIT("ch.ch_monitor");
 
 static virClass *virCHMonitorClass;
 static void virCHMonitorDispose(void *obj);
+static void virCHMonitorThreadInfoFree(virCHMonitor *mon);
 
 static int virCHMonitorOnceInit(void)
 {
@@ -571,6 +572,7 @@ static void virCHMonitorDispose(void *opaque)
     virCHMonitor *mon = opaque;
 
     VIR_DEBUG("mon=%p", mon);
+    virCHMonitorThreadInfoFree(mon);
     virObjectUnref(mon->vm);
 }
 
@@ -736,6 +738,114 @@ virCHMonitorGet(virCHMonitor *mon, const char *endpoint, virJSONValue **response
     return ret;
 }
 
+/**
+ * virCHMonitorGetInfo:
+ * @mon: Pointer to the monitor
+ * @info: Get VM info
+ *
+ * Retrieve the VM info and store in @info
+ *
+ * Returns 0 on success.
+ */
+int
+virCHMonitorGetInfo(virCHMonitor *mon, virJSONValue **info)
+{
+    return virCHMonitorGet(mon, URL_VM_INFO, info);
+}
+
+static void
+virCHMonitorThreadInfoFree(virCHMonitor *mon)
+{
+    mon->nthreads = 0;
+    if (mon->threads)
+        VIR_FREE(mon->threads);
+}
+
+static size_t
+virCHMonitorRefreshThreadInfo(virCHMonitor *mon)
+{
+    virCHMonitorThreadInfo *info = NULL;
+    g_autofree pid_t *tids = NULL;
+    virDomainObj *vm = mon->vm;
+    size_t ntids = 0;
+    size_t i;
+
+
+    virCHMonitorThreadInfoFree(mon);
+    if (virProcessGetPids(vm->pid, &ntids, &tids) < 0) {
+        mon->threads = NULL;
+        return 0;
+    }
+
+    info = g_new0(virCHMonitorThreadInfo, ntids);
+    for (i = 0; i < ntids; i++) {
+        g_autofree char *proc = NULL;
+        g_autofree char *data = NULL;
+
+        proc = g_strdup_printf("/proc/%d/task/%d/comm",
+                (int)vm->pid, (int)tids[i]);
+
+        if (virFileReadAll(proc, (1<<16), &data) < 0) {
+            continue;
+        }
+
+        VIR_DEBUG("VM PID: %d, TID %d, COMM: %s",
+                (int)vm->pid, (int)tids[i], data);
+        if (STRPREFIX(data, "vcpu")) {
+            int cpuid;
+            char *tmp;
+            if (virStrToLong_i(data + 4, &tmp, 0, &cpuid) < 0) {
+                VIR_WARN("Index is not specified correctly");
+                continue;
+            }
+            info[i].type = virCHThreadTypeVcpu;
+            info[i].vcpuInfo.tid = tids[i];
+            info[i].vcpuInfo.online = true;
+            info[i].vcpuInfo.cpuid = cpuid;
+            VIR_DEBUG("vcpu%d -> tid: %d", cpuid, tids[i]);
+        } else if (STRPREFIX(data, "_disk") || STRPREFIX(data, "_net") ||
+                   STRPREFIX(data, "_rng")) {
+        /* Prefixes used by cloud-hypervisor for IO Threads are captured at
+        https://github.com/cloud-hypervisor/cloud-hypervisor/blob/main/vmm/src/device_manager.rs */
+            info[i].type = virCHThreadTypeIO;
+            info[i].ioInfo.tid = tids[i];
+            virStrcpy(info[i].ioInfo.thrName, data, VIRCH_THREAD_NAME_LEN - 1);
+        }else {
+            info[i].type = virCHThreadTypeEmulator;
+            info[i].emuInfo.tid = tids[i];
+            virStrcpy(info[i].emuInfo.thrName, data, VIRCH_THREAD_NAME_LEN - 1);
+        }
+        mon->nthreads++;
+
+    }
+    mon->threads = info;
+
+    return mon->nthreads;
+}
+
+/**
+ * virCHMonitorGetThreadInfo:
+ * @mon: Pointer to the monitor
+ * @refresh: Refresh thread info or not
+ *
+ * Retrive thread info and store to @threads
+ *
+ * Returns count of threads on success.
+ */
+size_t
+virCHMonitorGetThreadInfo(virCHMonitor *mon, bool refresh,
+                          virCHMonitorThreadInfo **threads)
+{
+    int nthreads = 0;
+
+    if (refresh)
+        nthreads = virCHMonitorRefreshThreadInfo(mon);
+
+    *threads = mon->threads;
+
+    return nthreads;
+}
+
 int
 virCHMonitorShutdownVMM(virCHMonitor *mon)
 {
@@ -810,18 +920,3 @@ virCHMonitorResumeVM(virCHMonitor *mon)
 {
     return virCHMonitorPutNoContent(mon, URL_VM_RESUME);
 }
-
-/**
- * virCHMonitorGetInfo:
- * @mon: Pointer to the monitor
- * @info: Get VM info
- *
- * Retrieve the VM info and store in @info
- *
- * Returns 0 on success.
- */
-int
-virCHMonitorGetInfo(virCHMonitor *mon, virJSONValue **info)
-{
-    return virCHMonitorGet(mon, URL_VM_INFO, info);
-}
diff --git a/src/ch/ch_monitor.h b/src/ch/ch_monitor.h
index 8ca9e17a9a..f8c3fa75e8 100644
--- a/src/ch/ch_monitor.h
+++ b/src/ch/ch_monitor.h
@@ -37,6 +37,50 @@
 #define URL_VM_RESUME "vm.resume"
 #define URL_VM_INFO "vm.info"
 
+#define VIRCH_THREAD_NAME_LEN   16
+
+typedef enum {
+    virCHThreadTypeEmulator,
+    virCHThreadTypeVcpu,
+    virCHThreadTypeIO,
+    virCHThreadTypeMax
+} virCHThreadType;
+
+typedef struct _virCHMonitorCPUInfo virCHMonitorCPUInfo;
+
+struct _virCHMonitorCPUInfo {
+    int cpuid;
+    pid_t tid;
+
+    bool online;
+};
+
+typedef struct _virCHMonitorEmuThreadInfo virCHMonitorEmuThreadInfo;
+
+struct _virCHMonitorEmuThreadInfo {
+    char    thrName[VIRCH_THREAD_NAME_LEN];
+    pid_t   tid;
+};
+
+typedef struct _virCHMonitorIOThreadInfo virCHMonitorIOThreadInfo;
+
+struct _virCHMonitorIOThreadInfo {
+    char    thrName[VIRCH_THREAD_NAME_LEN];
+    pid_t   tid;
+};
+
+typedef struct _virCHMonitorThreadInfo virCHMonitorThreadInfo;
+
+struct _virCHMonitorThreadInfo {
+    virCHThreadType type;
+
+    union {
+        virCHMonitorCPUInfo vcpuInfo;
+        virCHMonitorEmuThreadInfo emuInfo;
+        virCHMonitorIOThreadInfo ioInfo;
+    };
+};
+
 typedef struct _virCHMonitor virCHMonitor;
 
 struct _virCHMonitor {
@@ -49,6 +93,9 @@ struct _virCHMonitor {
     pid_t pid;
 
     virDomainObj *vm;
+
+    size_t nthreads;
+    virCHMonitorThreadInfo *threads;
 };
 
 virCHMonitor *virCHMonitorNew(virDomainObj *vm, const char *socketdir);
@@ -65,12 +112,9 @@ int virCHMonitorSuspendVM(virCHMonitor *mon);
 int virCHMonitorResumeVM(virCHMonitor *mon);
 int virCHMonitorGetInfo(virCHMonitor *mon, virJSONValue **info);
 
-typedef struct _virCHMonitorCPUInfo virCHMonitorCPUInfo;
-struct _virCHMonitorCPUInfo {
-    pid_t tid;
-    bool online;
-};
 void virCHMonitorCPUInfoFree(virCHMonitorCPUInfo *cpus);
 int virCHMonitorGetCPUInfo(virCHMonitor *mon,
                        virCHMonitorCPUInfo **vcpus,
                        size_t maxvcpus);
+size_t virCHMonitorGetThreadInfo(virCHMonitor *mon, bool refresh,
+                                 virCHMonitorThreadInfo **threads);
diff --git a/src/ch/ch_process.c b/src/ch/ch_process.c
index 3b7f6fcddf..8dce737adb 100644
--- a/src/ch/ch_process.c
+++ b/src/ch/ch_process.c
@@ -26,6 +26,8 @@
 #include "ch_domain.h"
 #include "ch_monitor.h"
 #include "ch_process.h"
+#include "ch_cgroup.h"
+#include "virnuma.h"
 #include "viralloc.h"
 #include "virerror.h"
 #include "virjson.h"
@@ -133,6 +135,257 @@ virCHProcessUpdateInfo(virDomainObj *vm)
     return 0;
 }
 
+static int
+virCHProcessGetAllCpuAffinity(virBitmap **cpumapRet)
+{
+    *cpumapRet = NULL;
+
+    if (!virHostCPUHasBitmap())
+        return 0;
+
+    if (!(*cpumapRet = virHostCPUGetOnlineBitmap()))
+        return -1;
+
+    return 0;
+}
+
+#if defined(WITH_SCHED_GETAFFINITY) || defined(WITH_BSD_CPU_AFFINITY)
+static int
+virCHProcessInitCpuAffinity(virDomainObj *vm)
+{
+    g_autoptr(virBitmap) cpumapToSet = NULL;
+    virDomainNumatuneMemMode mem_mode;
+    virCHDomainObjPrivate *priv = vm->privateData;
+
+    if (!vm->pid) {
+        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
+                       _("Cannot setup CPU affinity until process is started"));
+        return -1;
+    }
+
+    if (virDomainNumaGetNodeCount(vm->def->numa) <= 1 &&
+        virDomainNumatuneGetMode(vm->def->numa, -1, &mem_mode) == 0 &&
+        mem_mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) {
+        virBitmap *nodeset = NULL;
+
+        if (virDomainNumatuneMaybeGetNodeset(vm->def->numa,
+                                             priv->autoNodeset,
+                                             &nodeset,
+                                             -1) < 0)
+            return -1;
+
+        if (virNumaNodesetToCPUset(nodeset, &cpumapToSet) < 0)
+            return -1;
+    } else if (vm->def->cputune.emulatorpin) {
+        if (!(cpumapToSet = virBitmapNewCopy(vm->def->cputune.emulatorpin)))
+            return -1;
+    } else {
+        if (virCHProcessGetAllCpuAffinity(&cpumapToSet) < 0)
+            return -1;
+    }
+
+    if (cpumapToSet &&
+        virProcessSetAffinity(vm->pid, cpumapToSet, false) < 0) {
+        return -1;
+    }
+
+    return 0;
+}
+#else /* !defined(WITH_SCHED_GETAFFINITY) && !defined(WITH_BSD_CPU_AFFINITY) */
+static int
+virCHProcessInitCpuAffinity(virDomainObj *vm G_GNUC_UNUSED)
+{
+    return 0;
+}
+#endif /* !defined(WITH_SCHED_GETAFFINITY) && !defined(WITH_BSD_CPU_AFFINITY) */
+
+/**
+ * virCHProcessSetupPid:
+ *
+ * This function sets resource properties (affinity, cgroups,
+ * scheduler) for any PID associated with a domain.  It should be used
+ * to set up emulator PIDs as well as vCPU and I/O thread pids to
+ * ensure they are all handled the same way.
+ *
+ * Returns 0 on success, -1 on error.
+ */
+static int
+virCHProcessSetupPid(virDomainObj *vm,
+                     pid_t pid,
+                     virCgroupThreadName nameval,
+                     int id,
+                     virBitmap *cpumask,
+                     unsigned long long period,
+                     long long quota,
+                     virDomainThreadSchedParam *sched)
+{
+    virCHDomainObjPrivate *priv = vm->privateData;
+    virDomainNumatuneMemMode mem_mode;
+    virCgroup *cgroup = NULL;
+    virBitmap *use_cpumask = NULL;
+    virBitmap *affinity_cpumask = NULL;
+    g_autoptr(virBitmap) hostcpumap = NULL;
+    g_autofree char *mem_mask = NULL;
+    int ret = -1;
+
+    if ((period || quota) &&
+        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
+        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+                       _("cgroup cpu is required for scheduler tuning"));
+        goto cleanup;
+    }
+
+    /* Infer which cpumask shall be used. */
+    if (cpumask) {
+        use_cpumask = cpumask;
+    } else if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
+        use_cpumask = priv->autoCpuset;
+    } else if (vm->def->cpumask) {
+        use_cpumask = vm->def->cpumask;
+    } else {
+        /* we can't assume cloud-hypervisor itself is running on all pCPUs,
+         * so we need to explicitly set the spawned instance to all pCPUs. */
+        if (virCHProcessGetAllCpuAffinity(&hostcpumap) < 0)
+            goto cleanup;
+        affinity_cpumask = hostcpumap;
+    }
+
+    /*
+     * If CPU cgroup controller is not initialized here, then we need
+     * neither period nor quota settings.  And if CPUSET controller is
+     * not initialized either, then there's nothing to do anyway.
+     */
+    if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) ||
+        virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) {
+
+        if (virDomainNumatuneGetMode(vm->def->numa, -1, &mem_mode) == 0 &&
+            mem_mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT &&
+            virDomainNumatuneMaybeFormatNodeset(vm->def->numa,
+                                                priv->autoNodeset,
+                                                &mem_mask, -1) < 0)
+            goto cleanup;
+
+        if (virCgroupNewThread(priv->cgroup, nameval, id, true, &cgroup) < 0)
+            goto cleanup;
+
+        if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) {
+            if (use_cpumask &&
+                chSetupCgroupCpusetCpus(cgroup, use_cpumask) < 0)
+                goto cleanup;
+
+            if (mem_mask && virCgroupSetCpusetMems(cgroup, mem_mask) < 0)
+                goto cleanup;
+
+        }
+
+        if ((period || quota) &&
+            chSetupCgroupVcpuBW(cgroup, period, quota) < 0)
+            goto cleanup;
+
+        /* Move the thread to the sub dir */
+        VIR_INFO("Adding pid %d to cgroup", pid);
+        if (virCgroupAddThread(cgroup, pid) < 0)
+            goto cleanup;
+
+    }
+
+    if (!affinity_cpumask)
+        affinity_cpumask = use_cpumask;
+
+    /* Setup legacy affinity. */
+    if (affinity_cpumask && virProcessSetAffinity(pid, affinity_cpumask, false) < 0)
+        goto cleanup;
+
+    /* Set scheduler type and priority, but not for the main thread. */
+    if (sched &&
+        nameval != VIR_CGROUP_THREAD_EMULATOR &&
+        virProcessSetScheduler(pid, sched->policy, sched->priority) < 0)
+        goto cleanup;
+
+    ret = 0;
+ cleanup:
+    if (cgroup) {
+        if (ret < 0)
+            virCgroupRemove(cgroup);
+        virCgroupFree(cgroup);
+    }
+
+    return ret;
+}
+
+/**
+ * virCHProcessSetupVcpu:
+ * @vm: domain object
+ * @vcpuid: id of VCPU to set defaults
+ *
+ * This function sets resource properties (cgroups, affinity, scheduler) for a
+ * vCPU. This function expects that the vCPU is online and the vCPU pids were
+ * correctly detected at the point when it's called.
+ *
+ * Returns 0 on success, -1 on error.
+ */
+int
+virCHProcessSetupVcpu(virDomainObj *vm,
+                      unsigned int vcpuid)
+{
+    pid_t vcpupid = virCHDomainGetVcpuPid(vm, vcpuid);
+    virDomainVcpuDef *vcpu = virDomainDefGetVcpu(vm->def, vcpuid);
+
+    return virCHProcessSetupPid(vm, vcpupid, VIR_CGROUP_THREAD_VCPU,
+                                vcpuid, vcpu->cpumask,
+                                vm->def->cputune.period,
+                                vm->def->cputune.quota,
+                                &vcpu->sched);
+}
+
+static int
+virCHProcessSetupVcpus(virDomainObj *vm)
+{
+    virDomainVcpuDef *vcpu;
+    unsigned int maxvcpus = virDomainDefGetVcpusMax(vm->def);
+    size_t i;
+
+    if ((vm->def->cputune.period || vm->def->cputune.quota) &&
+        !virCgroupHasController(((virCHDomainObjPrivate *) vm->privateData)->cgroup,
+                                VIR_CGROUP_CONTROLLER_CPU)) {
+        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+                       _("cgroup cpu is required for scheduler tuning"));
+        return -1;
+    }
+
+    if (!virCHDomainHasVcpuPids(vm)) {
+        /* If any CPU has custom affinity that differs from the
+         * VM default affinity, we must reject it */
+        for (i = 0; i < maxvcpus; i++) {
+            vcpu = virDomainDefGetVcpu(vm->def, i);
+
+            if (!vcpu->online)
+                continue;
+
+            if (vcpu->cpumask &&
+                !virBitmapEqual(vm->def->cpumask, vcpu->cpumask)) {
+                virReportError(VIR_ERR_OPERATION_INVALID, "%s",
+                                _("cpu affinity is not supported"));
+                return -1;
+            }
+        }
+
+        return 0;
+    }
+
+    for (i = 0; i < maxvcpus; i++) {
+        vcpu = virDomainDefGetVcpu(vm->def, i);
+
+        if (!vcpu->online)
+            continue;
+
+        if (virCHProcessSetupVcpu(vm, i) < 0)
+            return -1;
+    }
+
+    return 0;
+}
+
 /**
  * virCHProcessStart:
  * @driver: pointer to driver structure
@@ -168,18 +421,33 @@ int virCHProcessStart(virCHDriver *driver,
         }
     }
 
+    vm->pid = priv->monitor->pid;
+    vm->def->id = vm->pid;
+    priv->machineName = virCHDomainGetMachineName(vm);
+
+    if (chSetupCgroup(vm, nnicindexes, nicindexes) < 0)
+        goto cleanup;
+
+    if (virCHProcessInitCpuAffinity(vm) < 0)
+        goto cleanup;
+
     if (virCHMonitorBootVM(priv->monitor) < 0) {
         virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
                        _("failed to boot guest VM"));
         goto cleanup;
     }
 
-    priv->machineName = virCHDomainGetMachineName(vm);
-    vm->pid = priv->monitor->pid;
-    vm->def->id = vm->pid;
+    virCHDomainRefreshThreadInfo(vm);
 
-    virCHProcessUpdateInfo(vm);
+    VIR_DEBUG("Setting global CPU cgroup (if required)");
+    if (chSetupGlobalCpuCgroup(vm) < 0)
+        goto cleanup;
+
+    VIR_DEBUG("Setting vCPU tuning/settings");
+    if (virCHProcessSetupVcpus(vm) < 0)
+        goto cleanup;
 
+    virCHProcessUpdateInfo(vm);
     virDomainObjSetState(vm, VIR_DOMAIN_RUNNING, reason);
 
     return 0;
@@ -195,6 +463,8 @@ int virCHProcessStop(virCHDriver *driver G_GNUC_UNUSED,
                      virDomainObj *vm,
                      virDomainShutoffReason reason)
 {
+    int ret;
+    int retries = 0;
     virCHDomainObjPrivate *priv = vm->privateData;
 
     VIR_DEBUG("Stopping VM name=%s pid=%d reason=%d",
@@ -205,6 +475,16 @@ int virCHProcessStop(virCHDriver *driver G_GNUC_UNUSED,
         priv->monitor = NULL;
     }
 
+    retry:
+        if ((ret = chRemoveCgroup(vm)) < 0) {
+            if (ret == -EBUSY && (retries++ < 5)) {
+                g_usleep(200*1000);
+                goto retry;
+            }
+            VIR_WARN("Failed to remove cgroup for %s",
+                    vm->def->name);
+        }
+
     vm->pid = -1;
     vm->def->id = -1;
 
diff --git a/src/ch/ch_process.h b/src/ch/ch_process.h
index abc4915979..800e3f4e23 100644
--- a/src/ch/ch_process.h
+++ b/src/ch/ch_process.h
@@ -29,3 +29,6 @@ int virCHProcessStart(virCHDriver *driver,
 int virCHProcessStop(virCHDriver *driver,
                      virDomainObj *vm,
                      virDomainShutoffReason reason);
+
+int virCHProcessSetupVcpu(virDomainObj *vm,
+                          unsigned int vcpuid);
diff --git a/src/ch/meson.build b/src/ch/meson.build
index 2b2bdda26c..0b20de56fd 100644
--- a/src/ch/meson.build
+++ b/src/ch/meson.build
@@ -1,6 +1,8 @@
 ch_driver_sources = [
   'ch_conf.c',
   'ch_conf.h',
+  'ch_cgroup.c',
+  'ch_cgroup.h',
   'ch_domain.c',
   'ch_domain.h',
   'ch_driver.c',
-- 
2.27.0


Re: [libvirt PATCH 08/13] ch_cgroup: methods for cgroup mgmt in ch driver
Posted by Daniel Henrique Barboza 4 years, 2 months ago

On 10/21/21 16:31, Vineeth Pillai wrote:
> Signed-off-by: Vineeth Pillai <viremana@linux.microsoft.com>
> Signed-off-by: Praveen K Paladugu <prapal@linux.microsoft.com>
> ---
>   po/POTFILES.in      |   1 +
>   src/ch/ch_cgroup.c  | 457 ++++++++++++++++++++++++++++++++++++++++++++
>   src/ch/ch_cgroup.h  |  45 +++++
>   src/ch/ch_conf.c    |   2 +
>   src/ch/ch_conf.h    |   4 +-
>   src/ch/ch_domain.c  |  33 ++++
>   src/ch/ch_domain.h  |   3 +-
>   src/ch/ch_monitor.c | 125 ++++++++++--
>   src/ch/ch_monitor.h |  54 +++++-
>   src/ch/ch_process.c | 288 +++++++++++++++++++++++++++-
>   src/ch/ch_process.h |   3 +
>   src/ch/meson.build  |   2 +
>   12 files changed, 991 insertions(+), 26 deletions(-)
>   create mode 100644 src/ch/ch_cgroup.c
>   create mode 100644 src/ch/ch_cgroup.h
> 
> diff --git a/po/POTFILES.in b/po/POTFILES.in
> index b554cf08ca..3a8db501bc 100644
> --- a/po/POTFILES.in
> +++ b/po/POTFILES.in
> @@ -19,6 +19,7 @@
>   @SRCDIR@src/bhyve/bhyve_parse_command.c
>   @SRCDIR@src/bhyve/bhyve_process.c
>   @SRCDIR@src/ch/ch_conf.c
> +@SRCDIR@src/ch/ch_cgroup.c
>   @SRCDIR@src/ch/ch_domain.c
>   @SRCDIR@src/ch/ch_driver.c
>   @SRCDIR@src/ch/ch_monitor.c
> diff --git a/src/ch/ch_cgroup.c b/src/ch/ch_cgroup.c
> new file mode 100644
> index 0000000000..6be2184cf1
> --- /dev/null
> +++ b/src/ch/ch_cgroup.c
> @@ -0,0 +1,457 @@
> +/*
> + * ch_cgroup.c: CH cgroup management
> + *
> + * Copyright Microsoft Corp. 2020-2021
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library.  If not, see
> + * <http://www.gnu.org/licenses/>.
> + */
> +
> +#include <config.h>
> +
> +#include "ch_cgroup.h"
> +#include "ch_domain.h"
> +#include "ch_process.h"
> +#include "vircgroup.h"
> +#include "virlog.h"
> +#include "viralloc.h"
> +#include "virerror.h"
> +#include "domain_audit.h"
> +#include "domain_cgroup.h"
> +#include "virscsi.h"
> +#include "virstring.h"
> +#include "virfile.h"
> +#include "virtypedparam.h"
> +#include "virnuma.h"
> +#include "virdevmapper.h"
> +#include "virutil.h"
> +
> +#define VIR_FROM_THIS VIR_FROM_CH
> +
> +VIR_LOG_INIT("ch.ch_cgroup");
> +
> +static int
> +chSetupBlkioCgroup(virDomainObj * vm)
> +{
> +    virCHDomainObjPrivate *priv = vm->privateData;
> +
> +    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_BLKIO)) {
> +        if (vm->def->blkio.weight || vm->def->blkio.ndevices) {
> +            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
> +                           _("Block I/O tuning is not available on this host"));
> +            return -1;
> +        } else {
> +            return 0;
> +        }
> +    }
> +
> +    return virDomainCgroupSetupBlkio(priv->cgroup, vm->def->blkio);
> +}
> +
> +
> +static int
> +chSetupMemoryCgroup(virDomainObj * vm)
> +{
> +    virCHDomainObjPrivate *priv = vm->privateData;
> +
> +    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_MEMORY)) {
> +        if (virMemoryLimitIsSet(vm->def->mem.hard_limit) ||
> +            virMemoryLimitIsSet(vm->def->mem.soft_limit) ||
> +            virMemoryLimitIsSet(vm->def->mem.swap_hard_limit)) {
> +            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
> +                           _("Memory cgroup is not available on this host"));
> +            return -1;
> +        } else {
> +            return 0;
> +        }
> +    }
> +
> +    return virDomainCgroupSetupMemtune(priv->cgroup, vm->def->mem);
> +}
> +
> +static int
> +chSetupCpusetCgroup(virDomainObj * vm)
> +{
> +    virCHDomainObjPrivate *priv = vm->privateData;
> +
> +    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
> +        return 0;
> +
> +    if (virCgroupSetCpusetMemoryMigrate(priv->cgroup, true) < 0)
> +        return -1;
> +
> +    return 0;
> +}
> +
> +
> +static int
> +chSetupCpuCgroup(virDomainObj * vm)
> +{
> +    virCHDomainObjPrivate *priv = vm->privateData;
> +
> +    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
> +        if (vm->def->cputune.sharesSpecified) {
> +            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
> +                           _("CPU tuning is not available on this host"));
> +            return -1;
> +        } else {
> +            return 0;
> +        }
> +    }
> +
> +    if (vm->def->cputune.sharesSpecified) {
> +
> +        if (virCgroupSetCpuShares(priv->cgroup, vm->def->cputune.shares) < 0)
> +            return -1;
> +
> +    }
> +
> +    return 0;
> +}
> +
> +
> +static int
> +chInitCgroup(virDomainObj * vm, size_t nnicindexes, int *nicindexes)
> +{
> +    virCHDomainObjPrivate *priv = vm->privateData;
> +
> +    g_autoptr(virCHDriverConfig) cfg = virCHDriverGetConfig(priv->driver);
> +
> +    if (!priv->driver->privileged)
> +        return 0;
> +
> +    if (!virCgroupAvailable())
> +        return 0;
> +
> +    virCgroupFree(priv->cgroup);
> +
> +    if (!vm->def->resource) {
> +        virDomainResourceDef *res;
> +
> +        res = g_new0(virDomainResourceDef, 1);
> +
> +        res->partition = g_strdup("/machine");
> +
> +        vm->def->resource = res;
> +    }
> +
> +    if (vm->def->resource->partition[0] != '/') {
> +        virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
> +                       _("Resource partition '%s' must start with '/'"),
> +                       vm->def->resource->partition);
> +        return -1;
> +    }
> +
> +    if (virCgroupNewMachine(priv->machineName, "ch", vm->def->uuid, NULL, vm->pid, false, nnicindexes, nicindexes, vm->def->resource->partition, cfg->cgroupControllers, 0,     /* maxThreadsPerProc */


Break this big boy to at least 100 chars per line, please.


> +                            &priv->cgroup) < 0) {
> +        if (virCgroupNewIgnoreError())
> +            return 0;
> +
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
> +static void
> +chRestoreCgroupState(virDomainObj * vm)
> +{
> +    g_autofree char *mem_mask = NULL;
> +    g_autofree char *nodeset = NULL;
> +    virCHDomainObjPrivate *priv = vm->privateData;
> +    size_t i = 0;
> +
> +    g_autoptr(virBitmap) all_nodes = NULL;
> +    virCgroup *cgroup_temp = NULL;
> +
> +    if (!virNumaIsAvailable() ||
> +        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
> +        return;
> +
> +    if (!(all_nodes = virNumaGetHostMemoryNodeset()))
> +        goto error;
> +
> +    if (!(mem_mask = virBitmapFormat(all_nodes)))
> +        goto error;
> +
> +    if ((virCgroupHasEmptyTasks(priv->cgroup,
> +                                VIR_CGROUP_CONTROLLER_CPUSET)) <= 0)
> +        goto error;
> +
> +    if (virCgroupSetCpusetMems(priv->cgroup, mem_mask) < 0)
> +        goto error;
> +
> +    for (i = 0; i < virDomainDefGetVcpusMax(vm->def); i++) {
> +        virDomainVcpuDef *vcpu = virDomainDefGetVcpu(vm->def, i);
> +
> +        if (!vcpu->online)
> +            continue;
> +
> +        if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_VCPU, i,
> +                               false, &cgroup_temp) < 0 ||
> +            virCgroupSetCpusetMemoryMigrate(cgroup_temp, true) < 0 ||
> +            virCgroupGetCpusetMems(cgroup_temp, &nodeset) < 0 ||
> +            virCgroupSetCpusetMems(cgroup_temp, nodeset) < 0)
> +            goto cleanup;
> +
> +        g_free(nodeset);
> +        virCgroupFree(cgroup_temp);
> +    }
> +
> +    for (i = 0; i < vm->def->niothreadids; i++) {
> +        if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_IOTHREAD,
> +                               vm->def->iothreadids[i]->iothread_id,
> +                               false, &cgroup_temp) < 0 ||
> +            virCgroupSetCpusetMemoryMigrate(cgroup_temp, true) < 0 ||
> +            virCgroupGetCpusetMems(cgroup_temp, &nodeset) < 0 ||
> +            virCgroupSetCpusetMems(cgroup_temp, nodeset) < 0)
> +            goto cleanup;
> +
> +        g_free(nodeset);
> +        virCgroupFree(cgroup_temp);
> +    }
> +
> +    if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_EMULATOR, 0,
> +                           false, &cgroup_temp) < 0 ||
> +        virCgroupSetCpusetMemoryMigrate(cgroup_temp, true) < 0 ||
> +        virCgroupGetCpusetMems(cgroup_temp, &nodeset) < 0 ||
> +        virCgroupSetCpusetMems(cgroup_temp, nodeset) < 0)
> +        goto cleanup;
> +
> +    cleanup:
> +        virCgroupFree(cgroup_temp);
> +        return;

Up there in the cgroup_temp declaration you can use

g_autoptr(virCgroup) cgroup_temp = NULL;

And then you won't need a 'cleanup' label.

> +
> +    error:

Wrong label indentation.


> +        virResetLastError();
> +        VIR_DEBUG("Couldn't restore cgroups to meaningful state");
> +        goto cleanup;
> +}
> +
> +int
> +chConnectCgroup(virDomainObj * vm)
> +{
> +    virCHDomainObjPrivate *priv = vm->privateData;
> +
> +    g_autoptr(virCHDriverConfig) cfg = virCHDriverGetConfig(priv->driver);
> +
> +    if (!priv->driver->privileged)
> +        return 0;
> +
> +    if (!virCgroupAvailable())
> +        return 0;
> +
> +    virCgroupFree(priv->cgroup);
> +
> +    if (virCgroupNewDetectMachine(vm->def->name,
> +                                  "ch",
> +                                  vm->pid,
> +                                  cfg->cgroupControllers,
> +                                  priv->machineName, &priv->cgroup) < 0)
> +        return -1;
> +
> +    chRestoreCgroupState(vm);
> +    return 0;
> +}
> +
> +int
> +chSetupCgroup(virDomainObj * vm, size_t nnicindexes, int *nicindexes)
> +{
> +    virCHDomainObjPrivate *priv = vm->privateData;
> +
> +    if (!vm->pid) {
> +        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
> +                       _("Cannot setup cgroups until process is started"));
> +        return -1;
> +    }
> +
> +    if (chInitCgroup(vm, nnicindexes, nicindexes) < 0)
> +        return -1;
> +
> +    if (!priv->cgroup)
> +        return 0;
> +
> +    if (chSetupBlkioCgroup(vm) < 0)
> +        return -1;
> +
> +    if (chSetupMemoryCgroup(vm) < 0)
> +        return -1;
> +
> +    if (chSetupCpuCgroup(vm) < 0)
> +        return -1;
> +
> +    if (chSetupCpusetCgroup(vm) < 0)
> +        return -1;
> +
> +    return 0;
> +}
> +
> +int
> +chSetupCgroupVcpuBW(virCgroup * cgroup,
> +                    unsigned long long period, long long quota)
> +{
> +    return virCgroupSetupCpuPeriodQuota(cgroup, period, quota);
> +}
> +
> +
> +int
> +chSetupCgroupCpusetCpus(virCgroup * cgroup, virBitmap * cpumask)
> +{
> +    return virCgroupSetupCpusetCpus(cgroup, cpumask);
> +}
> +
> +int
> +chSetupGlobalCpuCgroup(virDomainObj * vm)
> +{
> +    virCHDomainObjPrivate *priv = vm->privateData;
> +    unsigned long long period = vm->def->cputune.global_period;
> +    long long quota = vm->def->cputune.global_quota;
> +    g_autofree char *mem_mask = NULL;
> +    virDomainNumatuneMemMode mem_mode;
> +
> +    if ((period || quota) &&
> +        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
> +        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
> +                       _("cgroup cpu is required for scheduler tuning"));
> +        return -1;
> +    }
> +
> +    /*
> +     * If CPU cgroup controller is not initialized here, then we need
> +     * neither period nor quota settings.  And if CPUSET controller is
> +     * not initialized either, then there's nothing to do anyway.
> +     */
> +    if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) &&
> +        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
> +        return 0;
> +
> +
> +    if (virDomainNumatuneGetMode(vm->def->numa, -1, &mem_mode) == 0 &&
> +        mem_mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT &&
> +        virDomainNumatuneMaybeFormatNodeset(vm->def->numa,
> +                                            priv->autoNodeset,
> +                                            &mem_mask, -1) < 0)
> +        return -1;
> +
> +    if (period || quota) {
> +        if (chSetupCgroupVcpuBW(priv->cgroup, period, quota) < 0)
> +            return -1;
> +    }
> +
> +    return 0;
> +}
> +
> +
> +int
> +chRemoveCgroup(virDomainObj * vm)
> +{
> +    virCHDomainObjPrivate *priv = vm->privateData;
> +
> +    if (priv->cgroup == NULL)
> +        return 0;               /* Not supported, so claim success */
> +
> +    if (virCgroupTerminateMachine(priv->machineName) < 0) {
> +        if (!virCgroupNewIgnoreError())
> +            VIR_DEBUG("Failed to terminate cgroup for %s", vm->def->name);
> +    }
> +
> +    return virCgroupRemove(priv->cgroup);
> +}
> +
> +
> +static void
> +chCgroupEmulatorAllNodesDataFree(chCgroupEmulatorAllNodesData * data)
> +{
> +    if (!data)
> +        return;
> +
> +    virCgroupFree(data->emulatorCgroup);
> +    g_free(data->emulatorMemMask);
> +    g_free(data);
> +}
> +
> +
> +/**
> + * chCgroupEmulatorAllNodesAllow:
> + * @cgroup: domain cgroup pointer
> + * @retData: filled with structure used to roll back the operation
> + *
> + * Allows all NUMA nodes for the cloud hypervisor thread temporarily. This is
> + * necessary when hotplugging cpus since it requires memory allocated in the
> + * DMA region. Afterwards the operation can be reverted by
> + * chCgroupEmulatorAllNodesRestore.
> + *
> + * Returns 0 on success -1 on error
> + */
> +int
> +chCgroupEmulatorAllNodesAllow(virCgroup * cgroup,
> +                              chCgroupEmulatorAllNodesData ** retData)
> +{
> +    chCgroupEmulatorAllNodesData *data = NULL;
> +    g_autofree char *all_nodes_str = NULL;
> +

Extra line in the middle of var declarations.

> +    g_autoptr(virBitmap) all_nodes = NULL;
> +    int ret = -1;
> +
> +    if (!virNumaIsAvailable() ||
> +        !virCgroupHasController(cgroup, VIR_CGROUP_CONTROLLER_CPUSET))
> +        return 0;
> +
> +    if (!(all_nodes = virNumaGetHostMemoryNodeset()))
> +        goto cleanup;
> +
> +    if (!(all_nodes_str = virBitmapFormat(all_nodes)))
> +        goto cleanup;
> +
> +    data = g_new0(chCgroupEmulatorAllNodesData, 1);
> +
> +    if (virCgroupNewThread(cgroup, VIR_CGROUP_THREAD_EMULATOR, 0,
> +                           false, &data->emulatorCgroup) < 0)
> +        goto cleanup;
> +
> +    if (virCgroupGetCpusetMems(data->emulatorCgroup, &data->emulatorMemMask) < 0
> +        || virCgroupSetCpusetMems(data->emulatorCgroup, all_nodes_str) < 0)
> +        goto cleanup;
> +
> +    *retData = g_steal_pointer(&data);
> +    ret = 0;
> +
> +    cleanup:

Wrong label indentation.

> +        chCgroupEmulatorAllNodesDataFree(data);
> +
> +    return ret;
> +}
> +
> +
> +/**
> + * chCgroupEmulatorAllNodesRestore:
> + * @data: data structure created by chCgroupEmulatorAllNodesAllow
> + *
> + * Rolls back the setting done by chCgroupEmulatorAllNodesAllow and frees the
> + * associated data.
> + */
> +void
> +chCgroupEmulatorAllNodesRestore(chCgroupEmulatorAllNodesData * data)
> +{
> +    virError *err;
> +
> +    if (!data)
> +        return;
> +
> +    virErrorPreserveLast(&err);
> +    virCgroupSetCpusetMems(data->emulatorCgroup, data->emulatorMemMask);
> +    virErrorRestore(&err);
> +
> +    chCgroupEmulatorAllNodesDataFree(data);
> +}
> diff --git a/src/ch/ch_cgroup.h b/src/ch/ch_cgroup.h
> new file mode 100644
> index 0000000000..0152b5477c
> --- /dev/null
> +++ b/src/ch/ch_cgroup.h
> @@ -0,0 +1,45 @@
> +/*
> + * ch_cgroup.h: CH cgroup management
> + *
> + * Copyright Microsoft Corp. 2020-2021
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library.  If not, see
> + * <http://www.gnu.org/licenses/>.
> + */
> +
> +#pragma once
> +
> +#include "virusb.h"
> +#include "vircgroup.h"
> +#include "domain_conf.h"
> +#include "ch_conf.h"
> +
> +int chConnectCgroup(virDomainObj * vm);
> +int chSetupCgroup(virDomainObj * vm, size_t nnicindexes, int *nicindexes);
> +int chSetupCgroupVcpuBW(virCgroup * cgroup,
> +                        unsigned long long period, long long quota);
> +int chSetupCgroupCpusetCpus(virCgroup * cgroup, virBitmap * cpumask);
> +int chSetupGlobalCpuCgroup(virDomainObj * vm);
> +int chRemoveCgroup(virDomainObj * vm);
> +
> +typedef struct _chCgroupEmulatorAllNodesData chCgroupEmulatorAllNodesData;
> +
> +struct _chCgroupEmulatorAllNodesData {
> +    virCgroup *emulatorCgroup;
> +    char *emulatorMemMask;
> +};
> +
> +int chCgroupEmulatorAllNodesAllow(virCgroup * cgroup,
> +                                  chCgroupEmulatorAllNodesData ** data);
> +void chCgroupEmulatorAllNodesRestore(chCgroupEmulatorAllNodesData * data);
> diff --git a/src/ch/ch_conf.c b/src/ch/ch_conf.c
> index ed0fffe5d6..7f70452296 100644
> --- a/src/ch/ch_conf.c
> +++ b/src/ch/ch_conf.c
> @@ -141,6 +141,8 @@ virCHDriverConfigNew(bool privileged)
>       if (!(cfg = virObjectNew(virCHDriverConfigClass)))
>           return NULL;
>   
> +    cfg->cgroupControllers = -1; /* Auto detect */
> +
>       if (privileged) {
>           if (virGetUserID(CH_USER, &cfg->user) < 0)
>               return NULL;
> diff --git a/src/ch/ch_conf.h b/src/ch/ch_conf.h
> index 49f286f97a..19deb8e568 100644
> --- a/src/ch/ch_conf.h
> +++ b/src/ch/ch_conf.h
> @@ -35,11 +35,13 @@ struct _virCHDriverConfig {
>   
>       char *stateDir;
>       char *logDir;
> -
> +    int cgroupControllers;
>       uid_t user;
>       gid_t group;
>   };
>   
> +G_DEFINE_AUTOPTR_CLEANUP_FUNC(virCHDriverConfig, virObjectUnref);
> +
>   struct _virCHDriver
>   {
>       virMutex lock;
> diff --git a/src/ch/ch_domain.c b/src/ch/ch_domain.c
> index e1030800aa..d0aaeed1f4 100644
> --- a/src/ch/ch_domain.c
> +++ b/src/ch/ch_domain.c
> @@ -326,6 +326,39 @@ chValidateDomainDeviceDef(const virDomainDeviceDef *dev,
>                          _("Serial can only be enabled for a PTY"));
>           return -1;
>       }
> +    return 0;
> +}
> +int
> +virCHDomainRefreshThreadInfo(virDomainObj *vm)
> +{
> +    size_t maxvcpus = virDomainDefGetVcpusMax(vm->def);
> +    virCHMonitorThreadInfo *info = NULL;
> +    size_t nthreads, ncpus = 0;
> +    size_t i;
> +
> +    nthreads = virCHMonitorGetThreadInfo(virCHDomainGetMonitor(vm),
> +                                         true, &info);
> +
> +    for (i = 0; i < nthreads; i++) {
> +        virCHDomainVcpuPrivate *vcpupriv;
> +        virDomainVcpuDef *vcpu;
> +        virCHMonitorCPUInfo *vcpuInfo;
> +
> +        if (info[i].type != virCHThreadTypeVcpu)
> +            continue;
> +
> +        // TODO: hotplug support
> +        vcpuInfo = &info[i].vcpuInfo;
> +        vcpu = virDomainDefGetVcpu(vm->def, vcpuInfo->cpuid);
> +        vcpupriv = CH_DOMAIN_VCPU_PRIVATE(vcpu);
> +        vcpupriv->tid = vcpuInfo->tid;
> +        ncpus++;
> +    }
> +
> +    // TODO: Remove the warning when hotplug is implemented.
> +    if (ncpus != maxvcpus)
> +        VIR_WARN("Mismatch in the number of cpus, expected: %ld, actual: %ld",
> +                 maxvcpus, ncpus);
>   
>       return 0;
>   }
> diff --git a/src/ch/ch_domain.h b/src/ch/ch_domain.h
> index 3ac3421015..2ce3e2cef3 100644
> --- a/src/ch/ch_domain.h
> +++ b/src/ch/ch_domain.h
> @@ -89,7 +89,8 @@ virCHDomainObjBeginJob(virDomainObj *obj, enum virCHDomainJob job)
>   void
>   virCHDomainObjEndJob(virDomainObj *obj);
>   
> -int virCHDomainRefreshVcpuInfo(virDomainObj *vm);
> +int virCHDomainRefreshThreadInfo(virDomainObj *vm);
> +
>   pid_t virCHDomainGetVcpuPid(virDomainObj *vm, unsigned int vcpuid);
>   bool virCHDomainHasVcpuPids(virDomainObj *vm);
>   
> diff --git a/src/ch/ch_monitor.c b/src/ch/ch_monitor.c
> index c0ae031200..095779cb3f 100644
> --- a/src/ch/ch_monitor.c
> +++ b/src/ch/ch_monitor.c
> @@ -41,6 +41,7 @@ VIR_LOG_INIT("ch.ch_monitor");
>   
>   static virClass *virCHMonitorClass;
>   static void virCHMonitorDispose(void *obj);
> +static void virCHMonitorThreadInfoFree(virCHMonitor *mon);
>   
>   static int virCHMonitorOnceInit(void)
>   {
> @@ -571,6 +572,7 @@ static void virCHMonitorDispose(void *opaque)
>       virCHMonitor *mon = opaque;
>   
>       VIR_DEBUG("mon=%p", mon);
> +    virCHMonitorThreadInfoFree(mon);
>       virObjectUnref(mon->vm);
>   }
>   
> @@ -736,6 +738,114 @@ virCHMonitorGet(virCHMonitor *mon, const char *endpoint, virJSONValue **response
>       return ret;
>   }
>   
> +/**
> + * virCHMonitorGetInfo:
> + * @mon: Pointer to the monitor
> + * @info: Get VM info
> + *
> + * Retrieve the VM info and store in @info
> + *
> + * Returns 0 on success.
> + */
> +int
> +virCHMonitorGetInfo(virCHMonitor *mon, virJSONValue **info)
> +{
> +    return virCHMonitorGet(mon, URL_VM_INFO, info);
> +}
> +
> +static void
> +virCHMonitorThreadInfoFree(virCHMonitor *mon)
> +{
> +    mon->nthreads = 0;
> +    if (mon->threads)
> +        VIR_FREE(mon->threads);
> +}
> +
> +static size_t
> +virCHMonitorRefreshThreadInfo(virCHMonitor *mon)
> +{
> +    virCHMonitorThreadInfo *info = NULL;
> +    g_autofree pid_t *tids = NULL;
> +    virDomainObj *vm = mon->vm;
> +    size_t ntids = 0;
> +    size_t i;
> +
> +
> +    virCHMonitorThreadInfoFree(mon);
> +    if (virProcessGetPids(vm->pid, &ntids, &tids) < 0) {
> +        mon->threads = NULL;
> +        return 0;
> +    }
> +
> +    info = g_new0(virCHMonitorThreadInfo, ntids);
> +    for (i = 0; i < ntids; i++) {
> +        g_autofree char *proc = NULL;
> +        g_autofree char *data = NULL;
> +
> +        proc = g_strdup_printf("/proc/%d/task/%d/comm",
> +                (int)vm->pid, (int)tids[i]);
> +
> +        if (virFileReadAll(proc, (1<<16), &data) < 0) {
> +            continue;
> +        }
> +
> +        VIR_DEBUG("VM PID: %d, TID %d, COMM: %s",
> +                (int)vm->pid, (int)tids[i], data);
> +        if (STRPREFIX(data, "vcpu")) {
> +            int cpuid;
> +            char *tmp;
> +            if (virStrToLong_i(data + 4, &tmp, 0, &cpuid) < 0) {
> +                VIR_WARN("Index is not specified correctly");
> +                continue;
> +            }
> +            info[i].type = virCHThreadTypeVcpu;
> +            info[i].vcpuInfo.tid = tids[i];
> +            info[i].vcpuInfo.online = true;
> +            info[i].vcpuInfo.cpuid = cpuid;
> +            VIR_DEBUG("vcpu%d -> tid: %d", cpuid, tids[i]);
> +        } else if (STRPREFIX(data, "_disk") || STRPREFIX(data, "_net") ||
> +                   STRPREFIX(data, "_rng")) {
> +        /* Prefixes used by cloud-hypervisor for IO Threads are captured at
> +        https://github.com/cloud-hypervisor/cloud-hypervisor/blob/main/vmm/src/device_manager.rs */
> +            info[i].type = virCHThreadTypeIO;
> +            info[i].ioInfo.tid = tids[i];
> +            virStrcpy(info[i].ioInfo.thrName, data, VIRCH_THREAD_NAME_LEN - 1);
> +        }else {
> +            info[i].type = virCHThreadTypeEmulator;
> +            info[i].emuInfo.tid = tids[i];
> +            virStrcpy(info[i].emuInfo.thrName, data, VIRCH_THREAD_NAME_LEN - 1);
> +        }
> +        mon->nthreads++;
> +
> +    }
> +    mon->threads = info;
> +
> +    return mon->nthreads;
> +}
> +
> +/**
> + * virCHMonitorGetThreadInfo:
> + * @mon: Pointer to the monitor
> + * @refresh: Refresh thread info or not
> + *
> + * Retrive thread info and store to @threads
> + *
> + * Returns count of threads on success.
> + */
> +size_t
> +virCHMonitorGetThreadInfo(virCHMonitor *mon, bool refresh,
> +                          virCHMonitorThreadInfo **threads)
> +{
> +    int nthreads = 0;
> +
> +    if (refresh)
> +        nthreads = virCHMonitorRefreshThreadInfo(mon);
> +
> +    *threads = mon->threads;
> +
> +    return nthreads;
> +}
> +
>   int
>   virCHMonitorShutdownVMM(virCHMonitor *mon)
>   {
> @@ -810,18 +920,3 @@ virCHMonitorResumeVM(virCHMonitor *mon)
>   {
>       return virCHMonitorPutNoContent(mon, URL_VM_RESUME);
>   }
> -
> -/**
> - * virCHMonitorGetInfo:
> - * @mon: Pointer to the monitor
> - * @info: Get VM info
> - *
> - * Retrieve the VM info and store in @info
> - *
> - * Returns 0 on success.
> - */
> -int
> -virCHMonitorGetInfo(virCHMonitor *mon, virJSONValue **info)
> -{
> -    return virCHMonitorGet(mon, URL_VM_INFO, info);
> -}
> diff --git a/src/ch/ch_monitor.h b/src/ch/ch_monitor.h
> index 8ca9e17a9a..f8c3fa75e8 100644
> --- a/src/ch/ch_monitor.h
> +++ b/src/ch/ch_monitor.h
> @@ -37,6 +37,50 @@
>   #define URL_VM_RESUME "vm.resume"
>   #define URL_VM_INFO "vm.info"
>   
> +#define VIRCH_THREAD_NAME_LEN   16
> +
> +typedef enum {
> +    virCHThreadTypeEmulator,
> +    virCHThreadTypeVcpu,
> +    virCHThreadTypeIO,
> +    virCHThreadTypeMax
> +} virCHThreadType;
> +
> +typedef struct _virCHMonitorCPUInfo virCHMonitorCPUInfo;
> +
> +struct _virCHMonitorCPUInfo {
> +    int cpuid;
> +    pid_t tid;
> +
> +    bool online;
> +};
> +
> +typedef struct _virCHMonitorEmuThreadInfo virCHMonitorEmuThreadInfo;
> +
> +struct _virCHMonitorEmuThreadInfo {
> +    char    thrName[VIRCH_THREAD_NAME_LEN];
> +    pid_t   tid;
> +};
> +
> +typedef struct _virCHMonitorIOThreadInfo virCHMonitorIOThreadInfo;
> +
> +struct _virCHMonitorIOThreadInfo {
> +    char    thrName[VIRCH_THREAD_NAME_LEN];
> +    pid_t   tid;
> +};
> +
> +typedef struct _virCHMonitorThreadInfo virCHMonitorThreadInfo;
> +
> +struct _virCHMonitorThreadInfo {
> +    virCHThreadType type;
> +
> +    union {
> +        virCHMonitorCPUInfo vcpuInfo;
> +        virCHMonitorEmuThreadInfo emuInfo;
> +        virCHMonitorIOThreadInfo ioInfo;
> +    };
> +};
> +
>   typedef struct _virCHMonitor virCHMonitor;
>   
>   struct _virCHMonitor {
> @@ -49,6 +93,9 @@ struct _virCHMonitor {
>       pid_t pid;
>   
>       virDomainObj *vm;
> +
> +    size_t nthreads;
> +    virCHMonitorThreadInfo *threads;
>   };
>   
>   virCHMonitor *virCHMonitorNew(virDomainObj *vm, const char *socketdir);
> @@ -65,12 +112,9 @@ int virCHMonitorSuspendVM(virCHMonitor *mon);
>   int virCHMonitorResumeVM(virCHMonitor *mon);
>   int virCHMonitorGetInfo(virCHMonitor *mon, virJSONValue **info);
>   
> -typedef struct _virCHMonitorCPUInfo virCHMonitorCPUInfo;
> -struct _virCHMonitorCPUInfo {
> -    pid_t tid;
> -    bool online;
> -};
>   void virCHMonitorCPUInfoFree(virCHMonitorCPUInfo *cpus);
>   int virCHMonitorGetCPUInfo(virCHMonitor *mon,
>                          virCHMonitorCPUInfo **vcpus,
>                          size_t maxvcpus);
> +size_t virCHMonitorGetThreadInfo(virCHMonitor *mon, bool refresh,
> +                                 virCHMonitorThreadInfo **threads);
> diff --git a/src/ch/ch_process.c b/src/ch/ch_process.c
> index 3b7f6fcddf..8dce737adb 100644
> --- a/src/ch/ch_process.c
> +++ b/src/ch/ch_process.c
> @@ -26,6 +26,8 @@
>   #include "ch_domain.h"
>   #include "ch_monitor.h"
>   #include "ch_process.h"
> +#include "ch_cgroup.h"
> +#include "virnuma.h"
>   #include "viralloc.h"
>   #include "virerror.h"
>   #include "virjson.h"
> @@ -133,6 +135,257 @@ virCHProcessUpdateInfo(virDomainObj *vm)
>       return 0;
>   }
>   
> +static int
> +virCHProcessGetAllCpuAffinity(virBitmap **cpumapRet)
> +{
> +    *cpumapRet = NULL;
> +
> +    if (!virHostCPUHasBitmap())
> +        return 0;
> +
> +    if (!(*cpumapRet = virHostCPUGetOnlineBitmap()))
> +        return -1;
> +
> +    return 0;
> +}
> +
> +#if defined(WITH_SCHED_GETAFFINITY) || defined(WITH_BSD_CPU_AFFINITY)
> +static int
> +virCHProcessInitCpuAffinity(virDomainObj *vm)
> +{
> +    g_autoptr(virBitmap) cpumapToSet = NULL;
> +    virDomainNumatuneMemMode mem_mode;
> +    virCHDomainObjPrivate *priv = vm->privateData;
> +
> +    if (!vm->pid) {
> +        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
> +                       _("Cannot setup CPU affinity until process is started"));
> +        return -1;
> +    }
> +
> +    if (virDomainNumaGetNodeCount(vm->def->numa) <= 1 &&
> +        virDomainNumatuneGetMode(vm->def->numa, -1, &mem_mode) == 0 &&
> +        mem_mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) {
> +        virBitmap *nodeset = NULL;
> +
> +        if (virDomainNumatuneMaybeGetNodeset(vm->def->numa,
> +                                             priv->autoNodeset,
> +                                             &nodeset,
> +                                             -1) < 0)
> +            return -1;
> +
> +        if (virNumaNodesetToCPUset(nodeset, &cpumapToSet) < 0)
> +            return -1;
> +    } else if (vm->def->cputune.emulatorpin) {
> +        if (!(cpumapToSet = virBitmapNewCopy(vm->def->cputune.emulatorpin)))
> +            return -1;
> +    } else {
> +        if (virCHProcessGetAllCpuAffinity(&cpumapToSet) < 0)
> +            return -1;
> +    }
> +
> +    if (cpumapToSet &&
> +        virProcessSetAffinity(vm->pid, cpumapToSet, false) < 0) {
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +#else /* !defined(WITH_SCHED_GETAFFINITY) && !defined(WITH_BSD_CPU_AFFINITY) */
> +static int
> +virCHProcessInitCpuAffinity(virDomainObj *vm G_GNUC_UNUSED)
> +{
> +    return 0;
> +}
> +#endif /* !defined(WITH_SCHED_GETAFFINITY) && !defined(WITH_BSD_CPU_AFFINITY) */
> +
> +/**
> + * virCHProcessSetupPid:
> + *
> + * This function sets resource properties (affinity, cgroups,
> + * scheduler) for any PID associated with a domain.  It should be used
> + * to set up emulator PIDs as well as vCPU and I/O thread pids to
> + * ensure they are all handled the same way.
> + *
> + * Returns 0 on success, -1 on error.
> + */
> +static int
> +virCHProcessSetupPid(virDomainObj *vm,
> +                     pid_t pid,
> +                     virCgroupThreadName nameval,
> +                     int id,
> +                     virBitmap *cpumask,
> +                     unsigned long long period,
> +                     long long quota,
> +                     virDomainThreadSchedParam *sched)
> +{
> +    virCHDomainObjPrivate *priv = vm->privateData;
> +    virDomainNumatuneMemMode mem_mode;
> +    virCgroup *cgroup = NULL;
> +    virBitmap *use_cpumask = NULL;
> +    virBitmap *affinity_cpumask = NULL;
> +    g_autoptr(virBitmap) hostcpumap = NULL;
> +    g_autofree char *mem_mask = NULL;
> +    int ret = -1;
> +
> +    if ((period || quota) &&
> +        !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) {
> +        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
> +                       _("cgroup cpu is required for scheduler tuning"));
> +        goto cleanup;
> +    }
> +
> +    /* Infer which cpumask shall be used. */
> +    if (cpumask) {
> +        use_cpumask = cpumask;
> +    } else if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) {
> +        use_cpumask = priv->autoCpuset;
> +    } else if (vm->def->cpumask) {
> +        use_cpumask = vm->def->cpumask;
> +    } else {
> +        /* we can't assume cloud-hypervisor itself is running on all pCPUs,
> +         * so we need to explicitly set the spawned instance to all pCPUs. */
> +        if (virCHProcessGetAllCpuAffinity(&hostcpumap) < 0)
> +            goto cleanup;
> +        affinity_cpumask = hostcpumap;
> +    }
> +
> +    /*
> +     * If CPU cgroup controller is not initialized here, then we need
> +     * neither period nor quota settings.  And if CPUSET controller is
> +     * not initialized either, then there's nothing to do anyway.
> +     */
> +    if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) ||
> +        virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) {
> +
> +        if (virDomainNumatuneGetMode(vm->def->numa, -1, &mem_mode) == 0 &&
> +            mem_mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT &&
> +            virDomainNumatuneMaybeFormatNodeset(vm->def->numa,
> +                                                priv->autoNodeset,
> +                                                &mem_mask, -1) < 0)
> +            goto cleanup;
> +
> +        if (virCgroupNewThread(priv->cgroup, nameval, id, true, &cgroup) < 0)
> +            goto cleanup;
> +
> +        if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) {
> +            if (use_cpumask &&
> +                chSetupCgroupCpusetCpus(cgroup, use_cpumask) < 0)
> +                goto cleanup;
> +
> +            if (mem_mask && virCgroupSetCpusetMems(cgroup, mem_mask) < 0)
> +                goto cleanup;
> +
> +        }
> +
> +        if ((period || quota) &&
> +            chSetupCgroupVcpuBW(cgroup, period, quota) < 0)
> +            goto cleanup;
> +
> +        /* Move the thread to the sub dir */
> +        VIR_INFO("Adding pid %d to cgroup", pid);
> +        if (virCgroupAddThread(cgroup, pid) < 0)
> +            goto cleanup;
> +
> +    }
> +
> +    if (!affinity_cpumask)
> +        affinity_cpumask = use_cpumask;
> +
> +    /* Setup legacy affinity. */
> +    if (affinity_cpumask && virProcessSetAffinity(pid, affinity_cpumask, false) < 0)
> +        goto cleanup;
> +
> +    /* Set scheduler type and priority, but not for the main thread. */
> +    if (sched &&
> +        nameval != VIR_CGROUP_THREAD_EMULATOR &&
> +        virProcessSetScheduler(pid, sched->policy, sched->priority) < 0)
> +        goto cleanup;
> +
> +    ret = 0;
> + cleanup:
> +    if (cgroup) {
> +        if (ret < 0)
> +            virCgroupRemove(cgroup);
> +        virCgroupFree(cgroup);
> +    }
> +
> +    return ret;
> +}
> +
> +/**
> + * virCHProcessSetupVcpu:
> + * @vm: domain object
> + * @vcpuid: id of VCPU to set defaults
> + *
> + * This function sets resource properties (cgroups, affinity, scheduler) for a
> + * vCPU. This function expects that the vCPU is online and the vCPU pids were
> + * correctly detected at the point when it's called.
> + *
> + * Returns 0 on success, -1 on error.
> + */
> +int
> +virCHProcessSetupVcpu(virDomainObj *vm,
> +                      unsigned int vcpuid)
> +{
> +    pid_t vcpupid = virCHDomainGetVcpuPid(vm, vcpuid);
> +    virDomainVcpuDef *vcpu = virDomainDefGetVcpu(vm->def, vcpuid);
> +
> +    return virCHProcessSetupPid(vm, vcpupid, VIR_CGROUP_THREAD_VCPU,
> +                                vcpuid, vcpu->cpumask,
> +                                vm->def->cputune.period,
> +                                vm->def->cputune.quota,
> +                                &vcpu->sched);
> +}
> +
> +static int
> +virCHProcessSetupVcpus(virDomainObj *vm)
> +{
> +    virDomainVcpuDef *vcpu;
> +    unsigned int maxvcpus = virDomainDefGetVcpusMax(vm->def);
> +    size_t i;
> +
> +    if ((vm->def->cputune.period || vm->def->cputune.quota) &&
> +        !virCgroupHasController(((virCHDomainObjPrivate *) vm->privateData)->cgroup,
> +                                VIR_CGROUP_CONTROLLER_CPU)) {
> +        virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
> +                       _("cgroup cpu is required for scheduler tuning"));
> +        return -1;
> +    }
> +
> +    if (!virCHDomainHasVcpuPids(vm)) {
> +        /* If any CPU has custom affinity that differs from the
> +         * VM default affinity, we must reject it */
> +        for (i = 0; i < maxvcpus; i++) {
> +            vcpu = virDomainDefGetVcpu(vm->def, i);
> +
> +            if (!vcpu->online)
> +                continue;
> +
> +            if (vcpu->cpumask &&
> +                !virBitmapEqual(vm->def->cpumask, vcpu->cpumask)) {
> +                virReportError(VIR_ERR_OPERATION_INVALID, "%s",
> +                                _("cpu affinity is not supported"));
> +                return -1;
> +            }
> +        }
> +
> +        return 0;
> +    }
> +
> +    for (i = 0; i < maxvcpus; i++) {
> +        vcpu = virDomainDefGetVcpu(vm->def, i);
> +
> +        if (!vcpu->online)
> +            continue;
> +
> +        if (virCHProcessSetupVcpu(vm, i) < 0)
> +            return -1;
> +    }
> +
> +    return 0;
> +}
> +
>   /**
>    * virCHProcessStart:
>    * @driver: pointer to driver structure
> @@ -168,18 +421,33 @@ int virCHProcessStart(virCHDriver *driver,
>           }
>       }
>   
> +    vm->pid = priv->monitor->pid;
> +    vm->def->id = vm->pid;
> +    priv->machineName = virCHDomainGetMachineName(vm);
> +
> +    if (chSetupCgroup(vm, nnicindexes, nicindexes) < 0)
> +        goto cleanup;
> +
> +    if (virCHProcessInitCpuAffinity(vm) < 0)
> +        goto cleanup;
> +
>       if (virCHMonitorBootVM(priv->monitor) < 0) {
>           virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
>                          _("failed to boot guest VM"));
>           goto cleanup;
>       }
>   
> -    priv->machineName = virCHDomainGetMachineName(vm);
> -    vm->pid = priv->monitor->pid;
> -    vm->def->id = vm->pid;
> +    virCHDomainRefreshThreadInfo(vm);
>   
> -    virCHProcessUpdateInfo(vm);
> +    VIR_DEBUG("Setting global CPU cgroup (if required)");
> +    if (chSetupGlobalCpuCgroup(vm) < 0)
> +        goto cleanup;
> +
> +    VIR_DEBUG("Setting vCPU tuning/settings");
> +    if (virCHProcessSetupVcpus(vm) < 0)
> +        goto cleanup;
>   
> +    virCHProcessUpdateInfo(vm);
>       virDomainObjSetState(vm, VIR_DOMAIN_RUNNING, reason);
>   
>       return 0;
> @@ -195,6 +463,8 @@ int virCHProcessStop(virCHDriver *driver G_GNUC_UNUSED,
>                        virDomainObj *vm,
>                        virDomainShutoffReason reason)
>   {
> +    int ret;
> +    int retries = 0;
>       virCHDomainObjPrivate *priv = vm->privateData;
>   
>       VIR_DEBUG("Stopping VM name=%s pid=%d reason=%d",
> @@ -205,6 +475,16 @@ int virCHProcessStop(virCHDriver *driver G_GNUC_UNUSED,
>           priv->monitor = NULL;
>       }
>   
> +    retry:

Wrong label indentation.


Daniel

> +        if ((ret = chRemoveCgroup(vm)) < 0) {
> +            if (ret == -EBUSY && (retries++ < 5)) {
> +                g_usleep(200*1000);
> +                goto retry;
> +            }
> +            VIR_WARN("Failed to remove cgroup for %s",
> +                    vm->def->name);
> +        }
> +
>       vm->pid = -1;
>       vm->def->id = -1;
>   
> diff --git a/src/ch/ch_process.h b/src/ch/ch_process.h
> index abc4915979..800e3f4e23 100644
> --- a/src/ch/ch_process.h
> +++ b/src/ch/ch_process.h
> @@ -29,3 +29,6 @@ int virCHProcessStart(virCHDriver *driver,
>   int virCHProcessStop(virCHDriver *driver,
>                        virDomainObj *vm,
>                        virDomainShutoffReason reason);
> +
> +int virCHProcessSetupVcpu(virDomainObj *vm,
> +                          unsigned int vcpuid);
> diff --git a/src/ch/meson.build b/src/ch/meson.build
> index 2b2bdda26c..0b20de56fd 100644
> --- a/src/ch/meson.build
> +++ b/src/ch/meson.build
> @@ -1,6 +1,8 @@
>   ch_driver_sources = [
>     'ch_conf.c',
>     'ch_conf.h',
> +  'ch_cgroup.c',
> +  'ch_cgroup.h',
>     'ch_domain.c',
>     'ch_domain.h',
>     'ch_driver.c',
>