[PATCH v4 4/5] lxc: Implement virtual /proc/cpuinfo via LXC fuse

Julio Faracco posted 5 patches 5 years, 11 months ago
[PATCH v4 4/5] lxc: Implement virtual /proc/cpuinfo via LXC fuse
Posted by Julio Faracco 5 years, 11 months ago
This commit tries to fix lots of issues related to LXC VCPUs. One of
them is related to /proc/cpuinfo content. If only 1 VCPU is set, LXC
containers will show all CPUs available for host. The second one is
related to CPU share, if an user set only 1 VCPU, the container/process
will use all available CPUs. (This is not the case when `cpuset`
attribute is declared). So, this commit adds a virtual cpuinfo based on
VCPU mapping and it automatically limits the CPU usage according VCPU
count.

Example (now):
LXC container - 8 CPUS with 2 VCPU:
    lxc-root# stress --cpu 8

On host machine, only CPU 0 and 1 have 100% usage.

Signed-off-by: Julio Faracco <jcfaracco@gmail.com>
---
 src/lxc/lxc_cgroup.c    | 31 ++++++++++++++
 src/lxc/lxc_container.c | 39 ++++++++++-------
 src/lxc/lxc_fuse.c      | 95 ++++++++++++++++++++++++++++++++++++++---
 3 files changed, 145 insertions(+), 20 deletions(-)

diff --git a/src/lxc/lxc_cgroup.c b/src/lxc/lxc_cgroup.c
index d29b65092a..912a252473 100644
--- a/src/lxc/lxc_cgroup.c
+++ b/src/lxc/lxc_cgroup.c
@@ -50,6 +50,34 @@ static int virLXCCgroupSetupCpuTune(virDomainDefPtr def,
 }
 
 
+static int virLXCCgroupSetupVcpuAuto(virDomainDefPtr def,
+                                     virCgroupPtr cgroup)
+{
+    size_t i;
+    int vcpumax;
+    virBuffer buffer = VIR_BUFFER_INITIALIZER;
+    virBufferPtr cpuset = &buffer;
+
+    vcpumax = virDomainDefGetVcpusMax(def);
+    for (i = 0; i < vcpumax; i++) {
+        virDomainVcpuDefPtr vcpu = virDomainDefGetVcpu(def, i);
+        /* Cgroup is smart enough to convert numbers separated
+         * by comma into ranges. Example: "0,1,2,5," -> "0-2,5".
+         * Libvirt does not need to process it here. */
+        if (vcpu)
+            virBufferAsprintf(cpuset, "%zu,", i);
+    }
+    if (virCgroupSetCpusetCpus(cgroup,
+                               virBufferCurrentContent(cpuset)) < 0) {
+        virBufferFreeAndReset(cpuset);
+        return -1;
+    }
+
+    virBufferFreeAndReset(cpuset);
+    return 0;
+}
+
+
 static int virLXCCgroupSetupCpusetTune(virDomainDefPtr def,
                                        virCgroupPtr cgroup,
                                        virBitmapPtr nodemask)
@@ -61,6 +89,9 @@ static int virLXCCgroupSetupCpusetTune(virDomainDefPtr def,
         def->cpumask &&
         virCgroupSetupCpusetCpus(cgroup, def->cpumask) < 0) {
         return -1;
+    } else {
+        /* auto mode for VCPU limits */
+        virLXCCgroupSetupVcpuAuto(def, cgroup);
     }
 
     if (virDomainNumatuneGetMode(def->numa, -1, &mode) < 0 ||
diff --git a/src/lxc/lxc_container.c b/src/lxc/lxc_container.c
index 41efe43a14..88e27f3060 100644
--- a/src/lxc/lxc_container.c
+++ b/src/lxc/lxc_container.c
@@ -997,8 +997,8 @@ static int lxcContainerMountBasicFS(bool userns_enabled,
 static int lxcContainerMountProcFuse(virDomainDefPtr def,
                                      const char *stateDir)
 {
-    int ret;
-    char *meminfo_path = NULL;
+    g_autofree char *meminfo_path = NULL;
+    g_autofree char *cpuinfo_path = NULL;
 
     VIR_DEBUG("Mount /proc/meminfo stateDir=%s", stateDir);
 
@@ -1006,15 +1006,29 @@ static int lxcContainerMountProcFuse(virDomainDefPtr def,
                                    stateDir,
                                    def->name);
 
-    if ((ret = mount(meminfo_path, "/proc/meminfo",
-                     NULL, MS_BIND, NULL)) < 0) {
+    if (mount(meminfo_path, "/proc/meminfo",
+                     NULL, MS_BIND, NULL) < 0) {
         virReportSystemError(errno,
                              _("Failed to mount %s on /proc/meminfo"),
                              meminfo_path);
+        return -1;
     }
 
-    VIR_FREE(meminfo_path);
-    return ret;
+    VIR_DEBUG("Mount /proc/cpuinfo stateDir=%s", stateDir);
+
+    cpuinfo_path = g_strdup_printf("/.oldroot/%s/%s.fuse/cpuinfo",
+                                   stateDir,
+                                   def->name);
+
+    if (mount(cpuinfo_path, "/proc/cpuinfo",
+                     NULL, MS_BIND, NULL) < 0) {
+        virReportSystemError(errno,
+                             _("Failed to mount %s on /proc/cpuinfo"),
+                             cpuinfo_path);
+        return -1;
+    }
+
+    return 0;
 }
 #else
 static int lxcContainerMountProcFuse(virDomainDefPtr def G_GNUC_UNUSED,
@@ -1027,8 +1041,7 @@ static int lxcContainerMountProcFuse(virDomainDefPtr def G_GNUC_UNUSED,
 static int lxcContainerMountFSDev(virDomainDefPtr def,
                                   const char *stateDir)
 {
-    int ret = -1;
-    char *path = NULL;
+    g_autofree char *path = NULL;
     int flags = def->idmap.nuidmap ? MS_BIND : MS_MOVE;
 
     VIR_DEBUG("Mount /dev/ stateDir=%s", stateDir);
@@ -1038,7 +1051,7 @@ static int lxcContainerMountFSDev(virDomainDefPtr def,
     if (virFileMakePath("/dev") < 0) {
         virReportSystemError(errno, "%s",
                              _("Cannot create /dev"));
-        goto cleanup;
+        return -1;
     }
 
     VIR_DEBUG("Trying to %s %s to /dev", def->idmap.nuidmap ?
@@ -1048,14 +1061,10 @@ static int lxcContainerMountFSDev(virDomainDefPtr def,
         virReportSystemError(errno,
                              _("Failed to mount %s on /dev"),
                              path);
-        goto cleanup;
+        return -1;
     }
 
-    ret = 0;
-
- cleanup:
-    VIR_FREE(path);
-    return ret;
+    return 0;
 }
 
 static int lxcContainerMountFSDevPTS(virDomainDefPtr def,
diff --git a/src/lxc/lxc_fuse.c b/src/lxc/lxc_fuse.c
index 8cfccdd7e0..b2117bfa17 100644
--- a/src/lxc/lxc_fuse.c
+++ b/src/lxc/lxc_fuse.c
@@ -36,23 +36,29 @@
 
 #if WITH_FUSE
 
+#ifndef CPUINFO_FILE_LEN
+# define CPUINFO_FILE_LEN (1024*1024)
+#endif
+
 static const char *fuse_meminfo_path = "/meminfo";
+static const char *fuse_cpuinfo_path = "/cpuinfo";
 
 static int lxcProcGetattr(const char *path, struct stat *stbuf)
 {
-    g_autofree char *mempath = NULL;
+    g_autofree char *procpath = NULL;
     struct stat sb;
     struct fuse_context *context = fuse_get_context();
     virDomainDefPtr def = (virDomainDefPtr)context->private_data;
 
     memset(stbuf, 0, sizeof(struct stat));
-    mempath = g_strdup_printf("/proc/%s", path);
+    procpath = g_strdup_printf("/proc/%s", path);
 
     if (STREQ(path, "/")) {
         stbuf->st_mode = S_IFDIR | 0755;
         stbuf->st_nlink = 2;
-    } else if (STREQ(path, fuse_meminfo_path)) {
-        if (stat(mempath, &sb) < 0)
+    } else if (STREQ(path, fuse_meminfo_path) ||
+               STREQ(path, fuse_cpuinfo_path)) {
+        if (stat(procpath, &sb) < 0)
             return -errno;
 
         stbuf->st_uid = def->idmap.uidmap ? def->idmap.uidmap[0].target : 0;
@@ -83,6 +89,7 @@ static int lxcProcReaddir(const char *path, void *buf,
     filler(buf, ".", NULL, 0);
     filler(buf, "..", NULL, 0);
     filler(buf, fuse_meminfo_path + 1, NULL, 0);
+    filler(buf, fuse_cpuinfo_path + 1, NULL, 0);
 
     return 0;
 }
@@ -90,7 +97,8 @@ static int lxcProcReaddir(const char *path, void *buf,
 static int lxcProcOpen(const char *path G_GNUC_UNUSED,
                        struct fuse_file_info *fi G_GNUC_UNUSED)
 {
-    if (STRNEQ(path, fuse_meminfo_path))
+    if (STRNEQ(path, fuse_meminfo_path) &&
+        STRNEQ(path, fuse_cpuinfo_path))
         return -ENOENT;
 
     if ((fi->flags & 3) != O_RDONLY)
@@ -227,6 +235,80 @@ static int lxcProcReadMeminfo(char *hostpath, virDomainDefPtr def,
     return res;
 }
 
+
+static int
+lxcProcReadCpuinfoParse(virDomainDefPtr def, char *base,
+                        virBufferPtr new_cpuinfo)
+{
+    char *procline = NULL;
+    char *saveptr = base;
+    size_t cpu;
+    size_t nvcpu;
+    size_t curcpu = 0;
+    bool get_proc = false;
+
+    nvcpu = virDomainDefGetVcpus(def);
+    while ((procline = strtok_r(NULL, "\n", &saveptr))) {
+        if (sscanf(procline, "processor\t: %zu", &cpu) == 1) {
+            virDomainVcpuDefPtr vcpu = virDomainDefGetVcpu(def, cpu);
+            /* VCPU is mapped */
+            if (vcpu) {
+                if (curcpu == nvcpu)
+                    break;
+
+                if (curcpu > 0)
+                    virBufferAddLit(new_cpuinfo, "\n");
+
+                virBufferAsprintf(new_cpuinfo, "processor\t: %zu\n",
+                                  curcpu);
+                curcpu++;
+                get_proc = true;
+            } else {
+                get_proc = false;
+            }
+        } else {
+            /* It is not a processor index */
+            if (get_proc)
+                virBufferAsprintf(new_cpuinfo, "%s\n", procline);
+        }
+    }
+
+    virBufferAddLit(new_cpuinfo, "\n");
+
+    return strlen(virBufferCurrentContent(new_cpuinfo));
+}
+
+
+static int lxcProcReadCpuinfo(char *hostpath, virDomainDefPtr def,
+                              char *buf, size_t size, off_t offset)
+{
+    virBuffer buffer = VIR_BUFFER_INITIALIZER;
+    virBufferPtr new_cpuinfo = &buffer;
+    g_autofree char *outbuf = NULL;
+    int res = -1;
+
+    /* Gather info from /proc/cpuinfo */
+    if (virFileReadAll(hostpath, CPUINFO_FILE_LEN, &outbuf) < 0) {
+        virReportError(VIR_ERR_INTERNAL_ERROR,
+                       _("Failed to open %s"), hostpath);
+        return -1;
+    }
+
+    /* /proc/cpuinfo does not support fseek */
+    if (offset > 0)
+        return 0;
+
+    res = lxcProcReadCpuinfoParse(def, outbuf, new_cpuinfo);
+
+    if (res > size)
+        res = size;
+    memcpy(buf, virBufferCurrentContent(new_cpuinfo), res);
+
+    virBufferFreeAndReset(new_cpuinfo);
+    return res;
+}
+
+
 static int lxcProcRead(const char *path G_GNUC_UNUSED,
                        char *buf G_GNUC_UNUSED,
                        size_t size G_GNUC_UNUSED,
@@ -246,6 +328,9 @@ static int lxcProcRead(const char *path G_GNUC_UNUSED,
     if (STREQ(path, fuse_meminfo_path)) {
         if ((res = lxcProcReadMeminfo(hostpath, def, buf, size, offset)) < 0)
             res = lxcProcHostRead(hostpath, buf, size, offset);
+    } else if (STREQ(path, fuse_cpuinfo_path)) {
+        if ((res = lxcProcReadCpuinfo(hostpath, def, buf, size, offset)) < 0)
+            res = lxcProcHostRead(hostpath, buf, size, offset);
     }
 
     return res;
-- 
2.20.1


Re: [PATCH v4 4/5] lxc: Implement virtual /proc/cpuinfo via LXC fuse
Posted by Daniel P. Berrangé 5 years, 11 months ago
On Mon, Feb 24, 2020 at 11:24:28AM -0300, Julio Faracco wrote:
> This commit tries to fix lots of issues related to LXC VCPUs. One of
> them is related to /proc/cpuinfo content. If only 1 VCPU is set, LXC
> containers will show all CPUs available for host. The second one is
> related to CPU share, if an user set only 1 VCPU, the container/process
> will use all available CPUs. (This is not the case when `cpuset`
> attribute is declared). So, this commit adds a virtual cpuinfo based on
> VCPU mapping and it automatically limits the CPU usage according VCPU
> count.
> 
> Example (now):
> LXC container - 8 CPUS with 2 VCPU:
>     lxc-root# stress --cpu 8
> 
> On host machine, only CPU 0 and 1 have 100% usage.
> 
> Signed-off-by: Julio Faracco <jcfaracco@gmail.com>
> ---
>  src/lxc/lxc_cgroup.c    | 31 ++++++++++++++
>  src/lxc/lxc_container.c | 39 ++++++++++-------
>  src/lxc/lxc_fuse.c      | 95 ++++++++++++++++++++++++++++++++++++++---
>  3 files changed, 145 insertions(+), 20 deletions(-)
> 
> diff --git a/src/lxc/lxc_cgroup.c b/src/lxc/lxc_cgroup.c
> index d29b65092a..912a252473 100644
> --- a/src/lxc/lxc_cgroup.c
> +++ b/src/lxc/lxc_cgroup.c
> @@ -50,6 +50,34 @@ static int virLXCCgroupSetupCpuTune(virDomainDefPtr def,
>  }
>  
>  
> +static int virLXCCgroupSetupVcpuAuto(virDomainDefPtr def,
> +                                     virCgroupPtr cgroup)
> +{
> +    size_t i;
> +    int vcpumax;
> +    virBuffer buffer = VIR_BUFFER_INITIALIZER;
> +    virBufferPtr cpuset = &buffer;
> +
> +    vcpumax = virDomainDefGetVcpusMax(def);
> +    for (i = 0; i < vcpumax; i++) {
> +        virDomainVcpuDefPtr vcpu = virDomainDefGetVcpu(def, i);
> +        /* Cgroup is smart enough to convert numbers separated
> +         * by comma into ranges. Example: "0,1,2,5," -> "0-2,5".
> +         * Libvirt does not need to process it here. */
> +        if (vcpu)
> +            virBufferAsprintf(cpuset, "%zu,", i);
> +    }

If I have 4 containers, each with vcpu==2, then all 4 containers
are going to be taskset onto host CPUs 0 & 1, and host CPUs 2,3,4,5,6,7
are going to be unused. This is not viable as a strategy.

> +    if (virCgroupSetCpusetCpus(cgroup,
> +                               virBufferCurrentContent(cpuset)) < 0) {
> +        virBufferFreeAndReset(cpuset);
> +        return -1;
> +    }
> +
> +    virBufferFreeAndReset(cpuset);
> +    return 0;
> +}

> +static int
> +lxcProcReadCpuinfoParse(virDomainDefPtr def, char *base,
> +                        virBufferPtr new_cpuinfo)
> +{
> +    char *procline = NULL;
> +    char *saveptr = base;
> +    size_t cpu;
> +    size_t nvcpu;
> +    size_t curcpu = 0;
> +    bool get_proc = false;
> +
> +    nvcpu = virDomainDefGetVcpus(def);
> +    while ((procline = strtok_r(NULL, "\n", &saveptr))) {
> +        if (sscanf(procline, "processor\t: %zu", &cpu) == 1) {

This doesn't work because it is assuming the /proc/cpuinfo data format
for x86 architecture. Every architecture puts different info in this
file, and only some of them using "processor: NN" as a delimiter for
new entries. There's many examples in tests/sysinfodata/ and
in tests/virhostcpudata/ showing this problem

We had considered taking /proc/cpuinfo in the past, but decided against
it. Since this file is non-standard data format varying across arches,
well written apps won't actually look at /proc/cpuinfo at all. Instead
they'll use /sys/devices/system/cpu instead.  I don't think we want to
make /proc/cpuinfo be inconsistent with data in sysfs.


> +            virDomainVcpuDefPtr vcpu = virDomainDefGetVcpu(def, cpu);
> +            /* VCPU is mapped */
> +            if (vcpu) {
> +                if (curcpu == nvcpu)
> +                    break;
> +
> +                if (curcpu > 0)
> +                    virBufferAddLit(new_cpuinfo, "\n");
> +
> +                virBufferAsprintf(new_cpuinfo, "processor\t: %zu\n",
> +                                  curcpu);
> +                curcpu++;
> +                get_proc = true;
> +            } else {
> +                get_proc = false;
> +            }
> +        } else {
> +            /* It is not a processor index */
> +            if (get_proc)
> +                virBufferAsprintf(new_cpuinfo, "%s\n", procline);
> +        }
> +    }
> +
> +    virBufferAddLit(new_cpuinfo, "\n");
> +
> +    return strlen(virBufferCurrentContent(new_cpuinfo));
> +}

Regards,
Daniel
-- 
|: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org         -o-            https://fstop138.berrange.com :|
|: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|

Re: [PATCH v4 4/5] lxc: Implement virtual /proc/cpuinfo via LXC fuse
Posted by Julio Faracco 5 years, 11 months ago
Thanks for your comments, Daniel.

Well, since you have a consensus about /proc/cpuinfo, I will only
resubmit the patches related to timer.

Julio Cesar Faracco

Em ter., 25 de fev. de 2020 às 08:34, Daniel P. Berrangé
<berrange@redhat.com> escreveu:
>
> On Mon, Feb 24, 2020 at 11:24:28AM -0300, Julio Faracco wrote:
> > This commit tries to fix lots of issues related to LXC VCPUs. One of
> > them is related to /proc/cpuinfo content. If only 1 VCPU is set, LXC
> > containers will show all CPUs available for host. The second one is
> > related to CPU share, if an user set only 1 VCPU, the container/process
> > will use all available CPUs. (This is not the case when `cpuset`
> > attribute is declared). So, this commit adds a virtual cpuinfo based on
> > VCPU mapping and it automatically limits the CPU usage according VCPU
> > count.
> >
> > Example (now):
> > LXC container - 8 CPUS with 2 VCPU:
> >     lxc-root# stress --cpu 8
> >
> > On host machine, only CPU 0 and 1 have 100% usage.
> >
> > Signed-off-by: Julio Faracco <jcfaracco@gmail.com>
> > ---
> >  src/lxc/lxc_cgroup.c    | 31 ++++++++++++++
> >  src/lxc/lxc_container.c | 39 ++++++++++-------
> >  src/lxc/lxc_fuse.c      | 95 ++++++++++++++++++++++++++++++++++++++---
> >  3 files changed, 145 insertions(+), 20 deletions(-)
> >
> > diff --git a/src/lxc/lxc_cgroup.c b/src/lxc/lxc_cgroup.c
> > index d29b65092a..912a252473 100644
> > --- a/src/lxc/lxc_cgroup.c
> > +++ b/src/lxc/lxc_cgroup.c
> > @@ -50,6 +50,34 @@ static int virLXCCgroupSetupCpuTune(virDomainDefPtr def,
> >  }
> >
> >
> > +static int virLXCCgroupSetupVcpuAuto(virDomainDefPtr def,
> > +                                     virCgroupPtr cgroup)
> > +{
> > +    size_t i;
> > +    int vcpumax;
> > +    virBuffer buffer = VIR_BUFFER_INITIALIZER;
> > +    virBufferPtr cpuset = &buffer;
> > +
> > +    vcpumax = virDomainDefGetVcpusMax(def);
> > +    for (i = 0; i < vcpumax; i++) {
> > +        virDomainVcpuDefPtr vcpu = virDomainDefGetVcpu(def, i);
> > +        /* Cgroup is smart enough to convert numbers separated
> > +         * by comma into ranges. Example: "0,1,2,5," -> "0-2,5".
> > +         * Libvirt does not need to process it here. */
> > +        if (vcpu)
> > +            virBufferAsprintf(cpuset, "%zu,", i);
> > +    }
>
> If I have 4 containers, each with vcpu==2, then all 4 containers
> are going to be taskset onto host CPUs 0 & 1, and host CPUs 2,3,4,5,6,7
> are going to be unused. This is not viable as a strategy.
>
> > +    if (virCgroupSetCpusetCpus(cgroup,
> > +                               virBufferCurrentContent(cpuset)) < 0) {
> > +        virBufferFreeAndReset(cpuset);
> > +        return -1;
> > +    }
> > +
> > +    virBufferFreeAndReset(cpuset);
> > +    return 0;
> > +}
>
> > +static int
> > +lxcProcReadCpuinfoParse(virDomainDefPtr def, char *base,
> > +                        virBufferPtr new_cpuinfo)
> > +{
> > +    char *procline = NULL;
> > +    char *saveptr = base;
> > +    size_t cpu;
> > +    size_t nvcpu;
> > +    size_t curcpu = 0;
> > +    bool get_proc = false;
> > +
> > +    nvcpu = virDomainDefGetVcpus(def);
> > +    while ((procline = strtok_r(NULL, "\n", &saveptr))) {
> > +        if (sscanf(procline, "processor\t: %zu", &cpu) == 1) {
>
> This doesn't work because it is assuming the /proc/cpuinfo data format
> for x86 architecture. Every architecture puts different info in this
> file, and only some of them using "processor: NN" as a delimiter for
> new entries. There's many examples in tests/sysinfodata/ and
> in tests/virhostcpudata/ showing this problem
>
> We had considered taking /proc/cpuinfo in the past, but decided against
> it. Since this file is non-standard data format varying across arches,
> well written apps won't actually look at /proc/cpuinfo at all. Instead
> they'll use /sys/devices/system/cpu instead.  I don't think we want to
> make /proc/cpuinfo be inconsistent with data in sysfs.
>
>
> > +            virDomainVcpuDefPtr vcpu = virDomainDefGetVcpu(def, cpu);
> > +            /* VCPU is mapped */
> > +            if (vcpu) {
> > +                if (curcpu == nvcpu)
> > +                    break;
> > +
> > +                if (curcpu > 0)
> > +                    virBufferAddLit(new_cpuinfo, "\n");
> > +
> > +                virBufferAsprintf(new_cpuinfo, "processor\t: %zu\n",
> > +                                  curcpu);
> > +                curcpu++;
> > +                get_proc = true;
> > +            } else {
> > +                get_proc = false;
> > +            }
> > +        } else {
> > +            /* It is not a processor index */
> > +            if (get_proc)
> > +                virBufferAsprintf(new_cpuinfo, "%s\n", procline);
> > +        }
> > +    }
> > +
> > +    virBufferAddLit(new_cpuinfo, "\n");
> > +
> > +    return strlen(virBufferCurrentContent(new_cpuinfo));
> > +}
>
> Regards,
> Daniel
> --
> |: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
> |: https://libvirt.org         -o-            https://fstop138.berrange.com :|
> |: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|
>