If some program is attached to cgroup and that cgroup is removed before
detaching the program it might not be freed and will remain in the
system until it's rebooted.
This would not be that big deal to workaround if we wouldn't use
machined to track our guests. Systemd tries to be the nice guy and if
there is no process attached to TransientUnit it will destroy the unit
and remove the cgroup from system which will hit the kernel bug.
In order to prevent this behavior of systemd we can move another process
into the guest cgroup and the cgroup will remain active even if we kill
qemu process while destroying guest. We can then detach our BPF program
from that cgroup and call machined terminate API to cleanup the cgroup.
Signed-off-by: Pavel Hrdina <phrdina@redhat.com>
---
src/libvirt_private.syms | 1 +
src/lxc/lxc_cgroup.c | 1 +
src/qemu/qemu_cgroup.c | 6 ++-
src/util/vircgroup.c | 16 ++++++++
src/util/vircgroup.h | 1 +
src/util/vircgrouppriv.h | 2 +
src/util/vircgroupv2.c | 2 +
src/util/vircgroupv2devices.c | 70 ++++++++++++++++++++++++++++++++++-
src/util/virsystemd.c | 2 +-
src/util/virsystemd.h | 2 +
10 files changed, 99 insertions(+), 4 deletions(-)
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
index f1da5ede71..c22c81cebe 100644
--- a/src/libvirt_private.syms
+++ b/src/libvirt_private.syms
@@ -3031,6 +3031,7 @@ virSystemdCanHybridSleep;
virSystemdCanSuspend;
virSystemdCreateMachine;
virSystemdGetMachineNameByPID;
+virSystemdHasMachined;
virSystemdHasMachinedResetCachedValue;
virSystemdMakeScopeName;
virSystemdMakeSliceName;
diff --git a/src/lxc/lxc_cgroup.c b/src/lxc/lxc_cgroup.c
index d93a19d684..a8cef74bb9 100644
--- a/src/lxc/lxc_cgroup.c
+++ b/src/lxc/lxc_cgroup.c
@@ -455,6 +455,7 @@ virCgroupPtr virLXCCgroupCreate(virDomainDefPtr def,
nnicindexes, nicindexes,
def->resource->partition,
-1,
+ LXC_STATE_DIR,
&cgroup) < 0)
goto cleanup;
diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c
index 4931fb6575..9374a799a1 100644
--- a/src/qemu/qemu_cgroup.c
+++ b/src/qemu/qemu_cgroup.c
@@ -946,6 +946,7 @@ qemuInitCgroup(virDomainObjPtr vm,
nnicindexes, nicindexes,
vm->def->resource->partition,
cfg->cgroupControllers,
+ cfg->stateDir,
&priv->cgroup) < 0) {
if (virCgroupNewIgnoreError())
goto done;
@@ -1256,16 +1257,19 @@ int
qemuRemoveCgroup(virDomainObjPtr vm)
{
qemuDomainObjPrivatePtr priv = vm->privateData;
+ int rc = 0;
if (priv->cgroup == NULL)
return 0; /* Not supported, so claim success */
+ rc = virCgroupRemove(priv->cgroup);
+
if (virCgroupTerminateMachine(priv->machineName) < 0) {
if (!virCgroupNewIgnoreError())
VIR_DEBUG("Failed to terminate cgroup for %s", vm->def->name);
}
- return virCgroupRemove(priv->cgroup);
+ return rc;
}
diff --git a/src/util/vircgroup.c b/src/util/vircgroup.c
index 9d284e9bf4..506cc49b9a 100644
--- a/src/util/vircgroup.c
+++ b/src/util/vircgroup.c
@@ -1105,6 +1105,7 @@ virCgroupNewMachineSystemd(const char *name,
int *nicindexes,
const char *partition,
int controllers,
+ const char *stateDir,
virCgroupPtr *group)
{
int rv;
@@ -1151,6 +1152,16 @@ virCgroupNewMachineSystemd(const char *name,
return -1;
}
+ if (VIR_STRDUP((*group)->stateDir, stateDir) < 0) {
+ virCgroupFree(group);
+ return -1;
+ }
+
+ if (VIR_STRDUP((*group)->vmName, name) < 0) {
+ virCgroupFree(group);
+ return -1;
+ }
+
if (virCgroupAddProcess(*group, pidleader) < 0) {
virErrorPtr saved = virSaveLastError();
virCgroupRemove(*group);
@@ -1233,6 +1244,7 @@ virCgroupNewMachine(const char *name,
int *nicindexes,
const char *partition,
int controllers,
+ const char *stateDir,
virCgroupPtr *group)
{
int rv;
@@ -1249,6 +1261,7 @@ virCgroupNewMachine(const char *name,
nicindexes,
partition,
controllers,
+ stateDir,
group)) == 0)
return 0;
@@ -1301,6 +1314,8 @@ virCgroupFree(virCgroupPtr *group)
VIR_FREE((*group)->unified.placement);
VIR_FREE((*group)->path);
+ VIR_FREE((*group)->stateDir);
+ VIR_FREE((*group)->vmName);
VIR_FREE(*group);
}
@@ -2897,6 +2912,7 @@ virCgroupNewMachine(const char *name ATTRIBUTE_UNUSED,
int *nicindexes ATTRIBUTE_UNUSED,
const char *partition ATTRIBUTE_UNUSED,
int controllers ATTRIBUTE_UNUSED,
+ const char *stateDir ATTRIBUTE_UNUSED,
virCgroupPtr *group ATTRIBUTE_UNUSED)
{
virReportSystemError(ENXIO, "%s",
diff --git a/src/util/vircgroup.h b/src/util/vircgroup.h
index 372009de4a..0cd90d3651 100644
--- a/src/util/vircgroup.h
+++ b/src/util/vircgroup.h
@@ -98,6 +98,7 @@ int virCgroupNewMachine(const char *name,
int *nicindexes,
const char *partition,
int controllers,
+ const char *stateDir,
virCgroupPtr *group)
ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2)
ATTRIBUTE_NONNULL(3);
diff --git a/src/util/vircgrouppriv.h b/src/util/vircgrouppriv.h
index 085fea375c..e0ccce88a0 100644
--- a/src/util/vircgrouppriv.h
+++ b/src/util/vircgrouppriv.h
@@ -62,6 +62,8 @@ typedef virCgroupV2Controller *virCgroupV2ControllerPtr;
struct _virCgroup {
char *path;
+ char *stateDir;
+ char *vmName;
virCgroupBackendPtr backends[VIR_CGROUP_BACKEND_TYPE_LAST];
diff --git a/src/util/vircgroupv2.c b/src/util/vircgroupv2.c
index 44c4c7fc90..03efabe198 100644
--- a/src/util/vircgroupv2.c
+++ b/src/util/vircgroupv2.c
@@ -37,6 +37,8 @@
#include "virerror.h"
#include "virfile.h"
#include "virlog.h"
+#include "virpidfile.h"
+#include "virprocess.h"
#include "virstring.h"
#include "virsystemd.h"
diff --git a/src/util/vircgroupv2devices.c b/src/util/vircgroupv2devices.c
index a481ba89b7..9d8fc5557e 100644
--- a/src/util/vircgroupv2devices.c
+++ b/src/util/vircgroupv2devices.c
@@ -31,10 +31,15 @@
#include "vircgrouppriv.h"
#include "virbpf.h"
+#include "vircommand.h"
#include "vircgroup.h"
#include "vircgroupv2devices.h"
#include "virfile.h"
#include "virlog.h"
+#include "virpidfile.h"
+#include "virprocess.h"
+#include "virstring.h"
+#include "virsystemd.h"
VIR_LOG_INIT("util.cgroup");
@@ -365,17 +370,74 @@ virCgroupV2DevicesReallocMap(int mapfd,
}
+static pid_t
+virCgroupV2DevicesStartDummyProc(virCgroupPtr group)
+{
+ pid_t ret = -1;
+ pid_t pid = -1;
+ virCommandPtr cmd = NULL;
+ VIR_AUTOFREE(char *) pidfile = NULL;
+ VIR_AUTOFREE(char *) fileName = NULL;
+
+ if (virAsprintf(&fileName, "cgroup-%s", group->vmName) < 0)
+ return -1;
+
+ pidfile = virPidFileBuildPath(group->stateDir, fileName);
+ if (!pidfile)
+ return -1;
+
+ cmd = virCommandNewArgList("/usr/bin/tail", "-f", "/dev/null",
+ "-s", "3600", NULL);
+ if (!cmd)
+ return -1;
+
+ virCommandDaemonize(cmd);
+ virCommandSetPidFile(cmd, pidfile);
+
+ if (virCommandRun(cmd, NULL) < 0)
+ goto cleanup;
+
+ if (virPidFileReadPath(pidfile, &pid) < 0) {
+ virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
+ _("failed to start dummy cgroup helper"));
+ goto cleanup;
+ }
+
+ if (virCgroupAddMachineProcess(group, pid) < 0)
+ goto cleanup;
+
+ ret = pid;
+ cleanup:
+ if (ret < 0) {
+ virCommandAbort(cmd);
+ if (pid > 0)
+ virProcessKillPainfully(pid, true);
+ }
+ if (pidfile)
+ unlink(pidfile);
+ virCommandFree(cmd);
+ return ret;
+}
+
+
int
virCgroupV2DevicesCreateProg(virCgroupPtr group)
{
- int mapfd;
+ int mapfd = -1;
+ pid_t pid = -1;
if (group->unified.devices.progfd > 0 && group->unified.devices.mapfd > 0)
return 0;
+ if (virSystemdHasMachined() == 0) {
+ pid = virCgroupV2DevicesStartDummyProc(group);
+ if (pid < 0)
+ return -1;
+ }
+
mapfd = virCgroupV2DevicesCreateMap(VIR_CGROUP_V2_INITIAL_BPF_MAP_SIZE);
if (mapfd < 0)
- return -1;
+ goto error;
if (virCgroupV2DevicesAttachProg(group, mapfd,
VIR_CGROUP_V2_INITIAL_BPF_MAP_SIZE) < 0) {
@@ -385,6 +447,7 @@ virCgroupV2DevicesCreateProg(virCgroupPtr group)
return 0;
error:
+ virProcessKillPainfully(pid, true);
VIR_FORCE_CLOSE(mapfd);
return -1;
}
@@ -453,6 +516,9 @@ virCgroupV2DevicesRemoveProg(virCgroupPtr group)
if (group->unified.devices.progfd >= 0)
VIR_FORCE_CLOSE(group->unified.devices.progfd);
+ if (virSystemdHasMachined() == 0)
+ virCgroupKillPainfully(group);
+
ret = 0;
cleanup:
diff --git a/src/util/virsystemd.c b/src/util/virsystemd.c
index f492ac1859..ca35b12a5c 100644
--- a/src/util/virsystemd.c
+++ b/src/util/virsystemd.c
@@ -138,7 +138,7 @@ void virSystemdHasMachinedResetCachedValue(void)
* -1 = error
* 0 = machine1 is available
*/
-static int
+int
virSystemdHasMachined(void)
{
int ret;
diff --git a/src/util/virsystemd.h b/src/util/virsystemd.h
index 7d9c0ebd62..553d4196f1 100644
--- a/src/util/virsystemd.h
+++ b/src/util/virsystemd.h
@@ -51,4 +51,6 @@ int virSystemdCanHybridSleep(bool *result);
char *virSystemdGetMachineNameByPID(pid_t pid);
+int virSystemdHasMachined(void);
+
#endif /* LIBVIRT_VIRSYSTEMD_H */
--
2.20.1
--
libvir-list mailing list
libvir-list@redhat.com
https://www.redhat.com/mailman/listinfo/libvir-list
On 1/14/19 4:47 PM, Pavel Hrdina wrote:
> If some program is attached to cgroup and that cgroup is removed before
> detaching the program it might not be freed and will remain in the
> system until it's rebooted.
>
> This would not be that big deal to workaround if we wouldn't use
> machined to track our guests. Systemd tries to be the nice guy and if
> there is no process attached to TransientUnit it will destroy the unit
> and remove the cgroup from system which will hit the kernel bug.
>
> In order to prevent this behavior of systemd we can move another process
> into the guest cgroup and the cgroup will remain active even if we kill
> qemu process while destroying guest. We can then detach our BPF program
> from that cgroup and call machined terminate API to cleanup the cgroup.
>
> Signed-off-by: Pavel Hrdina <phrdina@redhat.com>
> ---
> src/libvirt_private.syms | 1 +
> src/lxc/lxc_cgroup.c | 1 +
> src/qemu/qemu_cgroup.c | 6 ++-
> src/util/vircgroup.c | 16 ++++++++
> src/util/vircgroup.h | 1 +
> src/util/vircgrouppriv.h | 2 +
> src/util/vircgroupv2.c | 2 +
> src/util/vircgroupv2devices.c | 70 ++++++++++++++++++++++++++++++++++-
> src/util/virsystemd.c | 2 +-
> src/util/virsystemd.h | 2 +
> 10 files changed, 99 insertions(+), 4 deletions(-)
> +static pid_t
> +virCgroupV2DevicesStartDummyProc(virCgroupPtr group)
> +{
> + pid_t ret = -1;
> + pid_t pid = -1;
> + virCommandPtr cmd = NULL;
> + VIR_AUTOFREE(char *) pidfile = NULL;
> + VIR_AUTOFREE(char *) fileName = NULL;
> +
> + if (virAsprintf(&fileName, "cgroup-%s", group->vmName) < 0)
> + return -1;
> +
> + pidfile = virPidFileBuildPath(group->stateDir, fileName);
> + if (!pidfile)
> + return -1;
> +
> + cmd = virCommandNewArgList("/usr/bin/tail", "-f", "/dev/null",
> + "-s", "3600", NULL);
> + if (!cmd)
> + return -1;
> +
> + virCommandDaemonize(cmd);
> + virCommandSetPidFile(cmd, pidfile);
> +
> + if (virCommandRun(cmd, NULL) < 0)
> + goto cleanup;
> +
> + if (virPidFileReadPath(pidfile, &pid) < 0) {
> + virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
> + _("failed to start dummy cgroup helper"));
> + goto cleanup;
> + }
> +
> + if (virCgroupAddMachineProcess(group, pid) < 0)
> + goto cleanup;
> +
> + ret = pid;
> + cleanup:
> + if (ret < 0) {
> + virCommandAbort(cmd);
> + if (pid > 0)
> + virProcessKillPainfully(pid, true);
> + }
> + if (pidfile)
> + unlink(pidfile);
> + virCommandFree(cmd);
> + return ret;
> +}
I don't think we should bother with this. This is clearly a kernel bug
and once fixed (which we can't detect) we would still be placing this
'tail' into the cgroup for nothing. I know we have lots of workarounds
for bugs in other libraries (and even probably kernels too) in our code,
but we should start getting rid of those instead of introducing new ones.
Michal
--
libvir-list mailing list
libvir-list@redhat.com
https://www.redhat.com/mailman/listinfo/libvir-list
On Mon, Jan 14, 2019 at 04:47:50PM +0100, Pavel Hrdina wrote:
> If some program is attached to cgroup and that cgroup is removed before
> detaching the program it might not be freed and will remain in the
> system until it's rebooted.
>
> This would not be that big deal to workaround if we wouldn't use
> machined to track our guests. Systemd tries to be the nice guy and if
> there is no process attached to TransientUnit it will destroy the unit
> and remove the cgroup from system which will hit the kernel bug.
>
> In order to prevent this behavior of systemd we can move another process
> into the guest cgroup and the cgroup will remain active even if we kill
> qemu process while destroying guest. We can then detach our BPF program
> from that cgroup and call machined terminate API to cleanup the cgroup.
The kernel bugzilla says that "bpftool" still reports existance of
the BPF program when this situation hits. So can't we just use
bpftool to explicitly delete this orphaned program
> +static pid_t
> +virCgroupV2DevicesStartDummyProc(virCgroupPtr group)
> +{
> + pid_t ret = -1;
> + pid_t pid = -1;
> + virCommandPtr cmd = NULL;
> + VIR_AUTOFREE(char *) pidfile = NULL;
> + VIR_AUTOFREE(char *) fileName = NULL;
> +
> + if (virAsprintf(&fileName, "cgroup-%s", group->vmName) < 0)
> + return -1;
> +
> + pidfile = virPidFileBuildPath(group->stateDir, fileName);
> + if (!pidfile)
> + return -1;
> +
> + cmd = virCommandNewArgList("/usr/bin/tail", "-f", "/dev/null",
> + "-s", "3600", NULL);
> + if (!cmd)
> + return -1;
> +
> + virCommandDaemonize(cmd);
> + virCommandSetPidFile(cmd, pidfile);
> +
> + if (virCommandRun(cmd, NULL) < 0)
> + goto cleanup;
> +
> + if (virPidFileReadPath(pidfile, &pid) < 0) {
> + virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
> + _("failed to start dummy cgroup helper"));
> + goto cleanup;
> + }
> +
> + if (virCgroupAddMachineProcess(group, pid) < 0)
> + goto cleanup;
> +
> + ret = pid;
> + cleanup:
> + if (ret < 0) {
> + virCommandAbort(cmd);
> + if (pid > 0)
> + virProcessKillPainfully(pid, true);
> + }
> + if (pidfile)
> + unlink(pidfile);
> + virCommandFree(cmd);
> + return ret;
> +}
I think this is really horrible to be honest.
If there's really no other option, then I'd prefer that we do not use
BPF at all on known broken kernel versions until it is fixed.
Regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
--
libvir-list mailing list
libvir-list@redhat.com
https://www.redhat.com/mailman/listinfo/libvir-list
On Thu, Jan 24, 2019 at 11:06:51AM +0000, Daniel P. Berrangé wrote:
> On Mon, Jan 14, 2019 at 04:47:50PM +0100, Pavel Hrdina wrote:
> > If some program is attached to cgroup and that cgroup is removed before
> > detaching the program it might not be freed and will remain in the
> > system until it's rebooted.
> >
> > This would not be that big deal to workaround if we wouldn't use
> > machined to track our guests. Systemd tries to be the nice guy and if
> > there is no process attached to TransientUnit it will destroy the unit
> > and remove the cgroup from system which will hit the kernel bug.
> >
> > In order to prevent this behavior of systemd we can move another process
> > into the guest cgroup and the cgroup will remain active even if we kill
> > qemu process while destroying guest. We can then detach our BPF program
> > from that cgroup and call machined terminate API to cleanup the cgroup.
>
> The kernel bugzilla says that "bpftool" still reports existance of
> the BPF program when this situation hits. So can't we just use
> bpftool to explicitly delete this orphaned program
Unfortunately no, bpftool doesn't have any way how to delete any
programs.
> > +static pid_t
> > +virCgroupV2DevicesStartDummyProc(virCgroupPtr group)
> > +{
> > + pid_t ret = -1;
> > + pid_t pid = -1;
> > + virCommandPtr cmd = NULL;
> > + VIR_AUTOFREE(char *) pidfile = NULL;
> > + VIR_AUTOFREE(char *) fileName = NULL;
> > +
> > + if (virAsprintf(&fileName, "cgroup-%s", group->vmName) < 0)
> > + return -1;
> > +
> > + pidfile = virPidFileBuildPath(group->stateDir, fileName);
> > + if (!pidfile)
> > + return -1;
> > +
> > + cmd = virCommandNewArgList("/usr/bin/tail", "-f", "/dev/null",
> > + "-s", "3600", NULL);
> > + if (!cmd)
> > + return -1;
> > +
> > + virCommandDaemonize(cmd);
> > + virCommandSetPidFile(cmd, pidfile);
> > +
> > + if (virCommandRun(cmd, NULL) < 0)
> > + goto cleanup;
> > +
> > + if (virPidFileReadPath(pidfile, &pid) < 0) {
> > + virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
> > + _("failed to start dummy cgroup helper"));
> > + goto cleanup;
> > + }
> > +
> > + if (virCgroupAddMachineProcess(group, pid) < 0)
> > + goto cleanup;
> > +
> > + ret = pid;
> > + cleanup:
> > + if (ret < 0) {
> > + virCommandAbort(cmd);
> > + if (pid > 0)
> > + virProcessKillPainfully(pid, true);
> > + }
> > + if (pidfile)
> > + unlink(pidfile);
> > + virCommandFree(cmd);
> > + return ret;
> > +}
>
> I think this is really horrible to be honest.
>
> If there's really no other option, then I'd prefer that we do not use
> BPF at all on known broken kernel versions until it is fixed.
I agree with you and Michal, it's ugly and most likely the only
solution with broken kernel and systemd. I wrote the patch to show
how we could workaround the kernel bug and I was hoping that we will
agree on not using it.
Another possible workaround would be to not use machined and create the
systemd unit by other APIs, that way systemd would not remove the cgroup
and we would be able to detach programs before removing cgroups.
Or just simply wait until kernel fixes the issue.
Pavel
--
libvir-list mailing list
libvir-list@redhat.com
https://www.redhat.com/mailman/listinfo/libvir-list
© 2016 - 2026 Red Hat, Inc.