>From e28cb2a03a670e4c0e7641f68f9d9f3accb00ae0 Mon Sep 17 00:00:00 2001
From: Yilu Lin <linyilu@huawei.com>
Date: Tue, 4 Aug 2020 02:42:00 -0400
Subject: [PATCH] fix vm schizencephaly when heartbeat stoped
Signed-off-by: Yilu Lin <linyilu@huawei.com>
If keepalive messages lost in finish step, vm maybe schizencephaly.
Shutdown src vm for protection.
---
src/qemu/qemu_migration.c | 22 ++++++++++++++++++++--
1 file changed, 20 insertions(+), 2 deletions(-)
diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c
index 2c7bf34..b8296ba 100644
--- a/src/qemu/qemu_migration.c
+++ b/src/qemu/qemu_migration.c
@@ -4136,6 +4136,8 @@ qemuMigrationSrcPerformPeer2Peer3(virQEMUDriverPtr driver,
int cookieoutlen = 0;
int ret = -1;
virErrorPtr orig_err = NULL;
+ virErrorPtr finish_err = NULL;
+ bool living = true;
bool cancelled = true;
virStreamPtr st = NULL;
unsigned long destflags;
@@ -4394,7 +4396,16 @@ qemuMigrationSrcPerformPeer2Peer3(virQEMUDriverPtr driver,
* The lock manager plugins should take care of
* safety in this scenario.
*/
- cancelled = ddomain == NULL;
+ if (!cancelled && !ddomain)
+ finish_err = virSaveLastError();
+
+ if (finish_err && finish_err->message &&
+ strstr(finish_err->message, "received hangup / error event on socket")) {
+ living = false;
+ VIR_ERROR(_("keepalive messages lost in finish step, shutdown src vm for protection"));
+ } else {
+ cancelled = ddomain == NULL;
+ }
/* If finish3 set an error, and we don't have an earlier
* one we need to preserve it in case confirm3 overwrites
@@ -4427,10 +4438,17 @@ qemuMigrationSrcPerformPeer2Peer3(virQEMUDriverPtr driver,
virObjectUnref(ddomain);
ret = 0;
} else {
- ret = -1;
+ if (!living)
+ ret = 0;
+ else
+ ret = -1;
}
virObjectUnref(st);
+ if (finish_err) {
+ virSetError(finish_err);
+ virFreeError(finish_err);
+ }
virErrorRestore(&orig_err);
VIR_FREE(uri_out);
--
2.19.1
On 8/4/20 4:21 AM, Yilu Lin wrote: >>From e28cb2a03a670e4c0e7641f68f9d9f3accb00ae0 Mon Sep 17 00:00:00 2001 > From: Yilu Lin <linyilu@huawei.com> > Date: Tue, 4 Aug 2020 02:42:00 -0400 > Subject: [PATCH] fix vm schizencephaly when heartbeat stoped > > Signed-off-by: Yilu Lin <linyilu@huawei.com> > > If keepalive messages lost in finish step, vm maybe schizencephaly. > Shutdown src vm for protection. Typo in commit msg: stoped -> stopped. Also, I had to translate 'schizencephaly' when I realized this wasn't a weird typo. It got translated to portuguese, and I still needed to look it up to understand the meaning. Look, I'm all up to use out of domain expressions/words in commit messages, but this word is too niche to be used out of medical context, and some people (like myself) will need to look it up the meaning. This goes against the idea of the commit message giving a clear explanation of the change. I'll not discuss whether the word is too "spicy" to be used or not because the previous point is good enough to justify changing it. My suggestion is to use 'malformation' or 'corruption' instead. One more thing below: > > --- > src/qemu/qemu_migration.c | 22 ++++++++++++++++++++-- > 1 file changed, 20 insertions(+), 2 deletions(-) > > diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c > index 2c7bf34..b8296ba 100644 > --- a/src/qemu/qemu_migration.c > +++ b/src/qemu/qemu_migration.c > @@ -4136,6 +4136,8 @@ qemuMigrationSrcPerformPeer2Peer3(virQEMUDriverPtr driver, > int cookieoutlen = 0; > int ret = -1; > virErrorPtr orig_err = NULL; > + virErrorPtr finish_err = NULL; > + bool living = true; > bool cancelled = true; > virStreamPtr st = NULL; > unsigned long destflags; > @@ -4394,7 +4396,16 @@ qemuMigrationSrcPerformPeer2Peer3(virQEMUDriverPtr driver, > * The lock manager plugins should take care of > * safety in this scenario. > */ > - cancelled = ddomain == NULL; > + if (!cancelled && !ddomain) > + finish_err = virSaveLastError(); > + > + if (finish_err && finish_err->message && > + strstr(finish_err->message, "received hangup / error event on socket")) { > + living = false; > + VIR_ERROR(_("keepalive messages lost in finish step, shutdown src vm for protection")); > + } else { > + cancelled = ddomain == NULL; > + } > > /* If finish3 set an error, and we don't have an earlier > * one we need to preserve it in case confirm3 overwrites > @@ -4427,10 +4438,17 @@ qemuMigrationSrcPerformPeer2Peer3(virQEMUDriverPtr driver, > virObjectUnref(ddomain); > ret = 0; > } else { > - ret = -1; > + if (!living) > + ret = 0; The logic is OK, but the flag being called 'living' here is a bit counter-intuitive. 'failsafeShutdownTriggered' or anything more in line with what the flag represents would be better. Thanks, DHB > + else > + ret = -1; > } > > virObjectUnref(st); > + if (finish_err) { > + virSetError(finish_err); > + virFreeError(finish_err); > + } > > virErrorRestore(&orig_err); > VIR_FREE(uri_out); >
© 2016 - 2024 Red Hat, Inc.