[v2] Add support for post-copy recovery

[libvirt PATCH v2 81/81] RFC: qemu: Keep vCPUs paused while migration is in postcopy-paused

Posted by Jiri Denemark 3 years, 8 months ago

QEMU keeps guest CPUs running even in postcopy-paused migration state so
that processes that already have all memory pages they need migrated to
the destination can keep running. However, this behavior might bring
unexpected delays in interprocess communication as some processes will
be stopped until migration is recover and their memory pages migrated.
So let's make sure all guest CPUs are paused while postcopy migration is
paused.
---

Notes:
    Version 2:
    - new patch

    - this patch does not currently work as QEMU cannot handle "stop"
      QMP command while in postcopy-paused state... the monitor just
      hangs (see https://gitlab.com/qemu-project/qemu/-/issues/1052 )
    - an ideal solution of the QEMU bug would be if QEMU itself paused
      the CPUs for us and we just got notified about it via QMP events
    - but Peter Xu thinks this behavior is actually worse than keeping
      vCPUs running
    - so let's take this patch as a base for discussing what we should
      be doing with vCPUs in postcopy-paused migration state

 src/qemu/qemu_domain.c    |  1 +
 src/qemu/qemu_domain.h    |  1 +
 src/qemu/qemu_driver.c    | 30 +++++++++++++++++++++++++
 src/qemu/qemu_migration.c | 47 +++++++++++++++++++++++++++++++++++++++
 src/qemu/qemu_migration.h |  6 +++++
 src/qemu/qemu_process.c   | 32 ++++++++++++++++++++++++++
 6 files changed, 117 insertions(+)

diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
index d04ec6cd0c..dcd6d5e1b5 100644
--- a/src/qemu/qemu_domain.c
+++ b/src/qemu/qemu_domain.c
@@ -11115,6 +11115,7 @@ qemuProcessEventFree(struct qemuProcessEvent *event)
         break;
     case QEMU_PROCESS_EVENT_PR_DISCONNECT:
     case QEMU_PROCESS_EVENT_UNATTENDED_MIGRATION:
+    case QEMU_PROCESS_EVENT_MIGRATION_CPU_STATE:
     case QEMU_PROCESS_EVENT_LAST:
         break;
     }
diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h
index 153dfe3a23..f5cdb2235f 100644
--- a/src/qemu/qemu_domain.h
+++ b/src/qemu/qemu_domain.h
@@ -427,6 +427,7 @@ typedef enum {
     QEMU_PROCESS_EVENT_GUEST_CRASHLOADED,
     QEMU_PROCESS_EVENT_MEMORY_DEVICE_SIZE_CHANGE,
     QEMU_PROCESS_EVENT_UNATTENDED_MIGRATION,
+    QEMU_PROCESS_EVENT_MIGRATION_CPU_STATE,
 
     QEMU_PROCESS_EVENT_LAST
 } qemuProcessEventType;
diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c
index 637106f1b3..d0498ef2aa 100644
--- a/src/qemu/qemu_driver.c
+++ b/src/qemu/qemu_driver.c
@@ -4255,6 +4255,33 @@ processMemoryDeviceSizeChange(virQEMUDriver *driver,
 }
 
 
+static void
+processMigrationCPUState(virDomainObj *vm,
+                         virDomainState state,
+                         int reason)
+{
+    qemuDomainObjPrivate *priv = vm->privateData;
+    virQEMUDriver *driver = priv->driver;
+
+    if (qemuDomainObjBeginJob(driver, vm, VIR_JOB_MIGRATION_SAFE) < 0)
+        return;
+
+    if (!virDomainObjIsActive(vm)) {
+        VIR_DEBUG("Domain '%s' is not running", vm->def->name);
+        goto endjob;
+    }
+
+    if (priv->job.asyncJob == VIR_ASYNC_JOB_MIGRATION_IN &&
+        virDomainObjIsPostcopy(vm, VIR_DOMAIN_JOB_OPERATION_MIGRATION_IN)) {
+        qemuMigrationUpdatePostcopyCPUState(vm, state, reason,
+                                            VIR_ASYNC_JOB_NONE);
+    }
+
+ endjob:
+    qemuDomainObjEndJob(vm);
+}
+
+
 static void qemuProcessEventHandler(void *data, void *opaque)
 {
     struct qemuProcessEvent *processEvent = data;
@@ -4312,6 +4339,9 @@ static void qemuProcessEventHandler(void *data, void *opaque)
                                        processEvent->action,
                                        processEvent->status);
         break;
+    case QEMU_PROCESS_EVENT_MIGRATION_CPU_STATE:
+        processMigrationCPUState(vm, processEvent->action, processEvent->status);
+        break;
     case QEMU_PROCESS_EVENT_LAST:
         break;
     }
diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c
index 0314fb1148..58d7009363 100644
--- a/src/qemu/qemu_migration.c
+++ b/src/qemu/qemu_migration.c
@@ -6831,6 +6831,53 @@ qemuMigrationProcessUnattended(virQEMUDriver *driver,
 }
 
 
+void
+qemuMigrationUpdatePostcopyCPUState(virDomainObj *vm,
+                                    virDomainState state,
+                                    int reason,
+                                    int asyncJob)
+{
+    virQEMUDriver *driver = QEMU_DOMAIN_PRIVATE(vm)->driver;
+    int current;
+
+    if (state == VIR_DOMAIN_PAUSED) {
+        VIR_DEBUG("Post-copy migration of domain '%s' was paused, stopping guest CPUs",
+                  vm->def->name);
+    } else {
+        VIR_DEBUG("Post-copy migration of domain '%s' was resumed, starting guest CPUs",
+                  vm->def->name);
+    }
+
+    if (virDomainObjGetState(vm, &current) == state) {
+        int eventType = -1;
+        int eventDetail = -1;
+
+        if (current == reason) {
+            VIR_DEBUG("Guest CPUs are already in the right state");
+            return;
+        }
+
+        VIR_DEBUG("Fixing domain state reason");
+        if (state == VIR_DOMAIN_PAUSED) {
+            eventType = VIR_DOMAIN_EVENT_SUSPENDED;
+            eventDetail = qemuDomainPausedReasonToSuspendedEvent(reason);
+        } else {
+            eventType = VIR_DOMAIN_EVENT_RESUMED;
+            eventDetail = qemuDomainRunningReasonToResumeEvent(reason);
+        }
+        virDomainObjSetState(vm, state, reason);
+        qemuDomainSaveStatus(vm);
+        virObjectEventStateQueue(driver->domainEventState,
+                                 virDomainEventLifecycleNewFromObj(vm, eventType,
+                                                                   eventDetail));
+    } else if (state == VIR_DOMAIN_PAUSED) {
+        qemuProcessStopCPUs(driver, vm, reason, asyncJob);
+    } else {
+        qemuProcessStartCPUs(driver, vm, reason, asyncJob);
+    }
+}
+
+
 /* Helper function called while vm is active.  */
 int
 qemuMigrationSrcToFile(virQEMUDriver *driver, virDomainObj *vm,
diff --git a/src/qemu/qemu_migration.h b/src/qemu/qemu_migration.h
index fbc0549b34..a1e2d8d171 100644
--- a/src/qemu/qemu_migration.h
+++ b/src/qemu/qemu_migration.h
@@ -224,6 +224,12 @@ qemuMigrationProcessUnattended(virQEMUDriver *driver,
                                virDomainAsyncJob job,
                                qemuMonitorMigrationStatus status);
 
+void
+qemuMigrationUpdatePostcopyCPUState(virDomainObj *vm,
+                                    virDomainState state,
+                                    int reason,
+                                    int asyncJob);
+
 bool
 qemuMigrationSrcIsAllowed(virQEMUDriver *driver,
                           virDomainObj *vm,
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
index ad529dabb4..7fff68c0db 100644
--- a/src/qemu/qemu_process.c
+++ b/src/qemu/qemu_process.c
@@ -1521,6 +1521,10 @@ qemuProcessHandleMigrationStatus(qemuMonitor *mon G_GNUC_UNUSED,
              * Thus we need to handle the event here. */
             qemuMigrationSrcPostcopyFailed(vm);
             qemuDomainSaveStatus(vm);
+        } else if (priv->job.asyncJob == VIR_ASYNC_JOB_MIGRATION_IN) {
+            qemuProcessEventSubmit(vm, QEMU_PROCESS_EVENT_MIGRATION_CPU_STATE,
+                                   VIR_DOMAIN_PAUSED,
+                                   VIR_DOMAIN_PAUSED_POSTCOPY_FAILED, NULL);
         }
         break;
 
@@ -1547,6 +1551,12 @@ qemuProcessHandleMigrationStatus(qemuMonitor *mon G_GNUC_UNUSED,
             event = virDomainEventLifecycleNewFromObj(vm, eventType, eventDetail);
             qemuDomainSaveStatus(vm);
         }
+
+        if (priv->job.asyncJob == VIR_ASYNC_JOB_MIGRATION_IN) {
+            qemuProcessEventSubmit(vm, QEMU_PROCESS_EVENT_MIGRATION_CPU_STATE,
+                                   VIR_DOMAIN_RUNNING,
+                                   VIR_DOMAIN_RUNNING_POSTCOPY, NULL);
+        }
         break;
 
     case QEMU_MONITOR_MIGRATION_STATUS_COMPLETED:
@@ -3703,10 +3713,32 @@ qemuProcessRecoverMigration(virQEMUDriver *driver,
         if (migStatus == VIR_DOMAIN_JOB_STATUS_POSTCOPY) {
             VIR_DEBUG("Post-copy migration of domain %s still running, it will be handled as unattended",
                       vm->def->name);
+
+            if (job->asyncJob == VIR_ASYNC_JOB_MIGRATION_IN &&
+                state == VIR_DOMAIN_PAUSED) {
+                qemuMigrationUpdatePostcopyCPUState(vm, VIR_DOMAIN_RUNNING,
+                                                    VIR_DOMAIN_RUNNING_POSTCOPY,
+                                                    VIR_ASYNC_JOB_NONE);
+            } else {
+                if (state == VIR_DOMAIN_RUNNING)
+                    reason = VIR_DOMAIN_RUNNING_POSTCOPY;
+                else
+                    reason = VIR_DOMAIN_PAUSED_POSTCOPY;
+
+                virDomainObjSetState(vm, state, reason);
+            }
+
             qemuProcessRestoreMigrationJob(vm, job);
             return 0;
         }
 
+        if (job->asyncJob == VIR_ASYNC_JOB_MIGRATION_IN &&
+            migStatus == VIR_DOMAIN_JOB_STATUS_POSTCOPY_PAUSED) {
+            qemuMigrationUpdatePostcopyCPUState(vm, VIR_DOMAIN_PAUSED,
+                                                VIR_DOMAIN_PAUSED_POSTCOPY,
+                                                VIR_ASYNC_JOB_NONE);
+        }
+
         if (migStatus != VIR_DOMAIN_JOB_STATUS_HYPERVISOR_COMPLETED) {
             if (job->asyncJob == VIR_ASYNC_JOB_MIGRATION_OUT)
                 qemuMigrationSrcPostcopyFailed(vm);
-- 
2.35.1

Re: [libvirt PATCH v2 81/81] RFC: qemu: Keep vCPUs paused while migration is in postcopy-paused

Posted by Peter Krempa 3 years, 8 months ago

On Wed, Jun 01, 2022 at 14:50:21 +0200, Jiri Denemark wrote:
> QEMU keeps guest CPUs running even in postcopy-paused migration state so
> that processes that already have all memory pages they need migrated to
> the destination can keep running. However, this behavior might bring
> unexpected delays in interprocess communication as some processes will
> be stopped until migration is recover and their memory pages migrated.
> So let's make sure all guest CPUs are paused while postcopy migration is
> paused.
> ---
> 
> Notes:
>     Version 2:
>     - new patch
> 
>     - this patch does not currently work as QEMU cannot handle "stop"
>       QMP command while in postcopy-paused state... the monitor just
>       hangs (see https://gitlab.com/qemu-project/qemu/-/issues/1052 )

Does it then somehow self-heal? Because if not ...

>     - an ideal solution of the QEMU bug would be if QEMU itself paused
>       the CPUs for us and we just got notified about it via QMP events
>     - but Peter Xu thinks this behavior is actually worse than keeping
>       vCPUs running
>     - so let's take this patch as a base for discussing what we should
>       be doing with vCPUs in postcopy-paused migration state
> 
>  src/qemu/qemu_domain.c    |  1 +
>  src/qemu/qemu_domain.h    |  1 +
>  src/qemu/qemu_driver.c    | 30 +++++++++++++++++++++++++
>  src/qemu/qemu_migration.c | 47 +++++++++++++++++++++++++++++++++++++++
>  src/qemu/qemu_migration.h |  6 +++++
>  src/qemu/qemu_process.c   | 32 ++++++++++++++++++++++++++
>  6 files changed, 117 insertions(+)

[...]

> diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c
> index 0314fb1148..58d7009363 100644
> --- a/src/qemu/qemu_migration.c
> +++ b/src/qemu/qemu_migration.c
> @@ -6831,6 +6831,53 @@ qemuMigrationProcessUnattended(virQEMUDriver *driver,
>  }
>  
>  
> +void
> +qemuMigrationUpdatePostcopyCPUState(virDomainObj *vm,
> +                                    virDomainState state,
> +                                    int reason,
> +                                    int asyncJob)
> +{
> +    virQEMUDriver *driver = QEMU_DOMAIN_PRIVATE(vm)->driver;
> +    int current;
> +
> +    if (state == VIR_DOMAIN_PAUSED) {
> +        VIR_DEBUG("Post-copy migration of domain '%s' was paused, stopping guest CPUs",
> +                  vm->def->name);
> +    } else {
> +        VIR_DEBUG("Post-copy migration of domain '%s' was resumed, starting guest CPUs",
> +                  vm->def->name);
> +    }
> +
> +    if (virDomainObjGetState(vm, &current) == state) {
> +        int eventType = -1;
> +        int eventDetail = -1;
> +
> +        if (current == reason) {
> +            VIR_DEBUG("Guest CPUs are already in the right state");
> +            return;
> +        }
> +
> +        VIR_DEBUG("Fixing domain state reason");
> +        if (state == VIR_DOMAIN_PAUSED) {
> +            eventType = VIR_DOMAIN_EVENT_SUSPENDED;
> +            eventDetail = qemuDomainPausedReasonToSuspendedEvent(reason);
> +        } else {
> +            eventType = VIR_DOMAIN_EVENT_RESUMED;
> +            eventDetail = qemuDomainRunningReasonToResumeEvent(reason);
> +        }
> +        virDomainObjSetState(vm, state, reason);
> +        qemuDomainSaveStatus(vm);
> +        virObjectEventStateQueue(driver->domainEventState,
> +                                 virDomainEventLifecycleNewFromObj(vm, eventType,
> +                                                                   eventDetail));
> +    } else if (state == VIR_DOMAIN_PAUSED) {
> +        qemuProcessStopCPUs(driver, vm, reason, asyncJob);

Then this will obviously break our ability to control qemu. If that is
forever, then we certainly should not be doing this.

In which case if we want to go ahead with pausing it ourselves, once
qemu fixes the issue you've mentioned above, they need to also add a
'feature' flag into QMP which we can probe and avoid breaking qemu
willingly.

> +    } else {
> +        qemuProcessStartCPUs(driver, vm, reason, asyncJob);
> +    }
> +}
> +
> +
>  /* Helper function called while vm is active.  */
>  int
>  qemuMigrationSrcToFile(virQEMUDriver *driver, virDomainObj *vm,

Re: [libvirt PATCH v2 81/81] RFC: qemu: Keep vCPUs paused while migration is in postcopy-paused

Posted by Jiri Denemark 3 years, 8 months ago

On Mon, Jun 06, 2022 at 15:37:45 +0200, Peter Krempa wrote:
> On Wed, Jun 01, 2022 at 14:50:21 +0200, Jiri Denemark wrote:
> > QEMU keeps guest CPUs running even in postcopy-paused migration state so
> > that processes that already have all memory pages they need migrated to
> > the destination can keep running. However, this behavior might bring
> > unexpected delays in interprocess communication as some processes will
> > be stopped until migration is recover and their memory pages migrated.
> > So let's make sure all guest CPUs are paused while postcopy migration is
> > paused.
> > ---
> > 
> > Notes:
> >     Version 2:
> >     - new patch
> > 
> >     - this patch does not currently work as QEMU cannot handle "stop"
> >       QMP command while in postcopy-paused state... the monitor just
> >       hangs (see https://gitlab.com/qemu-project/qemu/-/issues/1052 )
> 
> Does it then somehow self-heal? Because if not ...

> > +    } else if (state == VIR_DOMAIN_PAUSED) {
> > +        qemuProcessStopCPUs(driver, vm, reason, asyncJob);
> 
> Then this will obviously break our ability to control qemu. If that is
> forever, then we certainly should not be doing this.
> 
> In which case if we want to go ahead with pausing it ourselves, once
> qemu fixes the issue you've mentioned above, they need to also add a
> 'feature' flag into QMP which we can probe and avoid breaking qemu
> willingly.

Exactly. We either need QEMU to stop the CPUs by itself or fix the bug
and add a way for us to probe it was fixed. Currently our code would
just hang waiting for QEMU reply.

Because of this, pushing the series even without this RFC patch (before
the QEMU issue is sorted out in some way) is actually better than
keeping the current "always pause, even if QEMU is still migrating"
behavior as with the current code we may get stuck after sending "stop"
while QEMU migration is in postcopy-paused.

Jirka

Re: [libvirt PATCH v2 81/81] RFC: qemu: Keep vCPUs paused while migration is in postcopy-paused

Posted by Daniel P. Berrangé 3 years, 8 months ago

On Wed, Jun 01, 2022 at 02:50:21PM +0200, Jiri Denemark wrote:
> QEMU keeps guest CPUs running even in postcopy-paused migration state so
> that processes that already have all memory pages they need migrated to
> the destination can keep running. However, this behavior might bring
> unexpected delays in interprocess communication as some processes will
> be stopped until migration is recover and their memory pages migrated.
> So let's make sure all guest CPUs are paused while postcopy migration is
> paused.
> ---
> 
> Notes:
>     Version 2:
>     - new patch
> 
>     - this patch does not currently work as QEMU cannot handle "stop"
>       QMP command while in postcopy-paused state... the monitor just
>       hangs (see https://gitlab.com/qemu-project/qemu/-/issues/1052 )
>     - an ideal solution of the QEMU bug would be if QEMU itself paused
>       the CPUs for us and we just got notified about it via QMP events
>     - but Peter Xu thinks this behavior is actually worse than keeping
>       vCPUs running

I'd like to know what the rationale is here ?

We've got a long history knowing the behaviour and impact when
pausing a VM as a whole. Of course some apps may have timeouts
that are hit if the paused time was too long, but overall this
scenario is not that different from a bare metal machine doing
suspend-to-ram. Application impact is limited & predictable and
genrally well understood.

I don't think we can say the same about the behaviour & impact
on the guest OS if we selectively block execution of random
CPUs.  An OS where a certain physical CPU simply stops executing
is not a normal scenario that any application or OS is designed
to expect. I think the chance of the guest OS or application
breaking in a non-recoverable way is high. IOW, we might perform
post-copy recovery and all might look well from host POV, but
the guest OS/app is none the less broken.

The overriding goal for migration has to be to minimize the
danger to the guest OS and its applications, and I think that's
only viable if either the guest OS is running all CPUs or no
CPUs.

The length of outage for a CPU when post-copy transport is broken
is potentially orders of magnitude larger than the temporary
blockage while fetching a memory page asynchronously. The latter
is obviously not good for real-time sensitive apps, but most apps
and OS will cope with CPUs being stalled for 100's of milliseconds.
That isn't the case if CPUs get stalled for minutes, or even hours,
at a time due to a broken network link needing admin recovery work
in the host infra.

With regards,
Daniel
-- 
|: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org         -o-            https://fstop138.berrange.com :|
|: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|

Re: [libvirt PATCH v2 81/81] RFC: qemu: Keep vCPUs paused while migration is in postcopy-paused

Posted by Peter Xu 3 years, 8 months ago

[copy Dave]

On Mon, Jun 06, 2022 at 12:29:39PM +0100, Daniel P. Berrangé wrote:
> On Wed, Jun 01, 2022 at 02:50:21PM +0200, Jiri Denemark wrote:
> > QEMU keeps guest CPUs running even in postcopy-paused migration state so
> > that processes that already have all memory pages they need migrated to
> > the destination can keep running. However, this behavior might bring
> > unexpected delays in interprocess communication as some processes will
> > be stopped until migration is recover and their memory pages migrated.
> > So let's make sure all guest CPUs are paused while postcopy migration is
> > paused.
> > ---
> > 
> > Notes:
> >     Version 2:
> >     - new patch
> > 
> >     - this patch does not currently work as QEMU cannot handle "stop"
> >       QMP command while in postcopy-paused state... the monitor just
> >       hangs (see https://gitlab.com/qemu-project/qemu/-/issues/1052 )
> >     - an ideal solution of the QEMU bug would be if QEMU itself paused
> >       the CPUs for us and we just got notified about it via QMP events
> >     - but Peter Xu thinks this behavior is actually worse than keeping
> >       vCPUs running
> 
> I'd like to know what the rationale is here ?

I think the wording here is definitely stronger than what I meant. :-)

My understanding was stopping the VM may or may not help the guest,
depending on the guest behavior at the point of migration failure.  And if
we're not 100% sure of that, doing nothing is the best we have, as
explicitly stopping the VM is something extra we do, and it's not part of
the requirements for either postcopy itself or the recovery routine.

Some examples below.

1) If many of the guest threads are doing cpu intensive work, and if the
needed pageset is already migrated, then stopping the vcpu threads means
they could have been running during this "downtime" but we forced them not
to.  Actually if the postcopy didn't pause immediately right after switch,
we could very possibly migrated the workload pages if the working set is
not very large.

2) If we're reaching the end of the postcopy phase and it paused, most of
the pages could have been migrated already.  So maybe only a few or even
none thread will be stopped due to remote page faults.

3) Think about kvm async page fault: that's a feature that the guest can do
to yield the guest thread when there's a page fault.  It means even if some
of the page faulted threads got stuck for a long time due to postcopy
pausing, the guest is "smart" to know it'll take a long time (userfaultfd
is a major fault, and as long as KVM gup won't get the page we put the page
fault into async pf queue) then the guest vcpu can explicitly schedule()
the faulted context and run some other threads that may not need to be
blocked.

What I wanted to say is I don't know whether assuming "stopping the VM will
be better than not doing so" will always be true here.  If it's case by
case I feel like the better way to do is to do nothing special.

> 
> We've got a long history knowing the behaviour and impact when
> pausing a VM as a whole. Of course some apps may have timeouts
> that are hit if the paused time was too long, but overall this
> scenario is not that different from a bare metal machine doing
> suspend-to-ram. Application impact is limited & predictable and
> genrally well understood.

My other question is, even if we stopped the VM then right after we resume
the VM won't many of those timeout()s trigger as well?  I think I asked
similar question to Jiri and the answer at that time was that we could have
not called the timeout() function, however I think it's not persuasive
enough as timeout() is the function that should take the major time so at
least we're not sure whether we'll be on it already.

My understanding is that a VM can work properly after a migration because
the guest timekeeping will gradually sync up with the real world time, so
if there's a major donwtime triggered we can hardly make it not affecting
the guest.  What we can do is if we know a software is in VM context we
should be robust on the timeout (and that's at least what I do on programs
even on bare metal because I'd assume the program be run on an extremely
busy host).

But I could be all wrong on that, because I don't know enough on the whole
rational of the importance of stopping the VM in the past.

> 
> I don't think we can say the same about the behaviour & impact
> on the guest OS if we selectively block execution of random
> CPUs.  An OS where a certain physical CPU simply stops executing
> is not a normal scenario that any application or OS is designed
> to expect. I think the chance of the guest OS or application
> breaking in a non-recoverable way is high. IOW, we might perform
> post-copy recovery and all might look well from host POV, but
> the guest OS/app is none the less broken.
> 
> The overriding goal for migration has to be to minimize the
> danger to the guest OS and its applications, and I think that's
> only viable if either the guest OS is running all CPUs or no
> CPUs.

I agree.

> 
> The length of outage for a CPU when post-copy transport is broken
> is potentially orders of magnitude larger than the temporary
> blockage while fetching a memory page asynchronously. The latter
> is obviously not good for real-time sensitive apps, but most apps
> and OS will cope with CPUs being stalled for 100's of milliseconds.
> That isn't the case if CPUs get stalled for minutes, or even hours,
> at a time due to a broken network link needing admin recovery work
> in the host infra.

So let me also look at the issue on having vm stop hanged, no matter
whether we'd like an explicit vm_stop that hang should better be avoided
from libvirt pov.

Ideally it could be avoided but I need to look into it.  I think it can be
that the vm_stop was waiting for other vcpus to exit to userspace but those
didn't really come alive after the SIG_IPI sent to them (in reality that's
SIGUSR1; and I'm pretty sure all vcpu threads can handle SIGKILL.. so maybe
I need to figure out where got it blocked in the kernel).

I'll update either here or in the bug that Jiri opened when I got more
clues out of it.

Thanks,

-- 
Peter Xu

Re: [libvirt PATCH v2 81/81] RFC: qemu: Keep vCPUs paused while migration is in postcopy-paused

Posted by Daniel P. Berrangé 3 years, 8 months ago

On Mon, Jun 06, 2022 at 10:32:03AM -0400, Peter Xu wrote:
> [copy Dave]
> 
> On Mon, Jun 06, 2022 at 12:29:39PM +0100, Daniel P. Berrangé wrote:
> > On Wed, Jun 01, 2022 at 02:50:21PM +0200, Jiri Denemark wrote:
> > > QEMU keeps guest CPUs running even in postcopy-paused migration state so
> > > that processes that already have all memory pages they need migrated to
> > > the destination can keep running. However, this behavior might bring
> > > unexpected delays in interprocess communication as some processes will
> > > be stopped until migration is recover and their memory pages migrated.
> > > So let's make sure all guest CPUs are paused while postcopy migration is
> > > paused.
> > > ---
> > > 
> > > Notes:
> > >     Version 2:
> > >     - new patch
> > > 
> > >     - this patch does not currently work as QEMU cannot handle "stop"
> > >       QMP command while in postcopy-paused state... the monitor just
> > >       hangs (see https://gitlab.com/qemu-project/qemu/-/issues/1052 )
> > >     - an ideal solution of the QEMU bug would be if QEMU itself paused
> > >       the CPUs for us and we just got notified about it via QMP events
> > >     - but Peter Xu thinks this behavior is actually worse than keeping
> > >       vCPUs running
> > 
> > I'd like to know what the rationale is here ?
> 
> I think the wording here is definitely stronger than what I meant. :-)
> 
> My understanding was stopping the VM may or may not help the guest,
> depending on the guest behavior at the point of migration failure.  And if
> we're not 100% sure of that, doing nothing is the best we have, as
> explicitly stopping the VM is something extra we do, and it's not part of
> the requirements for either postcopy itself or the recovery routine.
> 
> Some examples below.
> 
> 1) If many of the guest threads are doing cpu intensive work, and if the
> needed pageset is already migrated, then stopping the vcpu threads means
> they could have been running during this "downtime" but we forced them not
> to.  Actually if the postcopy didn't pause immediately right after switch,
> we could very possibly migrated the workload pages if the working set is
> not very large.
> 
> 2) If we're reaching the end of the postcopy phase and it paused, most of
> the pages could have been migrated already.  So maybe only a few or even
> none thread will be stopped due to remote page faults.
> 
> 3) Think about kvm async page fault: that's a feature that the guest can do
> to yield the guest thread when there's a page fault.  It means even if some
> of the page faulted threads got stuck for a long time due to postcopy
> pausing, the guest is "smart" to know it'll take a long time (userfaultfd
> is a major fault, and as long as KVM gup won't get the page we put the page
> fault into async pf queue) then the guest vcpu can explicitly schedule()
> the faulted context and run some other threads that may not need to be
> blocked.
> 
> What I wanted to say is I don't know whether assuming "stopping the VM will
> be better than not doing so" will always be true here.  If it's case by
> case I feel like the better way to do is to do nothing special.
> 
> > 
> > We've got a long history knowing the behaviour and impact when
> > pausing a VM as a whole. Of course some apps may have timeouts
> > that are hit if the paused time was too long, but overall this
> > scenario is not that different from a bare metal machine doing
> > suspend-to-ram. Application impact is limited & predictable and
> > genrally well understood.
> 
> My other question is, even if we stopped the VM then right after we resume
> the VM won't many of those timeout()s trigger as well?  I think I asked
> similar question to Jiri and the answer at that time was that we could have
> not called the timeout() function, however I think it's not persuasive
> enough as timeout() is the function that should take the major time so at
> least we're not sure whether we'll be on it already.

It depends how you're measuring time.  If you're using real time
then upon resume you'll see a huge jump in time. If you're using
monotonic time then there is no jump in time at all.

If you don't want to be affected by changes in system clock, even
in bare metal you'd pick monotonic time. Real time would be for
timeouts where you absolutely need to work by a fixed point in
time.

So yes, in theory you can be affected by timeouts even in a basic
suspend/resume scenario across the whole VM, but at the same time
you can make yourself safe from that by using monotonic time.

With this post-copy stalls though, even monotonic time won't
help because monotonic time continues ticking even while the
individual CPU is blocked. This is a bad thing.

> My understanding is that a VM can work properly after a migration because
> the guest timekeeping will gradually sync up with the real world time, so
> if there's a major donwtime triggered we can hardly make it not affecting
> the guest.  What we can do is if we know a software is in VM context we
> should be robust on the timeout (and that's at least what I do on programs
> even on bare metal because I'd assume the program be run on an extremely
> busy host).
> 
> But I could be all wrong on that, because I don't know enough on the whole
> rational of the importance of stopping the VM in the past.

Timeouts are not the only problem with selectively stopping CPUs,
just the most obvious.

Certain CPUs may be doing work that is critical to the operation
of processes running on other CPUs. One example would be RCU
threads which clean up resources - if a vCPU running RCU cleanup
got stalled this can effectively become a resource leak. More
generally if you have one thread is doing some kind of garbage
collection work that can also be a problem if it gets blocked,
while the other threads producing garbage continue. 

Also consider that migration is invisible to the guest OS and its
administrator. They will have no idea that migration is taking
place and suddenly some process stops producing output, what are
they going to think & how are they going to know this is an
artifact of a broken migration shortly to be recovered ?  The
selectively dead applications are likely to cause the sysadmin
to take bad action, as again this kind of scenario is not something
any real hardware ever experiances.

Allowing CPUs to selectively keep running when post-copy breaks
and expecting the OS & apps to be OK is just wishful thinking
and will only ever work by luck. Immediately pausing the VM
when post-copy breaks will improve the chances that we will
get back a working VM. There's never going to be a 100%
guarantee, but at least we'd be in a situation which OS and
apps know can happen.


> > The length of outage for a CPU when post-copy transport is broken
> > is potentially orders of magnitude larger than the temporary
> > blockage while fetching a memory page asynchronously. The latter
> > is obviously not good for real-time sensitive apps, but most apps
> > and OS will cope with CPUs being stalled for 100's of milliseconds.
> > That isn't the case if CPUs get stalled for minutes, or even hours,
> > at a time due to a broken network link needing admin recovery work
> > in the host infra.
> 
> So let me also look at the issue on having vm stop hanged, no matter
> whether we'd like an explicit vm_stop that hang should better be avoided
> from libvirt pov.
> 
> Ideally it could be avoided but I need to look into it.  I think it can be
> that the vm_stop was waiting for other vcpus to exit to userspace but those
> didn't really come alive after the SIG_IPI sent to them (in reality that's
> SIGUSR1; and I'm pretty sure all vcpu threads can handle SIGKILL.. so maybe
> I need to figure out where got it blocked in the kernel).
> 
> I'll update either here or in the bug that Jiri opened when I got more
> clues out of it.
> 
> Thanks,
> 
> -- 
> Peter Xu
> 

With regards,
Daniel
-- 
|: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org         -o-            https://fstop138.berrange.com :|
|: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|

Re: [libvirt PATCH v2 81/81] RFC: qemu: Keep vCPUs paused while migration is in postcopy-paused

Posted by Peter Xu 3 years, 8 months ago

[copy Dave, for real]

On Mon, Jun 06, 2022 at 10:32:03AM -0400, Peter Xu wrote:
> [copy Dave]
> 
> On Mon, Jun 06, 2022 at 12:29:39PM +0100, Daniel P. Berrangé wrote:
> > On Wed, Jun 01, 2022 at 02:50:21PM +0200, Jiri Denemark wrote:
> > > QEMU keeps guest CPUs running even in postcopy-paused migration state so
> > > that processes that already have all memory pages they need migrated to
> > > the destination can keep running. However, this behavior might bring
> > > unexpected delays in interprocess communication as some processes will
> > > be stopped until migration is recover and their memory pages migrated.
> > > So let's make sure all guest CPUs are paused while postcopy migration is
> > > paused.
> > > ---
> > > 
> > > Notes:
> > >     Version 2:
> > >     - new patch
> > > 
> > >     - this patch does not currently work as QEMU cannot handle "stop"
> > >       QMP command while in postcopy-paused state... the monitor just
> > >       hangs (see https://gitlab.com/qemu-project/qemu/-/issues/1052 )
> > >     - an ideal solution of the QEMU bug would be if QEMU itself paused
> > >       the CPUs for us and we just got notified about it via QMP events
> > >     - but Peter Xu thinks this behavior is actually worse than keeping
> > >       vCPUs running
> > 
> > I'd like to know what the rationale is here ?
> 
> I think the wording here is definitely stronger than what I meant. :-)
> 
> My understanding was stopping the VM may or may not help the guest,
> depending on the guest behavior at the point of migration failure.  And if
> we're not 100% sure of that, doing nothing is the best we have, as
> explicitly stopping the VM is something extra we do, and it's not part of
> the requirements for either postcopy itself or the recovery routine.
> 
> Some examples below.
> 
> 1) If many of the guest threads are doing cpu intensive work, and if the
> needed pageset is already migrated, then stopping the vcpu threads means
> they could have been running during this "downtime" but we forced them not
> to.  Actually if the postcopy didn't pause immediately right after switch,
> we could very possibly migrated the workload pages if the working set is
> not very large.
> 
> 2) If we're reaching the end of the postcopy phase and it paused, most of
> the pages could have been migrated already.  So maybe only a few or even
> none thread will be stopped due to remote page faults.
> 
> 3) Think about kvm async page fault: that's a feature that the guest can do
> to yield the guest thread when there's a page fault.  It means even if some
> of the page faulted threads got stuck for a long time due to postcopy
> pausing, the guest is "smart" to know it'll take a long time (userfaultfd
> is a major fault, and as long as KVM gup won't get the page we put the page
> fault into async pf queue) then the guest vcpu can explicitly schedule()
> the faulted context and run some other threads that may not need to be
> blocked.
> 
> What I wanted to say is I don't know whether assuming "stopping the VM will
> be better than not doing so" will always be true here.  If it's case by
> case I feel like the better way to do is to do nothing special.
> 
> > 
> > We've got a long history knowing the behaviour and impact when
> > pausing a VM as a whole. Of course some apps may have timeouts
> > that are hit if the paused time was too long, but overall this
> > scenario is not that different from a bare metal machine doing
> > suspend-to-ram. Application impact is limited & predictable and
> > genrally well understood.
> 
> My other question is, even if we stopped the VM then right after we resume
> the VM won't many of those timeout()s trigger as well?  I think I asked
> similar question to Jiri and the answer at that time was that we could have
> not called the timeout() function, however I think it's not persuasive
> enough as timeout() is the function that should take the major time so at
> least we're not sure whether we'll be on it already.
> 
> My understanding is that a VM can work properly after a migration because
> the guest timekeeping will gradually sync up with the real world time, so
> if there's a major donwtime triggered we can hardly make it not affecting
> the guest.  What we can do is if we know a software is in VM context we
> should be robust on the timeout (and that's at least what I do on programs
> even on bare metal because I'd assume the program be run on an extremely
> busy host).
> 
> But I could be all wrong on that, because I don't know enough on the whole
> rational of the importance of stopping the VM in the past.
> 
> > 
> > I don't think we can say the same about the behaviour & impact
> > on the guest OS if we selectively block execution of random
> > CPUs.  An OS where a certain physical CPU simply stops executing
> > is not a normal scenario that any application or OS is designed
> > to expect. I think the chance of the guest OS or application
> > breaking in a non-recoverable way is high. IOW, we might perform
> > post-copy recovery and all might look well from host POV, but
> > the guest OS/app is none the less broken.
> > 
> > The overriding goal for migration has to be to minimize the
> > danger to the guest OS and its applications, and I think that's
> > only viable if either the guest OS is running all CPUs or no
> > CPUs.
> 
> I agree.
> 
> > 
> > The length of outage for a CPU when post-copy transport is broken
> > is potentially orders of magnitude larger than the temporary
> > blockage while fetching a memory page asynchronously. The latter
> > is obviously not good for real-time sensitive apps, but most apps
> > and OS will cope with CPUs being stalled for 100's of milliseconds.
> > That isn't the case if CPUs get stalled for minutes, or even hours,
> > at a time due to a broken network link needing admin recovery work
> > in the host infra.
> 
> So let me also look at the issue on having vm stop hanged, no matter
> whether we'd like an explicit vm_stop that hang should better be avoided
> from libvirt pov.
> 
> Ideally it could be avoided but I need to look into it.  I think it can be
> that the vm_stop was waiting for other vcpus to exit to userspace but those
> didn't really come alive after the SIG_IPI sent to them (in reality that's
> SIGUSR1; and I'm pretty sure all vcpu threads can handle SIGKILL.. so maybe
> I need to figure out where got it blocked in the kernel).
> 
> I'll update either here or in the bug that Jiri opened when I got more
> clues out of it.
> 
> Thanks,
> 
> -- 
> Peter Xu

-- 
Peter Xu