[PATCH v13 07/27] migration: push Error **errp into qemu_loadvm_state()

Arun Menon posted 27 patches 4 weeks, 1 day ago
Maintainers: Stefan Berger <stefanb@linux.vnet.ibm.com>, Peter Xu <peterx@redhat.com>, Fabiano Rosas <farosas@suse.de>, "Alex Bennée" <alex.bennee@linaro.org>, Akihiko Odaki <odaki@rsg.ci.i.u-tokyo.ac.jp>, Dmitry Osipenko <dmitry.osipenko@collabora.com>, "Michael S. Tsirkin" <mst@redhat.com>, Marcel Apfelbaum <marcel.apfelbaum@gmail.com>, Nicholas Piggin <npiggin@gmail.com>, Harsh Prateek Bora <harshpb@linux.ibm.com>, Cornelia Huck <cohuck@redhat.com>, Halil Pasic <pasic@linux.ibm.com>, Eric Farman <farman@linux.ibm.com>, Richard Henderson <richard.henderson@linaro.org>, David Hildenbrand <david@redhat.com>, Ilya Leoshkevich <iii@linux.ibm.com>, Christian Borntraeger <borntraeger@linux.ibm.com>, Matthew Rosato <mjrosato@linux.ibm.com>, Thomas Huth <thuth@redhat.com>, Paolo Bonzini <pbonzini@redhat.com>, Fam Zheng <fam@euphon.net>, Alex Williamson <alex.williamson@redhat.com>, "Cédric Le Goater" <clg@redhat.com>, Hailiang Zhang <zhanghailiang@xfusion.com>, Steve Sistare <steven.sistare@oracle.com>, Peter Maydell <peter.maydell@linaro.org>, "Marc-André Lureau" <marcandre.lureau@redhat.com>
There is a newer version of this series
[PATCH v13 07/27] migration: push Error **errp into qemu_loadvm_state()
Posted by Arun Menon 4 weeks, 1 day ago
This is an incremental step in converting vmstate loading
code to report error via Error objects instead of directly
printing it to console/monitor.
It is ensured that qemu_loadvm_state() must report an error
in errp, in case of failure.

When postcopy live migration runs, the device states are loaded by
both the qemu coroutine process_incoming_migration_co() and the
postcopy_ram_listen_thread(). Therefore, it is important that the
coroutine also reports the error in case of failure, with
error_report_err(). Otherwise, the source qemu will not display
any errors before going into the postcopy pause state.

Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Reviewed-by: Fabiano Rosas <farosas@suse.de>
Signed-off-by: Arun Menon <armenon@redhat.com>
---
 migration/migration.c |  9 +++++----
 migration/savevm.c    | 30 ++++++++++++++++++------------
 migration/savevm.h    |  2 +-
 3 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index 10c216d25dec01f206eacad2edd24d21f00e614c..c6768d88f45c870c7fad9b9957300766ff69effc 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -881,7 +881,7 @@ process_incoming_migration_co(void *opaque)
                       MIGRATION_STATUS_ACTIVE);
 
     mis->loadvm_co = qemu_coroutine_self();
-    ret = qemu_loadvm_state(mis->from_src_file);
+    ret = qemu_loadvm_state(mis->from_src_file, &local_err);
     mis->loadvm_co = NULL;
 
     trace_vmstate_downtime_checkpoint("dst-precopy-loadvm-completed");
@@ -908,7 +908,8 @@ process_incoming_migration_co(void *opaque)
     }
 
     if (ret < 0) {
-        error_setg(&local_err, "load of migration failed: %s", strerror(-ret));
+        error_prepend(&local_err, "load of migration failed: %s: ",
+                      strerror(-ret));
         goto fail;
     }
 
@@ -924,13 +925,13 @@ fail:
     migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
                       MIGRATION_STATUS_FAILED);
     migrate_set_error(s, local_err);
-    error_free(local_err);
+    error_report_err(local_err);
 
     migration_incoming_state_destroy();
 
     if (mis->exit_on_error) {
         WITH_QEMU_LOCK_GUARD(&s->error_mutex) {
-            error_report_err(s->error);
+            error_free(s->error);
             s->error = NULL;
         }
 
diff --git a/migration/savevm.c b/migration/savevm.c
index de5671ffd1cd06e728227a3056c3f895d3a6e6f3..0087fca15ce108685667d3808350d80d37b807b1 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -3159,28 +3159,24 @@ out:
     return ret;
 }
 
-int qemu_loadvm_state(QEMUFile *f)
+int qemu_loadvm_state(QEMUFile *f, Error **errp)
 {
     MigrationState *s = migrate_get_current();
     MigrationIncomingState *mis = migration_incoming_get_current();
-    Error *local_err = NULL;
     int ret;
 
-    if (qemu_savevm_state_blocked(&local_err)) {
-        error_report_err(local_err);
+    if (qemu_savevm_state_blocked(errp)) {
         return -EINVAL;
     }
 
     qemu_loadvm_thread_pool_create(mis);
 
-    ret = qemu_loadvm_state_header(f, &local_err);
+    ret = qemu_loadvm_state_header(f, errp);
     if (ret) {
-        error_report_err(local_err);
         return ret;
     }
 
-    if (qemu_loadvm_state_setup(f, &local_err) != 0) {
-        error_report_err(local_err);
+    if (qemu_loadvm_state_setup(f, errp) != 0) {
         return -EINVAL;
     }
 
@@ -3191,6 +3187,9 @@ int qemu_loadvm_state(QEMUFile *f)
     cpu_synchronize_all_pre_loadvm();
 
     ret = qemu_loadvm_state_main(f, mis);
+    if (ret < 0) {
+        error_setg(errp, "Load VM state failed: %d", ret);
+    }
     qemu_event_set(&mis->main_thread_load_event);
 
     trace_qemu_loadvm_state_post_main(ret);
@@ -3208,8 +3207,15 @@ int qemu_loadvm_state(QEMUFile *f)
         if (migrate_has_error(migrate_get_current()) ||
             !qemu_loadvm_thread_pool_wait(s, mis)) {
             ret = -EINVAL;
+            error_setg(errp,
+                       "Error while loading vmstate");
         } else {
             ret = qemu_file_get_error(f);
+            if (ret < 0) {
+                error_setg(errp,
+                           "Error while loading vmstate: stream error: %d",
+                           ret);
+            }
         }
     }
     /*
@@ -3474,6 +3480,7 @@ void qmp_xen_save_devices_state(const char *filename, bool has_live, bool live,
 
 void qmp_xen_load_devices_state(const char *filename, Error **errp)
 {
+    ERRP_GUARD();
     QEMUFile *f;
     QIOChannelFile *ioc;
     int ret;
@@ -3495,10 +3502,10 @@ void qmp_xen_load_devices_state(const char *filename, Error **errp)
     f = qemu_file_new_input(QIO_CHANNEL(ioc));
     object_unref(OBJECT(ioc));
 
-    ret = qemu_loadvm_state(f);
+    ret = qemu_loadvm_state(f, errp);
     qemu_fclose(f);
     if (ret < 0) {
-        error_setg(errp, "loading Xen device state failed");
+        error_prepend(errp, "loading Xen device state failed: ");
     }
     migration_incoming_state_destroy();
 }
@@ -3569,13 +3576,12 @@ bool load_snapshot(const char *name, const char *vmstate,
         ret = -EINVAL;
         goto err_drain;
     }
-    ret = qemu_loadvm_state(f);
+    ret = qemu_loadvm_state(f, errp);
     migration_incoming_state_destroy();
 
     bdrv_drain_all_end();
 
     if (ret < 0) {
-        error_setg(errp, "Error %d while loading VM state", ret);
         return false;
     }
 
diff --git a/migration/savevm.h b/migration/savevm.h
index 2d5e9c716686f06720325e82fe90c75335ced1de..b80770b7461a60e2ad6ba5e24a7baeae73d90955 100644
--- a/migration/savevm.h
+++ b/migration/savevm.h
@@ -64,7 +64,7 @@ void qemu_savevm_send_colo_enable(QEMUFile *f);
 void qemu_savevm_live_state(QEMUFile *f);
 int qemu_save_device_state(QEMUFile *f);
 
-int qemu_loadvm_state(QEMUFile *f);
+int qemu_loadvm_state(QEMUFile *f, Error **errp);
 void qemu_loadvm_state_cleanup(MigrationIncomingState *mis);
 int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis);
 int qemu_load_device_state(QEMUFile *f);

-- 
2.51.0


Re: [PATCH v13 07/27] migration: push Error **errp into qemu_loadvm_state()
Posted by Akihiko Odaki 4 weeks, 1 day ago
On 2025/08/30 5:01, Arun Menon wrote:
> This is an incremental step in converting vmstate loading
> code to report error via Error objects instead of directly
> printing it to console/monitor.
> It is ensured that qemu_loadvm_state() must report an error
> in errp, in case of failure.
> 
> When postcopy live migration runs, the device states are loaded by
> both the qemu coroutine process_incoming_migration_co() and the
> postcopy_ram_listen_thread(). Therefore, it is important that the
> coroutine also reports the error in case of failure, with
> error_report_err(). Otherwise, the source qemu will not display
> any errors before going into the postcopy pause state.
> 
> Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
> Reviewed-by: Fabiano Rosas <farosas@suse.de>
> Signed-off-by: Arun Menon <armenon@redhat.com>
> ---
>   migration/migration.c |  9 +++++----
>   migration/savevm.c    | 30 ++++++++++++++++++------------
>   migration/savevm.h    |  2 +-
>   3 files changed, 24 insertions(+), 17 deletions(-)
> 
> diff --git a/migration/migration.c b/migration/migration.c
> index 10c216d25dec01f206eacad2edd24d21f00e614c..c6768d88f45c870c7fad9b9957300766ff69effc 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -881,7 +881,7 @@ process_incoming_migration_co(void *opaque)
>                         MIGRATION_STATUS_ACTIVE);
>   
>       mis->loadvm_co = qemu_coroutine_self();
> -    ret = qemu_loadvm_state(mis->from_src_file);
> +    ret = qemu_loadvm_state(mis->from_src_file, &local_err);
>       mis->loadvm_co = NULL;
>   
>       trace_vmstate_downtime_checkpoint("dst-precopy-loadvm-completed");
> @@ -908,7 +908,8 @@ process_incoming_migration_co(void *opaque)
>       }
>   
>       if (ret < 0) {
> -        error_setg(&local_err, "load of migration failed: %s", strerror(-ret));
> +        error_prepend(&local_err, "load of migration failed: %s: ",
> +                      strerror(-ret));
>           goto fail;
>       }
>   
> @@ -924,13 +925,13 @@ fail:
>       migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
>                         MIGRATION_STATUS_FAILED);
>       migrate_set_error(s, local_err);
> -    error_free(local_err);
> +    error_report_err(local_err);

This is problematic because it results in duplicate error reports when
!mis->exit_on_error; in that case the query-migrate QMP command reports 
the error and this error reporting is redundant.

>   
>       migration_incoming_state_destroy();
>   
>       if (mis->exit_on_error) {
>           WITH_QEMU_LOCK_GUARD(&s->error_mutex) {
> -            error_report_err(s->error);
> +            error_free(s->error);

This change is problematic because s->error set somewhere else here will 
be ignored.

I think the two changes I commented can be simply removed without 
causing other problems.

>               s->error = NULL;
>           }
>   
> diff --git a/migration/savevm.c b/migration/savevm.c
> index de5671ffd1cd06e728227a3056c3f895d3a6e6f3..0087fca15ce108685667d3808350d80d37b807b1 100644
> --- a/migration/savevm.c
> +++ b/migration/savevm.c
> @@ -3159,28 +3159,24 @@ out:
>       return ret;
>   }
>   
> -int qemu_loadvm_state(QEMUFile *f)
> +int qemu_loadvm_state(QEMUFile *f, Error **errp)
>   {
>       MigrationState *s = migrate_get_current();
>       MigrationIncomingState *mis = migration_incoming_get_current();
> -    Error *local_err = NULL;
>       int ret;
>   
> -    if (qemu_savevm_state_blocked(&local_err)) {
> -        error_report_err(local_err);
> +    if (qemu_savevm_state_blocked(errp)) {
>           return -EINVAL;
>       }
>   
>       qemu_loadvm_thread_pool_create(mis);
>   
> -    ret = qemu_loadvm_state_header(f, &local_err);
> +    ret = qemu_loadvm_state_header(f, errp);
>       if (ret) {
> -        error_report_err(local_err);
>           return ret;
>       }
>   
> -    if (qemu_loadvm_state_setup(f, &local_err) != 0) {
> -        error_report_err(local_err);
> +    if (qemu_loadvm_state_setup(f, errp) != 0) {
>           return -EINVAL;
>       }
>   
> @@ -3191,6 +3187,9 @@ int qemu_loadvm_state(QEMUFile *f)
>       cpu_synchronize_all_pre_loadvm();
>   
>       ret = qemu_loadvm_state_main(f, mis);
> +    if (ret < 0) {
> +        error_setg(errp, "Load VM state failed: %d", ret);
> +    }
>       qemu_event_set(&mis->main_thread_load_event);
>   
>       trace_qemu_loadvm_state_post_main(ret);
> @@ -3208,8 +3207,15 @@ int qemu_loadvm_state(QEMUFile *f)
>           if (migrate_has_error(migrate_get_current()) ||
>               !qemu_loadvm_thread_pool_wait(s, mis)) {
>               ret = -EINVAL;
> +            error_setg(errp,
> +                       "Error while loading vmstate");
>           } else {
>               ret = qemu_file_get_error(f);
> +            if (ret < 0) {
> +                error_setg(errp,
> +                           "Error while loading vmstate: stream error: %d",
> +                           ret);
> +            }
>           }
>       }
>       /*
> @@ -3474,6 +3480,7 @@ void qmp_xen_save_devices_state(const char *filename, bool has_live, bool live,
>   
>   void qmp_xen_load_devices_state(const char *filename, Error **errp)
>   {
> +    ERRP_GUARD();
>       QEMUFile *f;
>       QIOChannelFile *ioc;
>       int ret;
> @@ -3495,10 +3502,10 @@ void qmp_xen_load_devices_state(const char *filename, Error **errp)
>       f = qemu_file_new_input(QIO_CHANNEL(ioc));
>       object_unref(OBJECT(ioc));
>   
> -    ret = qemu_loadvm_state(f);
> +    ret = qemu_loadvm_state(f, errp);
>       qemu_fclose(f);
>       if (ret < 0) {
> -        error_setg(errp, "loading Xen device state failed");
> +        error_prepend(errp, "loading Xen device state failed: ");
>       }
>       migration_incoming_state_destroy();
>   }
> @@ -3569,13 +3576,12 @@ bool load_snapshot(const char *name, const char *vmstate,
>           ret = -EINVAL;
>           goto err_drain;
>       }
> -    ret = qemu_loadvm_state(f);
> +    ret = qemu_loadvm_state(f, errp);
>       migration_incoming_state_destroy();
>   
>       bdrv_drain_all_end();
>   
>       if (ret < 0) {
> -        error_setg(errp, "Error %d while loading VM state", ret);
>           return false;
>       }
>   
> diff --git a/migration/savevm.h b/migration/savevm.h
> index 2d5e9c716686f06720325e82fe90c75335ced1de..b80770b7461a60e2ad6ba5e24a7baeae73d90955 100644
> --- a/migration/savevm.h
> +++ b/migration/savevm.h
> @@ -64,7 +64,7 @@ void qemu_savevm_send_colo_enable(QEMUFile *f);
>   void qemu_savevm_live_state(QEMUFile *f);
>   int qemu_save_device_state(QEMUFile *f);
>   
> -int qemu_loadvm_state(QEMUFile *f);
> +int qemu_loadvm_state(QEMUFile *f, Error **errp);
>   void qemu_loadvm_state_cleanup(MigrationIncomingState *mis);
>   int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis);
>   int qemu_load_device_state(QEMUFile *f);
> 


Re: [PATCH v13 07/27] migration: push Error **errp into qemu_loadvm_state()
Posted by Arun Menon 3 weeks, 6 days ago
Hi Akihiko,
Thanks for the review.

On Sat, Aug 30, 2025 at 02:58:05PM +0900, Akihiko Odaki wrote:
> On 2025/08/30 5:01, Arun Menon wrote:
> > This is an incremental step in converting vmstate loading
> > code to report error via Error objects instead of directly
> > printing it to console/monitor.
> > It is ensured that qemu_loadvm_state() must report an error
> > in errp, in case of failure.
> > 
> > When postcopy live migration runs, the device states are loaded by
> > both the qemu coroutine process_incoming_migration_co() and the
> > postcopy_ram_listen_thread(). Therefore, it is important that the
> > coroutine also reports the error in case of failure, with
> > error_report_err(). Otherwise, the source qemu will not display
> > any errors before going into the postcopy pause state.
> > 
> > Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
> > Reviewed-by: Fabiano Rosas <farosas@suse.de>
> > Signed-off-by: Arun Menon <armenon@redhat.com>
> > ---
> >   migration/migration.c |  9 +++++----
> >   migration/savevm.c    | 30 ++++++++++++++++++------------
> >   migration/savevm.h    |  2 +-
> >   3 files changed, 24 insertions(+), 17 deletions(-)
> > 
> > diff --git a/migration/migration.c b/migration/migration.c
> > index 10c216d25dec01f206eacad2edd24d21f00e614c..c6768d88f45c870c7fad9b9957300766ff69effc 100644
> > --- a/migration/migration.c
> > +++ b/migration/migration.c
> > @@ -881,7 +881,7 @@ process_incoming_migration_co(void *opaque)
> >                         MIGRATION_STATUS_ACTIVE);
> >       mis->loadvm_co = qemu_coroutine_self();
> > -    ret = qemu_loadvm_state(mis->from_src_file);
> > +    ret = qemu_loadvm_state(mis->from_src_file, &local_err);
> >       mis->loadvm_co = NULL;
> >       trace_vmstate_downtime_checkpoint("dst-precopy-loadvm-completed");
> > @@ -908,7 +908,8 @@ process_incoming_migration_co(void *opaque)
> >       }
> >       if (ret < 0) {
> > -        error_setg(&local_err, "load of migration failed: %s", strerror(-ret));
> > +        error_prepend(&local_err, "load of migration failed: %s: ",
> > +                      strerror(-ret));
> >           goto fail;
> >       }
> > @@ -924,13 +925,13 @@ fail:
> >       migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
> >                         MIGRATION_STATUS_FAILED);
> >       migrate_set_error(s, local_err);
> > -    error_free(local_err);
> > +    error_report_err(local_err);
> 
> This is problematic because it results in duplicate error reports when
> !mis->exit_on_error; in that case the query-migrate QMP command reports the
> error and this error reporting is redundant.

If I comment this change, then all of the errors propagated up to now, using
error_setg() will not be reported. This is the place where it is finally reported,
when qemu_loadvm_state() fails. In other words, all the error_reports() we removed
from all the files, replacing them with error_setg(), will finally be reported here
using error_report_err().

> 
> >       migration_incoming_state_destroy();
> >       if (mis->exit_on_error) {
> >           WITH_QEMU_LOCK_GUARD(&s->error_mutex) {
> > -            error_report_err(s->error);
> > +            error_free(s->error);
> 
> This change is problematic because s->error set somewhere else here will be
> ignored.

This is specific to the case when mis->exit_on_error is set.
since we did a migrate_set_error(s, local_err) before, we free the
error in s->error and set it to NULL, before an exit(EXIT_FAILURE)

> 
> I think the two changes I commented can be simply removed without causing
> other problems.

Please correct me if I am wrong.

> 
> >               s->error = NULL;
> >           }
> > diff --git a/migration/savevm.c b/migration/savevm.c
> > index de5671ffd1cd06e728227a3056c3f895d3a6e6f3..0087fca15ce108685667d3808350d80d37b807b1 100644
> > --- a/migration/savevm.c
> > +++ b/migration/savevm.c
> > @@ -3159,28 +3159,24 @@ out:
> >       return ret;
> >   }
> > -int qemu_loadvm_state(QEMUFile *f)
> > +int qemu_loadvm_state(QEMUFile *f, Error **errp)
> >   {
> >       MigrationState *s = migrate_get_current();
> >       MigrationIncomingState *mis = migration_incoming_get_current();
> > -    Error *local_err = NULL;
> >       int ret;
> > -    if (qemu_savevm_state_blocked(&local_err)) {
> > -        error_report_err(local_err);
> > +    if (qemu_savevm_state_blocked(errp)) {
> >           return -EINVAL;
> >       }
> >       qemu_loadvm_thread_pool_create(mis);
> > -    ret = qemu_loadvm_state_header(f, &local_err);
> > +    ret = qemu_loadvm_state_header(f, errp);
> >       if (ret) {
> > -        error_report_err(local_err);
> >           return ret;
> >       }
> > -    if (qemu_loadvm_state_setup(f, &local_err) != 0) {
> > -        error_report_err(local_err);
> > +    if (qemu_loadvm_state_setup(f, errp) != 0) {
> >           return -EINVAL;
> >       }
> > @@ -3191,6 +3187,9 @@ int qemu_loadvm_state(QEMUFile *f)
> >       cpu_synchronize_all_pre_loadvm();
> >       ret = qemu_loadvm_state_main(f, mis);
> > +    if (ret < 0) {
> > +        error_setg(errp, "Load VM state failed: %d", ret);
> > +    }
> >       qemu_event_set(&mis->main_thread_load_event);
> >       trace_qemu_loadvm_state_post_main(ret);
> > @@ -3208,8 +3207,15 @@ int qemu_loadvm_state(QEMUFile *f)
> >           if (migrate_has_error(migrate_get_current()) ||
> >               !qemu_loadvm_thread_pool_wait(s, mis)) {
> >               ret = -EINVAL;
> > +            error_setg(errp,
> > +                       "Error while loading vmstate");
> >           } else {
> >               ret = qemu_file_get_error(f);
> > +            if (ret < 0) {
> > +                error_setg(errp,
> > +                           "Error while loading vmstate: stream error: %d",
> > +                           ret);
> > +            }
> >           }
> >       }
> >       /*
> > @@ -3474,6 +3480,7 @@ void qmp_xen_save_devices_state(const char *filename, bool has_live, bool live,
> >   void qmp_xen_load_devices_state(const char *filename, Error **errp)
> >   {
> > +    ERRP_GUARD();
> >       QEMUFile *f;
> >       QIOChannelFile *ioc;
> >       int ret;
> > @@ -3495,10 +3502,10 @@ void qmp_xen_load_devices_state(const char *filename, Error **errp)
> >       f = qemu_file_new_input(QIO_CHANNEL(ioc));
> >       object_unref(OBJECT(ioc));
> > -    ret = qemu_loadvm_state(f);
> > +    ret = qemu_loadvm_state(f, errp);
> >       qemu_fclose(f);
> >       if (ret < 0) {
> > -        error_setg(errp, "loading Xen device state failed");
> > +        error_prepend(errp, "loading Xen device state failed: ");
> >       }
> >       migration_incoming_state_destroy();
> >   }
> > @@ -3569,13 +3576,12 @@ bool load_snapshot(const char *name, const char *vmstate,
> >           ret = -EINVAL;
> >           goto err_drain;
> >       }
> > -    ret = qemu_loadvm_state(f);
> > +    ret = qemu_loadvm_state(f, errp);
> >       migration_incoming_state_destroy();
> >       bdrv_drain_all_end();
> >       if (ret < 0) {
> > -        error_setg(errp, "Error %d while loading VM state", ret);
> >           return false;
> >       }
> > diff --git a/migration/savevm.h b/migration/savevm.h
> > index 2d5e9c716686f06720325e82fe90c75335ced1de..b80770b7461a60e2ad6ba5e24a7baeae73d90955 100644
> > --- a/migration/savevm.h
> > +++ b/migration/savevm.h
> > @@ -64,7 +64,7 @@ void qemu_savevm_send_colo_enable(QEMUFile *f);
> >   void qemu_savevm_live_state(QEMUFile *f);
> >   int qemu_save_device_state(QEMUFile *f);
> > -int qemu_loadvm_state(QEMUFile *f);
> > +int qemu_loadvm_state(QEMUFile *f, Error **errp);
> >   void qemu_loadvm_state_cleanup(MigrationIncomingState *mis);
> >   int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis);
> >   int qemu_load_device_state(QEMUFile *f);
> > 
> 
Regards,
Arun Menon
Re: [PATCH v13 07/27] migration: push Error **errp into qemu_loadvm_state()
Posted by Akihiko Odaki 3 weeks, 6 days ago
On 2025/09/01 0:45, Arun Menon wrote:
> Hi Akihiko,
> Thanks for the review.
> 
> On Sat, Aug 30, 2025 at 02:58:05PM +0900, Akihiko Odaki wrote:
>> On 2025/08/30 5:01, Arun Menon wrote:
>>> This is an incremental step in converting vmstate loading
>>> code to report error via Error objects instead of directly
>>> printing it to console/monitor.
>>> It is ensured that qemu_loadvm_state() must report an error
>>> in errp, in case of failure.
>>>
>>> When postcopy live migration runs, the device states are loaded by
>>> both the qemu coroutine process_incoming_migration_co() and the
>>> postcopy_ram_listen_thread(). Therefore, it is important that the
>>> coroutine also reports the error in case of failure, with
>>> error_report_err(). Otherwise, the source qemu will not display
>>> any errors before going into the postcopy pause state.
>>>
>>> Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
>>> Reviewed-by: Fabiano Rosas <farosas@suse.de>
>>> Signed-off-by: Arun Menon <armenon@redhat.com>
>>> ---
>>>    migration/migration.c |  9 +++++----
>>>    migration/savevm.c    | 30 ++++++++++++++++++------------
>>>    migration/savevm.h    |  2 +-
>>>    3 files changed, 24 insertions(+), 17 deletions(-)
>>>
>>> diff --git a/migration/migration.c b/migration/migration.c
>>> index 10c216d25dec01f206eacad2edd24d21f00e614c..c6768d88f45c870c7fad9b9957300766ff69effc 100644
>>> --- a/migration/migration.c
>>> +++ b/migration/migration.c
>>> @@ -881,7 +881,7 @@ process_incoming_migration_co(void *opaque)
>>>                          MIGRATION_STATUS_ACTIVE);
>>>        mis->loadvm_co = qemu_coroutine_self();
>>> -    ret = qemu_loadvm_state(mis->from_src_file);
>>> +    ret = qemu_loadvm_state(mis->from_src_file, &local_err);
>>>        mis->loadvm_co = NULL;
>>>        trace_vmstate_downtime_checkpoint("dst-precopy-loadvm-completed");
>>> @@ -908,7 +908,8 @@ process_incoming_migration_co(void *opaque)
>>>        }
>>>        if (ret < 0) {
>>> -        error_setg(&local_err, "load of migration failed: %s", strerror(-ret));
>>> +        error_prepend(&local_err, "load of migration failed: %s: ",
>>> +                      strerror(-ret));
>>>            goto fail;
>>>        }
>>> @@ -924,13 +925,13 @@ fail:
>>>        migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
>>>                          MIGRATION_STATUS_FAILED);
>>>        migrate_set_error(s, local_err);
>>> -    error_free(local_err);
>>> +    error_report_err(local_err);
>>
>> This is problematic because it results in duplicate error reports when
>> !mis->exit_on_error; in that case the query-migrate QMP command reports the
>> error and this error reporting is redundant.
> 
> If I comment this change, then all of the errors propagated up to now, using
> error_setg() will not be reported. This is the place where it is finally reported,
> when qemu_loadvm_state() fails. In other words, all the error_reports() we removed
> from all the files, replacing them with error_setg(), will finally be reported here
> using error_report_err().

My understanding of the code without these two changes is:
- If the migrate-incoming QMP command is used with false as
   exit-on-error, this function will not report the error but
   the query-migrate QMP command will report the error.
- Otherwise, this function reports the error.

With these two changes, if the migrate-incoming QMP command is used with 
false as exit-on-error, this function will report the error *and* the 
query-migrate QMP command will report the error, resulting in duplicate 
reports.

> 
>>
>>>        migration_incoming_state_destroy();
>>>        if (mis->exit_on_error) {
>>>            WITH_QEMU_LOCK_GUARD(&s->error_mutex) {
>>> -            error_report_err(s->error);
>>> +            error_free(s->error);
>>
>> This change is problematic because s->error set somewhere else here will be
>> ignored.
> 
> This is specific to the case when mis->exit_on_error is set.
> since we did a migrate_set_error(s, local_err) before, we free the
> error in s->error and set it to NULL, before an exit(EXIT_FAILURE)

It shouldn't just free the error but should print it or the error will 
be missed.

Regards,
Akihiko Odaki

Re: [PATCH v13 07/27] migration: push Error **errp into qemu_loadvm_state()
Posted by Arun Menon 3 weeks, 6 days ago
Hi,

On Mon, Sep 01, 2025 at 01:04:40AM +0900, Akihiko Odaki wrote:
> On 2025/09/01 0:45, Arun Menon wrote:
> > Hi Akihiko,
> > Thanks for the review.
> > 
> > On Sat, Aug 30, 2025 at 02:58:05PM +0900, Akihiko Odaki wrote:
> > > On 2025/08/30 5:01, Arun Menon wrote:
> > > > This is an incremental step in converting vmstate loading
> > > > code to report error via Error objects instead of directly
> > > > printing it to console/monitor.
> > > > It is ensured that qemu_loadvm_state() must report an error
> > > > in errp, in case of failure.
> > > > 
> > > > When postcopy live migration runs, the device states are loaded by
> > > > both the qemu coroutine process_incoming_migration_co() and the
> > > > postcopy_ram_listen_thread(). Therefore, it is important that the
> > > > coroutine also reports the error in case of failure, with
> > > > error_report_err(). Otherwise, the source qemu will not display
> > > > any errors before going into the postcopy pause state.
> > > > 
> > > > Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
> > > > Reviewed-by: Fabiano Rosas <farosas@suse.de>
> > > > Signed-off-by: Arun Menon <armenon@redhat.com>
> > > > ---
> > > >    migration/migration.c |  9 +++++----
> > > >    migration/savevm.c    | 30 ++++++++++++++++++------------
> > > >    migration/savevm.h    |  2 +-
> > > >    3 files changed, 24 insertions(+), 17 deletions(-)
> > > > 
> > > > diff --git a/migration/migration.c b/migration/migration.c
> > > > index 10c216d25dec01f206eacad2edd24d21f00e614c..c6768d88f45c870c7fad9b9957300766ff69effc 100644
> > > > --- a/migration/migration.c
> > > > +++ b/migration/migration.c
> > > > @@ -881,7 +881,7 @@ process_incoming_migration_co(void *opaque)
> > > >                          MIGRATION_STATUS_ACTIVE);
> > > >        mis->loadvm_co = qemu_coroutine_self();
> > > > -    ret = qemu_loadvm_state(mis->from_src_file);
> > > > +    ret = qemu_loadvm_state(mis->from_src_file, &local_err);
> > > >        mis->loadvm_co = NULL;
> > > >        trace_vmstate_downtime_checkpoint("dst-precopy-loadvm-completed");
> > > > @@ -908,7 +908,8 @@ process_incoming_migration_co(void *opaque)
> > > >        }
> > > >        if (ret < 0) {
> > > > -        error_setg(&local_err, "load of migration failed: %s", strerror(-ret));
> > > > +        error_prepend(&local_err, "load of migration failed: %s: ",
> > > > +                      strerror(-ret));
> > > >            goto fail;
> > > >        }
> > > > @@ -924,13 +925,13 @@ fail:
> > > >        migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
> > > >                          MIGRATION_STATUS_FAILED);
> > > >        migrate_set_error(s, local_err);
> > > > -    error_free(local_err);
> > > > +    error_report_err(local_err);
> > > 
> > > This is problematic because it results in duplicate error reports when
> > > !mis->exit_on_error; in that case the query-migrate QMP command reports the
> > > error and this error reporting is redundant.
> > 
> > If I comment this change, then all of the errors propagated up to now, using
> > error_setg() will not be reported. This is the place where it is finally reported,
> > when qemu_loadvm_state() fails. In other words, all the error_reports() we removed
> > from all the files, replacing them with error_setg(), will finally be reported here
> > using error_report_err().
> 
> My understanding of the code without these two changes is:
> - If the migrate-incoming QMP command is used with false as
>   exit-on-error, this function will not report the error but
>   the query-migrate QMP command will report the error.
> - Otherwise, this function reports the error.

With my limited experience in testing, I have a question,
So there are 2 scenarios,
1. running the virsh migrate command on the source host. Something like the following,
  virsh -c 'qemu:///system' migrate --live --verbose --domain guest-vm --desturi qemu+ssh://10.6.120.20/system
  OR for postcopy-ram,
  virsh migrate guest-vm --live qemu+ssh://10.6.120.20/system --verbose --postcopy --timeout 10 --timeout-postcopy

2. Using QMP commands, performing a migration from source to destination.
  Running something like the following on the destination:
  {
    "execute": "migrate-incoming",
    "arguments": {
      "uri": "tcp:127.0.0.1:7777",
      "exit-on-error": false
    }
  }
  {
    "execute": "migrate-incoming",
    "arguments": {
      "uri": "tcp:127.0.0.1:7777",
      "exit-on-error": false
    }
  }
  and the somthing like the following on source:
  {
    "execute": "migrate",
    "arguments": {
      "uri": "tcp:127.0.0.1:7777"
    }
  }
  {"execute" : "query-migrate"}

In 1, previously, the user used to get an error message on migration failure.
This was because there were error_report() calls in all of the files.
Now that they are replaced with error_setg() and the error is stored in errp,
we need to display that using error_report_err(). Hence I introduced an error_report_err()
call in the fail section.

In 2, we have 2 QMP sessions, one for the source and another for the destination.
The QMP command migrate will be issued on the source, and the errp will be set.
I did not understand the part where the message will be displayed because of the
error_report_err() call. I did not see such a message on failure scenario on both
the sessions.
If the user wants to check for errors, then the destination qemu will not exit 
(exit-on-error = false ) and we can retrieve it using {"execute" : "query-migrate"}

Aren't the 2 scenarios different by nature?

> 
> With these two changes, if the migrate-incoming QMP command is used with
> false as exit-on-error, this function will report the error *and* the
> query-migrate QMP command will report the error, resulting in duplicate
> reports.
> 
> > 
> > > 
> > > >        migration_incoming_state_destroy();
> > > >        if (mis->exit_on_error) {
> > > >            WITH_QEMU_LOCK_GUARD(&s->error_mutex) {
> > > > -            error_report_err(s->error);
> > > > +            error_free(s->error);
> > > 
> > > This change is problematic because s->error set somewhere else here will be
> > > ignored.
> > 
> > This is specific to the case when mis->exit_on_error is set.
> > since we did a migrate_set_error(s, local_err) before, we free the
> > error in s->error and set it to NULL, before an exit(EXIT_FAILURE)
> 
> It shouldn't just free the error but should print it or the error will be
> missed.
> 
> Regards,
> Akihiko Odaki
> 
Regards,
Arun Menon
Re: [PATCH v13 07/27] migration: push Error **errp into qemu_loadvm_state()
Posted by Akihiko Odaki 3 weeks, 6 days ago
On 2025/09/01 1:38, Arun Menon wrote:
> Hi,
> 
> On Mon, Sep 01, 2025 at 01:04:40AM +0900, Akihiko Odaki wrote:
>> On 2025/09/01 0:45, Arun Menon wrote:
>>> Hi Akihiko,
>>> Thanks for the review.
>>>
>>> On Sat, Aug 30, 2025 at 02:58:05PM +0900, Akihiko Odaki wrote:
>>>> On 2025/08/30 5:01, Arun Menon wrote:
>>>>> This is an incremental step in converting vmstate loading
>>>>> code to report error via Error objects instead of directly
>>>>> printing it to console/monitor.
>>>>> It is ensured that qemu_loadvm_state() must report an error
>>>>> in errp, in case of failure.
>>>>>
>>>>> When postcopy live migration runs, the device states are loaded by
>>>>> both the qemu coroutine process_incoming_migration_co() and the
>>>>> postcopy_ram_listen_thread(). Therefore, it is important that the
>>>>> coroutine also reports the error in case of failure, with
>>>>> error_report_err(). Otherwise, the source qemu will not display
>>>>> any errors before going into the postcopy pause state.
>>>>>
>>>>> Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
>>>>> Reviewed-by: Fabiano Rosas <farosas@suse.de>
>>>>> Signed-off-by: Arun Menon <armenon@redhat.com>
>>>>> ---
>>>>>     migration/migration.c |  9 +++++----
>>>>>     migration/savevm.c    | 30 ++++++++++++++++++------------
>>>>>     migration/savevm.h    |  2 +-
>>>>>     3 files changed, 24 insertions(+), 17 deletions(-)
>>>>>
>>>>> diff --git a/migration/migration.c b/migration/migration.c
>>>>> index 10c216d25dec01f206eacad2edd24d21f00e614c..c6768d88f45c870c7fad9b9957300766ff69effc 100644
>>>>> --- a/migration/migration.c
>>>>> +++ b/migration/migration.c
>>>>> @@ -881,7 +881,7 @@ process_incoming_migration_co(void *opaque)
>>>>>                           MIGRATION_STATUS_ACTIVE);
>>>>>         mis->loadvm_co = qemu_coroutine_self();
>>>>> -    ret = qemu_loadvm_state(mis->from_src_file);
>>>>> +    ret = qemu_loadvm_state(mis->from_src_file, &local_err);
>>>>>         mis->loadvm_co = NULL;
>>>>>         trace_vmstate_downtime_checkpoint("dst-precopy-loadvm-completed");
>>>>> @@ -908,7 +908,8 @@ process_incoming_migration_co(void *opaque)
>>>>>         }
>>>>>         if (ret < 0) {
>>>>> -        error_setg(&local_err, "load of migration failed: %s", strerror(-ret));
>>>>> +        error_prepend(&local_err, "load of migration failed: %s: ",
>>>>> +                      strerror(-ret));
>>>>>             goto fail;
>>>>>         }
>>>>> @@ -924,13 +925,13 @@ fail:
>>>>>         migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
>>>>>                           MIGRATION_STATUS_FAILED);
>>>>>         migrate_set_error(s, local_err);
>>>>> -    error_free(local_err);
>>>>> +    error_report_err(local_err);
>>>>
>>>> This is problematic because it results in duplicate error reports when
>>>> !mis->exit_on_error; in that case the query-migrate QMP command reports the
>>>> error and this error reporting is redundant.
>>>
>>> If I comment this change, then all of the errors propagated up to now, using
>>> error_setg() will not be reported. This is the place where it is finally reported,
>>> when qemu_loadvm_state() fails. In other words, all the error_reports() we removed
>>> from all the files, replacing them with error_setg(), will finally be reported here
>>> using error_report_err().
>>
>> My understanding of the code without these two changes is:
>> - If the migrate-incoming QMP command is used with false as
>>    exit-on-error, this function will not report the error but
>>    the query-migrate QMP command will report the error.
>> - Otherwise, this function reports the error.
> 
> With my limited experience in testing, I have a question,
> So there are 2 scenarios,
> 1. running the virsh migrate command on the source host. Something like the following,
>    virsh -c 'qemu:///system' migrate --live --verbose --domain guest-vm --desturi qemu+ssh://10.6.120.20/system
>    OR for postcopy-ram,
>    virsh migrate guest-vm --live qemu+ssh://10.6.120.20/system --verbose --postcopy --timeout 10 --timeout-postcopy
> 
> 2. Using QMP commands, performing a migration from source to destination.
>    Running something like the following on the destination:
>    {
>      "execute": "migrate-incoming",
>      "arguments": {
>        "uri": "tcp:127.0.0.1:7777",
>        "exit-on-error": false
>      }
>    }
>    {
>      "execute": "migrate-incoming",
>      "arguments": {
>        "uri": "tcp:127.0.0.1:7777",
>        "exit-on-error": false
>      }
>    }
>    and the somthing like the following on source:
>    {
>      "execute": "migrate",
>      "arguments": {
>        "uri": "tcp:127.0.0.1:7777"
>      }
>    }
>    {"execute" : "query-migrate"}
> 
> In 1, previously, the user used to get an error message on migration failure.
> This was because there were error_report() calls in all of the files.
> Now that they are replaced with error_setg() and the error is stored in errp,
> we need to display that using error_report_err(). Hence I introduced an error_report_err()
> call in the fail section.
> 
> In 2, we have 2 QMP sessions, one for the source and another for the destination.
> The QMP command migrate will be issued on the source, and the errp will be set.
> I did not understand the part where the message will be displayed because of the
> error_report_err() call. I did not see such a message on failure scenario on both
> the sessions.
> If the user wants to check for errors, then the destination qemu will not exit
> (exit-on-error = false ) and we can retrieve it using {"execute" : "query-migrate"}
> 
> Aren't the 2 scenarios different by nature?

In 1, doesn't libvirt query the error with query-migrate and print it?

In any case, it would be nice if you describe how libvirt interacts with 
QEMU in 1.

Regards,
Akihiko Odaki

Re: [PATCH v13 07/27] migration: push Error **errp into qemu_loadvm_state()
Posted by Arun Menon 3 weeks, 4 days ago
Hi Akihiko,

It took some time to set up the machines; apologies for the delay in response.

On Mon, Sep 01, 2025 at 02:12:54AM +0900, Akihiko Odaki wrote:
> On 2025/09/01 1:38, Arun Menon wrote:
> > Hi,
> > 
> > On Mon, Sep 01, 2025 at 01:04:40AM +0900, Akihiko Odaki wrote:
> > > On 2025/09/01 0:45, Arun Menon wrote:
> > > > Hi Akihiko,
> > > > Thanks for the review.
> > > > 
> > > > On Sat, Aug 30, 2025 at 02:58:05PM +0900, Akihiko Odaki wrote:
> > > > > On 2025/08/30 5:01, Arun Menon wrote:
> > > > > > This is an incremental step in converting vmstate loading
> > > > > > code to report error via Error objects instead of directly
> > > > > > printing it to console/monitor.
> > > > > > It is ensured that qemu_loadvm_state() must report an error
> > > > > > in errp, in case of failure.
> > > > > > 
> > > > > > When postcopy live migration runs, the device states are loaded by
> > > > > > both the qemu coroutine process_incoming_migration_co() and the
> > > > > > postcopy_ram_listen_thread(). Therefore, it is important that the
> > > > > > coroutine also reports the error in case of failure, with
> > > > > > error_report_err(). Otherwise, the source qemu will not display
> > > > > > any errors before going into the postcopy pause state.
> > > > > > 
> > > > > > Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
> > > > > > Reviewed-by: Fabiano Rosas <farosas@suse.de>
> > > > > > Signed-off-by: Arun Menon <armenon@redhat.com>
> > > > > > ---
> > > > > >     migration/migration.c |  9 +++++----
> > > > > >     migration/savevm.c    | 30 ++++++++++++++++++------------
> > > > > >     migration/savevm.h    |  2 +-
> > > > > >     3 files changed, 24 insertions(+), 17 deletions(-)
> > > > > > 
> > > > > > diff --git a/migration/migration.c b/migration/migration.c
> > > > > > index 10c216d25dec01f206eacad2edd24d21f00e614c..c6768d88f45c870c7fad9b9957300766ff69effc 100644
> > > > > > --- a/migration/migration.c
> > > > > > +++ b/migration/migration.c
> > > > > > @@ -881,7 +881,7 @@ process_incoming_migration_co(void *opaque)
> > > > > >                           MIGRATION_STATUS_ACTIVE);
> > > > > >         mis->loadvm_co = qemu_coroutine_self();
> > > > > > -    ret = qemu_loadvm_state(mis->from_src_file);
> > > > > > +    ret = qemu_loadvm_state(mis->from_src_file, &local_err);
> > > > > >         mis->loadvm_co = NULL;
> > > > > >         trace_vmstate_downtime_checkpoint("dst-precopy-loadvm-completed");
> > > > > > @@ -908,7 +908,8 @@ process_incoming_migration_co(void *opaque)
> > > > > >         }
> > > > > >         if (ret < 0) {
> > > > > > -        error_setg(&local_err, "load of migration failed: %s", strerror(-ret));
> > > > > > +        error_prepend(&local_err, "load of migration failed: %s: ",
> > > > > > +                      strerror(-ret));
> > > > > >             goto fail;
> > > > > >         }
> > > > > > @@ -924,13 +925,13 @@ fail:
> > > > > >         migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
> > > > > >                           MIGRATION_STATUS_FAILED);
> > > > > >         migrate_set_error(s, local_err);
> > > > > > -    error_free(local_err);
> > > > > > +    error_report_err(local_err);
> > > > > 
> > > > > This is problematic because it results in duplicate error reports when
> > > > > !mis->exit_on_error; in that case the query-migrate QMP command reports the
> > > > > error and this error reporting is redundant.
> > > > 
> > > > If I comment this change, then all of the errors propagated up to now, using
> > > > error_setg() will not be reported. This is the place where it is finally reported,
> > > > when qemu_loadvm_state() fails. In other words, all the error_reports() we removed
> > > > from all the files, replacing them with error_setg(), will finally be reported here
> > > > using error_report_err().
> > > 
> > > My understanding of the code without these two changes is:
> > > - If the migrate-incoming QMP command is used with false as
> > >    exit-on-error, this function will not report the error but
> > >    the query-migrate QMP command will report the error.
> > > - Otherwise, this function reports the error.
> > 
> > With my limited experience in testing, I have a question,
> > So there are 2 scenarios,
> > 1. running the virsh migrate command on the source host. Something like the following,
> >    virsh -c 'qemu:///system' migrate --live --verbose --domain guest-vm --desturi qemu+ssh://10.6.120.20/system
> >    OR for postcopy-ram,
> >    virsh migrate guest-vm --live qemu+ssh://10.6.120.20/system --verbose --postcopy --timeout 10 --timeout-postcopy
> > 
> > 2. Using QMP commands, performing a migration from source to destination.
> >    Running something like the following on the destination:
> >    {
> >      "execute": "migrate-incoming",
> >      "arguments": {
> >        "uri": "tcp:127.0.0.1:7777",
> >        "exit-on-error": false
> >      }
> >    }
> >    {
> >      "execute": "migrate-incoming",
> >      "arguments": {
> >        "uri": "tcp:127.0.0.1:7777",
> >        "exit-on-error": false
> >      }
> >    }
> >    and the somthing like the following on source:
> >    {
> >      "execute": "migrate",
> >      "arguments": {
> >        "uri": "tcp:127.0.0.1:7777"
> >      }
> >    }
> >    {"execute" : "query-migrate"}
> > 
> > In 1, previously, the user used to get an error message on migration failure.
> > This was because there were error_report() calls in all of the files.
> > Now that they are replaced with error_setg() and the error is stored in errp,
> > we need to display that using error_report_err(). Hence I introduced an error_report_err()
> > call in the fail section.
> > 
> > In 2, we have 2 QMP sessions, one for the source and another for the destination.
> > The QMP command migrate will be issued on the source, and the errp will be set.
> > I did not understand the part where the message will be displayed because of the
> > error_report_err() call. I did not see such a message on failure scenario on both
> > the sessions.
> > If the user wants to check for errors, then the destination qemu will not exit
> > (exit-on-error = false ) and we can retrieve it using {"execute" : "query-migrate"}
> > 
> > Aren't the 2 scenarios different by nature?
> 
> In 1, doesn't libvirt query the error with query-migrate and print it?

Ideally it should find the the error, and print the whole thing. It does work
in the normal scenario. However, the postcopy scenario does not show the same result,
which is mentioned in the commit message.

> 
> In any case, it would be nice if you describe how libvirt interacts with
> QEMU in 1.

Please find below the difference in the command output at source, when we run a live migration
with postcopy enabled.

=========
With the current changes:
[root@dell-per750-42 qemu-priv]# virsh migrate-setspeed guest-vm 1

[root@dell-per750-42 build]# virsh migrate guest-vm --live qemu+ssh://10.6.120.9/system --verbose --postcopy --timeout 10 --timeout-postcopy
root@10.6.120.9's password: 
Migration: [ 1.26 %]error: internal error: QEMU unexpectedly closed the monitor (vm='guest-vm'): 2025-09-03T06:19:15.076547Z qemu-system-x86_64: -accel kvm: warning: Number of SMP cpus requested (2) exceeds the recommended cpus supported by KVM (1)
2025-09-03T06:19:15.076586Z qemu-system-x86_64: -accel kvm: warning: Number of hotpluggable cpus requested (2) exceeds the recommended cpus supported by KVM (1)
2025-09-03T06:19:27.776715Z qemu-system-x86_64: load of migration failed: Input/output error: error while loading state for instance 0x0 of device 'tpm-emulator': post load hook failed for: tpm-emulator, version_id: 0, minimum_version: 0, ret: -5: tpm-emulator: Setting the stateblob (type 1) failed with a TPM error 0x21 decryption error

[root@dell-per750-42 build]# 

=========

Without the current changes:
[root@dell-per750-42 qemu-priv]# virsh migrate-setspeed guest-vm 1

[root@dell-per750-42 qemu-priv]# virsh migrate guest-vm --live qemu+ssh://10.6.120.9/system --verbose --postcopy --timeout 10 --timeout-postcopy
root@10.6.120.9's password: 
Migration: [ 1.28 %]error: internal error: QEMU unexpectedly closed the monitor (vm='guest-vm'): 2025-09-03T06:26:17.733786Z qemu-system-x86_64: -accel kvm: warning: Number of SMP cpus requested (2) exceeds the recommended cpus supported by KVM (1)
2025-09-03T06:26:17.733830Z qemu-system-x86_64: -accel kvm: warning: Number of hotpluggable cpus requested (2) exceeds the recommended cpus supported by KVM (1)

[root@dell-per750-42 qemu-priv]# 

=========
The original behavior was to print the error to the console regardless of whether the migration is normal or postcopy.
The source machine goes in to a paused state after this.

> 
> Regards,
> Akihiko Odaki
> 

Regards,
Arun Menon
Re: [PATCH v13 07/27] migration: push Error **errp into qemu_loadvm_state()
Posted by Akihiko Odaki 3 weeks, 1 day ago
On 2025/09/03 8:47, Arun Menon wrote:
> Hi Akihiko,
> 
> It took some time to set up the machines; apologies for the delay in response.
> 
> On Mon, Sep 01, 2025 at 02:12:54AM +0900, Akihiko Odaki wrote:
>> On 2025/09/01 1:38, Arun Menon wrote:
>>> Hi,
>>>
>>> On Mon, Sep 01, 2025 at 01:04:40AM +0900, Akihiko Odaki wrote:
>>>> On 2025/09/01 0:45, Arun Menon wrote:
>>>>> Hi Akihiko,
>>>>> Thanks for the review.
>>>>>
>>>>> On Sat, Aug 30, 2025 at 02:58:05PM +0900, Akihiko Odaki wrote:
>>>>>> On 2025/08/30 5:01, Arun Menon wrote:
>>>>>>> This is an incremental step in converting vmstate loading
>>>>>>> code to report error via Error objects instead of directly
>>>>>>> printing it to console/monitor.
>>>>>>> It is ensured that qemu_loadvm_state() must report an error
>>>>>>> in errp, in case of failure.
>>>>>>>
>>>>>>> When postcopy live migration runs, the device states are loaded by
>>>>>>> both the qemu coroutine process_incoming_migration_co() and the
>>>>>>> postcopy_ram_listen_thread(). Therefore, it is important that the
>>>>>>> coroutine also reports the error in case of failure, with
>>>>>>> error_report_err(). Otherwise, the source qemu will not display
>>>>>>> any errors before going into the postcopy pause state.
>>>>>>>
>>>>>>> Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
>>>>>>> Reviewed-by: Fabiano Rosas <farosas@suse.de>
>>>>>>> Signed-off-by: Arun Menon <armenon@redhat.com>
>>>>>>> ---
>>>>>>>      migration/migration.c |  9 +++++----
>>>>>>>      migration/savevm.c    | 30 ++++++++++++++++++------------
>>>>>>>      migration/savevm.h    |  2 +-
>>>>>>>      3 files changed, 24 insertions(+), 17 deletions(-)
>>>>>>>
>>>>>>> diff --git a/migration/migration.c b/migration/migration.c
>>>>>>> index 10c216d25dec01f206eacad2edd24d21f00e614c..c6768d88f45c870c7fad9b9957300766ff69effc 100644
>>>>>>> --- a/migration/migration.c
>>>>>>> +++ b/migration/migration.c
>>>>>>> @@ -881,7 +881,7 @@ process_incoming_migration_co(void *opaque)
>>>>>>>                            MIGRATION_STATUS_ACTIVE);
>>>>>>>          mis->loadvm_co = qemu_coroutine_self();
>>>>>>> -    ret = qemu_loadvm_state(mis->from_src_file);
>>>>>>> +    ret = qemu_loadvm_state(mis->from_src_file, &local_err);
>>>>>>>          mis->loadvm_co = NULL;
>>>>>>>          trace_vmstate_downtime_checkpoint("dst-precopy-loadvm-completed");
>>>>>>> @@ -908,7 +908,8 @@ process_incoming_migration_co(void *opaque)
>>>>>>>          }
>>>>>>>          if (ret < 0) {
>>>>>>> -        error_setg(&local_err, "load of migration failed: %s", strerror(-ret));
>>>>>>> +        error_prepend(&local_err, "load of migration failed: %s: ",
>>>>>>> +                      strerror(-ret));
>>>>>>>              goto fail;
>>>>>>>          }
>>>>>>> @@ -924,13 +925,13 @@ fail:
>>>>>>>          migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
>>>>>>>                            MIGRATION_STATUS_FAILED);
>>>>>>>          migrate_set_error(s, local_err);
>>>>>>> -    error_free(local_err);
>>>>>>> +    error_report_err(local_err);
>>>>>>
>>>>>> This is problematic because it results in duplicate error reports when
>>>>>> !mis->exit_on_error; in that case the query-migrate QMP command reports the
>>>>>> error and this error reporting is redundant.
>>>>>
>>>>> If I comment this change, then all of the errors propagated up to now, using
>>>>> error_setg() will not be reported. This is the place where it is finally reported,
>>>>> when qemu_loadvm_state() fails. In other words, all the error_reports() we removed
>>>>> from all the files, replacing them with error_setg(), will finally be reported here
>>>>> using error_report_err().
>>>>
>>>> My understanding of the code without these two changes is:
>>>> - If the migrate-incoming QMP command is used with false as
>>>>     exit-on-error, this function will not report the error but
>>>>     the query-migrate QMP command will report the error.
>>>> - Otherwise, this function reports the error.
>>>
>>> With my limited experience in testing, I have a question,
>>> So there are 2 scenarios,
>>> 1. running the virsh migrate command on the source host. Something like the following,
>>>     virsh -c 'qemu:///system' migrate --live --verbose --domain guest-vm --desturi qemu+ssh://10.6.120.20/system
>>>     OR for postcopy-ram,
>>>     virsh migrate guest-vm --live qemu+ssh://10.6.120.20/system --verbose --postcopy --timeout 10 --timeout-postcopy
>>>
>>> 2. Using QMP commands, performing a migration from source to destination.
>>>     Running something like the following on the destination:
>>>     {
>>>       "execute": "migrate-incoming",
>>>       "arguments": {
>>>         "uri": "tcp:127.0.0.1:7777",
>>>         "exit-on-error": false
>>>       }
>>>     }
>>>     {
>>>       "execute": "migrate-incoming",
>>>       "arguments": {
>>>         "uri": "tcp:127.0.0.1:7777",
>>>         "exit-on-error": false
>>>       }
>>>     }
>>>     and the somthing like the following on source:
>>>     {
>>>       "execute": "migrate",
>>>       "arguments": {
>>>         "uri": "tcp:127.0.0.1:7777"
>>>       }
>>>     }
>>>     {"execute" : "query-migrate"}
>>>
>>> In 1, previously, the user used to get an error message on migration failure.
>>> This was because there were error_report() calls in all of the files.
>>> Now that they are replaced with error_setg() and the error is stored in errp,
>>> we need to display that using error_report_err(). Hence I introduced an error_report_err()
>>> call in the fail section.
>>>
>>> In 2, we have 2 QMP sessions, one for the source and another for the destination.
>>> The QMP command migrate will be issued on the source, and the errp will be set.
>>> I did not understand the part where the message will be displayed because of the
>>> error_report_err() call. I did not see such a message on failure scenario on both
>>> the sessions.
>>> If the user wants to check for errors, then the destination qemu will not exit
>>> (exit-on-error = false ) and we can retrieve it using {"execute" : "query-migrate"}
>>>
>>> Aren't the 2 scenarios different by nature?
>>
>> In 1, doesn't libvirt query the error with query-migrate and print it?
> 
> Ideally it should find the the error, and print the whole thing. It does work
> in the normal scenario. However, the postcopy scenario does not show the same result,
> which is mentioned in the commit message.
> 
>>
>> In any case, it would be nice if you describe how libvirt interacts with
>> QEMU in 1.
> 
> Please find below the difference in the command output at source, when we run a live migration
> with postcopy enabled.
> 
> =========
> With the current changes:
> [root@dell-per750-42 qemu-priv]# virsh migrate-setspeed guest-vm 1
> 
> [root@dell-per750-42 build]# virsh migrate guest-vm --live qemu+ssh://10.6.120.9/system --verbose --postcopy --timeout 10 --timeout-postcopy
> root@10.6.120.9's password:
> Migration: [ 1.26 %]error: internal error: QEMU unexpectedly closed the monitor (vm='guest-vm'): 2025-09-03T06:19:15.076547Z qemu-system-x86_64: -accel kvm: warning: Number of SMP cpus requested (2) exceeds the recommended cpus supported by KVM (1)
> 2025-09-03T06:19:15.076586Z qemu-system-x86_64: -accel kvm: warning: Number of hotpluggable cpus requested (2) exceeds the recommended cpus supported by KVM (1)
> 2025-09-03T06:19:27.776715Z qemu-system-x86_64: load of migration failed: Input/output error: error while loading state for instance 0x0 of device 'tpm-emulator': post load hook failed for: tpm-emulator, version_id: 0, minimum_version: 0, ret: -5: tpm-emulator: Setting the stateblob (type 1) failed with a TPM error 0x21 decryption error
> 
> [root@dell-per750-42 build]#
> 
> =========
> 
> Without the current changes:
> [root@dell-per750-42 qemu-priv]# virsh migrate-setspeed guest-vm 1
> 
> [root@dell-per750-42 qemu-priv]# virsh migrate guest-vm --live qemu+ssh://10.6.120.9/system --verbose --postcopy --timeout 10 --timeout-postcopy
> root@10.6.120.9's password:
> Migration: [ 1.28 %]error: internal error: QEMU unexpectedly closed the monitor (vm='guest-vm'): 2025-09-03T06:26:17.733786Z qemu-system-x86_64: -accel kvm: warning: Number of SMP cpus requested (2) exceeds the recommended cpus supported by KVM (1)
> 2025-09-03T06:26:17.733830Z qemu-system-x86_64: -accel kvm: warning: Number of hotpluggable cpus requested (2) exceeds the recommended cpus supported by KVM (1)
> 
> [root@dell-per750-42 qemu-priv]#
> 
> =========
> The original behavior was to print the error to the console regardless of whether the migration is normal or postcopy.

This was true for messages in qemu_loadvm_state(), but the message "load 
of migration failed" was printed or queried with query-migrate, not 
both. We should think of which behavior is more appropriate, and I think 
we should avoid duplicate reports.

> The source machine goes in to a paused state after this.

The output is informative. It implies the destination machine exited, 
and it makes sense to print error messages as it is done for
mis->exit_on_error. I wonder if it is possible to detect the condition 
and treat it identically to mis->exit_on_error.

Regards,
Akihiko Odaki

Re: [PATCH v13 07/27] migration: push Error **errp into qemu_loadvm_state()
Posted by Arun Menon 1 week, 4 days ago
Hi Akihiko,

On Sat, Sep 06, 2025 at 05:22:31AM +0200, Akihiko Odaki wrote:
> On 2025/09/03 8:47, Arun Menon wrote:
> > Hi Akihiko,
> > 
> > It took some time to set up the machines; apologies for the delay in response.
> > 
> > On Mon, Sep 01, 2025 at 02:12:54AM +0900, Akihiko Odaki wrote:
> > > On 2025/09/01 1:38, Arun Menon wrote:
> > > > Hi,
> > > > 
> > > > On Mon, Sep 01, 2025 at 01:04:40AM +0900, Akihiko Odaki wrote:
> > > > > On 2025/09/01 0:45, Arun Menon wrote:
> > > > > > Hi Akihiko,
> > > > > > Thanks for the review.
> > > > > > 
> > > > > > On Sat, Aug 30, 2025 at 02:58:05PM +0900, Akihiko Odaki wrote:
> > > > > > > On 2025/08/30 5:01, Arun Menon wrote:
> > > > > > > > This is an incremental step in converting vmstate loading
> > > > > > > > code to report error via Error objects instead of directly
> > > > > > > > printing it to console/monitor.
> > > > > > > > It is ensured that qemu_loadvm_state() must report an error
> > > > > > > > in errp, in case of failure.
> > > > > > > > 
> > > > > > > > When postcopy live migration runs, the device states are loaded by
> > > > > > > > both the qemu coroutine process_incoming_migration_co() and the
> > > > > > > > postcopy_ram_listen_thread(). Therefore, it is important that the
> > > > > > > > coroutine also reports the error in case of failure, with
> > > > > > > > error_report_err(). Otherwise, the source qemu will not display
> > > > > > > > any errors before going into the postcopy pause state.
> > > > > > > > 
> > > > > > > > Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
> > > > > > > > Reviewed-by: Fabiano Rosas <farosas@suse.de>
> > > > > > > > Signed-off-by: Arun Menon <armenon@redhat.com>
> > > > > > > > ---
> > > > > > > >      migration/migration.c |  9 +++++----
> > > > > > > >      migration/savevm.c    | 30 ++++++++++++++++++------------
> > > > > > > >      migration/savevm.h    |  2 +-
> > > > > > > >      3 files changed, 24 insertions(+), 17 deletions(-)
> > > > > > > > 
> > > > > > > > diff --git a/migration/migration.c b/migration/migration.c
> > > > > > > > index 10c216d25dec01f206eacad2edd24d21f00e614c..c6768d88f45c870c7fad9b9957300766ff69effc 100644
> > > > > > > > --- a/migration/migration.c
> > > > > > > > +++ b/migration/migration.c
> > > > > > > > @@ -881,7 +881,7 @@ process_incoming_migration_co(void *opaque)
> > > > > > > >                            MIGRATION_STATUS_ACTIVE);
> > > > > > > >          mis->loadvm_co = qemu_coroutine_self();
> > > > > > > > -    ret = qemu_loadvm_state(mis->from_src_file);
> > > > > > > > +    ret = qemu_loadvm_state(mis->from_src_file, &local_err);
> > > > > > > >          mis->loadvm_co = NULL;
> > > > > > > >          trace_vmstate_downtime_checkpoint("dst-precopy-loadvm-completed");
> > > > > > > > @@ -908,7 +908,8 @@ process_incoming_migration_co(void *opaque)
> > > > > > > >          }
> > > > > > > >          if (ret < 0) {
> > > > > > > > -        error_setg(&local_err, "load of migration failed: %s", strerror(-ret));
> > > > > > > > +        error_prepend(&local_err, "load of migration failed: %s: ",
> > > > > > > > +                      strerror(-ret));
> > > > > > > >              goto fail;
> > > > > > > >          }
> > > > > > > > @@ -924,13 +925,13 @@ fail:
> > > > > > > >          migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
> > > > > > > >                            MIGRATION_STATUS_FAILED);
> > > > > > > >          migrate_set_error(s, local_err);
> > > > > > > > -    error_free(local_err);
> > > > > > > > +    error_report_err(local_err);
> > > > > > > 
> > > > > > > This is problematic because it results in duplicate error reports when
> > > > > > > !mis->exit_on_error; in that case the query-migrate QMP command reports the
> > > > > > > error and this error reporting is redundant.
> > > > > > 
> > > > > > If I comment this change, then all of the errors propagated up to now, using
> > > > > > error_setg() will not be reported. This is the place where it is finally reported,
> > > > > > when qemu_loadvm_state() fails. In other words, all the error_reports() we removed
> > > > > > from all the files, replacing them with error_setg(), will finally be reported here
> > > > > > using error_report_err().
> > > > > 
> > > > > My understanding of the code without these two changes is:
> > > > > - If the migrate-incoming QMP command is used with false as
> > > > >     exit-on-error, this function will not report the error but
> > > > >     the query-migrate QMP command will report the error.
> > > > > - Otherwise, this function reports the error.
> > > > 
> > > > With my limited experience in testing, I have a question,
> > > > So there are 2 scenarios,
> > > > 1. running the virsh migrate command on the source host. Something like the following,
> > > >     virsh -c 'qemu:///system' migrate --live --verbose --domain guest-vm --desturi qemu+ssh://10.6.120.20/system
> > > >     OR for postcopy-ram,
> > > >     virsh migrate guest-vm --live qemu+ssh://10.6.120.20/system --verbose --postcopy --timeout 10 --timeout-postcopy
> > > > 
> > > > 2. Using QMP commands, performing a migration from source to destination.
> > > >     Running something like the following on the destination:
> > > >     {
> > > >       "execute": "migrate-incoming",
> > > >       "arguments": {
> > > >         "uri": "tcp:127.0.0.1:7777",
> > > >         "exit-on-error": false
> > > >       }
> > > >     }
> > > >     {
> > > >       "execute": "migrate-incoming",
> > > >       "arguments": {
> > > >         "uri": "tcp:127.0.0.1:7777",
> > > >         "exit-on-error": false
> > > >       }
> > > >     }
> > > >     and the somthing like the following on source:
> > > >     {
> > > >       "execute": "migrate",
> > > >       "arguments": {
> > > >         "uri": "tcp:127.0.0.1:7777"
> > > >       }
> > > >     }
> > > >     {"execute" : "query-migrate"}
> > > > 
> > > > In 1, previously, the user used to get an error message on migration failure.
> > > > This was because there were error_report() calls in all of the files.
> > > > Now that they are replaced with error_setg() and the error is stored in errp,
> > > > we need to display that using error_report_err(). Hence I introduced an error_report_err()
> > > > call in the fail section.
> > > > 
> > > > In 2, we have 2 QMP sessions, one for the source and another for the destination.
> > > > The QMP command migrate will be issued on the source, and the errp will be set.
> > > > I did not understand the part where the message will be displayed because of the
> > > > error_report_err() call. I did not see such a message on failure scenario on both
> > > > the sessions.
> > > > If the user wants to check for errors, then the destination qemu will not exit
> > > > (exit-on-error = false ) and we can retrieve it using {"execute" : "query-migrate"}
> > > > 
> > > > Aren't the 2 scenarios different by nature?
> > > 
> > > In 1, doesn't libvirt query the error with query-migrate and print it?
> > 
> > Ideally it should find the the error, and print the whole thing. It does work
> > in the normal scenario. However, the postcopy scenario does not show the same result,
> > which is mentioned in the commit message.
> > 
> > > 
> > > In any case, it would be nice if you describe how libvirt interacts with
> > > QEMU in 1.
> > 
> > Please find below the difference in the command output at source, when we run a live migration
> > with postcopy enabled.
> > 
> > =========
> > With the current changes:
> > [root@dell-per750-42 qemu-priv]# virsh migrate-setspeed guest-vm 1
> > 
> > [root@dell-per750-42 build]# virsh migrate guest-vm --live qemu+ssh://10.6.120.9/system --verbose --postcopy --timeout 10 --timeout-postcopy
> > root@10.6.120.9's password:
> > Migration: [ 1.26 %]error: internal error: QEMU unexpectedly closed the monitor (vm='guest-vm'): 2025-09-03T06:19:15.076547Z qemu-system-x86_64: -accel kvm: warning: Number of SMP cpus requested (2) exceeds the recommended cpus supported by KVM (1)
> > 2025-09-03T06:19:15.076586Z qemu-system-x86_64: -accel kvm: warning: Number of hotpluggable cpus requested (2) exceeds the recommended cpus supported by KVM (1)
> > 2025-09-03T06:19:27.776715Z qemu-system-x86_64: load of migration failed: Input/output error: error while loading state for instance 0x0 of device 'tpm-emulator': post load hook failed for: tpm-emulator, version_id: 0, minimum_version: 0, ret: -5: tpm-emulator: Setting the stateblob (type 1) failed with a TPM error 0x21 decryption error
> > 
> > [root@dell-per750-42 build]#
> > 
> > =========
> > 
> > Without the current changes:
> > [root@dell-per750-42 qemu-priv]# virsh migrate-setspeed guest-vm 1
> > 
> > [root@dell-per750-42 qemu-priv]# virsh migrate guest-vm --live qemu+ssh://10.6.120.9/system --verbose --postcopy --timeout 10 --timeout-postcopy
> > root@10.6.120.9's password:
> > Migration: [ 1.28 %]error: internal error: QEMU unexpectedly closed the monitor (vm='guest-vm'): 2025-09-03T06:26:17.733786Z qemu-system-x86_64: -accel kvm: warning: Number of SMP cpus requested (2) exceeds the recommended cpus supported by KVM (1)
> > 2025-09-03T06:26:17.733830Z qemu-system-x86_64: -accel kvm: warning: Number of hotpluggable cpus requested (2) exceeds the recommended cpus supported by KVM (1)
> > 
> > [root@dell-per750-42 qemu-priv]#
> > 
> > =========
> > The original behavior was to print the error to the console regardless of whether the migration is normal or postcopy.
> 
> This was true for messages in qemu_loadvm_state(), but the message "load of
> migration failed" was printed or queried with query-migrate, not both. We
> should think of which behavior is more appropriate, and I think we should
> avoid duplicate reports.
> 
> > The source machine goes in to a paused state after this.
> 
> The output is informative. It implies the destination machine exited, and it
> makes sense to print error messages as it is done for
> mis->exit_on_error. I wonder if it is possible to detect the condition and
> treat it identically to mis->exit_on_error.

I see that we want to catch a specific scenario in postcopy ram migration
where the destination abruptly exits without a graceful shutdown,
thus failing to inform the source the reason for its failure through a
'query-migrate' even though 'exit-on-error' was set to false on the destination.

However, I am not sure how to reliably detect the specific error condition of
such a connection close that you have described. Given that this is a large
patch series already, could we keep the current change as is for now?
From what I can tell, the additional log message "load of migration failed"
is not a breaking change and will not cause a crash. We can develop a more
elegant solution to handle the issue of duplication in a separate patch.

> 
> Regards,
> Akihiko Odaki
> 

Regards,
Arun Menon
Re: [PATCH v13 07/27] migration: push Error **errp into qemu_loadvm_state()
Posted by Akihiko Odaki 1 week, 4 days ago
On 2025/09/17 9:34, Arun Menon wrote:
> Hi Akihiko,
> 
> On Sat, Sep 06, 2025 at 05:22:31AM +0200, Akihiko Odaki wrote:
>> On 2025/09/03 8:47, Arun Menon wrote:
>>> Hi Akihiko,
>>>
>>> It took some time to set up the machines; apologies for the delay in response.
>>>
>>> On Mon, Sep 01, 2025 at 02:12:54AM +0900, Akihiko Odaki wrote:
>>>> On 2025/09/01 1:38, Arun Menon wrote:
>>>>> Hi,
>>>>>
>>>>> On Mon, Sep 01, 2025 at 01:04:40AM +0900, Akihiko Odaki wrote:
>>>>>> On 2025/09/01 0:45, Arun Menon wrote:
>>>>>>> Hi Akihiko,
>>>>>>> Thanks for the review.
>>>>>>>
>>>>>>> On Sat, Aug 30, 2025 at 02:58:05PM +0900, Akihiko Odaki wrote:
>>>>>>>> On 2025/08/30 5:01, Arun Menon wrote:
>>>>>>>>> This is an incremental step in converting vmstate loading
>>>>>>>>> code to report error via Error objects instead of directly
>>>>>>>>> printing it to console/monitor.
>>>>>>>>> It is ensured that qemu_loadvm_state() must report an error
>>>>>>>>> in errp, in case of failure.
>>>>>>>>>
>>>>>>>>> When postcopy live migration runs, the device states are loaded by
>>>>>>>>> both the qemu coroutine process_incoming_migration_co() and the
>>>>>>>>> postcopy_ram_listen_thread(). Therefore, it is important that the
>>>>>>>>> coroutine also reports the error in case of failure, with
>>>>>>>>> error_report_err(). Otherwise, the source qemu will not display
>>>>>>>>> any errors before going into the postcopy pause state.
>>>>>>>>>
>>>>>>>>> Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
>>>>>>>>> Reviewed-by: Fabiano Rosas <farosas@suse.de>
>>>>>>>>> Signed-off-by: Arun Menon <armenon@redhat.com>
>>>>>>>>> ---
>>>>>>>>>       migration/migration.c |  9 +++++----
>>>>>>>>>       migration/savevm.c    | 30 ++++++++++++++++++------------
>>>>>>>>>       migration/savevm.h    |  2 +-
>>>>>>>>>       3 files changed, 24 insertions(+), 17 deletions(-)
>>>>>>>>>
>>>>>>>>> diff --git a/migration/migration.c b/migration/migration.c
>>>>>>>>> index 10c216d25dec01f206eacad2edd24d21f00e614c..c6768d88f45c870c7fad9b9957300766ff69effc 100644
>>>>>>>>> --- a/migration/migration.c
>>>>>>>>> +++ b/migration/migration.c
>>>>>>>>> @@ -881,7 +881,7 @@ process_incoming_migration_co(void *opaque)
>>>>>>>>>                             MIGRATION_STATUS_ACTIVE);
>>>>>>>>>           mis->loadvm_co = qemu_coroutine_self();
>>>>>>>>> -    ret = qemu_loadvm_state(mis->from_src_file);
>>>>>>>>> +    ret = qemu_loadvm_state(mis->from_src_file, &local_err);
>>>>>>>>>           mis->loadvm_co = NULL;
>>>>>>>>>           trace_vmstate_downtime_checkpoint("dst-precopy-loadvm-completed");
>>>>>>>>> @@ -908,7 +908,8 @@ process_incoming_migration_co(void *opaque)
>>>>>>>>>           }
>>>>>>>>>           if (ret < 0) {
>>>>>>>>> -        error_setg(&local_err, "load of migration failed: %s", strerror(-ret));
>>>>>>>>> +        error_prepend(&local_err, "load of migration failed: %s: ",
>>>>>>>>> +                      strerror(-ret));
>>>>>>>>>               goto fail;
>>>>>>>>>           }
>>>>>>>>> @@ -924,13 +925,13 @@ fail:
>>>>>>>>>           migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
>>>>>>>>>                             MIGRATION_STATUS_FAILED);
>>>>>>>>>           migrate_set_error(s, local_err);
>>>>>>>>> -    error_free(local_err);
>>>>>>>>> +    error_report_err(local_err);
>>>>>>>>
>>>>>>>> This is problematic because it results in duplicate error reports when
>>>>>>>> !mis->exit_on_error; in that case the query-migrate QMP command reports the
>>>>>>>> error and this error reporting is redundant.
>>>>>>>
>>>>>>> If I comment this change, then all of the errors propagated up to now, using
>>>>>>> error_setg() will not be reported. This is the place where it is finally reported,
>>>>>>> when qemu_loadvm_state() fails. In other words, all the error_reports() we removed
>>>>>>> from all the files, replacing them with error_setg(), will finally be reported here
>>>>>>> using error_report_err().
>>>>>>
>>>>>> My understanding of the code without these two changes is:
>>>>>> - If the migrate-incoming QMP command is used with false as
>>>>>>      exit-on-error, this function will not report the error but
>>>>>>      the query-migrate QMP command will report the error.
>>>>>> - Otherwise, this function reports the error.
>>>>>
>>>>> With my limited experience in testing, I have a question,
>>>>> So there are 2 scenarios,
>>>>> 1. running the virsh migrate command on the source host. Something like the following,
>>>>>      virsh -c 'qemu:///system' migrate --live --verbose --domain guest-vm --desturi qemu+ssh://10.6.120.20/system
>>>>>      OR for postcopy-ram,
>>>>>      virsh migrate guest-vm --live qemu+ssh://10.6.120.20/system --verbose --postcopy --timeout 10 --timeout-postcopy
>>>>>
>>>>> 2. Using QMP commands, performing a migration from source to destination.
>>>>>      Running something like the following on the destination:
>>>>>      {
>>>>>        "execute": "migrate-incoming",
>>>>>        "arguments": {
>>>>>          "uri": "tcp:127.0.0.1:7777",
>>>>>          "exit-on-error": false
>>>>>        }
>>>>>      }
>>>>>      {
>>>>>        "execute": "migrate-incoming",
>>>>>        "arguments": {
>>>>>          "uri": "tcp:127.0.0.1:7777",
>>>>>          "exit-on-error": false
>>>>>        }
>>>>>      }
>>>>>      and the somthing like the following on source:
>>>>>      {
>>>>>        "execute": "migrate",
>>>>>        "arguments": {
>>>>>          "uri": "tcp:127.0.0.1:7777"
>>>>>        }
>>>>>      }
>>>>>      {"execute" : "query-migrate"}
>>>>>
>>>>> In 1, previously, the user used to get an error message on migration failure.
>>>>> This was because there were error_report() calls in all of the files.
>>>>> Now that they are replaced with error_setg() and the error is stored in errp,
>>>>> we need to display that using error_report_err(). Hence I introduced an error_report_err()
>>>>> call in the fail section.
>>>>>
>>>>> In 2, we have 2 QMP sessions, one for the source and another for the destination.
>>>>> The QMP command migrate will be issued on the source, and the errp will be set.
>>>>> I did not understand the part where the message will be displayed because of the
>>>>> error_report_err() call. I did not see such a message on failure scenario on both
>>>>> the sessions.
>>>>> If the user wants to check for errors, then the destination qemu will not exit
>>>>> (exit-on-error = false ) and we can retrieve it using {"execute" : "query-migrate"}
>>>>>
>>>>> Aren't the 2 scenarios different by nature?
>>>>
>>>> In 1, doesn't libvirt query the error with query-migrate and print it?
>>>
>>> Ideally it should find the the error, and print the whole thing. It does work
>>> in the normal scenario. However, the postcopy scenario does not show the same result,
>>> which is mentioned in the commit message.
>>>
>>>>
>>>> In any case, it would be nice if you describe how libvirt interacts with
>>>> QEMU in 1.
>>>
>>> Please find below the difference in the command output at source, when we run a live migration
>>> with postcopy enabled.
>>>
>>> =========
>>> With the current changes:
>>> [root@dell-per750-42 qemu-priv]# virsh migrate-setspeed guest-vm 1
>>>
>>> [root@dell-per750-42 build]# virsh migrate guest-vm --live qemu+ssh://10.6.120.9/system --verbose --postcopy --timeout 10 --timeout-postcopy
>>> root@10.6.120.9's password:
>>> Migration: [ 1.26 %]error: internal error: QEMU unexpectedly closed the monitor (vm='guest-vm'): 2025-09-03T06:19:15.076547Z qemu-system-x86_64: -accel kvm: warning: Number of SMP cpus requested (2) exceeds the recommended cpus supported by KVM (1)
>>> 2025-09-03T06:19:15.076586Z qemu-system-x86_64: -accel kvm: warning: Number of hotpluggable cpus requested (2) exceeds the recommended cpus supported by KVM (1)
>>> 2025-09-03T06:19:27.776715Z qemu-system-x86_64: load of migration failed: Input/output error: error while loading state for instance 0x0 of device 'tpm-emulator': post load hook failed for: tpm-emulator, version_id: 0, minimum_version: 0, ret: -5: tpm-emulator: Setting the stateblob (type 1) failed with a TPM error 0x21 decryption error
>>>
>>> [root@dell-per750-42 build]#
>>>
>>> =========
>>>
>>> Without the current changes:
>>> [root@dell-per750-42 qemu-priv]# virsh migrate-setspeed guest-vm 1
>>>
>>> [root@dell-per750-42 qemu-priv]# virsh migrate guest-vm --live qemu+ssh://10.6.120.9/system --verbose --postcopy --timeout 10 --timeout-postcopy
>>> root@10.6.120.9's password:
>>> Migration: [ 1.28 %]error: internal error: QEMU unexpectedly closed the monitor (vm='guest-vm'): 2025-09-03T06:26:17.733786Z qemu-system-x86_64: -accel kvm: warning: Number of SMP cpus requested (2) exceeds the recommended cpus supported by KVM (1)
>>> 2025-09-03T06:26:17.733830Z qemu-system-x86_64: -accel kvm: warning: Number of hotpluggable cpus requested (2) exceeds the recommended cpus supported by KVM (1)
>>>
>>> [root@dell-per750-42 qemu-priv]#
>>>
>>> =========
>>> The original behavior was to print the error to the console regardless of whether the migration is normal or postcopy.
>>
>> This was true for messages in qemu_loadvm_state(), but the message "load of
>> migration failed" was printed or queried with query-migrate, not both. We
>> should think of which behavior is more appropriate, and I think we should
>> avoid duplicate reports.
>>
>>> The source machine goes in to a paused state after this.
>>
>> The output is informative. It implies the destination machine exited, and it
>> makes sense to print error messages as it is done for
>> mis->exit_on_error. I wonder if it is possible to detect the condition and
>> treat it identically to mis->exit_on_error.
> 
> I see that we want to catch a specific scenario in postcopy ram migration
> where the destination abruptly exits without a graceful shutdown,
> thus failing to inform the source the reason for its failure through a
> 'query-migrate' even though 'exit-on-error' was set to false on the destination.
> 
> However, I am not sure how to reliably detect the specific error condition of
> such a connection close that you have described. Given that this is a large
> patch series already, could we keep the current change as is for now?
>  From what I can tell, the additional log message "load of migration failed"
> is not a breaking change and will not cause a crash. We can develop a more
> elegant solution to handle the issue of duplication in a separate patch.
There are two regressions:
1) Duplicate error reports when exit-on-error is false and postcopy is 
disabled.
2) Errors reported code else qemu_loadvm_state() are ignored when 
exit-on-error is true.

1) is trivial yet difficult to fix and I agree that it can be handled 
later. Ideally there should be a comment to note that.

However, there is also 2), which is more serious and easier to fix so I 
suggest fixing it now. More concretely, the code will look like as follows:

     migrate_set_error(s, local_err);
     if (mis->exit_on_error) {
         error_free(local_err);
     } else {
         /*
          * Report the error here in case that QEMU abruptly exits when
          * postcopy is enabled.
          */
         error_report_err(s->error);
     }

     migration_incoming_state_destroy();

     if (mis->exit_on_error) {
         WITH_QEMU_LOCK_GUARD(&s->error_mutex) {
             error_report_err(s->error);

This ensures errors set by anyone will be reported while duplicate error 
reports are avoided when exit-on-error is true.

Regards,
Akihiko Odaki

Re: [PATCH v13 07/27] migration: push Error **errp into qemu_loadvm_state()
Posted by Arun Menon 1 week, 4 days ago
Hi Akihiko,

On Wed, Sep 17, 2025 at 04:54:11PM +0900, Akihiko Odaki wrote:
> On 2025/09/17 9:34, Arun Menon wrote:
> > Hi Akihiko,
> > 
> > On Sat, Sep 06, 2025 at 05:22:31AM +0200, Akihiko Odaki wrote:
> > > On 2025/09/03 8:47, Arun Menon wrote:
> > > > Hi Akihiko,
> > > > 
> > > > It took some time to set up the machines; apologies for the delay in response.
> > > > 
> > > > On Mon, Sep 01, 2025 at 02:12:54AM +0900, Akihiko Odaki wrote:
> > > > > On 2025/09/01 1:38, Arun Menon wrote:
> > > > > > Hi,
> > > > > > 
> > > > > > On Mon, Sep 01, 2025 at 01:04:40AM +0900, Akihiko Odaki wrote:
> > > > > > > On 2025/09/01 0:45, Arun Menon wrote:
> > > > > > > > Hi Akihiko,
> > > > > > > > Thanks for the review.
> > > > > > > > 
> > > > > > > > On Sat, Aug 30, 2025 at 02:58:05PM +0900, Akihiko Odaki wrote:
> > > > > > > > > On 2025/08/30 5:01, Arun Menon wrote:
> > > > > > > > > > This is an incremental step in converting vmstate loading
> > > > > > > > > > code to report error via Error objects instead of directly
> > > > > > > > > > printing it to console/monitor.
> > > > > > > > > > It is ensured that qemu_loadvm_state() must report an error
> > > > > > > > > > in errp, in case of failure.
> > > > > > > > > > 
> > > > > > > > > > When postcopy live migration runs, the device states are loaded by
> > > > > > > > > > both the qemu coroutine process_incoming_migration_co() and the
> > > > > > > > > > postcopy_ram_listen_thread(). Therefore, it is important that the
> > > > > > > > > > coroutine also reports the error in case of failure, with
> > > > > > > > > > error_report_err(). Otherwise, the source qemu will not display
> > > > > > > > > > any errors before going into the postcopy pause state.
> > > > > > > > > > 
> > > > > > > > > > Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
> > > > > > > > > > Reviewed-by: Fabiano Rosas <farosas@suse.de>
> > > > > > > > > > Signed-off-by: Arun Menon <armenon@redhat.com>
> > > > > > > > > > ---
> > > > > > > > > >       migration/migration.c |  9 +++++----
> > > > > > > > > >       migration/savevm.c    | 30 ++++++++++++++++++------------
> > > > > > > > > >       migration/savevm.h    |  2 +-
> > > > > > > > > >       3 files changed, 24 insertions(+), 17 deletions(-)
> > > > > > > > > > 
> > > > > > > > > > diff --git a/migration/migration.c b/migration/migration.c
> > > > > > > > > > index 10c216d25dec01f206eacad2edd24d21f00e614c..c6768d88f45c870c7fad9b9957300766ff69effc 100644
> > > > > > > > > > --- a/migration/migration.c
> > > > > > > > > > +++ b/migration/migration.c
> > > > > > > > > > @@ -881,7 +881,7 @@ process_incoming_migration_co(void *opaque)
> > > > > > > > > >                             MIGRATION_STATUS_ACTIVE);
> > > > > > > > > >           mis->loadvm_co = qemu_coroutine_self();
> > > > > > > > > > -    ret = qemu_loadvm_state(mis->from_src_file);
> > > > > > > > > > +    ret = qemu_loadvm_state(mis->from_src_file, &local_err);
> > > > > > > > > >           mis->loadvm_co = NULL;
> > > > > > > > > >           trace_vmstate_downtime_checkpoint("dst-precopy-loadvm-completed");
> > > > > > > > > > @@ -908,7 +908,8 @@ process_incoming_migration_co(void *opaque)
> > > > > > > > > >           }
> > > > > > > > > >           if (ret < 0) {
> > > > > > > > > > -        error_setg(&local_err, "load of migration failed: %s", strerror(-ret));
> > > > > > > > > > +        error_prepend(&local_err, "load of migration failed: %s: ",
> > > > > > > > > > +                      strerror(-ret));
> > > > > > > > > >               goto fail;
> > > > > > > > > >           }
> > > > > > > > > > @@ -924,13 +925,13 @@ fail:
> > > > > > > > > >           migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
> > > > > > > > > >                             MIGRATION_STATUS_FAILED);
> > > > > > > > > >           migrate_set_error(s, local_err);
> > > > > > > > > > -    error_free(local_err);
> > > > > > > > > > +    error_report_err(local_err);
> > > > > > > > > 
> > > > > > > > > This is problematic because it results in duplicate error reports when
> > > > > > > > > !mis->exit_on_error; in that case the query-migrate QMP command reports the
> > > > > > > > > error and this error reporting is redundant.
> > > > > > > > 
> > > > > > > > If I comment this change, then all of the errors propagated up to now, using
> > > > > > > > error_setg() will not be reported. This is the place where it is finally reported,
> > > > > > > > when qemu_loadvm_state() fails. In other words, all the error_reports() we removed
> > > > > > > > from all the files, replacing them with error_setg(), will finally be reported here
> > > > > > > > using error_report_err().
> > > > > > > 
> > > > > > > My understanding of the code without these two changes is:
> > > > > > > - If the migrate-incoming QMP command is used with false as
> > > > > > >      exit-on-error, this function will not report the error but
> > > > > > >      the query-migrate QMP command will report the error.
> > > > > > > - Otherwise, this function reports the error.
> > > > > > 
> > > > > > With my limited experience in testing, I have a question,
> > > > > > So there are 2 scenarios,
> > > > > > 1. running the virsh migrate command on the source host. Something like the following,
> > > > > >      virsh -c 'qemu:///system' migrate --live --verbose --domain guest-vm --desturi qemu+ssh://10.6.120.20/system
> > > > > >      OR for postcopy-ram,
> > > > > >      virsh migrate guest-vm --live qemu+ssh://10.6.120.20/system --verbose --postcopy --timeout 10 --timeout-postcopy
> > > > > > 
> > > > > > 2. Using QMP commands, performing a migration from source to destination.
> > > > > >      Running something like the following on the destination:
> > > > > >      {
> > > > > >        "execute": "migrate-incoming",
> > > > > >        "arguments": {
> > > > > >          "uri": "tcp:127.0.0.1:7777",
> > > > > >          "exit-on-error": false
> > > > > >        }
> > > > > >      }
> > > > > >      {
> > > > > >        "execute": "migrate-incoming",
> > > > > >        "arguments": {
> > > > > >          "uri": "tcp:127.0.0.1:7777",
> > > > > >          "exit-on-error": false
> > > > > >        }
> > > > > >      }
> > > > > >      and the somthing like the following on source:
> > > > > >      {
> > > > > >        "execute": "migrate",
> > > > > >        "arguments": {
> > > > > >          "uri": "tcp:127.0.0.1:7777"
> > > > > >        }
> > > > > >      }
> > > > > >      {"execute" : "query-migrate"}
> > > > > > 
> > > > > > In 1, previously, the user used to get an error message on migration failure.
> > > > > > This was because there were error_report() calls in all of the files.
> > > > > > Now that they are replaced with error_setg() and the error is stored in errp,
> > > > > > we need to display that using error_report_err(). Hence I introduced an error_report_err()
> > > > > > call in the fail section.
> > > > > > 
> > > > > > In 2, we have 2 QMP sessions, one for the source and another for the destination.
> > > > > > The QMP command migrate will be issued on the source, and the errp will be set.
> > > > > > I did not understand the part where the message will be displayed because of the
> > > > > > error_report_err() call. I did not see such a message on failure scenario on both
> > > > > > the sessions.
> > > > > > If the user wants to check for errors, then the destination qemu will not exit
> > > > > > (exit-on-error = false ) and we can retrieve it using {"execute" : "query-migrate"}
> > > > > > 
> > > > > > Aren't the 2 scenarios different by nature?
> > > > > 
> > > > > In 1, doesn't libvirt query the error with query-migrate and print it?
> > > > 
> > > > Ideally it should find the the error, and print the whole thing. It does work
> > > > in the normal scenario. However, the postcopy scenario does not show the same result,
> > > > which is mentioned in the commit message.
> > > > 
> > > > > 
> > > > > In any case, it would be nice if you describe how libvirt interacts with
> > > > > QEMU in 1.
> > > > 
> > > > Please find below the difference in the command output at source, when we run a live migration
> > > > with postcopy enabled.
> > > > 
> > > > =========
> > > > With the current changes:
> > > > [root@dell-per750-42 qemu-priv]# virsh migrate-setspeed guest-vm 1
> > > > 
> > > > [root@dell-per750-42 build]# virsh migrate guest-vm --live qemu+ssh://10.6.120.9/system --verbose --postcopy --timeout 10 --timeout-postcopy
> > > > root@10.6.120.9's password:
> > > > Migration: [ 1.26 %]error: internal error: QEMU unexpectedly closed the monitor (vm='guest-vm'): 2025-09-03T06:19:15.076547Z qemu-system-x86_64: -accel kvm: warning: Number of SMP cpus requested (2) exceeds the recommended cpus supported by KVM (1)
> > > > 2025-09-03T06:19:15.076586Z qemu-system-x86_64: -accel kvm: warning: Number of hotpluggable cpus requested (2) exceeds the recommended cpus supported by KVM (1)
> > > > 2025-09-03T06:19:27.776715Z qemu-system-x86_64: load of migration failed: Input/output error: error while loading state for instance 0x0 of device 'tpm-emulator': post load hook failed for: tpm-emulator, version_id: 0, minimum_version: 0, ret: -5: tpm-emulator: Setting the stateblob (type 1) failed with a TPM error 0x21 decryption error
> > > > 
> > > > [root@dell-per750-42 build]#
> > > > 
> > > > =========
> > > > 
> > > > Without the current changes:
> > > > [root@dell-per750-42 qemu-priv]# virsh migrate-setspeed guest-vm 1
> > > > 
> > > > [root@dell-per750-42 qemu-priv]# virsh migrate guest-vm --live qemu+ssh://10.6.120.9/system --verbose --postcopy --timeout 10 --timeout-postcopy
> > > > root@10.6.120.9's password:
> > > > Migration: [ 1.28 %]error: internal error: QEMU unexpectedly closed the monitor (vm='guest-vm'): 2025-09-03T06:26:17.733786Z qemu-system-x86_64: -accel kvm: warning: Number of SMP cpus requested (2) exceeds the recommended cpus supported by KVM (1)
> > > > 2025-09-03T06:26:17.733830Z qemu-system-x86_64: -accel kvm: warning: Number of hotpluggable cpus requested (2) exceeds the recommended cpus supported by KVM (1)
> > > > 
> > > > [root@dell-per750-42 qemu-priv]#
> > > > 
> > > > =========
> > > > The original behavior was to print the error to the console regardless of whether the migration is normal or postcopy.
> > > 
> > > This was true for messages in qemu_loadvm_state(), but the message "load of
> > > migration failed" was printed or queried with query-migrate, not both. We
> > > should think of which behavior is more appropriate, and I think we should
> > > avoid duplicate reports.
> > > 
> > > > The source machine goes in to a paused state after this.
> > > 
> > > The output is informative. It implies the destination machine exited, and it
> > > makes sense to print error messages as it is done for
> > > mis->exit_on_error. I wonder if it is possible to detect the condition and
> > > treat it identically to mis->exit_on_error.
> > 
> > I see that we want to catch a specific scenario in postcopy ram migration
> > where the destination abruptly exits without a graceful shutdown,
> > thus failing to inform the source the reason for its failure through a
> > 'query-migrate' even though 'exit-on-error' was set to false on the destination.
> > 
> > However, I am not sure how to reliably detect the specific error condition of
> > such a connection close that you have described. Given that this is a large
> > patch series already, could we keep the current change as is for now?
> >  From what I can tell, the additional log message "load of migration failed"
> > is not a breaking change and will not cause a crash. We can develop a more
> > elegant solution to handle the issue of duplication in a separate patch.
> There are two regressions:
> 1) Duplicate error reports when exit-on-error is false and postcopy is
> disabled.

Thank you for your detailed response.
I am trying to fully understand the logic here, so please correct me if I'm wrong.
My understanding of the patch is that all errors are chained together
in local_err by propagating them through the call stack and prepending them with
"load of migration failed."
This creates a single, comprehensive error message, which is then passed to 
  migrate_set_error(s, local_err) in the fail section.
This line already existed. So in effect we are setting the error here.

Regarding the second regression you mentioned:

> 2) Errors reported code else qemu_loadvm_state() are ignored when
> exit-on-error is true.

Doesn't the error_prepend() function ensure that errors set higher up the call stack
are preserved and only "load of migration failed" is prepended to it?
When the fail section is reached, local_err will have the complete error message.

> 
> 1) is trivial yet difficult to fix and I agree that it can be handled later.
> Ideally there should be a comment to note that.
> 
> However, there is also 2), which is more serious and easier to fix so I
> suggest fixing it now. More concretely, the code will look like as follows:
> 
>     migrate_set_error(s, local_err);
>     if (mis->exit_on_error) {
>         error_free(local_err);
>     } else {
>         /*
>          * Report the error here in case that QEMU abruptly exits when
>          * postcopy is enabled.
>          */
>         error_report_err(s->error);
>     }
> 
>     migration_incoming_state_destroy();
> 
>     if (mis->exit_on_error) {
>         WITH_QEMU_LOCK_GUARD(&s->error_mutex) {
>             error_report_err(s->error);
> 
> This ensures errors set by anyone will be reported while duplicate error
> reports are avoided when exit-on-error is true.

yes, this part (set by anyone), I fail to follow. Do you mean that there can be
chances of a race condition between migrate_set_error() and error_report_err()?

My analysis of your proposed code shows that whether mis->exit_on_error is true (in the final if block)
or false (in the else block), error_report_err() is called. 
This seems to result in the error being reported regardless, which is similar to my original patch.
Yes in both the cases, s->error is being reported. However in my view, in the fail section, both
local_err and s->error will be the same.

I appreciate your patience as I try to understand this better.


> 
> Regards,
> Akihiko Odaki
> 

Regards,
Arun Menon
Re: [PATCH v13 07/27] migration: push Error **errp into qemu_loadvm_state()
Posted by Akihiko Odaki 1 week, 3 days ago
On 2025/09/17 23:27, Arun Menon wrote:
> Hi Akihiko,
> 
> On Wed, Sep 17, 2025 at 04:54:11PM +0900, Akihiko Odaki wrote:
>> On 2025/09/17 9:34, Arun Menon wrote:
>>> Hi Akihiko,
>>>
>>> On Sat, Sep 06, 2025 at 05:22:31AM +0200, Akihiko Odaki wrote:
>>>> On 2025/09/03 8:47, Arun Menon wrote:
>>>>> Hi Akihiko,
>>>>>
>>>>> It took some time to set up the machines; apologies for the delay in response.
>>>>>
>>>>> On Mon, Sep 01, 2025 at 02:12:54AM +0900, Akihiko Odaki wrote:
>>>>>> On 2025/09/01 1:38, Arun Menon wrote:
>>>>>>> Hi,
>>>>>>>
>>>>>>> On Mon, Sep 01, 2025 at 01:04:40AM +0900, Akihiko Odaki wrote:
>>>>>>>> On 2025/09/01 0:45, Arun Menon wrote:
>>>>>>>>> Hi Akihiko,
>>>>>>>>> Thanks for the review.
>>>>>>>>>
>>>>>>>>> On Sat, Aug 30, 2025 at 02:58:05PM +0900, Akihiko Odaki wrote:
>>>>>>>>>> On 2025/08/30 5:01, Arun Menon wrote:
>>>>>>>>>>> This is an incremental step in converting vmstate loading
>>>>>>>>>>> code to report error via Error objects instead of directly
>>>>>>>>>>> printing it to console/monitor.
>>>>>>>>>>> It is ensured that qemu_loadvm_state() must report an error
>>>>>>>>>>> in errp, in case of failure.
>>>>>>>>>>>
>>>>>>>>>>> When postcopy live migration runs, the device states are loaded by
>>>>>>>>>>> both the qemu coroutine process_incoming_migration_co() and the
>>>>>>>>>>> postcopy_ram_listen_thread(). Therefore, it is important that the
>>>>>>>>>>> coroutine also reports the error in case of failure, with
>>>>>>>>>>> error_report_err(). Otherwise, the source qemu will not display
>>>>>>>>>>> any errors before going into the postcopy pause state.
>>>>>>>>>>>
>>>>>>>>>>> Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
>>>>>>>>>>> Reviewed-by: Fabiano Rosas <farosas@suse.de>
>>>>>>>>>>> Signed-off-by: Arun Menon <armenon@redhat.com>
>>>>>>>>>>> ---
>>>>>>>>>>>        migration/migration.c |  9 +++++----
>>>>>>>>>>>        migration/savevm.c    | 30 ++++++++++++++++++------------
>>>>>>>>>>>        migration/savevm.h    |  2 +-
>>>>>>>>>>>        3 files changed, 24 insertions(+), 17 deletions(-)
>>>>>>>>>>>
>>>>>>>>>>> diff --git a/migration/migration.c b/migration/migration.c
>>>>>>>>>>> index 10c216d25dec01f206eacad2edd24d21f00e614c..c6768d88f45c870c7fad9b9957300766ff69effc 100644
>>>>>>>>>>> --- a/migration/migration.c
>>>>>>>>>>> +++ b/migration/migration.c
>>>>>>>>>>> @@ -881,7 +881,7 @@ process_incoming_migration_co(void *opaque)
>>>>>>>>>>>                              MIGRATION_STATUS_ACTIVE);
>>>>>>>>>>>            mis->loadvm_co = qemu_coroutine_self();
>>>>>>>>>>> -    ret = qemu_loadvm_state(mis->from_src_file);
>>>>>>>>>>> +    ret = qemu_loadvm_state(mis->from_src_file, &local_err);
>>>>>>>>>>>            mis->loadvm_co = NULL;
>>>>>>>>>>>            trace_vmstate_downtime_checkpoint("dst-precopy-loadvm-completed");
>>>>>>>>>>> @@ -908,7 +908,8 @@ process_incoming_migration_co(void *opaque)
>>>>>>>>>>>            }
>>>>>>>>>>>            if (ret < 0) {
>>>>>>>>>>> -        error_setg(&local_err, "load of migration failed: %s", strerror(-ret));
>>>>>>>>>>> +        error_prepend(&local_err, "load of migration failed: %s: ",
>>>>>>>>>>> +                      strerror(-ret));
>>>>>>>>>>>                goto fail;
>>>>>>>>>>>            }
>>>>>>>>>>> @@ -924,13 +925,13 @@ fail:
>>>>>>>>>>>            migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
>>>>>>>>>>>                              MIGRATION_STATUS_FAILED);
>>>>>>>>>>>            migrate_set_error(s, local_err);
>>>>>>>>>>> -    error_free(local_err);
>>>>>>>>>>> +    error_report_err(local_err);
>>>>>>>>>>
>>>>>>>>>> This is problematic because it results in duplicate error reports when
>>>>>>>>>> !mis->exit_on_error; in that case the query-migrate QMP command reports the
>>>>>>>>>> error and this error reporting is redundant.
>>>>>>>>>
>>>>>>>>> If I comment this change, then all of the errors propagated up to now, using
>>>>>>>>> error_setg() will not be reported. This is the place where it is finally reported,
>>>>>>>>> when qemu_loadvm_state() fails. In other words, all the error_reports() we removed
>>>>>>>>> from all the files, replacing them with error_setg(), will finally be reported here
>>>>>>>>> using error_report_err().
>>>>>>>>
>>>>>>>> My understanding of the code without these two changes is:
>>>>>>>> - If the migrate-incoming QMP command is used with false as
>>>>>>>>       exit-on-error, this function will not report the error but
>>>>>>>>       the query-migrate QMP command will report the error.
>>>>>>>> - Otherwise, this function reports the error.
>>>>>>>
>>>>>>> With my limited experience in testing, I have a question,
>>>>>>> So there are 2 scenarios,
>>>>>>> 1. running the virsh migrate command on the source host. Something like the following,
>>>>>>>       virsh -c 'qemu:///system' migrate --live --verbose --domain guest-vm --desturi qemu+ssh://10.6.120.20/system
>>>>>>>       OR for postcopy-ram,
>>>>>>>       virsh migrate guest-vm --live qemu+ssh://10.6.120.20/system --verbose --postcopy --timeout 10 --timeout-postcopy
>>>>>>>
>>>>>>> 2. Using QMP commands, performing a migration from source to destination.
>>>>>>>       Running something like the following on the destination:
>>>>>>>       {
>>>>>>>         "execute": "migrate-incoming",
>>>>>>>         "arguments": {
>>>>>>>           "uri": "tcp:127.0.0.1:7777",
>>>>>>>           "exit-on-error": false
>>>>>>>         }
>>>>>>>       }
>>>>>>>       {
>>>>>>>         "execute": "migrate-incoming",
>>>>>>>         "arguments": {
>>>>>>>           "uri": "tcp:127.0.0.1:7777",
>>>>>>>           "exit-on-error": false
>>>>>>>         }
>>>>>>>       }
>>>>>>>       and the somthing like the following on source:
>>>>>>>       {
>>>>>>>         "execute": "migrate",
>>>>>>>         "arguments": {
>>>>>>>           "uri": "tcp:127.0.0.1:7777"
>>>>>>>         }
>>>>>>>       }
>>>>>>>       {"execute" : "query-migrate"}
>>>>>>>
>>>>>>> In 1, previously, the user used to get an error message on migration failure.
>>>>>>> This was because there were error_report() calls in all of the files.
>>>>>>> Now that they are replaced with error_setg() and the error is stored in errp,
>>>>>>> we need to display that using error_report_err(). Hence I introduced an error_report_err()
>>>>>>> call in the fail section.
>>>>>>>
>>>>>>> In 2, we have 2 QMP sessions, one for the source and another for the destination.
>>>>>>> The QMP command migrate will be issued on the source, and the errp will be set.
>>>>>>> I did not understand the part where the message will be displayed because of the
>>>>>>> error_report_err() call. I did not see such a message on failure scenario on both
>>>>>>> the sessions.
>>>>>>> If the user wants to check for errors, then the destination qemu will not exit
>>>>>>> (exit-on-error = false ) and we can retrieve it using {"execute" : "query-migrate"}
>>>>>>>
>>>>>>> Aren't the 2 scenarios different by nature?
>>>>>>
>>>>>> In 1, doesn't libvirt query the error with query-migrate and print it?
>>>>>
>>>>> Ideally it should find the the error, and print the whole thing. It does work
>>>>> in the normal scenario. However, the postcopy scenario does not show the same result,
>>>>> which is mentioned in the commit message.
>>>>>
>>>>>>
>>>>>> In any case, it would be nice if you describe how libvirt interacts with
>>>>>> QEMU in 1.
>>>>>
>>>>> Please find below the difference in the command output at source, when we run a live migration
>>>>> with postcopy enabled.
>>>>>
>>>>> =========
>>>>> With the current changes:
>>>>> [root@dell-per750-42 qemu-priv]# virsh migrate-setspeed guest-vm 1
>>>>>
>>>>> [root@dell-per750-42 build]# virsh migrate guest-vm --live qemu+ssh://10.6.120.9/system --verbose --postcopy --timeout 10 --timeout-postcopy
>>>>> root@10.6.120.9's password:
>>>>> Migration: [ 1.26 %]error: internal error: QEMU unexpectedly closed the monitor (vm='guest-vm'): 2025-09-03T06:19:15.076547Z qemu-system-x86_64: -accel kvm: warning: Number of SMP cpus requested (2) exceeds the recommended cpus supported by KVM (1)
>>>>> 2025-09-03T06:19:15.076586Z qemu-system-x86_64: -accel kvm: warning: Number of hotpluggable cpus requested (2) exceeds the recommended cpus supported by KVM (1)
>>>>> 2025-09-03T06:19:27.776715Z qemu-system-x86_64: load of migration failed: Input/output error: error while loading state for instance 0x0 of device 'tpm-emulator': post load hook failed for: tpm-emulator, version_id: 0, minimum_version: 0, ret: -5: tpm-emulator: Setting the stateblob (type 1) failed with a TPM error 0x21 decryption error
>>>>>
>>>>> [root@dell-per750-42 build]#
>>>>>
>>>>> =========
>>>>>
>>>>> Without the current changes:
>>>>> [root@dell-per750-42 qemu-priv]# virsh migrate-setspeed guest-vm 1
>>>>>
>>>>> [root@dell-per750-42 qemu-priv]# virsh migrate guest-vm --live qemu+ssh://10.6.120.9/system --verbose --postcopy --timeout 10 --timeout-postcopy
>>>>> root@10.6.120.9's password:
>>>>> Migration: [ 1.28 %]error: internal error: QEMU unexpectedly closed the monitor (vm='guest-vm'): 2025-09-03T06:26:17.733786Z qemu-system-x86_64: -accel kvm: warning: Number of SMP cpus requested (2) exceeds the recommended cpus supported by KVM (1)
>>>>> 2025-09-03T06:26:17.733830Z qemu-system-x86_64: -accel kvm: warning: Number of hotpluggable cpus requested (2) exceeds the recommended cpus supported by KVM (1)
>>>>>
>>>>> [root@dell-per750-42 qemu-priv]#
>>>>>
>>>>> =========
>>>>> The original behavior was to print the error to the console regardless of whether the migration is normal or postcopy.
>>>>
>>>> This was true for messages in qemu_loadvm_state(), but the message "load of
>>>> migration failed" was printed or queried with query-migrate, not both. We
>>>> should think of which behavior is more appropriate, and I think we should
>>>> avoid duplicate reports.
>>>>
>>>>> The source machine goes in to a paused state after this.
>>>>
>>>> The output is informative. It implies the destination machine exited, and it
>>>> makes sense to print error messages as it is done for
>>>> mis->exit_on_error. I wonder if it is possible to detect the condition and
>>>> treat it identically to mis->exit_on_error.
>>>
>>> I see that we want to catch a specific scenario in postcopy ram migration
>>> where the destination abruptly exits without a graceful shutdown,
>>> thus failing to inform the source the reason for its failure through a
>>> 'query-migrate' even though 'exit-on-error' was set to false on the destination.
>>>
>>> However, I am not sure how to reliably detect the specific error condition of
>>> such a connection close that you have described. Given that this is a large
>>> patch series already, could we keep the current change as is for now?
>>>   From what I can tell, the additional log message "load of migration failed"
>>> is not a breaking change and will not cause a crash. We can develop a more
>>> elegant solution to handle the issue of duplication in a separate patch.
>> There are two regressions:
>> 1) Duplicate error reports when exit-on-error is false and postcopy is
>> disabled.
> 
> Thank you for your detailed response.
> I am trying to fully understand the logic here, so please correct me if I'm wrong.
> My understanding of the patch is that all errors are chained together
> in local_err by propagating them through the call stack and prepending them with
> "load of migration failed."
> This creates a single, comprehensive error message, which is then passed to
>    migrate_set_error(s, local_err) in the fail section.
> This line already existed. So in effect we are setting the error here.

There are two other changes:
- The message is printed immediately with error_report_err(). This 
causes 1).
- The error set with migrate_set_error() is ignored. This causes 2).

> 
> Regarding the second regression you mentioned:
> 
>> 2) Errors reported code else qemu_loadvm_state() are ignored when
>> exit-on-error is true.
> 
> Doesn't the error_prepend() function ensure that errors set higher up the call stack
> are preserved and only "load of migration failed" is prepended to it?
> When the fail section is reached, local_err will have the complete error message.
> 
>>
>> 1) is trivial yet difficult to fix and I agree that it can be handled later.
>> Ideally there should be a comment to note that.
>>
>> However, there is also 2), which is more serious and easier to fix so I
>> suggest fixing it now. More concretely, the code will look like as follows:
>>
>>      migrate_set_error(s, local_err);
>>      if (mis->exit_on_error) {
>>          error_free(local_err);
>>      } else {
>>          /*
>>           * Report the error here in case that QEMU abruptly exits when
>>           * postcopy is enabled.
>>           */
>>          error_report_err(s->error);
>>      }
>>
>>      migration_incoming_state_destroy();
>>
>>      if (mis->exit_on_error) {
>>          WITH_QEMU_LOCK_GUARD(&s->error_mutex) {
>>              error_report_err(s->error);
>>
>> This ensures errors set by anyone will be reported while duplicate error
>> reports are avoided when exit-on-error is true.
> 
> yes, this part (set by anyone), I fail to follow. Do you mean that there can be
> chances of a race condition between migrate_set_error() and error_report_err()?
> 
> My analysis of your proposed code shows that whether mis->exit_on_error is true (in the final if block)
> or false (in the else block), error_report_err() is called.
> This seems to result in the error being reported regardless, which is similar to my original patch.
> Yes in both the cases, s->error is being reported. However in my view, in the fail section, both
> local_err and s->error will be the same.
> 
> I appreciate your patience as I try to understand this better.

I indeed suspect the possibility of a race condition between 
migration_set_error() and error_report_err(). The code implies an error 
can be concurrently reported when migration_incoming_state_destroy() is 
called, and synchronized with WITH_QEMU_LOCK_GUARD(&s->error_mutex).

This code fragment is useless if there is no such concurrency at all and 
should look like the following:

     migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
                       MIGRATION_STATUS_FAILED);
     error_report_err(local_err);

     if (mis->exit_on_error) {
         exit(EXIT_FAILURE);
     }

     migrate_set_error(s, local_err);

     migration_incoming_state_destroy();

But it is not how the code is written, so I suspect there is a race 
condition.

Regards,
Akihiko Odaki

Re: [PATCH v13 07/27] migration: push Error **errp into qemu_loadvm_state()
Posted by Arun Menon 1 week, 3 days ago
Hi Akihiko,

On Thu, Sep 18, 2025 at 11:06:58AM +0900, Akihiko Odaki wrote:
> On 2025/09/17 23:27, Arun Menon wrote:
> > Hi Akihiko,
> > 
> > On Wed, Sep 17, 2025 at 04:54:11PM +0900, Akihiko Odaki wrote:
> > > On 2025/09/17 9:34, Arun Menon wrote:
> > > > Hi Akihiko,
> > > > 
> > > > On Sat, Sep 06, 2025 at 05:22:31AM +0200, Akihiko Odaki wrote:
> > > > > On 2025/09/03 8:47, Arun Menon wrote:
> > > > > > Hi Akihiko,
> > > > > > 
> > > > > > It took some time to set up the machines; apologies for the delay in response.
> > > > > > 
> > > > > > On Mon, Sep 01, 2025 at 02:12:54AM +0900, Akihiko Odaki wrote:
> > > > > > > On 2025/09/01 1:38, Arun Menon wrote:
> > > > > > > > Hi,
> > > > > > > > 
> > > > > > > > On Mon, Sep 01, 2025 at 01:04:40AM +0900, Akihiko Odaki wrote:
> > > > > > > > > On 2025/09/01 0:45, Arun Menon wrote:
> > > > > > > > > > Hi Akihiko,
> > > > > > > > > > Thanks for the review.
> > > > > > > > > > 
> > > > > > > > > > On Sat, Aug 30, 2025 at 02:58:05PM +0900, Akihiko Odaki wrote:
> > > > > > > > > > > On 2025/08/30 5:01, Arun Menon wrote:
> > > > > > > > > > > > This is an incremental step in converting vmstate loading
> > > > > > > > > > > > code to report error via Error objects instead of directly
> > > > > > > > > > > > printing it to console/monitor.
> > > > > > > > > > > > It is ensured that qemu_loadvm_state() must report an error
> > > > > > > > > > > > in errp, in case of failure.
> > > > > > > > > > > > 
> > > > > > > > > > > > When postcopy live migration runs, the device states are loaded by
> > > > > > > > > > > > both the qemu coroutine process_incoming_migration_co() and the
> > > > > > > > > > > > postcopy_ram_listen_thread(). Therefore, it is important that the
> > > > > > > > > > > > coroutine also reports the error in case of failure, with
> > > > > > > > > > > > error_report_err(). Otherwise, the source qemu will not display
> > > > > > > > > > > > any errors before going into the postcopy pause state.
> > > > > > > > > > > > 
> > > > > > > > > > > > Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
> > > > > > > > > > > > Reviewed-by: Fabiano Rosas <farosas@suse.de>
> > > > > > > > > > > > Signed-off-by: Arun Menon <armenon@redhat.com>
> > > > > > > > > > > > ---
> > > > > > > > > > > >        migration/migration.c |  9 +++++----
> > > > > > > > > > > >        migration/savevm.c    | 30 ++++++++++++++++++------------
> > > > > > > > > > > >        migration/savevm.h    |  2 +-
> > > > > > > > > > > >        3 files changed, 24 insertions(+), 17 deletions(-)
> > > > > > > > > > > > 
> > > > > > > > > > > > diff --git a/migration/migration.c b/migration/migration.c
> > > > > > > > > > > > index 10c216d25dec01f206eacad2edd24d21f00e614c..c6768d88f45c870c7fad9b9957300766ff69effc 100644
> > > > > > > > > > > > --- a/migration/migration.c
> > > > > > > > > > > > +++ b/migration/migration.c
> > > > > > > > > > > > @@ -881,7 +881,7 @@ process_incoming_migration_co(void *opaque)
> > > > > > > > > > > >                              MIGRATION_STATUS_ACTIVE);
> > > > > > > > > > > >            mis->loadvm_co = qemu_coroutine_self();
> > > > > > > > > > > > -    ret = qemu_loadvm_state(mis->from_src_file);
> > > > > > > > > > > > +    ret = qemu_loadvm_state(mis->from_src_file, &local_err);
> > > > > > > > > > > >            mis->loadvm_co = NULL;
> > > > > > > > > > > >            trace_vmstate_downtime_checkpoint("dst-precopy-loadvm-completed");
> > > > > > > > > > > > @@ -908,7 +908,8 @@ process_incoming_migration_co(void *opaque)
> > > > > > > > > > > >            }
> > > > > > > > > > > >            if (ret < 0) {
> > > > > > > > > > > > -        error_setg(&local_err, "load of migration failed: %s", strerror(-ret));
> > > > > > > > > > > > +        error_prepend(&local_err, "load of migration failed: %s: ",
> > > > > > > > > > > > +                      strerror(-ret));
> > > > > > > > > > > >                goto fail;
> > > > > > > > > > > >            }
> > > > > > > > > > > > @@ -924,13 +925,13 @@ fail:
> > > > > > > > > > > >            migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
> > > > > > > > > > > >                              MIGRATION_STATUS_FAILED);
> > > > > > > > > > > >            migrate_set_error(s, local_err);
> > > > > > > > > > > > -    error_free(local_err);
> > > > > > > > > > > > +    error_report_err(local_err);
> > > > > > > > > > > 
> > > > > > > > > > > This is problematic because it results in duplicate error reports when
> > > > > > > > > > > !mis->exit_on_error; in that case the query-migrate QMP command reports the
> > > > > > > > > > > error and this error reporting is redundant.
> > > > > > > > > > 
> > > > > > > > > > If I comment this change, then all of the errors propagated up to now, using
> > > > > > > > > > error_setg() will not be reported. This is the place where it is finally reported,
> > > > > > > > > > when qemu_loadvm_state() fails. In other words, all the error_reports() we removed
> > > > > > > > > > from all the files, replacing them with error_setg(), will finally be reported here
> > > > > > > > > > using error_report_err().
> > > > > > > > > 
> > > > > > > > > My understanding of the code without these two changes is:
> > > > > > > > > - If the migrate-incoming QMP command is used with false as
> > > > > > > > >       exit-on-error, this function will not report the error but
> > > > > > > > >       the query-migrate QMP command will report the error.
> > > > > > > > > - Otherwise, this function reports the error.
> > > > > > > > 
> > > > > > > > With my limited experience in testing, I have a question,
> > > > > > > > So there are 2 scenarios,
> > > > > > > > 1. running the virsh migrate command on the source host. Something like the following,
> > > > > > > >       virsh -c 'qemu:///system' migrate --live --verbose --domain guest-vm --desturi qemu+ssh://10.6.120.20/system
> > > > > > > >       OR for postcopy-ram,
> > > > > > > >       virsh migrate guest-vm --live qemu+ssh://10.6.120.20/system --verbose --postcopy --timeout 10 --timeout-postcopy
> > > > > > > > 
> > > > > > > > 2. Using QMP commands, performing a migration from source to destination.
> > > > > > > >       Running something like the following on the destination:
> > > > > > > >       {
> > > > > > > >         "execute": "migrate-incoming",
> > > > > > > >         "arguments": {
> > > > > > > >           "uri": "tcp:127.0.0.1:7777",
> > > > > > > >           "exit-on-error": false
> > > > > > > >         }
> > > > > > > >       }
> > > > > > > >       {
> > > > > > > >         "execute": "migrate-incoming",
> > > > > > > >         "arguments": {
> > > > > > > >           "uri": "tcp:127.0.0.1:7777",
> > > > > > > >           "exit-on-error": false
> > > > > > > >         }
> > > > > > > >       }
> > > > > > > >       and the somthing like the following on source:
> > > > > > > >       {
> > > > > > > >         "execute": "migrate",
> > > > > > > >         "arguments": {
> > > > > > > >           "uri": "tcp:127.0.0.1:7777"
> > > > > > > >         }
> > > > > > > >       }
> > > > > > > >       {"execute" : "query-migrate"}
> > > > > > > > 
> > > > > > > > In 1, previously, the user used to get an error message on migration failure.
> > > > > > > > This was because there were error_report() calls in all of the files.
> > > > > > > > Now that they are replaced with error_setg() and the error is stored in errp,
> > > > > > > > we need to display that using error_report_err(). Hence I introduced an error_report_err()
> > > > > > > > call in the fail section.
> > > > > > > > 
> > > > > > > > In 2, we have 2 QMP sessions, one for the source and another for the destination.
> > > > > > > > The QMP command migrate will be issued on the source, and the errp will be set.
> > > > > > > > I did not understand the part where the message will be displayed because of the
> > > > > > > > error_report_err() call. I did not see such a message on failure scenario on both
> > > > > > > > the sessions.
> > > > > > > > If the user wants to check for errors, then the destination qemu will not exit
> > > > > > > > (exit-on-error = false ) and we can retrieve it using {"execute" : "query-migrate"}
> > > > > > > > 
> > > > > > > > Aren't the 2 scenarios different by nature?
> > > > > > > 
> > > > > > > In 1, doesn't libvirt query the error with query-migrate and print it?
> > > > > > 
> > > > > > Ideally it should find the the error, and print the whole thing. It does work
> > > > > > in the normal scenario. However, the postcopy scenario does not show the same result,
> > > > > > which is mentioned in the commit message.
> > > > > > 
> > > > > > > 
> > > > > > > In any case, it would be nice if you describe how libvirt interacts with
> > > > > > > QEMU in 1.
> > > > > > 
> > > > > > Please find below the difference in the command output at source, when we run a live migration
> > > > > > with postcopy enabled.
> > > > > > 
> > > > > > =========
> > > > > > With the current changes:
> > > > > > [root@dell-per750-42 qemu-priv]# virsh migrate-setspeed guest-vm 1
> > > > > > 
> > > > > > [root@dell-per750-42 build]# virsh migrate guest-vm --live qemu+ssh://10.6.120.9/system --verbose --postcopy --timeout 10 --timeout-postcopy
> > > > > > root@10.6.120.9's password:
> > > > > > Migration: [ 1.26 %]error: internal error: QEMU unexpectedly closed the monitor (vm='guest-vm'): 2025-09-03T06:19:15.076547Z qemu-system-x86_64: -accel kvm: warning: Number of SMP cpus requested (2) exceeds the recommended cpus supported by KVM (1)
> > > > > > 2025-09-03T06:19:15.076586Z qemu-system-x86_64: -accel kvm: warning: Number of hotpluggable cpus requested (2) exceeds the recommended cpus supported by KVM (1)
> > > > > > 2025-09-03T06:19:27.776715Z qemu-system-x86_64: load of migration failed: Input/output error: error while loading state for instance 0x0 of device 'tpm-emulator': post load hook failed for: tpm-emulator, version_id: 0, minimum_version: 0, ret: -5: tpm-emulator: Setting the stateblob (type 1) failed with a TPM error 0x21 decryption error
> > > > > > 
> > > > > > [root@dell-per750-42 build]#
> > > > > > 
> > > > > > =========
> > > > > > 
> > > > > > Without the current changes:
> > > > > > [root@dell-per750-42 qemu-priv]# virsh migrate-setspeed guest-vm 1
> > > > > > 
> > > > > > [root@dell-per750-42 qemu-priv]# virsh migrate guest-vm --live qemu+ssh://10.6.120.9/system --verbose --postcopy --timeout 10 --timeout-postcopy
> > > > > > root@10.6.120.9's password:
> > > > > > Migration: [ 1.28 %]error: internal error: QEMU unexpectedly closed the monitor (vm='guest-vm'): 2025-09-03T06:26:17.733786Z qemu-system-x86_64: -accel kvm: warning: Number of SMP cpus requested (2) exceeds the recommended cpus supported by KVM (1)
> > > > > > 2025-09-03T06:26:17.733830Z qemu-system-x86_64: -accel kvm: warning: Number of hotpluggable cpus requested (2) exceeds the recommended cpus supported by KVM (1)
> > > > > > 
> > > > > > [root@dell-per750-42 qemu-priv]#
> > > > > > 
> > > > > > =========
> > > > > > The original behavior was to print the error to the console regardless of whether the migration is normal or postcopy.
> > > > > 
> > > > > This was true for messages in qemu_loadvm_state(), but the message "load of
> > > > > migration failed" was printed or queried with query-migrate, not both. We
> > > > > should think of which behavior is more appropriate, and I think we should
> > > > > avoid duplicate reports.
> > > > > 
> > > > > > The source machine goes in to a paused state after this.
> > > > > 
> > > > > The output is informative. It implies the destination machine exited, and it
> > > > > makes sense to print error messages as it is done for
> > > > > mis->exit_on_error. I wonder if it is possible to detect the condition and
> > > > > treat it identically to mis->exit_on_error.
> > > > 
> > > > I see that we want to catch a specific scenario in postcopy ram migration
> > > > where the destination abruptly exits without a graceful shutdown,
> > > > thus failing to inform the source the reason for its failure through a
> > > > 'query-migrate' even though 'exit-on-error' was set to false on the destination.
> > > > 
> > > > However, I am not sure how to reliably detect the specific error condition of
> > > > such a connection close that you have described. Given that this is a large
> > > > patch series already, could we keep the current change as is for now?
> > > >   From what I can tell, the additional log message "load of migration failed"
> > > > is not a breaking change and will not cause a crash. We can develop a more
> > > > elegant solution to handle the issue of duplication in a separate patch.
> > > There are two regressions:
> > > 1) Duplicate error reports when exit-on-error is false and postcopy is
> > > disabled.
> > 
> > Thank you for your detailed response.
> > I am trying to fully understand the logic here, so please correct me if I'm wrong.
> > My understanding of the patch is that all errors are chained together
> > in local_err by propagating them through the call stack and prepending them with
> > "load of migration failed."
> > This creates a single, comprehensive error message, which is then passed to
> >    migrate_set_error(s, local_err) in the fail section.
> > This line already existed. So in effect we are setting the error here.
> 
> There are two other changes:
> - The message is printed immediately with error_report_err(). This causes
> 1).
> - The error set with migrate_set_error() is ignored. This causes 2).
> 
> > 
> > Regarding the second regression you mentioned:
> > 
> > > 2) Errors reported code else qemu_loadvm_state() are ignored when
> > > exit-on-error is true.
> > 
> > Doesn't the error_prepend() function ensure that errors set higher up the call stack
> > are preserved and only "load of migration failed" is prepended to it?
> > When the fail section is reached, local_err will have the complete error message.
> > 
> > > 
> > > 1) is trivial yet difficult to fix and I agree that it can be handled later.
> > > Ideally there should be a comment to note that.
> > > 
> > > However, there is also 2), which is more serious and easier to fix so I
> > > suggest fixing it now. More concretely, the code will look like as follows:
> > > 
> > >      migrate_set_error(s, local_err);
> > >      if (mis->exit_on_error) {
> > >          error_free(local_err);
> > >      } else {
> > >          /*
> > >           * Report the error here in case that QEMU abruptly exits when
> > >           * postcopy is enabled.
> > >           */
> > >          error_report_err(s->error);
> > >      }
> > > 
> > >      migration_incoming_state_destroy();
> > > 
> > >      if (mis->exit_on_error) {
> > >          WITH_QEMU_LOCK_GUARD(&s->error_mutex) {
> > >              error_report_err(s->error);
> > > 
> > > This ensures errors set by anyone will be reported while duplicate error
> > > reports are avoided when exit-on-error is true.
> > 
> > yes, this part (set by anyone), I fail to follow. Do you mean that there can be
> > chances of a race condition between migrate_set_error() and error_report_err()?
> > 
> > My analysis of your proposed code shows that whether mis->exit_on_error is true (in the final if block)
> > or false (in the else block), error_report_err() is called.
> > This seems to result in the error being reported regardless, which is similar to my original patch.
> > Yes in both the cases, s->error is being reported. However in my view, in the fail section, both
> > local_err and s->error will be the same.
> > 
> > I appreciate your patience as I try to understand this better.
> 
> I indeed suspect the possibility of a race condition between
> migration_set_error() and error_report_err(). The code implies an error can
> be concurrently reported when migration_incoming_state_destroy() is called,
> and synchronized with WITH_QEMU_LOCK_GUARD(&s->error_mutex).
> 
> This code fragment is useless if there is no such concurrency at all and
> should look like the following:
> 
>     migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
>                       MIGRATION_STATUS_FAILED);
>     error_report_err(local_err);
> 
>     if (mis->exit_on_error) {
>         exit(EXIT_FAILURE);
>     }
> 
>     migrate_set_error(s, local_err);
> 
>     migration_incoming_state_destroy();
> 
> But it is not how the code is written, so I suspect there is a race
> condition.

Thank you. That is clear to me now.
We need to report s->error because it can be set somewhere else.
We are creating separate paths but only adding a placeholder comment
for now for the postcopy scenario. I suppose local_err can be freed outside the if clause.
I shall amend and send a new version.

> 
> Regards,
> Akihiko Odaki
> 

Regards,
Arun Menon
Re: [PATCH v13 07/27] migration: push Error **errp into qemu_loadvm_state()
Posted by Akihiko Odaki 1 week, 3 days ago
On 2025/09/18 15:28, Arun Menon wrote:
> Hi Akihiko,
> 
> On Thu, Sep 18, 2025 at 11:06:58AM +0900, Akihiko Odaki wrote:
>> On 2025/09/17 23:27, Arun Menon wrote:
>>> Hi Akihiko,
>>>
>>> On Wed, Sep 17, 2025 at 04:54:11PM +0900, Akihiko Odaki wrote:
>>>> On 2025/09/17 9:34, Arun Menon wrote:
>>>>> Hi Akihiko,
>>>>>
>>>>> On Sat, Sep 06, 2025 at 05:22:31AM +0200, Akihiko Odaki wrote:
>>>>>> On 2025/09/03 8:47, Arun Menon wrote:
>>>>>>> Hi Akihiko,
>>>>>>>
>>>>>>> It took some time to set up the machines; apologies for the delay in response.
>>>>>>>
>>>>>>> On Mon, Sep 01, 2025 at 02:12:54AM +0900, Akihiko Odaki wrote:
>>>>>>>> On 2025/09/01 1:38, Arun Menon wrote:
>>>>>>>>> Hi,
>>>>>>>>>
>>>>>>>>> On Mon, Sep 01, 2025 at 01:04:40AM +0900, Akihiko Odaki wrote:
>>>>>>>>>> On 2025/09/01 0:45, Arun Menon wrote:
>>>>>>>>>>> Hi Akihiko,
>>>>>>>>>>> Thanks for the review.
>>>>>>>>>>>
>>>>>>>>>>> On Sat, Aug 30, 2025 at 02:58:05PM +0900, Akihiko Odaki wrote:
>>>>>>>>>>>> On 2025/08/30 5:01, Arun Menon wrote:
>>>>>>>>>>>>> This is an incremental step in converting vmstate loading
>>>>>>>>>>>>> code to report error via Error objects instead of directly
>>>>>>>>>>>>> printing it to console/monitor.
>>>>>>>>>>>>> It is ensured that qemu_loadvm_state() must report an error
>>>>>>>>>>>>> in errp, in case of failure.
>>>>>>>>>>>>>
>>>>>>>>>>>>> When postcopy live migration runs, the device states are loaded by
>>>>>>>>>>>>> both the qemu coroutine process_incoming_migration_co() and the
>>>>>>>>>>>>> postcopy_ram_listen_thread(). Therefore, it is important that the
>>>>>>>>>>>>> coroutine also reports the error in case of failure, with
>>>>>>>>>>>>> error_report_err(). Otherwise, the source qemu will not display
>>>>>>>>>>>>> any errors before going into the postcopy pause state.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
>>>>>>>>>>>>> Reviewed-by: Fabiano Rosas <farosas@suse.de>
>>>>>>>>>>>>> Signed-off-by: Arun Menon <armenon@redhat.com>
>>>>>>>>>>>>> ---
>>>>>>>>>>>>>         migration/migration.c |  9 +++++----
>>>>>>>>>>>>>         migration/savevm.c    | 30 ++++++++++++++++++------------
>>>>>>>>>>>>>         migration/savevm.h    |  2 +-
>>>>>>>>>>>>>         3 files changed, 24 insertions(+), 17 deletions(-)
>>>>>>>>>>>>>
>>>>>>>>>>>>> diff --git a/migration/migration.c b/migration/migration.c
>>>>>>>>>>>>> index 10c216d25dec01f206eacad2edd24d21f00e614c..c6768d88f45c870c7fad9b9957300766ff69effc 100644
>>>>>>>>>>>>> --- a/migration/migration.c
>>>>>>>>>>>>> +++ b/migration/migration.c
>>>>>>>>>>>>> @@ -881,7 +881,7 @@ process_incoming_migration_co(void *opaque)
>>>>>>>>>>>>>                               MIGRATION_STATUS_ACTIVE);
>>>>>>>>>>>>>             mis->loadvm_co = qemu_coroutine_self();
>>>>>>>>>>>>> -    ret = qemu_loadvm_state(mis->from_src_file);
>>>>>>>>>>>>> +    ret = qemu_loadvm_state(mis->from_src_file, &local_err);
>>>>>>>>>>>>>             mis->loadvm_co = NULL;
>>>>>>>>>>>>>             trace_vmstate_downtime_checkpoint("dst-precopy-loadvm-completed");
>>>>>>>>>>>>> @@ -908,7 +908,8 @@ process_incoming_migration_co(void *opaque)
>>>>>>>>>>>>>             }
>>>>>>>>>>>>>             if (ret < 0) {
>>>>>>>>>>>>> -        error_setg(&local_err, "load of migration failed: %s", strerror(-ret));
>>>>>>>>>>>>> +        error_prepend(&local_err, "load of migration failed: %s: ",
>>>>>>>>>>>>> +                      strerror(-ret));
>>>>>>>>>>>>>                 goto fail;
>>>>>>>>>>>>>             }
>>>>>>>>>>>>> @@ -924,13 +925,13 @@ fail:
>>>>>>>>>>>>>             migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
>>>>>>>>>>>>>                               MIGRATION_STATUS_FAILED);
>>>>>>>>>>>>>             migrate_set_error(s, local_err);
>>>>>>>>>>>>> -    error_free(local_err);
>>>>>>>>>>>>> +    error_report_err(local_err);
>>>>>>>>>>>>
>>>>>>>>>>>> This is problematic because it results in duplicate error reports when
>>>>>>>>>>>> !mis->exit_on_error; in that case the query-migrate QMP command reports the
>>>>>>>>>>>> error and this error reporting is redundant.
>>>>>>>>>>>
>>>>>>>>>>> If I comment this change, then all of the errors propagated up to now, using
>>>>>>>>>>> error_setg() will not be reported. This is the place where it is finally reported,
>>>>>>>>>>> when qemu_loadvm_state() fails. In other words, all the error_reports() we removed
>>>>>>>>>>> from all the files, replacing them with error_setg(), will finally be reported here
>>>>>>>>>>> using error_report_err().
>>>>>>>>>>
>>>>>>>>>> My understanding of the code without these two changes is:
>>>>>>>>>> - If the migrate-incoming QMP command is used with false as
>>>>>>>>>>        exit-on-error, this function will not report the error but
>>>>>>>>>>        the query-migrate QMP command will report the error.
>>>>>>>>>> - Otherwise, this function reports the error.
>>>>>>>>>
>>>>>>>>> With my limited experience in testing, I have a question,
>>>>>>>>> So there are 2 scenarios,
>>>>>>>>> 1. running the virsh migrate command on the source host. Something like the following,
>>>>>>>>>        virsh -c 'qemu:///system' migrate --live --verbose --domain guest-vm --desturi qemu+ssh://10.6.120.20/system
>>>>>>>>>        OR for postcopy-ram,
>>>>>>>>>        virsh migrate guest-vm --live qemu+ssh://10.6.120.20/system --verbose --postcopy --timeout 10 --timeout-postcopy
>>>>>>>>>
>>>>>>>>> 2. Using QMP commands, performing a migration from source to destination.
>>>>>>>>>        Running something like the following on the destination:
>>>>>>>>>        {
>>>>>>>>>          "execute": "migrate-incoming",
>>>>>>>>>          "arguments": {
>>>>>>>>>            "uri": "tcp:127.0.0.1:7777",
>>>>>>>>>            "exit-on-error": false
>>>>>>>>>          }
>>>>>>>>>        }
>>>>>>>>>        {
>>>>>>>>>          "execute": "migrate-incoming",
>>>>>>>>>          "arguments": {
>>>>>>>>>            "uri": "tcp:127.0.0.1:7777",
>>>>>>>>>            "exit-on-error": false
>>>>>>>>>          }
>>>>>>>>>        }
>>>>>>>>>        and the somthing like the following on source:
>>>>>>>>>        {
>>>>>>>>>          "execute": "migrate",
>>>>>>>>>          "arguments": {
>>>>>>>>>            "uri": "tcp:127.0.0.1:7777"
>>>>>>>>>          }
>>>>>>>>>        }
>>>>>>>>>        {"execute" : "query-migrate"}
>>>>>>>>>
>>>>>>>>> In 1, previously, the user used to get an error message on migration failure.
>>>>>>>>> This was because there were error_report() calls in all of the files.
>>>>>>>>> Now that they are replaced with error_setg() and the error is stored in errp,
>>>>>>>>> we need to display that using error_report_err(). Hence I introduced an error_report_err()
>>>>>>>>> call in the fail section.
>>>>>>>>>
>>>>>>>>> In 2, we have 2 QMP sessions, one for the source and another for the destination.
>>>>>>>>> The QMP command migrate will be issued on the source, and the errp will be set.
>>>>>>>>> I did not understand the part where the message will be displayed because of the
>>>>>>>>> error_report_err() call. I did not see such a message on failure scenario on both
>>>>>>>>> the sessions.
>>>>>>>>> If the user wants to check for errors, then the destination qemu will not exit
>>>>>>>>> (exit-on-error = false ) and we can retrieve it using {"execute" : "query-migrate"}
>>>>>>>>>
>>>>>>>>> Aren't the 2 scenarios different by nature?
>>>>>>>>
>>>>>>>> In 1, doesn't libvirt query the error with query-migrate and print it?
>>>>>>>
>>>>>>> Ideally it should find the the error, and print the whole thing. It does work
>>>>>>> in the normal scenario. However, the postcopy scenario does not show the same result,
>>>>>>> which is mentioned in the commit message.
>>>>>>>
>>>>>>>>
>>>>>>>> In any case, it would be nice if you describe how libvirt interacts with
>>>>>>>> QEMU in 1.
>>>>>>>
>>>>>>> Please find below the difference in the command output at source, when we run a live migration
>>>>>>> with postcopy enabled.
>>>>>>>
>>>>>>> =========
>>>>>>> With the current changes:
>>>>>>> [root@dell-per750-42 qemu-priv]# virsh migrate-setspeed guest-vm 1
>>>>>>>
>>>>>>> [root@dell-per750-42 build]# virsh migrate guest-vm --live qemu+ssh://10.6.120.9/system --verbose --postcopy --timeout 10 --timeout-postcopy
>>>>>>> root@10.6.120.9's password:
>>>>>>> Migration: [ 1.26 %]error: internal error: QEMU unexpectedly closed the monitor (vm='guest-vm'): 2025-09-03T06:19:15.076547Z qemu-system-x86_64: -accel kvm: warning: Number of SMP cpus requested (2) exceeds the recommended cpus supported by KVM (1)
>>>>>>> 2025-09-03T06:19:15.076586Z qemu-system-x86_64: -accel kvm: warning: Number of hotpluggable cpus requested (2) exceeds the recommended cpus supported by KVM (1)
>>>>>>> 2025-09-03T06:19:27.776715Z qemu-system-x86_64: load of migration failed: Input/output error: error while loading state for instance 0x0 of device 'tpm-emulator': post load hook failed for: tpm-emulator, version_id: 0, minimum_version: 0, ret: -5: tpm-emulator: Setting the stateblob (type 1) failed with a TPM error 0x21 decryption error
>>>>>>>
>>>>>>> [root@dell-per750-42 build]#
>>>>>>>
>>>>>>> =========
>>>>>>>
>>>>>>> Without the current changes:
>>>>>>> [root@dell-per750-42 qemu-priv]# virsh migrate-setspeed guest-vm 1
>>>>>>>
>>>>>>> [root@dell-per750-42 qemu-priv]# virsh migrate guest-vm --live qemu+ssh://10.6.120.9/system --verbose --postcopy --timeout 10 --timeout-postcopy
>>>>>>> root@10.6.120.9's password:
>>>>>>> Migration: [ 1.28 %]error: internal error: QEMU unexpectedly closed the monitor (vm='guest-vm'): 2025-09-03T06:26:17.733786Z qemu-system-x86_64: -accel kvm: warning: Number of SMP cpus requested (2) exceeds the recommended cpus supported by KVM (1)
>>>>>>> 2025-09-03T06:26:17.733830Z qemu-system-x86_64: -accel kvm: warning: Number of hotpluggable cpus requested (2) exceeds the recommended cpus supported by KVM (1)
>>>>>>>
>>>>>>> [root@dell-per750-42 qemu-priv]#
>>>>>>>
>>>>>>> =========
>>>>>>> The original behavior was to print the error to the console regardless of whether the migration is normal or postcopy.
>>>>>>
>>>>>> This was true for messages in qemu_loadvm_state(), but the message "load of
>>>>>> migration failed" was printed or queried with query-migrate, not both. We
>>>>>> should think of which behavior is more appropriate, and I think we should
>>>>>> avoid duplicate reports.
>>>>>>
>>>>>>> The source machine goes in to a paused state after this.
>>>>>>
>>>>>> The output is informative. It implies the destination machine exited, and it
>>>>>> makes sense to print error messages as it is done for
>>>>>> mis->exit_on_error. I wonder if it is possible to detect the condition and
>>>>>> treat it identically to mis->exit_on_error.
>>>>>
>>>>> I see that we want to catch a specific scenario in postcopy ram migration
>>>>> where the destination abruptly exits without a graceful shutdown,
>>>>> thus failing to inform the source the reason for its failure through a
>>>>> 'query-migrate' even though 'exit-on-error' was set to false on the destination.
>>>>>
>>>>> However, I am not sure how to reliably detect the specific error condition of
>>>>> such a connection close that you have described. Given that this is a large
>>>>> patch series already, could we keep the current change as is for now?
>>>>>    From what I can tell, the additional log message "load of migration failed"
>>>>> is not a breaking change and will not cause a crash. We can develop a more
>>>>> elegant solution to handle the issue of duplication in a separate patch.
>>>> There are two regressions:
>>>> 1) Duplicate error reports when exit-on-error is false and postcopy is
>>>> disabled.
>>>
>>> Thank you for your detailed response.
>>> I am trying to fully understand the logic here, so please correct me if I'm wrong.
>>> My understanding of the patch is that all errors are chained together
>>> in local_err by propagating them through the call stack and prepending them with
>>> "load of migration failed."
>>> This creates a single, comprehensive error message, which is then passed to
>>>     migrate_set_error(s, local_err) in the fail section.
>>> This line already existed. So in effect we are setting the error here.
>>
>> There are two other changes:
>> - The message is printed immediately with error_report_err(). This causes
>> 1).
>> - The error set with migrate_set_error() is ignored. This causes 2).
>>
>>>
>>> Regarding the second regression you mentioned:
>>>
>>>> 2) Errors reported code else qemu_loadvm_state() are ignored when
>>>> exit-on-error is true.
>>>
>>> Doesn't the error_prepend() function ensure that errors set higher up the call stack
>>> are preserved and only "load of migration failed" is prepended to it?
>>> When the fail section is reached, local_err will have the complete error message.
>>>
>>>>
>>>> 1) is trivial yet difficult to fix and I agree that it can be handled later.
>>>> Ideally there should be a comment to note that.
>>>>
>>>> However, there is also 2), which is more serious and easier to fix so I
>>>> suggest fixing it now. More concretely, the code will look like as follows:
>>>>
>>>>       migrate_set_error(s, local_err);
>>>>       if (mis->exit_on_error) {
>>>>           error_free(local_err);
>>>>       } else {
>>>>           /*
>>>>            * Report the error here in case that QEMU abruptly exits when
>>>>            * postcopy is enabled.
>>>>            */
>>>>           error_report_err(s->error);
>>>>       }
>>>>
>>>>       migration_incoming_state_destroy();
>>>>
>>>>       if (mis->exit_on_error) {
>>>>           WITH_QEMU_LOCK_GUARD(&s->error_mutex) {
>>>>               error_report_err(s->error);
>>>>
>>>> This ensures errors set by anyone will be reported while duplicate error
>>>> reports are avoided when exit-on-error is true.
>>>
>>> yes, this part (set by anyone), I fail to follow. Do you mean that there can be
>>> chances of a race condition between migrate_set_error() and error_report_err()?
>>>
>>> My analysis of your proposed code shows that whether mis->exit_on_error is true (in the final if block)
>>> or false (in the else block), error_report_err() is called.
>>> This seems to result in the error being reported regardless, which is similar to my original patch.
>>> Yes in both the cases, s->error is being reported. However in my view, in the fail section, both
>>> local_err and s->error will be the same.
>>>
>>> I appreciate your patience as I try to understand this better.
>>
>> I indeed suspect the possibility of a race condition between
>> migration_set_error() and error_report_err(). The code implies an error can
>> be concurrently reported when migration_incoming_state_destroy() is called,
>> and synchronized with WITH_QEMU_LOCK_GUARD(&s->error_mutex).
>>
>> This code fragment is useless if there is no such concurrency at all and
>> should look like the following:
>>
>>      migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
>>                        MIGRATION_STATUS_FAILED);
>>      error_report_err(local_err);
>>
>>      if (mis->exit_on_error) {
>>          exit(EXIT_FAILURE);
>>      }
>>
>>      migrate_set_error(s, local_err);
>>
>>      migration_incoming_state_destroy();
>>
>> But it is not how the code is written, so I suspect there is a race
>> condition.
> 
> Thank you. That is clear to me now.
> We need to report s->error because it can be set somewhere else.
> We are creating separate paths but only adding a placeholder comment
> for now for the postcopy scenario. I suppose local_err can be freed outside the if clause.

The separate paths or the if clause for error_free() and 
error_report_err() is necessary because both functions free the error 
object and calling both results in double-free.

Regards,
Akihiko Odaki

Re: [PATCH v13 07/27] migration: push Error **errp into qemu_loadvm_state()
Posted by Arun Menon 1 week, 3 days ago
Hi Akihiko,

On Thu, Sep 18, 2025 at 09:08:05PM +0900, Akihiko Odaki wrote:
> On 2025/09/18 15:28, Arun Menon wrote:
> > Hi Akihiko,
> > 
> > On Thu, Sep 18, 2025 at 11:06:58AM +0900, Akihiko Odaki wrote:
> > > On 2025/09/17 23:27, Arun Menon wrote:
> > > > Hi Akihiko,
> > > > 
> > > > On Wed, Sep 17, 2025 at 04:54:11PM +0900, Akihiko Odaki wrote:
> > > > > On 2025/09/17 9:34, Arun Menon wrote:
> > > > > > Hi Akihiko,
> > > > > > 
> > > > > > On Sat, Sep 06, 2025 at 05:22:31AM +0200, Akihiko Odaki wrote:
> > > > > > > On 2025/09/03 8:47, Arun Menon wrote:
> > > > > > > > Hi Akihiko,
> > > > > > > > 
> > > > > > > > It took some time to set up the machines; apologies for the delay in response.
> > > > > > > > 
> > > > > > > > On Mon, Sep 01, 2025 at 02:12:54AM +0900, Akihiko Odaki wrote:
> > > > > > > > > On 2025/09/01 1:38, Arun Menon wrote:
> > > > > > > > > > Hi,
> > > > > > > > > > 
> > > > > > > > > > On Mon, Sep 01, 2025 at 01:04:40AM +0900, Akihiko Odaki wrote:
> > > > > > > > > > > On 2025/09/01 0:45, Arun Menon wrote:
> > > > > > > > > > > > Hi Akihiko,
> > > > > > > > > > > > Thanks for the review.
> > > > > > > > > > > > 
> > > > > > > > > > > > On Sat, Aug 30, 2025 at 02:58:05PM +0900, Akihiko Odaki wrote:
> > > > > > > > > > > > > On 2025/08/30 5:01, Arun Menon wrote:
> > > > > > > > > > > > > > This is an incremental step in converting vmstate loading
> > > > > > > > > > > > > > code to report error via Error objects instead of directly
> > > > > > > > > > > > > > printing it to console/monitor.
> > > > > > > > > > > > > > It is ensured that qemu_loadvm_state() must report an error
> > > > > > > > > > > > > > in errp, in case of failure.
> > > > > > > > > > > > > > 
> > > > > > > > > > > > > > When postcopy live migration runs, the device states are loaded by
> > > > > > > > > > > > > > both the qemu coroutine process_incoming_migration_co() and the
> > > > > > > > > > > > > > postcopy_ram_listen_thread(). Therefore, it is important that the
> > > > > > > > > > > > > > coroutine also reports the error in case of failure, with
> > > > > > > > > > > > > > error_report_err(). Otherwise, the source qemu will not display
> > > > > > > > > > > > > > any errors before going into the postcopy pause state.
> > > > > > > > > > > > > > 
> > > > > > > > > > > > > > Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
> > > > > > > > > > > > > > Reviewed-by: Fabiano Rosas <farosas@suse.de>
> > > > > > > > > > > > > > Signed-off-by: Arun Menon <armenon@redhat.com>
> > > > > > > > > > > > > > ---
> > > > > > > > > > > > > >         migration/migration.c |  9 +++++----
> > > > > > > > > > > > > >         migration/savevm.c    | 30 ++++++++++++++++++------------
> > > > > > > > > > > > > >         migration/savevm.h    |  2 +-
> > > > > > > > > > > > > >         3 files changed, 24 insertions(+), 17 deletions(-)
> > > > > > > > > > > > > > 
> > > > > > > > > > > > > > diff --git a/migration/migration.c b/migration/migration.c
> > > > > > > > > > > > > > index 10c216d25dec01f206eacad2edd24d21f00e614c..c6768d88f45c870c7fad9b9957300766ff69effc 100644
> > > > > > > > > > > > > > --- a/migration/migration.c
> > > > > > > > > > > > > > +++ b/migration/migration.c
> > > > > > > > > > > > > > @@ -881,7 +881,7 @@ process_incoming_migration_co(void *opaque)
> > > > > > > > > > > > > >                               MIGRATION_STATUS_ACTIVE);
> > > > > > > > > > > > > >             mis->loadvm_co = qemu_coroutine_self();
> > > > > > > > > > > > > > -    ret = qemu_loadvm_state(mis->from_src_file);
> > > > > > > > > > > > > > +    ret = qemu_loadvm_state(mis->from_src_file, &local_err);
> > > > > > > > > > > > > >             mis->loadvm_co = NULL;
> > > > > > > > > > > > > >             trace_vmstate_downtime_checkpoint("dst-precopy-loadvm-completed");
> > > > > > > > > > > > > > @@ -908,7 +908,8 @@ process_incoming_migration_co(void *opaque)
> > > > > > > > > > > > > >             }
> > > > > > > > > > > > > >             if (ret < 0) {
> > > > > > > > > > > > > > -        error_setg(&local_err, "load of migration failed: %s", strerror(-ret));
> > > > > > > > > > > > > > +        error_prepend(&local_err, "load of migration failed: %s: ",
> > > > > > > > > > > > > > +                      strerror(-ret));
> > > > > > > > > > > > > >                 goto fail;
> > > > > > > > > > > > > >             }
> > > > > > > > > > > > > > @@ -924,13 +925,13 @@ fail:
> > > > > > > > > > > > > >             migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
> > > > > > > > > > > > > >                               MIGRATION_STATUS_FAILED);
> > > > > > > > > > > > > >             migrate_set_error(s, local_err);
> > > > > > > > > > > > > > -    error_free(local_err);
> > > > > > > > > > > > > > +    error_report_err(local_err);
> > > > > > > > > > > > > 
> > > > > > > > > > > > > This is problematic because it results in duplicate error reports when
> > > > > > > > > > > > > !mis->exit_on_error; in that case the query-migrate QMP command reports the
> > > > > > > > > > > > > error and this error reporting is redundant.
> > > > > > > > > > > > 
> > > > > > > > > > > > If I comment this change, then all of the errors propagated up to now, using
> > > > > > > > > > > > error_setg() will not be reported. This is the place where it is finally reported,
> > > > > > > > > > > > when qemu_loadvm_state() fails. In other words, all the error_reports() we removed
> > > > > > > > > > > > from all the files, replacing them with error_setg(), will finally be reported here
> > > > > > > > > > > > using error_report_err().
> > > > > > > > > > > 
> > > > > > > > > > > My understanding of the code without these two changes is:
> > > > > > > > > > > - If the migrate-incoming QMP command is used with false as
> > > > > > > > > > >        exit-on-error, this function will not report the error but
> > > > > > > > > > >        the query-migrate QMP command will report the error.
> > > > > > > > > > > - Otherwise, this function reports the error.
> > > > > > > > > > 
> > > > > > > > > > With my limited experience in testing, I have a question,
> > > > > > > > > > So there are 2 scenarios,
> > > > > > > > > > 1. running the virsh migrate command on the source host. Something like the following,
> > > > > > > > > >        virsh -c 'qemu:///system' migrate --live --verbose --domain guest-vm --desturi qemu+ssh://10.6.120.20/system
> > > > > > > > > >        OR for postcopy-ram,
> > > > > > > > > >        virsh migrate guest-vm --live qemu+ssh://10.6.120.20/system --verbose --postcopy --timeout 10 --timeout-postcopy
> > > > > > > > > > 
> > > > > > > > > > 2. Using QMP commands, performing a migration from source to destination.
> > > > > > > > > >        Running something like the following on the destination:
> > > > > > > > > >        {
> > > > > > > > > >          "execute": "migrate-incoming",
> > > > > > > > > >          "arguments": {
> > > > > > > > > >            "uri": "tcp:127.0.0.1:7777",
> > > > > > > > > >            "exit-on-error": false
> > > > > > > > > >          }
> > > > > > > > > >        }
> > > > > > > > > >        {
> > > > > > > > > >          "execute": "migrate-incoming",
> > > > > > > > > >          "arguments": {
> > > > > > > > > >            "uri": "tcp:127.0.0.1:7777",
> > > > > > > > > >            "exit-on-error": false
> > > > > > > > > >          }
> > > > > > > > > >        }
> > > > > > > > > >        and the somthing like the following on source:
> > > > > > > > > >        {
> > > > > > > > > >          "execute": "migrate",
> > > > > > > > > >          "arguments": {
> > > > > > > > > >            "uri": "tcp:127.0.0.1:7777"
> > > > > > > > > >          }
> > > > > > > > > >        }
> > > > > > > > > >        {"execute" : "query-migrate"}
> > > > > > > > > > 
> > > > > > > > > > In 1, previously, the user used to get an error message on migration failure.
> > > > > > > > > > This was because there were error_report() calls in all of the files.
> > > > > > > > > > Now that they are replaced with error_setg() and the error is stored in errp,
> > > > > > > > > > we need to display that using error_report_err(). Hence I introduced an error_report_err()
> > > > > > > > > > call in the fail section.
> > > > > > > > > > 
> > > > > > > > > > In 2, we have 2 QMP sessions, one for the source and another for the destination.
> > > > > > > > > > The QMP command migrate will be issued on the source, and the errp will be set.
> > > > > > > > > > I did not understand the part where the message will be displayed because of the
> > > > > > > > > > error_report_err() call. I did not see such a message on failure scenario on both
> > > > > > > > > > the sessions.
> > > > > > > > > > If the user wants to check for errors, then the destination qemu will not exit
> > > > > > > > > > (exit-on-error = false ) and we can retrieve it using {"execute" : "query-migrate"}
> > > > > > > > > > 
> > > > > > > > > > Aren't the 2 scenarios different by nature?
> > > > > > > > > 
> > > > > > > > > In 1, doesn't libvirt query the error with query-migrate and print it?
> > > > > > > > 
> > > > > > > > Ideally it should find the the error, and print the whole thing. It does work
> > > > > > > > in the normal scenario. However, the postcopy scenario does not show the same result,
> > > > > > > > which is mentioned in the commit message.
> > > > > > > > 
> > > > > > > > > 
> > > > > > > > > In any case, it would be nice if you describe how libvirt interacts with
> > > > > > > > > QEMU in 1.
> > > > > > > > 
> > > > > > > > Please find below the difference in the command output at source, when we run a live migration
> > > > > > > > with postcopy enabled.
> > > > > > > > 
> > > > > > > > =========
> > > > > > > > With the current changes:
> > > > > > > > [root@dell-per750-42 qemu-priv]# virsh migrate-setspeed guest-vm 1
> > > > > > > > 
> > > > > > > > [root@dell-per750-42 build]# virsh migrate guest-vm --live qemu+ssh://10.6.120.9/system --verbose --postcopy --timeout 10 --timeout-postcopy
> > > > > > > > root@10.6.120.9's password:
> > > > > > > > Migration: [ 1.26 %]error: internal error: QEMU unexpectedly closed the monitor (vm='guest-vm'): 2025-09-03T06:19:15.076547Z qemu-system-x86_64: -accel kvm: warning: Number of SMP cpus requested (2) exceeds the recommended cpus supported by KVM (1)
> > > > > > > > 2025-09-03T06:19:15.076586Z qemu-system-x86_64: -accel kvm: warning: Number of hotpluggable cpus requested (2) exceeds the recommended cpus supported by KVM (1)
> > > > > > > > 2025-09-03T06:19:27.776715Z qemu-system-x86_64: load of migration failed: Input/output error: error while loading state for instance 0x0 of device 'tpm-emulator': post load hook failed for: tpm-emulator, version_id: 0, minimum_version: 0, ret: -5: tpm-emulator: Setting the stateblob (type 1) failed with a TPM error 0x21 decryption error
> > > > > > > > 
> > > > > > > > [root@dell-per750-42 build]#
> > > > > > > > 
> > > > > > > > =========
> > > > > > > > 
> > > > > > > > Without the current changes:
> > > > > > > > [root@dell-per750-42 qemu-priv]# virsh migrate-setspeed guest-vm 1
> > > > > > > > 
> > > > > > > > [root@dell-per750-42 qemu-priv]# virsh migrate guest-vm --live qemu+ssh://10.6.120.9/system --verbose --postcopy --timeout 10 --timeout-postcopy
> > > > > > > > root@10.6.120.9's password:
> > > > > > > > Migration: [ 1.28 %]error: internal error: QEMU unexpectedly closed the monitor (vm='guest-vm'): 2025-09-03T06:26:17.733786Z qemu-system-x86_64: -accel kvm: warning: Number of SMP cpus requested (2) exceeds the recommended cpus supported by KVM (1)
> > > > > > > > 2025-09-03T06:26:17.733830Z qemu-system-x86_64: -accel kvm: warning: Number of hotpluggable cpus requested (2) exceeds the recommended cpus supported by KVM (1)
> > > > > > > > 
> > > > > > > > [root@dell-per750-42 qemu-priv]#
> > > > > > > > 
> > > > > > > > =========
> > > > > > > > The original behavior was to print the error to the console regardless of whether the migration is normal or postcopy.
> > > > > > > 
> > > > > > > This was true for messages in qemu_loadvm_state(), but the message "load of
> > > > > > > migration failed" was printed or queried with query-migrate, not both. We
> > > > > > > should think of which behavior is more appropriate, and I think we should
> > > > > > > avoid duplicate reports.
> > > > > > > 
> > > > > > > > The source machine goes in to a paused state after this.
> > > > > > > 
> > > > > > > The output is informative. It implies the destination machine exited, and it
> > > > > > > makes sense to print error messages as it is done for
> > > > > > > mis->exit_on_error. I wonder if it is possible to detect the condition and
> > > > > > > treat it identically to mis->exit_on_error.
> > > > > > 
> > > > > > I see that we want to catch a specific scenario in postcopy ram migration
> > > > > > where the destination abruptly exits without a graceful shutdown,
> > > > > > thus failing to inform the source the reason for its failure through a
> > > > > > 'query-migrate' even though 'exit-on-error' was set to false on the destination.
> > > > > > 
> > > > > > However, I am not sure how to reliably detect the specific error condition of
> > > > > > such a connection close that you have described. Given that this is a large
> > > > > > patch series already, could we keep the current change as is for now?
> > > > > >    From what I can tell, the additional log message "load of migration failed"
> > > > > > is not a breaking change and will not cause a crash. We can develop a more
> > > > > > elegant solution to handle the issue of duplication in a separate patch.
> > > > > There are two regressions:
> > > > > 1) Duplicate error reports when exit-on-error is false and postcopy is
> > > > > disabled.
> > > > 
> > > > Thank you for your detailed response.
> > > > I am trying to fully understand the logic here, so please correct me if I'm wrong.
> > > > My understanding of the patch is that all errors are chained together
> > > > in local_err by propagating them through the call stack and prepending them with
> > > > "load of migration failed."
> > > > This creates a single, comprehensive error message, which is then passed to
> > > >     migrate_set_error(s, local_err) in the fail section.
> > > > This line already existed. So in effect we are setting the error here.
> > > 
> > > There are two other changes:
> > > - The message is printed immediately with error_report_err(). This causes
> > > 1).
> > > - The error set with migrate_set_error() is ignored. This causes 2).
> > > 
> > > > 
> > > > Regarding the second regression you mentioned:
> > > > 
> > > > > 2) Errors reported code else qemu_loadvm_state() are ignored when
> > > > > exit-on-error is true.
> > > > 
> > > > Doesn't the error_prepend() function ensure that errors set higher up the call stack
> > > > are preserved and only "load of migration failed" is prepended to it?
> > > > When the fail section is reached, local_err will have the complete error message.
> > > > 
> > > > > 
> > > > > 1) is trivial yet difficult to fix and I agree that it can be handled later.
> > > > > Ideally there should be a comment to note that.
> > > > > 
> > > > > However, there is also 2), which is more serious and easier to fix so I
> > > > > suggest fixing it now. More concretely, the code will look like as follows:
> > > > > 
> > > > >       migrate_set_error(s, local_err);
> > > > >       if (mis->exit_on_error) {
> > > > >           error_free(local_err);
> > > > >       } else {
> > > > >           /*
> > > > >            * Report the error here in case that QEMU abruptly exits when
> > > > >            * postcopy is enabled.
> > > > >            */
> > > > >           error_report_err(s->error);
> > > > >       }
> > > > > 
> > > > >       migration_incoming_state_destroy();
> > > > > 
> > > > >       if (mis->exit_on_error) {
> > > > >           WITH_QEMU_LOCK_GUARD(&s->error_mutex) {
> > > > >               error_report_err(s->error);
> > > > > 
> > > > > This ensures errors set by anyone will be reported while duplicate error
> > > > > reports are avoided when exit-on-error is true.
> > > > 
> > > > yes, this part (set by anyone), I fail to follow. Do you mean that there can be
> > > > chances of a race condition between migrate_set_error() and error_report_err()?
> > > > 
> > > > My analysis of your proposed code shows that whether mis->exit_on_error is true (in the final if block)
> > > > or false (in the else block), error_report_err() is called.
> > > > This seems to result in the error being reported regardless, which is similar to my original patch.
> > > > Yes in both the cases, s->error is being reported. However in my view, in the fail section, both
> > > > local_err and s->error will be the same.
> > > > 
> > > > I appreciate your patience as I try to understand this better.
> > > 
> > > I indeed suspect the possibility of a race condition between
> > > migration_set_error() and error_report_err(). The code implies an error can
> > > be concurrently reported when migration_incoming_state_destroy() is called,
> > > and synchronized with WITH_QEMU_LOCK_GUARD(&s->error_mutex).
> > > 
> > > This code fragment is useless if there is no such concurrency at all and
> > > should look like the following:
> > > 
> > >      migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
> > >                        MIGRATION_STATUS_FAILED);
> > >      error_report_err(local_err);
> > > 
> > >      if (mis->exit_on_error) {
> > >          exit(EXIT_FAILURE);
> > >      }
> > > 
> > >      migrate_set_error(s, local_err);
> > > 
> > >      migration_incoming_state_destroy();
> > > 
> > > But it is not how the code is written, so I suspect there is a race
> > > condition.
> > 
> > Thank you. That is clear to me now.
> > We need to report s->error because it can be set somewhere else.
> > We are creating separate paths but only adding a placeholder comment
> > for now for the postcopy scenario. I suppose local_err can be freed outside the if clause.
> 
> The separate paths or the if clause for error_free() and error_report_err()
> is necessary because both functions free the error object and calling both
> results in double-free.

Doesn't error_copy() called inside migrate_set_error() create a deep copy of the error
object? Such that local_err and s->error point to separate locations in memory.

The following has more boilerplate code as it does not change the existing sequence
of instructions but it can create confusion while reading.
fail:
    migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
                      MIGRATION_STATUS_FAILED);
    migrate_set_error(s, local_err);
    error_free(local_err);

    if (!mis->exit_on_error) {
        /*
         * Report the error here in case that QEMU abruptly exits when
         * postcopy is enabled.
         */
        WITH_QEMU_LOCK_GUARD(&s->error_mutex) {
            error_report_err(s->error);
            s->error = NULL;
        }
    }
    migration_incoming_state_destroy();
    if (mis->exit_on_error) {
        WITH_QEMU_LOCK_GUARD(&s->error_mutex) {
            error_report_err(s->error);
            s->error = NULL;
        }
        exit(EXIT_FAILURE);
    }

To avoid the confusion we can do the following,
fail:
    migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
                      MIGRATION_STATUS_FAILED);
    migrate_set_error(s, local_err);
    error_free(local_err);

    migration_incoming_state_destroy();

    if (mis->exit_on_error) {
        WITH_QEMU_LOCK_GUARD(&s->error_mutex) {
            error_report_err(s->error);
            s->error = NULL;
        }
        exit(EXIT_FAILURE);
    } else {
        /*
         * Report the error here in case that QEMU abruptly exits when
         * postcopy is enabled.
         */
        WITH_QEMU_LOCK_GUARD(&s->error_mutex) {
            error_report_err(s->error);
            s->error = NULL;
        }
    }

This way, we are freeing local_err, and reporting s->error, solving regression 2
We also have 2 code paths to deal with the postcopy scenario later (regression 1).
reporting the error and setting it to NULL requires a lock guard as it is a
shared object.

> 
> Regards,
> Akihiko Odaki
> 

Regards,
Arun Menon
Re: [PATCH v13 07/27] migration: push Error **errp into qemu_loadvm_state()
Posted by Akihiko Odaki 1 week, 3 days ago
On 2025/09/18 21:48, Arun Menon wrote:
> Hi Akihiko,
> 
> On Thu, Sep 18, 2025 at 09:08:05PM +0900, Akihiko Odaki wrote:
>> On 2025/09/18 15:28, Arun Menon wrote:
>>> Hi Akihiko,
>>>
>>> On Thu, Sep 18, 2025 at 11:06:58AM +0900, Akihiko Odaki wrote:
>>>> On 2025/09/17 23:27, Arun Menon wrote:
>>>>> Hi Akihiko,
>>>>>
>>>>> On Wed, Sep 17, 2025 at 04:54:11PM +0900, Akihiko Odaki wrote:
>>>>>> On 2025/09/17 9:34, Arun Menon wrote:
>>>>>>> Hi Akihiko,
>>>>>>>
>>>>>>> On Sat, Sep 06, 2025 at 05:22:31AM +0200, Akihiko Odaki wrote:
>>>>>>>> On 2025/09/03 8:47, Arun Menon wrote:
>>>>>>>>> Hi Akihiko,
>>>>>>>>>
>>>>>>>>> It took some time to set up the machines; apologies for the delay in response.
>>>>>>>>>
>>>>>>>>> On Mon, Sep 01, 2025 at 02:12:54AM +0900, Akihiko Odaki wrote:
>>>>>>>>>> On 2025/09/01 1:38, Arun Menon wrote:
>>>>>>>>>>> Hi,
>>>>>>>>>>>
>>>>>>>>>>> On Mon, Sep 01, 2025 at 01:04:40AM +0900, Akihiko Odaki wrote:
>>>>>>>>>>>> On 2025/09/01 0:45, Arun Menon wrote:
>>>>>>>>>>>>> Hi Akihiko,
>>>>>>>>>>>>> Thanks for the review.
>>>>>>>>>>>>>
>>>>>>>>>>>>> On Sat, Aug 30, 2025 at 02:58:05PM +0900, Akihiko Odaki wrote:
>>>>>>>>>>>>>> On 2025/08/30 5:01, Arun Menon wrote:
>>>>>>>>>>>>>>> This is an incremental step in converting vmstate loading
>>>>>>>>>>>>>>> code to report error via Error objects instead of directly
>>>>>>>>>>>>>>> printing it to console/monitor.
>>>>>>>>>>>>>>> It is ensured that qemu_loadvm_state() must report an error
>>>>>>>>>>>>>>> in errp, in case of failure.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> When postcopy live migration runs, the device states are loaded by
>>>>>>>>>>>>>>> both the qemu coroutine process_incoming_migration_co() and the
>>>>>>>>>>>>>>> postcopy_ram_listen_thread(). Therefore, it is important that the
>>>>>>>>>>>>>>> coroutine also reports the error in case of failure, with
>>>>>>>>>>>>>>> error_report_err(). Otherwise, the source qemu will not display
>>>>>>>>>>>>>>> any errors before going into the postcopy pause state.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
>>>>>>>>>>>>>>> Reviewed-by: Fabiano Rosas <farosas@suse.de>
>>>>>>>>>>>>>>> Signed-off-by: Arun Menon <armenon@redhat.com>
>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>          migration/migration.c |  9 +++++----
>>>>>>>>>>>>>>>          migration/savevm.c    | 30 ++++++++++++++++++------------
>>>>>>>>>>>>>>>          migration/savevm.h    |  2 +-
>>>>>>>>>>>>>>>          3 files changed, 24 insertions(+), 17 deletions(-)
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> diff --git a/migration/migration.c b/migration/migration.c
>>>>>>>>>>>>>>> index 10c216d25dec01f206eacad2edd24d21f00e614c..c6768d88f45c870c7fad9b9957300766ff69effc 100644
>>>>>>>>>>>>>>> --- a/migration/migration.c
>>>>>>>>>>>>>>> +++ b/migration/migration.c
>>>>>>>>>>>>>>> @@ -881,7 +881,7 @@ process_incoming_migration_co(void *opaque)
>>>>>>>>>>>>>>>                                MIGRATION_STATUS_ACTIVE);
>>>>>>>>>>>>>>>              mis->loadvm_co = qemu_coroutine_self();
>>>>>>>>>>>>>>> -    ret = qemu_loadvm_state(mis->from_src_file);
>>>>>>>>>>>>>>> +    ret = qemu_loadvm_state(mis->from_src_file, &local_err);
>>>>>>>>>>>>>>>              mis->loadvm_co = NULL;
>>>>>>>>>>>>>>>              trace_vmstate_downtime_checkpoint("dst-precopy-loadvm-completed");
>>>>>>>>>>>>>>> @@ -908,7 +908,8 @@ process_incoming_migration_co(void *opaque)
>>>>>>>>>>>>>>>              }
>>>>>>>>>>>>>>>              if (ret < 0) {
>>>>>>>>>>>>>>> -        error_setg(&local_err, "load of migration failed: %s", strerror(-ret));
>>>>>>>>>>>>>>> +        error_prepend(&local_err, "load of migration failed: %s: ",
>>>>>>>>>>>>>>> +                      strerror(-ret));
>>>>>>>>>>>>>>>                  goto fail;
>>>>>>>>>>>>>>>              }
>>>>>>>>>>>>>>> @@ -924,13 +925,13 @@ fail:
>>>>>>>>>>>>>>>              migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
>>>>>>>>>>>>>>>                                MIGRATION_STATUS_FAILED);
>>>>>>>>>>>>>>>              migrate_set_error(s, local_err);
>>>>>>>>>>>>>>> -    error_free(local_err);
>>>>>>>>>>>>>>> +    error_report_err(local_err);
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> This is problematic because it results in duplicate error reports when
>>>>>>>>>>>>>> !mis->exit_on_error; in that case the query-migrate QMP command reports the
>>>>>>>>>>>>>> error and this error reporting is redundant.
>>>>>>>>>>>>>
>>>>>>>>>>>>> If I comment this change, then all of the errors propagated up to now, using
>>>>>>>>>>>>> error_setg() will not be reported. This is the place where it is finally reported,
>>>>>>>>>>>>> when qemu_loadvm_state() fails. In other words, all the error_reports() we removed
>>>>>>>>>>>>> from all the files, replacing them with error_setg(), will finally be reported here
>>>>>>>>>>>>> using error_report_err().
>>>>>>>>>>>>
>>>>>>>>>>>> My understanding of the code without these two changes is:
>>>>>>>>>>>> - If the migrate-incoming QMP command is used with false as
>>>>>>>>>>>>         exit-on-error, this function will not report the error but
>>>>>>>>>>>>         the query-migrate QMP command will report the error.
>>>>>>>>>>>> - Otherwise, this function reports the error.
>>>>>>>>>>>
>>>>>>>>>>> With my limited experience in testing, I have a question,
>>>>>>>>>>> So there are 2 scenarios,
>>>>>>>>>>> 1. running the virsh migrate command on the source host. Something like the following,
>>>>>>>>>>>         virsh -c 'qemu:///system' migrate --live --verbose --domain guest-vm --desturi qemu+ssh://10.6.120.20/system
>>>>>>>>>>>         OR for postcopy-ram,
>>>>>>>>>>>         virsh migrate guest-vm --live qemu+ssh://10.6.120.20/system --verbose --postcopy --timeout 10 --timeout-postcopy
>>>>>>>>>>>
>>>>>>>>>>> 2. Using QMP commands, performing a migration from source to destination.
>>>>>>>>>>>         Running something like the following on the destination:
>>>>>>>>>>>         {
>>>>>>>>>>>           "execute": "migrate-incoming",
>>>>>>>>>>>           "arguments": {
>>>>>>>>>>>             "uri": "tcp:127.0.0.1:7777",
>>>>>>>>>>>             "exit-on-error": false
>>>>>>>>>>>           }
>>>>>>>>>>>         }
>>>>>>>>>>>         {
>>>>>>>>>>>           "execute": "migrate-incoming",
>>>>>>>>>>>           "arguments": {
>>>>>>>>>>>             "uri": "tcp:127.0.0.1:7777",
>>>>>>>>>>>             "exit-on-error": false
>>>>>>>>>>>           }
>>>>>>>>>>>         }
>>>>>>>>>>>         and the somthing like the following on source:
>>>>>>>>>>>         {
>>>>>>>>>>>           "execute": "migrate",
>>>>>>>>>>>           "arguments": {
>>>>>>>>>>>             "uri": "tcp:127.0.0.1:7777"
>>>>>>>>>>>           }
>>>>>>>>>>>         }
>>>>>>>>>>>         {"execute" : "query-migrate"}
>>>>>>>>>>>
>>>>>>>>>>> In 1, previously, the user used to get an error message on migration failure.
>>>>>>>>>>> This was because there were error_report() calls in all of the files.
>>>>>>>>>>> Now that they are replaced with error_setg() and the error is stored in errp,
>>>>>>>>>>> we need to display that using error_report_err(). Hence I introduced an error_report_err()
>>>>>>>>>>> call in the fail section.
>>>>>>>>>>>
>>>>>>>>>>> In 2, we have 2 QMP sessions, one for the source and another for the destination.
>>>>>>>>>>> The QMP command migrate will be issued on the source, and the errp will be set.
>>>>>>>>>>> I did not understand the part where the message will be displayed because of the
>>>>>>>>>>> error_report_err() call. I did not see such a message on failure scenario on both
>>>>>>>>>>> the sessions.
>>>>>>>>>>> If the user wants to check for errors, then the destination qemu will not exit
>>>>>>>>>>> (exit-on-error = false ) and we can retrieve it using {"execute" : "query-migrate"}
>>>>>>>>>>>
>>>>>>>>>>> Aren't the 2 scenarios different by nature?
>>>>>>>>>>
>>>>>>>>>> In 1, doesn't libvirt query the error with query-migrate and print it?
>>>>>>>>>
>>>>>>>>> Ideally it should find the the error, and print the whole thing. It does work
>>>>>>>>> in the normal scenario. However, the postcopy scenario does not show the same result,
>>>>>>>>> which is mentioned in the commit message.
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> In any case, it would be nice if you describe how libvirt interacts with
>>>>>>>>>> QEMU in 1.
>>>>>>>>>
>>>>>>>>> Please find below the difference in the command output at source, when we run a live migration
>>>>>>>>> with postcopy enabled.
>>>>>>>>>
>>>>>>>>> =========
>>>>>>>>> With the current changes:
>>>>>>>>> [root@dell-per750-42 qemu-priv]# virsh migrate-setspeed guest-vm 1
>>>>>>>>>
>>>>>>>>> [root@dell-per750-42 build]# virsh migrate guest-vm --live qemu+ssh://10.6.120.9/system --verbose --postcopy --timeout 10 --timeout-postcopy
>>>>>>>>> root@10.6.120.9's password:
>>>>>>>>> Migration: [ 1.26 %]error: internal error: QEMU unexpectedly closed the monitor (vm='guest-vm'): 2025-09-03T06:19:15.076547Z qemu-system-x86_64: -accel kvm: warning: Number of SMP cpus requested (2) exceeds the recommended cpus supported by KVM (1)
>>>>>>>>> 2025-09-03T06:19:15.076586Z qemu-system-x86_64: -accel kvm: warning: Number of hotpluggable cpus requested (2) exceeds the recommended cpus supported by KVM (1)
>>>>>>>>> 2025-09-03T06:19:27.776715Z qemu-system-x86_64: load of migration failed: Input/output error: error while loading state for instance 0x0 of device 'tpm-emulator': post load hook failed for: tpm-emulator, version_id: 0, minimum_version: 0, ret: -5: tpm-emulator: Setting the stateblob (type 1) failed with a TPM error 0x21 decryption error
>>>>>>>>>
>>>>>>>>> [root@dell-per750-42 build]#
>>>>>>>>>
>>>>>>>>> =========
>>>>>>>>>
>>>>>>>>> Without the current changes:
>>>>>>>>> [root@dell-per750-42 qemu-priv]# virsh migrate-setspeed guest-vm 1
>>>>>>>>>
>>>>>>>>> [root@dell-per750-42 qemu-priv]# virsh migrate guest-vm --live qemu+ssh://10.6.120.9/system --verbose --postcopy --timeout 10 --timeout-postcopy
>>>>>>>>> root@10.6.120.9's password:
>>>>>>>>> Migration: [ 1.28 %]error: internal error: QEMU unexpectedly closed the monitor (vm='guest-vm'): 2025-09-03T06:26:17.733786Z qemu-system-x86_64: -accel kvm: warning: Number of SMP cpus requested (2) exceeds the recommended cpus supported by KVM (1)
>>>>>>>>> 2025-09-03T06:26:17.733830Z qemu-system-x86_64: -accel kvm: warning: Number of hotpluggable cpus requested (2) exceeds the recommended cpus supported by KVM (1)
>>>>>>>>>
>>>>>>>>> [root@dell-per750-42 qemu-priv]#
>>>>>>>>>
>>>>>>>>> =========
>>>>>>>>> The original behavior was to print the error to the console regardless of whether the migration is normal or postcopy.
>>>>>>>>
>>>>>>>> This was true for messages in qemu_loadvm_state(), but the message "load of
>>>>>>>> migration failed" was printed or queried with query-migrate, not both. We
>>>>>>>> should think of which behavior is more appropriate, and I think we should
>>>>>>>> avoid duplicate reports.
>>>>>>>>
>>>>>>>>> The source machine goes in to a paused state after this.
>>>>>>>>
>>>>>>>> The output is informative. It implies the destination machine exited, and it
>>>>>>>> makes sense to print error messages as it is done for
>>>>>>>> mis->exit_on_error. I wonder if it is possible to detect the condition and
>>>>>>>> treat it identically to mis->exit_on_error.
>>>>>>>
>>>>>>> I see that we want to catch a specific scenario in postcopy ram migration
>>>>>>> where the destination abruptly exits without a graceful shutdown,
>>>>>>> thus failing to inform the source the reason for its failure through a
>>>>>>> 'query-migrate' even though 'exit-on-error' was set to false on the destination.
>>>>>>>
>>>>>>> However, I am not sure how to reliably detect the specific error condition of
>>>>>>> such a connection close that you have described. Given that this is a large
>>>>>>> patch series already, could we keep the current change as is for now?
>>>>>>>     From what I can tell, the additional log message "load of migration failed"
>>>>>>> is not a breaking change and will not cause a crash. We can develop a more
>>>>>>> elegant solution to handle the issue of duplication in a separate patch.
>>>>>> There are two regressions:
>>>>>> 1) Duplicate error reports when exit-on-error is false and postcopy is
>>>>>> disabled.
>>>>>
>>>>> Thank you for your detailed response.
>>>>> I am trying to fully understand the logic here, so please correct me if I'm wrong.
>>>>> My understanding of the patch is that all errors are chained together
>>>>> in local_err by propagating them through the call stack and prepending them with
>>>>> "load of migration failed."
>>>>> This creates a single, comprehensive error message, which is then passed to
>>>>>      migrate_set_error(s, local_err) in the fail section.
>>>>> This line already existed. So in effect we are setting the error here.
>>>>
>>>> There are two other changes:
>>>> - The message is printed immediately with error_report_err(). This causes
>>>> 1).
>>>> - The error set with migrate_set_error() is ignored. This causes 2).
>>>>
>>>>>
>>>>> Regarding the second regression you mentioned:
>>>>>
>>>>>> 2) Errors reported code else qemu_loadvm_state() are ignored when
>>>>>> exit-on-error is true.
>>>>>
>>>>> Doesn't the error_prepend() function ensure that errors set higher up the call stack
>>>>> are preserved and only "load of migration failed" is prepended to it?
>>>>> When the fail section is reached, local_err will have the complete error message.
>>>>>
>>>>>>
>>>>>> 1) is trivial yet difficult to fix and I agree that it can be handled later.
>>>>>> Ideally there should be a comment to note that.
>>>>>>
>>>>>> However, there is also 2), which is more serious and easier to fix so I
>>>>>> suggest fixing it now. More concretely, the code will look like as follows:
>>>>>>
>>>>>>        migrate_set_error(s, local_err);
>>>>>>        if (mis->exit_on_error) {
>>>>>>            error_free(local_err);
>>>>>>        } else {
>>>>>>            /*
>>>>>>             * Report the error here in case that QEMU abruptly exits when
>>>>>>             * postcopy is enabled.
>>>>>>             */
>>>>>>            error_report_err(s->error);
>>>>>>        }
>>>>>>
>>>>>>        migration_incoming_state_destroy();
>>>>>>
>>>>>>        if (mis->exit_on_error) {
>>>>>>            WITH_QEMU_LOCK_GUARD(&s->error_mutex) {
>>>>>>                error_report_err(s->error);
>>>>>>
>>>>>> This ensures errors set by anyone will be reported while duplicate error
>>>>>> reports are avoided when exit-on-error is true.
>>>>>
>>>>> yes, this part (set by anyone), I fail to follow. Do you mean that there can be
>>>>> chances of a race condition between migrate_set_error() and error_report_err()?
>>>>>
>>>>> My analysis of your proposed code shows that whether mis->exit_on_error is true (in the final if block)
>>>>> or false (in the else block), error_report_err() is called.
>>>>> This seems to result in the error being reported regardless, which is similar to my original patch.
>>>>> Yes in both the cases, s->error is being reported. However in my view, in the fail section, both
>>>>> local_err and s->error will be the same.
>>>>>
>>>>> I appreciate your patience as I try to understand this better.
>>>>
>>>> I indeed suspect the possibility of a race condition between
>>>> migration_set_error() and error_report_err(). The code implies an error can
>>>> be concurrently reported when migration_incoming_state_destroy() is called,
>>>> and synchronized with WITH_QEMU_LOCK_GUARD(&s->error_mutex).
>>>>
>>>> This code fragment is useless if there is no such concurrency at all and
>>>> should look like the following:
>>>>
>>>>       migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
>>>>                         MIGRATION_STATUS_FAILED);
>>>>       error_report_err(local_err);
>>>>
>>>>       if (mis->exit_on_error) {
>>>>           exit(EXIT_FAILURE);
>>>>       }
>>>>
>>>>       migrate_set_error(s, local_err);
>>>>
>>>>       migration_incoming_state_destroy();
>>>>
>>>> But it is not how the code is written, so I suspect there is a race
>>>> condition.
>>>
>>> Thank you. That is clear to me now.
>>> We need to report s->error because it can be set somewhere else.
>>> We are creating separate paths but only adding a placeholder comment
>>> for now for the postcopy scenario. I suppose local_err can be freed outside the if clause.
>>
>> The separate paths or the if clause for error_free() and error_report_err()
>> is necessary because both functions free the error object and calling both
>> results in double-free.
> 
> Doesn't error_copy() called inside migrate_set_error() create a deep copy of the error
> object? Such that local_err and s->error point to separate locations in memory.
> 
> The following has more boilerplate code as it does not change the existing sequence
> of instructions but it can create confusion while reading.
> fail:
>      migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
>                        MIGRATION_STATUS_FAILED);
>      migrate_set_error(s, local_err);
>      error_free(local_err);
> 
>      if (!mis->exit_on_error) {
>          /*
>           * Report the error here in case that QEMU abruptly exits when
>           * postcopy is enabled.
>           */
>          WITH_QEMU_LOCK_GUARD(&s->error_mutex) {
>              error_report_err(s->error);
>              s->error = NULL;
>          }
>      }
>      migration_incoming_state_destroy();
>      if (mis->exit_on_error) {
>          WITH_QEMU_LOCK_GUARD(&s->error_mutex) {
>              error_report_err(s->error);
>              s->error = NULL;
>          }
>          exit(EXIT_FAILURE);
>      }
> 
> To avoid the confusion we can do the following,
> fail:
>      migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
>                        MIGRATION_STATUS_FAILED);
>      migrate_set_error(s, local_err);
>      error_free(local_err);
> 
>      migration_incoming_state_destroy();
> 
>      if (mis->exit_on_error) {
>          WITH_QEMU_LOCK_GUARD(&s->error_mutex) {
>              error_report_err(s->error);
>              s->error = NULL;
>          }
>          exit(EXIT_FAILURE);
>      } else {
>          /*
>           * Report the error here in case that QEMU abruptly exits when
>           * postcopy is enabled.
>           */
>          WITH_QEMU_LOCK_GUARD(&s->error_mutex) {
>              error_report_err(s->error);
>              s->error = NULL;
>          }
>      }
> 
> This way, we are freeing local_err, and reporting s->error, solving regression 2
> We also have 2 code paths to deal with the postcopy scenario later (regression 1).
> reporting the error and setting it to NULL requires a lock guard as it is a
> shared object.

I forgot the possibility that s->error may be set when postcopy is 
enabled. The code looks good to me.

Regards,
Akihiko Odaki