[PATCH V4 6/8] migration: cpr-exec mode

Steve Sistare posted 8 patches 6 days, 1 hour ago
Maintainers: Steve Sistare <steven.sistare@oracle.com>, Peter Xu <peterx@redhat.com>, Fabiano Rosas <farosas@suse.de>, "Dr. David Alan Gilbert" <dave@treblig.org>, Alex Williamson <alex.williamson@redhat.com>, "Cédric Le Goater" <clg@redhat.com>, Eric Blake <eblake@redhat.com>, Markus Armbruster <armbru@redhat.com>, Paolo Bonzini <pbonzini@redhat.com>, Stefan Weil <sw@weilnetz.de>
[PATCH V4 6/8] migration: cpr-exec mode
Posted by Steve Sistare 6 days, 1 hour ago
Add the cpr-exec migration mode.  Usage:
  qemu-system-$arch -machine aux-ram-share=on ...
  migrate_set_parameter mode cpr-exec
  migrate_set_parameter cpr-exec-command \
    <arg1> <arg2> ... -incoming <uri-1> \
  migrate -d <uri-1>

The migrate command stops the VM, saves state to uri-1,
directly exec's a new version of QEMU on the same host,
replacing the original process while retaining its PID, and
loads state from uri-1.  Guest RAM is preserved in place,
albeit with new virtual addresses.

The new QEMU process is started by exec'ing the command
specified by the @cpr-exec-command parameter.  The first word of
the command is the binary, and the remaining words are its
arguments.  The command may be a direct invocation of new QEMU,
or may be a non-QEMU command that exec's the new QEMU binary.

This mode creates a second migration channel that is not visible
to the user.  At the start of migration, old QEMU saves CPR state
to the second channel, and at the end of migration, it tells the
main loop to call cpr_exec.  New QEMU loads CPR state early, before
objects are created.

Because old QEMU terminates when new QEMU starts, one cannot
stream data between the two, so uri-1 must be a type,
such as a file, that accepts all data before old QEMU exits.
Otherwise, old QEMU may quietly block writing to the channel.

Memory-backend objects must have the share=on attribute, but
memory-backend-epc is not supported.  The VM must be started with
the '-machine aux-ram-share=on' option, which allows anonymous
memory to be transferred in place to the new process.  The memfds
are kept open across exec by clearing the close-on-exec flag, their
values are saved in CPR state, and they are mmap'd in new QEMU.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Acked-by: Markus Armbruster <armbru@redhat.com>
---
 qapi/migration.json       | 25 +++++++++++++-
 include/migration/cpr.h   |  1 +
 migration/cpr-exec.c      | 84 +++++++++++++++++++++++++++++++++++++++++++++++
 migration/cpr.c           | 28 ++++++++++++++--
 migration/migration.c     | 10 +++++-
 migration/ram.c           |  1 +
 migration/vmstate-types.c |  8 +++++
 system/vl.c               |  4 ++-
 migration/trace-events    |  1 +
 9 files changed, 157 insertions(+), 5 deletions(-)

diff --git a/qapi/migration.json b/qapi/migration.json
index 2be8fa1..be0f3fc 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -694,9 +694,32 @@
 #     until you issue the `migrate-incoming` command.
 #
 #     (since 10.0)
+#
+# @cpr-exec: The migrate command stops the VM, saves state to the
+#     migration channel, directly exec's a new version of QEMU on the
+#     same host, replacing the original process while retaining its
+#     PID, and loads state from the channel.  Guest RAM is preserved
+#     in place.  Devices and their pinned pages are also preserved for
+#     VFIO and IOMMUFD.
+#
+#     Old QEMU starts new QEMU by exec'ing the command specified by
+#     the @cpr-exec-command parameter.  The command may be a direct
+#     invocation of new QEMU, or may be a wrapper that exec's the new
+#     QEMU binary.
+#
+#     Because old QEMU terminates when new QEMU starts, one cannot
+#     stream data between the two, so the channel must be a type,
+#     such as a file, that accepts all data before old QEMU exits.
+#     Otherwise, old QEMU may quietly block writing to the channel.
+#
+#     Memory-backend objects must have the share=on attribute, but
+#     memory-backend-epc is not supported.  The VM must be started
+#     with the '-machine aux-ram-share=on' option.
+#
+#     (since 10.2)
 ##
 { 'enum': 'MigMode',
-  'data': [ 'normal', 'cpr-reboot', 'cpr-transfer' ] }
+  'data': [ 'normal', 'cpr-reboot', 'cpr-transfer', 'cpr-exec' ] }
 
 ##
 # @ZeroPageDetection:
diff --git a/include/migration/cpr.h b/include/migration/cpr.h
index b84389f..beed392 100644
--- a/include/migration/cpr.h
+++ b/include/migration/cpr.h
@@ -53,6 +53,7 @@ int cpr_get_fd_param(const char *name, const char *fdname, int index,
 QEMUFile *cpr_transfer_output(MigrationChannel *channel, Error **errp);
 QEMUFile *cpr_transfer_input(MigrationChannel *channel, Error **errp);
 
+void cpr_exec_init(void);
 QEMUFile *cpr_exec_output(Error **errp);
 QEMUFile *cpr_exec_input(Error **errp);
 void cpr_exec_persist_state(QEMUFile *f);
diff --git a/migration/cpr-exec.c b/migration/cpr-exec.c
index 2c32e9c..8cf55a3 100644
--- a/migration/cpr-exec.c
+++ b/migration/cpr-exec.c
@@ -6,15 +6,21 @@
 
 #include "qemu/osdep.h"
 #include "qemu/cutils.h"
+#include "qemu/error-report.h"
 #include "qemu/memfd.h"
 #include "qapi/error.h"
+#include "qapi/type-helpers.h"
 #include "io/channel-file.h"
 #include "io/channel-socket.h"
+#include "block/block-global-state.h"
+#include "qemu/main-loop.h"
 #include "migration/cpr.h"
 #include "migration/qemu-file.h"
+#include "migration/migration.h"
 #include "migration/misc.h"
 #include "migration/vmstate.h"
 #include "system/runstate.h"
+#include "trace.h"
 
 #define CPR_EXEC_STATE_NAME "QEMU_CPR_EXEC_STATE"
 
@@ -92,3 +98,81 @@ QEMUFile *cpr_exec_input(Error **errp)
     lseek(mfd, 0, SEEK_SET);
     return qemu_file_new_fd_input(mfd, CPR_EXEC_STATE_NAME);
 }
+
+static bool preserve_fd(int fd)
+{
+    qemu_clear_cloexec(fd);
+    return true;
+}
+
+static bool unpreserve_fd(int fd)
+{
+    qemu_set_cloexec(fd);
+    return true;
+}
+
+static void cpr_exec_cb(void *opaque)
+{
+    MigrationState *s = migrate_get_current();
+    char **argv = strv_from_str_list(s->parameters.cpr_exec_command);
+    Error *err = NULL;
+
+    /*
+     * Clear the close-on-exec flag for all preserved fd's.  We cannot do so
+     * earlier because they should not persist across miscellaneous fork and
+     * exec calls that are performed during normal operation.
+     */
+    cpr_walk_fd(preserve_fd);
+
+    trace_cpr_exec();
+    execvp(argv[0], argv);
+
+    /*
+     * exec should only fail if argv[0] is bogus, or has a permissions problem,
+     * or the system is very short on resources.
+     */
+    g_strfreev(argv);
+    cpr_walk_fd(unpreserve_fd);
+
+    error_setg_errno(&err, errno, "execvp %s failed", argv[0]);
+    error_report_err(error_copy(err));
+    migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED);
+    migrate_set_error(s, err);
+
+    migration_call_notifiers(s, MIG_EVENT_PRECOPY_FAILED, NULL);
+
+    err = NULL;
+    if (!migration_block_activate(&err)) {
+        /* error was already reported */
+        return;
+    }
+
+    if (runstate_is_live(s->vm_old_state)) {
+        vm_start();
+    }
+}
+
+static int cpr_exec_notifier(NotifierWithReturn *notifier, MigrationEvent *e,
+                             Error **errp)
+{
+    MigrationState *s = migrate_get_current();
+
+    if (e->type == MIG_EVENT_PRECOPY_DONE) {
+        QEMUBH *cpr_exec_bh = qemu_bh_new(cpr_exec_cb, NULL);
+        assert(s->state == MIGRATION_STATUS_COMPLETED);
+        qemu_bh_schedule(cpr_exec_bh);
+        qemu_notify_event();
+
+    } else if (e->type == MIG_EVENT_PRECOPY_FAILED) {
+        cpr_exec_unpersist_state();
+    }
+    return 0;
+}
+
+void cpr_exec_init(void)
+{
+    static NotifierWithReturn exec_notifier;
+
+    migration_add_notifier_mode(&exec_notifier, cpr_exec_notifier,
+                                MIG_MODE_CPR_EXEC);
+}
diff --git a/migration/cpr.c b/migration/cpr.c
index d3e370e..eea3773 100644
--- a/migration/cpr.c
+++ b/migration/cpr.c
@@ -185,6 +185,8 @@ int cpr_state_save(MigrationChannel *channel, Error **errp)
     if (mode == MIG_MODE_CPR_TRANSFER) {
         g_assert(channel);
         f = cpr_transfer_output(channel, errp);
+    } else if (mode == MIG_MODE_CPR_EXEC) {
+        f = cpr_exec_output(errp);
     } else {
         return 0;
     }
@@ -202,6 +204,10 @@ int cpr_state_save(MigrationChannel *channel, Error **errp)
         return ret;
     }
 
+    if (migrate_mode() == MIG_MODE_CPR_EXEC) {
+        cpr_exec_persist_state(f);
+    }
+
     /*
      * Close the socket only partially so we can later detect when the other
      * end closes by getting a HUP event.
@@ -213,6 +219,12 @@ int cpr_state_save(MigrationChannel *channel, Error **errp)
     return 0;
 }
 
+static bool unpreserve_fd(int fd)
+{
+    qemu_set_cloexec(fd);
+    return true;
+}
+
 int cpr_state_load(MigrationChannel *channel, Error **errp)
 {
     int ret;
@@ -220,7 +232,13 @@ int cpr_state_load(MigrationChannel *channel, Error **errp)
     QEMUFile *f;
     MigMode mode = 0;
 
-    if (channel) {
+    if (cpr_exec_has_state()) {
+        mode = MIG_MODE_CPR_EXEC;
+        f = cpr_exec_input(errp);
+        if (channel) {
+            warn_report("ignoring cpr channel for migration mode cpr-exec");
+        }
+    } else if (channel) {
         mode = MIG_MODE_CPR_TRANSFER;
         cpr_set_incoming_mode(mode);
         f = cpr_transfer_input(channel, errp);
@@ -232,6 +250,7 @@ int cpr_state_load(MigrationChannel *channel, Error **errp)
     }
 
     trace_cpr_state_load(MigMode_str(mode));
+    cpr_set_incoming_mode(mode);
 
     v = qemu_get_be32(f);
     if (v != QEMU_CPR_FILE_MAGIC) {
@@ -253,6 +272,11 @@ int cpr_state_load(MigrationChannel *channel, Error **errp)
         return ret;
     }
 
+    if (migrate_mode() == MIG_MODE_CPR_EXEC) {
+        /* Set cloexec to prevent fd leaks from fork until the next cpr-exec */
+        cpr_walk_fd(unpreserve_fd);
+    }
+
     /*
      * Let the caller decide when to close the socket (and generate a HUP event
      * for the sending side).
@@ -273,7 +297,7 @@ void cpr_state_close(void)
 bool cpr_incoming_needed(void *opaque)
 {
     MigMode mode = migrate_mode();
-    return mode == MIG_MODE_CPR_TRANSFER;
+    return mode == MIG_MODE_CPR_TRANSFER || mode == MIG_MODE_CPR_EXEC;
 }
 
 /*
diff --git a/migration/migration.c b/migration/migration.c
index 08a98f7..2515bec 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -333,6 +333,7 @@ void migration_object_init(void)
 
     ram_mig_init();
     dirty_bitmap_mig_init();
+    cpr_exec_init();
 
     /* Initialize cpu throttle timers */
     cpu_throttle_init();
@@ -1796,7 +1797,8 @@ bool migrate_mode_is_cpr(MigrationState *s)
 {
     MigMode mode = s->parameters.mode;
     return mode == MIG_MODE_CPR_REBOOT ||
-           mode == MIG_MODE_CPR_TRANSFER;
+           mode == MIG_MODE_CPR_TRANSFER ||
+           mode == MIG_MODE_CPR_EXEC;
 }
 
 int migrate_init(MigrationState *s, Error **errp)
@@ -2145,6 +2147,12 @@ static bool migrate_prepare(MigrationState *s, bool resume, Error **errp)
         return false;
     }
 
+    if (migrate_mode() == MIG_MODE_CPR_EXEC &&
+        !s->parameters.has_cpr_exec_command) {
+        error_setg(errp, "cpr-exec mode requires setting cpr-exec-command");
+        return false;
+    }
+
     if (migration_is_blocked(errp)) {
         return false;
     }
diff --git a/migration/ram.c b/migration/ram.c
index 7208bc1..6730a41 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -228,6 +228,7 @@ bool migrate_ram_is_ignored(RAMBlock *block)
     MigMode mode = migrate_mode();
     return !qemu_ram_is_migratable(block) ||
            mode == MIG_MODE_CPR_TRANSFER ||
+           mode == MIG_MODE_CPR_EXEC ||
            (migrate_ignore_shared() && qemu_ram_is_shared(block)
                                     && qemu_ram_is_named_file(block));
 }
diff --git a/migration/vmstate-types.c b/migration/vmstate-types.c
index 741a588..1aa0573 100644
--- a/migration/vmstate-types.c
+++ b/migration/vmstate-types.c
@@ -321,6 +321,10 @@ static int get_fd(QEMUFile *f, void *pv, size_t size,
                   const VMStateField *field)
 {
     int32_t *v = pv;
+    if (migrate_mode() == MIG_MODE_CPR_EXEC) {
+        qemu_get_sbe32s(f, v);
+        return 0;
+    }
     *v = qemu_file_get_fd(f);
     return 0;
 }
@@ -329,6 +333,10 @@ static int put_fd(QEMUFile *f, void *pv, size_t size,
                   const VMStateField *field, JSONWriter *vmdesc)
 {
     int32_t *v = pv;
+    if (migrate_mode() == MIG_MODE_CPR_EXEC) {
+        qemu_put_sbe32s(f, v);
+        return 0;
+    }
     return qemu_file_put_fd(f, *v);
 }
 
diff --git a/system/vl.c b/system/vl.c
index 4c24073..f395d04 100644
--- a/system/vl.c
+++ b/system/vl.c
@@ -3867,6 +3867,8 @@ void qemu_init(int argc, char **argv)
     }
     qemu_init_displays();
     accel_setup_post(current_machine);
-    os_setup_post();
+    if (migrate_mode() != MIG_MODE_CPR_EXEC) {
+        os_setup_post();
+    }
     resume_mux_open();
 }
diff --git a/migration/trace-events b/migration/trace-events
index 706db97..e8edd1f 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -354,6 +354,7 @@ cpr_state_save(const char *mode) "%s mode"
 cpr_state_load(const char *mode) "%s mode"
 cpr_transfer_input(const char *path) "%s"
 cpr_transfer_output(const char *path) "%s"
+cpr_exec(void) ""
 
 # block-dirty-bitmap.c
 send_bitmap_header_enter(void) ""
-- 
1.8.3.1
Re: [PATCH V4 6/8] migration: cpr-exec mode
Posted by Cédric Le Goater 6 days ago
On 9/22/25 15:49, Steve Sistare wrote:
> Add the cpr-exec migration mode.  Usage:
>    qemu-system-$arch -machine aux-ram-share=on ...
>    migrate_set_parameter mode cpr-exec
>    migrate_set_parameter cpr-exec-command \
>      <arg1> <arg2> ... -incoming <uri-1> \
>    migrate -d <uri-1>
> 
> The migrate command stops the VM, saves state to uri-1,
> directly exec's a new version of QEMU on the same host,
> replacing the original process while retaining its PID, and
> loads state from uri-1.  Guest RAM is preserved in place,
> albeit with new virtual addresses.
> 
> The new QEMU process is started by exec'ing the command
> specified by the @cpr-exec-command parameter.  The first word of
> the command is the binary, and the remaining words are its
> arguments.  The command may be a direct invocation of new QEMU,
> or may be a non-QEMU command that exec's the new QEMU binary.
> 
> This mode creates a second migration channel that is not visible
> to the user.  At the start of migration, old QEMU saves CPR state
> to the second channel, and at the end of migration, it tells the
> main loop to call cpr_exec.  New QEMU loads CPR state early, before
> objects are created.
> 
> Because old QEMU terminates when new QEMU starts, one cannot
> stream data between the two, so uri-1 must be a type,
> such as a file, that accepts all data before old QEMU exits.
> Otherwise, old QEMU may quietly block writing to the channel.
> 
> Memory-backend objects must have the share=on attribute, but
> memory-backend-epc is not supported.  The VM must be started with
> the '-machine aux-ram-share=on' option, which allows anonymous
> memory to be transferred in place to the new process.  The memfds
> are kept open across exec by clearing the close-on-exec flag, their
> values are saved in CPR state, and they are mmap'd in new QEMU.
> 
> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
> Acked-by: Markus Armbruster <armbru@redhat.com>
> ---
>   qapi/migration.json       | 25 +++++++++++++-
>   include/migration/cpr.h   |  1 +
>   migration/cpr-exec.c      | 84 +++++++++++++++++++++++++++++++++++++++++++++++
>   migration/cpr.c           | 28 ++++++++++++++--
>   migration/migration.c     | 10 +++++-
>   migration/ram.c           |  1 +
>   migration/vmstate-types.c |  8 +++++
>   system/vl.c               |  4 ++-
>   migration/trace-events    |  1 +
>   9 files changed, 157 insertions(+), 5 deletions(-)
> 
> diff --git a/qapi/migration.json b/qapi/migration.json
> index 2be8fa1..be0f3fc 100644
> --- a/qapi/migration.json
> +++ b/qapi/migration.json
> @@ -694,9 +694,32 @@
>   #     until you issue the `migrate-incoming` command.
>   #
>   #     (since 10.0)
> +#
> +# @cpr-exec: The migrate command stops the VM, saves state to the
> +#     migration channel, directly exec's a new version of QEMU on the
> +#     same host, replacing the original process while retaining its
> +#     PID, and loads state from the channel.  Guest RAM is preserved
> +#     in place.  Devices and their pinned pages are also preserved for
> +#     VFIO and IOMMUFD.
> +#
> +#     Old QEMU starts new QEMU by exec'ing the command specified by
> +#     the @cpr-exec-command parameter.  The command may be a direct
> +#     invocation of new QEMU, or may be a wrapper that exec's the new
> +#     QEMU binary.
> +#
> +#     Because old QEMU terminates when new QEMU starts, one cannot
> +#     stream data between the two, so the channel must be a type,
> +#     such as a file, that accepts all data before old QEMU exits.
> +#     Otherwise, old QEMU may quietly block writing to the channel.
> +#
> +#     Memory-backend objects must have the share=on attribute, but
> +#     memory-backend-epc is not supported.  The VM must be started
> +#     with the '-machine aux-ram-share=on' option.
> +#
> +#     (since 10.2)
>   ##
>   { 'enum': 'MigMode',
> -  'data': [ 'normal', 'cpr-reboot', 'cpr-transfer' ] }
> +  'data': [ 'normal', 'cpr-reboot', 'cpr-transfer', 'cpr-exec' ] }
>   
>   ##
>   # @ZeroPageDetection:
> diff --git a/include/migration/cpr.h b/include/migration/cpr.h
> index b84389f..beed392 100644
> --- a/include/migration/cpr.h
> +++ b/include/migration/cpr.h
> @@ -53,6 +53,7 @@ int cpr_get_fd_param(const char *name, const char *fdname, int index,
>   QEMUFile *cpr_transfer_output(MigrationChannel *channel, Error **errp);
>   QEMUFile *cpr_transfer_input(MigrationChannel *channel, Error **errp);
>   
> +void cpr_exec_init(void);
>   QEMUFile *cpr_exec_output(Error **errp);
>   QEMUFile *cpr_exec_input(Error **errp);
>   void cpr_exec_persist_state(QEMUFile *f);
> diff --git a/migration/cpr-exec.c b/migration/cpr-exec.c
> index 2c32e9c..8cf55a3 100644
> --- a/migration/cpr-exec.c
> +++ b/migration/cpr-exec.c
> @@ -6,15 +6,21 @@
>   
>   #include "qemu/osdep.h"
>   #include "qemu/cutils.h"
> +#include "qemu/error-report.h"
>   #include "qemu/memfd.h"
>   #include "qapi/error.h"
> +#include "qapi/type-helpers.h"
>   #include "io/channel-file.h"
>   #include "io/channel-socket.h"
> +#include "block/block-global-state.h"
> +#include "qemu/main-loop.h"
>   #include "migration/cpr.h"
>   #include "migration/qemu-file.h"
> +#include "migration/migration.h"
>   #include "migration/misc.h"
>   #include "migration/vmstate.h"
>   #include "system/runstate.h"
> +#include "trace.h"
>   
>   #define CPR_EXEC_STATE_NAME "QEMU_CPR_EXEC_STATE"
>   
> @@ -92,3 +98,81 @@ QEMUFile *cpr_exec_input(Error **errp)
>       lseek(mfd, 0, SEEK_SET);
>       return qemu_file_new_fd_input(mfd, CPR_EXEC_STATE_NAME);
>   }
> +
> +static bool preserve_fd(int fd)
> +{
> +    qemu_clear_cloexec(fd);
> +    return true;
> +}
> +
> +static bool unpreserve_fd(int fd)
> +{
> +    qemu_set_cloexec(fd);
> +    return true;
> +}
> +
> +static void cpr_exec_cb(void *opaque)
> +{
> +    MigrationState *s = migrate_get_current();
> +    char **argv = strv_from_str_list(s->parameters.cpr_exec_command);
> +    Error *err = NULL;
> +
> +    /*
> +     * Clear the close-on-exec flag for all preserved fd's.  We cannot do so
> +     * earlier because they should not persist across miscellaneous fork and
> +     * exec calls that are performed during normal operation.
> +     */
> +    cpr_walk_fd(preserve_fd);
> +
> +    trace_cpr_exec();
> +    execvp(argv[0], argv);
> +
> +    /*
> +     * exec should only fail if argv[0] is bogus, or has a permissions problem,
> +     * or the system is very short on resources.
> +     */
> +    g_strfreev(argv);
> +    cpr_walk_fd(unpreserve_fd);
> +
> +    error_setg_errno(&err, errno, "execvp %s failed", argv[0]);
> +    error_report_err(error_copy(err));
> +    migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED);
> +    migrate_set_error(s, err);
> +
> +    migration_call_notifiers(s, MIG_EVENT_PRECOPY_FAILED, NULL);
> +
> +    err = NULL;
> +    if (!migration_block_activate(&err)) {
> +        /* error was already reported */
> +        return;
> +    }
> +
> +    if (runstate_is_live(s->vm_old_state)) {
> +        vm_start();
> +    }
> +}
> +
> +static int cpr_exec_notifier(NotifierWithReturn *notifier, MigrationEvent *e,
> +                             Error **errp)
> +{
> +    MigrationState *s = migrate_get_current();
> +
> +    if (e->type == MIG_EVENT_PRECOPY_DONE) {
> +        QEMUBH *cpr_exec_bh = qemu_bh_new(cpr_exec_cb, NULL);
> +        assert(s->state == MIGRATION_STATUS_COMPLETED);
> +        qemu_bh_schedule(cpr_exec_bh);
> +        qemu_notify_event();
> +
> +    } else if (e->type == MIG_EVENT_PRECOPY_FAILED) {
> +        cpr_exec_unpersist_state();
> +    }
> +    return 0;
> +}
> +
> +void cpr_exec_init(void)
> +{
> +    static NotifierWithReturn exec_notifier;
> +
> +    migration_add_notifier_mode(&exec_notifier, cpr_exec_notifier,
> +                                MIG_MODE_CPR_EXEC);
> +}
> diff --git a/migration/cpr.c b/migration/cpr.c
> index d3e370e..eea3773 100644
> --- a/migration/cpr.c
> +++ b/migration/cpr.c
> @@ -185,6 +185,8 @@ int cpr_state_save(MigrationChannel *channel, Error **errp)
>       if (mode == MIG_MODE_CPR_TRANSFER) {
>           g_assert(channel);
>           f = cpr_transfer_output(channel, errp);
> +    } else if (mode == MIG_MODE_CPR_EXEC) {
> +        f = cpr_exec_output(errp);
>       } else {
>           return 0;
>       }
> @@ -202,6 +204,10 @@ int cpr_state_save(MigrationChannel *channel, Error **errp)
>           return ret;
>       }
>   
> +    if (migrate_mode() == MIG_MODE_CPR_EXEC) {
> +        cpr_exec_persist_state(f);
> +    }
> +
>       /*
>        * Close the socket only partially so we can later detect when the other
>        * end closes by getting a HUP event.
> @@ -213,6 +219,12 @@ int cpr_state_save(MigrationChannel *channel, Error **errp)
>       return 0;
>   }
>   
> +static bool unpreserve_fd(int fd)
> +{
> +    qemu_set_cloexec(fd);
> +    return true;
> +}
> +
>   int cpr_state_load(MigrationChannel *channel, Error **errp)
>   {
>       int ret;
> @@ -220,7 +232,13 @@ int cpr_state_load(MigrationChannel *channel, Error **errp)
>       QEMUFile *f;
>       MigMode mode = 0;
>   
> -    if (channel) {
> +    if (cpr_exec_has_state()) {
> +        mode = MIG_MODE_CPR_EXEC;
> +        f = cpr_exec_input(errp);
> +        if (channel) {
> +            warn_report("ignoring cpr channel for migration mode cpr-exec");

migration/cpr.c does not include "qemu/error-report.h"

C.

> +        }
> +    } else if (channel) {
>           mode = MIG_MODE_CPR_TRANSFER;
>           cpr_set_incoming_mode(mode);
>           f = cpr_transfer_input(channel, errp);
> @@ -232,6 +250,7 @@ int cpr_state_load(MigrationChannel *channel, Error **errp)
>       }
>   
>       trace_cpr_state_load(MigMode_str(mode));
> +    cpr_set_incoming_mode(mode);
>   
>       v = qemu_get_be32(f);
>       if (v != QEMU_CPR_FILE_MAGIC) {
> @@ -253,6 +272,11 @@ int cpr_state_load(MigrationChannel *channel, Error **errp)
>           return ret;
>       }
>   
> +    if (migrate_mode() == MIG_MODE_CPR_EXEC) {
> +        /* Set cloexec to prevent fd leaks from fork until the next cpr-exec */
> +        cpr_walk_fd(unpreserve_fd);
> +    }
> +
>       /*
>        * Let the caller decide when to close the socket (and generate a HUP event
>        * for the sending side).
> @@ -273,7 +297,7 @@ void cpr_state_close(void)
>   bool cpr_incoming_needed(void *opaque)
>   {
>       MigMode mode = migrate_mode();
> -    return mode == MIG_MODE_CPR_TRANSFER;
> +    return mode == MIG_MODE_CPR_TRANSFER || mode == MIG_MODE_CPR_EXEC;
>   }
>   
>   /*
> diff --git a/migration/migration.c b/migration/migration.c
> index 08a98f7..2515bec 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -333,6 +333,7 @@ void migration_object_init(void)
>   
>       ram_mig_init();
>       dirty_bitmap_mig_init();
> +    cpr_exec_init();
>   
>       /* Initialize cpu throttle timers */
>       cpu_throttle_init();
> @@ -1796,7 +1797,8 @@ bool migrate_mode_is_cpr(MigrationState *s)
>   {
>       MigMode mode = s->parameters.mode;
>       return mode == MIG_MODE_CPR_REBOOT ||
> -           mode == MIG_MODE_CPR_TRANSFER;
> +           mode == MIG_MODE_CPR_TRANSFER ||
> +           mode == MIG_MODE_CPR_EXEC;
>   }
>   
>   int migrate_init(MigrationState *s, Error **errp)
> @@ -2145,6 +2147,12 @@ static bool migrate_prepare(MigrationState *s, bool resume, Error **errp)
>           return false;
>       }
>   
> +    if (migrate_mode() == MIG_MODE_CPR_EXEC &&
> +        !s->parameters.has_cpr_exec_command) {
> +        error_setg(errp, "cpr-exec mode requires setting cpr-exec-command");
> +        return false;
> +    }
> +
>       if (migration_is_blocked(errp)) {
>           return false;
>       }
> diff --git a/migration/ram.c b/migration/ram.c
> index 7208bc1..6730a41 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -228,6 +228,7 @@ bool migrate_ram_is_ignored(RAMBlock *block)
>       MigMode mode = migrate_mode();
>       return !qemu_ram_is_migratable(block) ||
>              mode == MIG_MODE_CPR_TRANSFER ||
> +           mode == MIG_MODE_CPR_EXEC ||
>              (migrate_ignore_shared() && qemu_ram_is_shared(block)
>                                       && qemu_ram_is_named_file(block));
>   }
> diff --git a/migration/vmstate-types.c b/migration/vmstate-types.c
> index 741a588..1aa0573 100644
> --- a/migration/vmstate-types.c
> +++ b/migration/vmstate-types.c
> @@ -321,6 +321,10 @@ static int get_fd(QEMUFile *f, void *pv, size_t size,
>                     const VMStateField *field)
>   {
>       int32_t *v = pv;
> +    if (migrate_mode() == MIG_MODE_CPR_EXEC) {
> +        qemu_get_sbe32s(f, v);
> +        return 0;
> +    }
>       *v = qemu_file_get_fd(f);
>       return 0;
>   }
> @@ -329,6 +333,10 @@ static int put_fd(QEMUFile *f, void *pv, size_t size,
>                     const VMStateField *field, JSONWriter *vmdesc)
>   {
>       int32_t *v = pv;
> +    if (migrate_mode() == MIG_MODE_CPR_EXEC) {
> +        qemu_put_sbe32s(f, v);
> +        return 0;
> +    }
>       return qemu_file_put_fd(f, *v);
>   }
>   
> diff --git a/system/vl.c b/system/vl.c
> index 4c24073..f395d04 100644
> --- a/system/vl.c
> +++ b/system/vl.c
> @@ -3867,6 +3867,8 @@ void qemu_init(int argc, char **argv)
>       }
>       qemu_init_displays();
>       accel_setup_post(current_machine);
> -    os_setup_post();
> +    if (migrate_mode() != MIG_MODE_CPR_EXEC) {
> +        os_setup_post();
> +    }
>       resume_mux_open();
>   }
> diff --git a/migration/trace-events b/migration/trace-events
> index 706db97..e8edd1f 100644
> --- a/migration/trace-events
> +++ b/migration/trace-events
> @@ -354,6 +354,7 @@ cpr_state_save(const char *mode) "%s mode"
>   cpr_state_load(const char *mode) "%s mode"
>   cpr_transfer_input(const char *path) "%s"
>   cpr_transfer_output(const char *path) "%s"
> +cpr_exec(void) ""
>   
>   # block-dirty-bitmap.c
>   send_bitmap_header_enter(void) ""
Re: [PATCH V4 6/8] migration: cpr-exec mode
Posted by Steven Sistare 3 days, 21 hours ago
On 9/22/2025 11:28 AM, Cédric Le Goater wrote:
> On 9/22/25 15:49, Steve Sistare wrote:
>> Add the cpr-exec migration mode.  Usage:
>>    qemu-system-$arch -machine aux-ram-share=on ...
>>    migrate_set_parameter mode cpr-exec
>>    migrate_set_parameter cpr-exec-command \
>>      <arg1> <arg2> ... -incoming <uri-1> \
>>    migrate -d <uri-1>
>>
>> The migrate command stops the VM, saves state to uri-1,
>> directly exec's a new version of QEMU on the same host,
>> replacing the original process while retaining its PID, and
>> loads state from uri-1.  Guest RAM is preserved in place,
>> albeit with new virtual addresses.
>>
>> The new QEMU process is started by exec'ing the command
>> specified by the @cpr-exec-command parameter.  The first word of
>> the command is the binary, and the remaining words are its
>> arguments.  The command may be a direct invocation of new QEMU,
>> or may be a non-QEMU command that exec's the new QEMU binary.
>>
>> This mode creates a second migration channel that is not visible
>> to the user.  At the start of migration, old QEMU saves CPR state
>> to the second channel, and at the end of migration, it tells the
>> main loop to call cpr_exec.  New QEMU loads CPR state early, before
>> objects are created.
>>
>> Because old QEMU terminates when new QEMU starts, one cannot
>> stream data between the two, so uri-1 must be a type,
>> such as a file, that accepts all data before old QEMU exits.
>> Otherwise, old QEMU may quietly block writing to the channel.
>>
>> Memory-backend objects must have the share=on attribute, but
>> memory-backend-epc is not supported.  The VM must be started with
>> the '-machine aux-ram-share=on' option, which allows anonymous
>> memory to be transferred in place to the new process.  The memfds
>> are kept open across exec by clearing the close-on-exec flag, their
>> values are saved in CPR state, and they are mmap'd in new QEMU.
>>
>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>> Acked-by: Markus Armbruster <armbru@redhat.com>
>> ---
>>   qapi/migration.json       | 25 +++++++++++++-
>>   include/migration/cpr.h   |  1 +
>>   migration/cpr-exec.c      | 84 +++++++++++++++++++++++++++++++++++++++++++++++
>>   migration/cpr.c           | 28 ++++++++++++++--
>>   migration/migration.c     | 10 +++++-
>>   migration/ram.c           |  1 +
>>   migration/vmstate-types.c |  8 +++++
>>   system/vl.c               |  4 ++-
>>   migration/trace-events    |  1 +
>>   9 files changed, 157 insertions(+), 5 deletions(-)
>>
>> diff --git a/qapi/migration.json b/qapi/migration.json
>> index 2be8fa1..be0f3fc 100644
>> --- a/qapi/migration.json
>> +++ b/qapi/migration.json
>> @@ -694,9 +694,32 @@
>>   #     until you issue the `migrate-incoming` command.
>>   #
>>   #     (since 10.0)
>> +#
>> +# @cpr-exec: The migrate command stops the VM, saves state to the
>> +#     migration channel, directly exec's a new version of QEMU on the
>> +#     same host, replacing the original process while retaining its
>> +#     PID, and loads state from the channel.  Guest RAM is preserved
>> +#     in place.  Devices and their pinned pages are also preserved for
>> +#     VFIO and IOMMUFD.
>> +#
>> +#     Old QEMU starts new QEMU by exec'ing the command specified by
>> +#     the @cpr-exec-command parameter.  The command may be a direct
>> +#     invocation of new QEMU, or may be a wrapper that exec's the new
>> +#     QEMU binary.
>> +#
>> +#     Because old QEMU terminates when new QEMU starts, one cannot
>> +#     stream data between the two, so the channel must be a type,
>> +#     such as a file, that accepts all data before old QEMU exits.
>> +#     Otherwise, old QEMU may quietly block writing to the channel.
>> +#
>> +#     Memory-backend objects must have the share=on attribute, but
>> +#     memory-backend-epc is not supported.  The VM must be started
>> +#     with the '-machine aux-ram-share=on' option.
>> +#
>> +#     (since 10.2)
>>   ##
>>   { 'enum': 'MigMode',
>> -  'data': [ 'normal', 'cpr-reboot', 'cpr-transfer' ] }
>> +  'data': [ 'normal', 'cpr-reboot', 'cpr-transfer', 'cpr-exec' ] }
>>   ##
>>   # @ZeroPageDetection:
>> diff --git a/include/migration/cpr.h b/include/migration/cpr.h
>> index b84389f..beed392 100644
>> --- a/include/migration/cpr.h
>> +++ b/include/migration/cpr.h
>> @@ -53,6 +53,7 @@ int cpr_get_fd_param(const char *name, const char *fdname, int index,
>>   QEMUFile *cpr_transfer_output(MigrationChannel *channel, Error **errp);
>>   QEMUFile *cpr_transfer_input(MigrationChannel *channel, Error **errp);
>> +void cpr_exec_init(void);
>>   QEMUFile *cpr_exec_output(Error **errp);
>>   QEMUFile *cpr_exec_input(Error **errp);
>>   void cpr_exec_persist_state(QEMUFile *f);
>> diff --git a/migration/cpr-exec.c b/migration/cpr-exec.c
>> index 2c32e9c..8cf55a3 100644
>> --- a/migration/cpr-exec.c
>> +++ b/migration/cpr-exec.c
>> @@ -6,15 +6,21 @@
>>   #include "qemu/osdep.h"
>>   #include "qemu/cutils.h"
>> +#include "qemu/error-report.h"
>>   #include "qemu/memfd.h"
>>   #include "qapi/error.h"
>> +#include "qapi/type-helpers.h"
>>   #include "io/channel-file.h"
>>   #include "io/channel-socket.h"
>> +#include "block/block-global-state.h"
>> +#include "qemu/main-loop.h"
>>   #include "migration/cpr.h"
>>   #include "migration/qemu-file.h"
>> +#include "migration/migration.h"
>>   #include "migration/misc.h"
>>   #include "migration/vmstate.h"
>>   #include "system/runstate.h"
>> +#include "trace.h"
>>   #define CPR_EXEC_STATE_NAME "QEMU_CPR_EXEC_STATE"
>> @@ -92,3 +98,81 @@ QEMUFile *cpr_exec_input(Error **errp)
>>       lseek(mfd, 0, SEEK_SET);
>>       return qemu_file_new_fd_input(mfd, CPR_EXEC_STATE_NAME);
>>   }
>> +
>> +static bool preserve_fd(int fd)
>> +{
>> +    qemu_clear_cloexec(fd);
>> +    return true;
>> +}
>> +
>> +static bool unpreserve_fd(int fd)
>> +{
>> +    qemu_set_cloexec(fd);
>> +    return true;
>> +}
>> +
>> +static void cpr_exec_cb(void *opaque)
>> +{
>> +    MigrationState *s = migrate_get_current();
>> +    char **argv = strv_from_str_list(s->parameters.cpr_exec_command);
>> +    Error *err = NULL;
>> +
>> +    /*
>> +     * Clear the close-on-exec flag for all preserved fd's.  We cannot do so
>> +     * earlier because they should not persist across miscellaneous fork and
>> +     * exec calls that are performed during normal operation.
>> +     */
>> +    cpr_walk_fd(preserve_fd);
>> +
>> +    trace_cpr_exec();
>> +    execvp(argv[0], argv);
>> +
>> +    /*
>> +     * exec should only fail if argv[0] is bogus, or has a permissions problem,
>> +     * or the system is very short on resources.
>> +     */
>> +    g_strfreev(argv);
>> +    cpr_walk_fd(unpreserve_fd);
>> +
>> +    error_setg_errno(&err, errno, "execvp %s failed", argv[0]);
>> +    error_report_err(error_copy(err));
>> +    migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED);
>> +    migrate_set_error(s, err);
>> +
>> +    migration_call_notifiers(s, MIG_EVENT_PRECOPY_FAILED, NULL);
>> +
>> +    err = NULL;
>> +    if (!migration_block_activate(&err)) {
>> +        /* error was already reported */
>> +        return;
>> +    }
>> +
>> +    if (runstate_is_live(s->vm_old_state)) {
>> +        vm_start();
>> +    }
>> +}
>> +
>> +static int cpr_exec_notifier(NotifierWithReturn *notifier, MigrationEvent *e,
>> +                             Error **errp)
>> +{
>> +    MigrationState *s = migrate_get_current();
>> +
>> +    if (e->type == MIG_EVENT_PRECOPY_DONE) {
>> +        QEMUBH *cpr_exec_bh = qemu_bh_new(cpr_exec_cb, NULL);
>> +        assert(s->state == MIGRATION_STATUS_COMPLETED);
>> +        qemu_bh_schedule(cpr_exec_bh);
>> +        qemu_notify_event();
>> +
>> +    } else if (e->type == MIG_EVENT_PRECOPY_FAILED) {
>> +        cpr_exec_unpersist_state();
>> +    }
>> +    return 0;
>> +}
>> +
>> +void cpr_exec_init(void)
>> +{
>> +    static NotifierWithReturn exec_notifier;
>> +
>> +    migration_add_notifier_mode(&exec_notifier, cpr_exec_notifier,
>> +                                MIG_MODE_CPR_EXEC);
>> +}
>> diff --git a/migration/cpr.c b/migration/cpr.c
>> index d3e370e..eea3773 100644
>> --- a/migration/cpr.c
>> +++ b/migration/cpr.c
>> @@ -185,6 +185,8 @@ int cpr_state_save(MigrationChannel *channel, Error **errp)
>>       if (mode == MIG_MODE_CPR_TRANSFER) {
>>           g_assert(channel);
>>           f = cpr_transfer_output(channel, errp);
>> +    } else if (mode == MIG_MODE_CPR_EXEC) {
>> +        f = cpr_exec_output(errp);
>>       } else {
>>           return 0;
>>       }
>> @@ -202,6 +204,10 @@ int cpr_state_save(MigrationChannel *channel, Error **errp)
>>           return ret;
>>       }
>> +    if (migrate_mode() == MIG_MODE_CPR_EXEC) {
>> +        cpr_exec_persist_state(f);
>> +    }
>> +
>>       /*
>>        * Close the socket only partially so we can later detect when the other
>>        * end closes by getting a HUP event.
>> @@ -213,6 +219,12 @@ int cpr_state_save(MigrationChannel *channel, Error **errp)
>>       return 0;
>>   }
>> +static bool unpreserve_fd(int fd)
>> +{
>> +    qemu_set_cloexec(fd);
>> +    return true;
>> +}
>> +
>>   int cpr_state_load(MigrationChannel *channel, Error **errp)
>>   {
>>       int ret;
>> @@ -220,7 +232,13 @@ int cpr_state_load(MigrationChannel *channel, Error **errp)
>>       QEMUFile *f;
>>       MigMode mode = 0;
>> -    if (channel) {
>> +    if (cpr_exec_has_state()) {
>> +        mode = MIG_MODE_CPR_EXEC;
>> +        f = cpr_exec_input(errp);
>> +        if (channel) {
>> +            warn_report("ignoring cpr channel for migration mode cpr-exec");
> 
> migration/cpr.c does not include "qemu/error-report.h"

It builds just fine because it is included indirectly, but I will include it
directly.

- Steve
>> +        }
>> +    } else if (channel) {
>>           mode = MIG_MODE_CPR_TRANSFER;
>>           cpr_set_incoming_mode(mode);
>>           f = cpr_transfer_input(channel, errp);
>> @@ -232,6 +250,7 @@ int cpr_state_load(MigrationChannel *channel, Error **errp)
>>       }
>>       trace_cpr_state_load(MigMode_str(mode));
>> +    cpr_set_incoming_mode(mode);
>>       v = qemu_get_be32(f);
>>       if (v != QEMU_CPR_FILE_MAGIC) {
>> @@ -253,6 +272,11 @@ int cpr_state_load(MigrationChannel *channel, Error **errp)
>>           return ret;
>>       }
>> +    if (migrate_mode() == MIG_MODE_CPR_EXEC) {
>> +        /* Set cloexec to prevent fd leaks from fork until the next cpr-exec */
>> +        cpr_walk_fd(unpreserve_fd);
>> +    }
>> +
>>       /*
>>        * Let the caller decide when to close the socket (and generate a HUP event
>>        * for the sending side).
>> @@ -273,7 +297,7 @@ void cpr_state_close(void)
>>   bool cpr_incoming_needed(void *opaque)
>>   {
>>       MigMode mode = migrate_mode();
>> -    return mode == MIG_MODE_CPR_TRANSFER;
>> +    return mode == MIG_MODE_CPR_TRANSFER || mode == MIG_MODE_CPR_EXEC;
>>   }
>>   /*
>> diff --git a/migration/migration.c b/migration/migration.c
>> index 08a98f7..2515bec 100644
>> --- a/migration/migration.c
>> +++ b/migration/migration.c
>> @@ -333,6 +333,7 @@ void migration_object_init(void)
>>       ram_mig_init();
>>       dirty_bitmap_mig_init();
>> +    cpr_exec_init();
>>       /* Initialize cpu throttle timers */
>>       cpu_throttle_init();
>> @@ -1796,7 +1797,8 @@ bool migrate_mode_is_cpr(MigrationState *s)
>>   {
>>       MigMode mode = s->parameters.mode;
>>       return mode == MIG_MODE_CPR_REBOOT ||
>> -           mode == MIG_MODE_CPR_TRANSFER;
>> +           mode == MIG_MODE_CPR_TRANSFER ||
>> +           mode == MIG_MODE_CPR_EXEC;
>>   }
>>   int migrate_init(MigrationState *s, Error **errp)
>> @@ -2145,6 +2147,12 @@ static bool migrate_prepare(MigrationState *s, bool resume, Error **errp)
>>           return false;
>>       }
>> +    if (migrate_mode() == MIG_MODE_CPR_EXEC &&
>> +        !s->parameters.has_cpr_exec_command) {
>> +        error_setg(errp, "cpr-exec mode requires setting cpr-exec-command");
>> +        return false;
>> +    }
>> +
>>       if (migration_is_blocked(errp)) {
>>           return false;
>>       }
>> diff --git a/migration/ram.c b/migration/ram.c
>> index 7208bc1..6730a41 100644
>> --- a/migration/ram.c
>> +++ b/migration/ram.c
>> @@ -228,6 +228,7 @@ bool migrate_ram_is_ignored(RAMBlock *block)
>>       MigMode mode = migrate_mode();
>>       return !qemu_ram_is_migratable(block) ||
>>              mode == MIG_MODE_CPR_TRANSFER ||
>> +           mode == MIG_MODE_CPR_EXEC ||
>>              (migrate_ignore_shared() && qemu_ram_is_shared(block)
>>                                       && qemu_ram_is_named_file(block));
>>   }
>> diff --git a/migration/vmstate-types.c b/migration/vmstate-types.c
>> index 741a588..1aa0573 100644
>> --- a/migration/vmstate-types.c
>> +++ b/migration/vmstate-types.c
>> @@ -321,6 +321,10 @@ static int get_fd(QEMUFile *f, void *pv, size_t size,
>>                     const VMStateField *field)
>>   {
>>       int32_t *v = pv;
>> +    if (migrate_mode() == MIG_MODE_CPR_EXEC) {
>> +        qemu_get_sbe32s(f, v);
>> +        return 0;
>> +    }
>>       *v = qemu_file_get_fd(f);
>>       return 0;
>>   }
>> @@ -329,6 +333,10 @@ static int put_fd(QEMUFile *f, void *pv, size_t size,
>>                     const VMStateField *field, JSONWriter *vmdesc)
>>   {
>>       int32_t *v = pv;
>> +    if (migrate_mode() == MIG_MODE_CPR_EXEC) {
>> +        qemu_put_sbe32s(f, v);
>> +        return 0;
>> +    }
>>       return qemu_file_put_fd(f, *v);
>>   }
>> diff --git a/system/vl.c b/system/vl.c
>> index 4c24073..f395d04 100644
>> --- a/system/vl.c
>> +++ b/system/vl.c
>> @@ -3867,6 +3867,8 @@ void qemu_init(int argc, char **argv)
>>       }
>>       qemu_init_displays();
>>       accel_setup_post(current_machine);
>> -    os_setup_post();
>> +    if (migrate_mode() != MIG_MODE_CPR_EXEC) {
>> +        os_setup_post();
>> +    }
>>       resume_mux_open();
>>   }
>> diff --git a/migration/trace-events b/migration/trace-events
>> index 706db97..e8edd1f 100644
>> --- a/migration/trace-events
>> +++ b/migration/trace-events
>> @@ -354,6 +354,7 @@ cpr_state_save(const char *mode) "%s mode"
>>   cpr_state_load(const char *mode) "%s mode"
>>   cpr_transfer_input(const char *path) "%s"
>>   cpr_transfer_output(const char *path) "%s"
>> +cpr_exec(void) ""
>>   # block-dirty-bitmap.c
>>   send_bitmap_header_enter(void) ""
> 


Re: [PATCH V4 6/8] migration: cpr-exec mode
Posted by Cédric Le Goater 3 days, 8 hours ago
On 9/24/25 20:16, Steven Sistare wrote:
> On 9/22/2025 11:28 AM, Cédric Le Goater wrote:
>> On 9/22/25 15:49, Steve Sistare wrote:
>>> Add the cpr-exec migration mode.  Usage:
>>>    qemu-system-$arch -machine aux-ram-share=on ...
>>>    migrate_set_parameter mode cpr-exec
>>>    migrate_set_parameter cpr-exec-command \
>>>      <arg1> <arg2> ... -incoming <uri-1> \
>>>    migrate -d <uri-1>
>>>
>>> The migrate command stops the VM, saves state to uri-1,
>>> directly exec's a new version of QEMU on the same host,
>>> replacing the original process while retaining its PID, and
>>> loads state from uri-1.  Guest RAM is preserved in place,
>>> albeit with new virtual addresses.
>>>
>>> The new QEMU process is started by exec'ing the command
>>> specified by the @cpr-exec-command parameter.  The first word of
>>> the command is the binary, and the remaining words are its
>>> arguments.  The command may be a direct invocation of new QEMU,
>>> or may be a non-QEMU command that exec's the new QEMU binary.
>>>
>>> This mode creates a second migration channel that is not visible
>>> to the user.  At the start of migration, old QEMU saves CPR state
>>> to the second channel, and at the end of migration, it tells the
>>> main loop to call cpr_exec.  New QEMU loads CPR state early, before
>>> objects are created.
>>>
>>> Because old QEMU terminates when new QEMU starts, one cannot
>>> stream data between the two, so uri-1 must be a type,
>>> such as a file, that accepts all data before old QEMU exits.
>>> Otherwise, old QEMU may quietly block writing to the channel.
>>>
>>> Memory-backend objects must have the share=on attribute, but
>>> memory-backend-epc is not supported.  The VM must be started with
>>> the '-machine aux-ram-share=on' option, which allows anonymous
>>> memory to be transferred in place to the new process.  The memfds
>>> are kept open across exec by clearing the close-on-exec flag, their
>>> values are saved in CPR state, and they are mmap'd in new QEMU.
>>>
>>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>>> Acked-by: Markus Armbruster <armbru@redhat.com>
>>> ---
>>>   qapi/migration.json       | 25 +++++++++++++-
>>>   include/migration/cpr.h   |  1 +
>>>   migration/cpr-exec.c      | 84 +++++++++++++++++++++++++++++++++++++++++++++++
>>>   migration/cpr.c           | 28 ++++++++++++++--
>>>   migration/migration.c     | 10 +++++-
>>>   migration/ram.c           |  1 +
>>>   migration/vmstate-types.c |  8 +++++
>>>   system/vl.c               |  4 ++-
>>>   migration/trace-events    |  1 +
>>>   9 files changed, 157 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/qapi/migration.json b/qapi/migration.json
>>> index 2be8fa1..be0f3fc 100644
>>> --- a/qapi/migration.json
>>> +++ b/qapi/migration.json
>>> @@ -694,9 +694,32 @@
>>>   #     until you issue the `migrate-incoming` command.
>>>   #
>>>   #     (since 10.0)
>>> +#
>>> +# @cpr-exec: The migrate command stops the VM, saves state to the
>>> +#     migration channel, directly exec's a new version of QEMU on the
>>> +#     same host, replacing the original process while retaining its
>>> +#     PID, and loads state from the channel.  Guest RAM is preserved
>>> +#     in place.  Devices and their pinned pages are also preserved for
>>> +#     VFIO and IOMMUFD.
>>> +#
>>> +#     Old QEMU starts new QEMU by exec'ing the command specified by
>>> +#     the @cpr-exec-command parameter.  The command may be a direct
>>> +#     invocation of new QEMU, or may be a wrapper that exec's the new
>>> +#     QEMU binary.
>>> +#
>>> +#     Because old QEMU terminates when new QEMU starts, one cannot
>>> +#     stream data between the two, so the channel must be a type,
>>> +#     such as a file, that accepts all data before old QEMU exits.
>>> +#     Otherwise, old QEMU may quietly block writing to the channel.
>>> +#
>>> +#     Memory-backend objects must have the share=on attribute, but
>>> +#     memory-backend-epc is not supported.  The VM must be started
>>> +#     with the '-machine aux-ram-share=on' option.
>>> +#
>>> +#     (since 10.2)
>>>   ##
>>>   { 'enum': 'MigMode',
>>> -  'data': [ 'normal', 'cpr-reboot', 'cpr-transfer' ] }
>>> +  'data': [ 'normal', 'cpr-reboot', 'cpr-transfer', 'cpr-exec' ] }
>>>   ##
>>>   # @ZeroPageDetection:
>>> diff --git a/include/migration/cpr.h b/include/migration/cpr.h
>>> index b84389f..beed392 100644
>>> --- a/include/migration/cpr.h
>>> +++ b/include/migration/cpr.h
>>> @@ -53,6 +53,7 @@ int cpr_get_fd_param(const char *name, const char *fdname, int index,
>>>   QEMUFile *cpr_transfer_output(MigrationChannel *channel, Error **errp);
>>>   QEMUFile *cpr_transfer_input(MigrationChannel *channel, Error **errp);
>>> +void cpr_exec_init(void);
>>>   QEMUFile *cpr_exec_output(Error **errp);
>>>   QEMUFile *cpr_exec_input(Error **errp);
>>>   void cpr_exec_persist_state(QEMUFile *f);
>>> diff --git a/migration/cpr-exec.c b/migration/cpr-exec.c
>>> index 2c32e9c..8cf55a3 100644
>>> --- a/migration/cpr-exec.c
>>> +++ b/migration/cpr-exec.c
>>> @@ -6,15 +6,21 @@
>>>   #include "qemu/osdep.h"
>>>   #include "qemu/cutils.h"
>>> +#include "qemu/error-report.h"
>>>   #include "qemu/memfd.h"
>>>   #include "qapi/error.h"
>>> +#include "qapi/type-helpers.h"
>>>   #include "io/channel-file.h"
>>>   #include "io/channel-socket.h"
>>> +#include "block/block-global-state.h"
>>> +#include "qemu/main-loop.h"
>>>   #include "migration/cpr.h"
>>>   #include "migration/qemu-file.h"
>>> +#include "migration/migration.h"
>>>   #include "migration/misc.h"
>>>   #include "migration/vmstate.h"
>>>   #include "system/runstate.h"
>>> +#include "trace.h"
>>>   #define CPR_EXEC_STATE_NAME "QEMU_CPR_EXEC_STATE"
>>> @@ -92,3 +98,81 @@ QEMUFile *cpr_exec_input(Error **errp)
>>>       lseek(mfd, 0, SEEK_SET);
>>>       return qemu_file_new_fd_input(mfd, CPR_EXEC_STATE_NAME);
>>>   }
>>> +
>>> +static bool preserve_fd(int fd)
>>> +{
>>> +    qemu_clear_cloexec(fd);
>>> +    return true;
>>> +}
>>> +
>>> +static bool unpreserve_fd(int fd)
>>> +{
>>> +    qemu_set_cloexec(fd);
>>> +    return true;
>>> +}
>>> +
>>> +static void cpr_exec_cb(void *opaque)
>>> +{
>>> +    MigrationState *s = migrate_get_current();
>>> +    char **argv = strv_from_str_list(s->parameters.cpr_exec_command);
>>> +    Error *err = NULL;
>>> +
>>> +    /*
>>> +     * Clear the close-on-exec flag for all preserved fd's.  We cannot do so
>>> +     * earlier because they should not persist across miscellaneous fork and
>>> +     * exec calls that are performed during normal operation.
>>> +     */
>>> +    cpr_walk_fd(preserve_fd);
>>> +
>>> +    trace_cpr_exec();
>>> +    execvp(argv[0], argv);
>>> +
>>> +    /*
>>> +     * exec should only fail if argv[0] is bogus, or has a permissions problem,
>>> +     * or the system is very short on resources.
>>> +     */
>>> +    g_strfreev(argv);
>>> +    cpr_walk_fd(unpreserve_fd);
>>> +
>>> +    error_setg_errno(&err, errno, "execvp %s failed", argv[0]);
>>> +    error_report_err(error_copy(err));
>>> +    migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED);
>>> +    migrate_set_error(s, err);
>>> +
>>> +    migration_call_notifiers(s, MIG_EVENT_PRECOPY_FAILED, NULL);
>>> +
>>> +    err = NULL;
>>> +    if (!migration_block_activate(&err)) {
>>> +        /* error was already reported */
>>> +        return;
>>> +    }
>>> +
>>> +    if (runstate_is_live(s->vm_old_state)) {
>>> +        vm_start();
>>> +    }
>>> +}
>>> +
>>> +static int cpr_exec_notifier(NotifierWithReturn *notifier, MigrationEvent *e,
>>> +                             Error **errp)
>>> +{
>>> +    MigrationState *s = migrate_get_current();
>>> +
>>> +    if (e->type == MIG_EVENT_PRECOPY_DONE) {
>>> +        QEMUBH *cpr_exec_bh = qemu_bh_new(cpr_exec_cb, NULL);
>>> +        assert(s->state == MIGRATION_STATUS_COMPLETED);
>>> +        qemu_bh_schedule(cpr_exec_bh);
>>> +        qemu_notify_event();
>>> +
>>> +    } else if (e->type == MIG_EVENT_PRECOPY_FAILED) {
>>> +        cpr_exec_unpersist_state();
>>> +    }
>>> +    return 0;
>>> +}
>>> +
>>> +void cpr_exec_init(void)
>>> +{
>>> +    static NotifierWithReturn exec_notifier;
>>> +
>>> +    migration_add_notifier_mode(&exec_notifier, cpr_exec_notifier,
>>> +                                MIG_MODE_CPR_EXEC);
>>> +}
>>> diff --git a/migration/cpr.c b/migration/cpr.c
>>> index d3e370e..eea3773 100644
>>> --- a/migration/cpr.c
>>> +++ b/migration/cpr.c
>>> @@ -185,6 +185,8 @@ int cpr_state_save(MigrationChannel *channel, Error **errp)
>>>       if (mode == MIG_MODE_CPR_TRANSFER) {
>>>           g_assert(channel);
>>>           f = cpr_transfer_output(channel, errp);
>>> +    } else if (mode == MIG_MODE_CPR_EXEC) {
>>> +        f = cpr_exec_output(errp);
>>>       } else {
>>>           return 0;
>>>       }
>>> @@ -202,6 +204,10 @@ int cpr_state_save(MigrationChannel *channel, Error **errp)
>>>           return ret;
>>>       }
>>> +    if (migrate_mode() == MIG_MODE_CPR_EXEC) {
>>> +        cpr_exec_persist_state(f);
>>> +    }
>>> +
>>>       /*
>>>        * Close the socket only partially so we can later detect when the other
>>>        * end closes by getting a HUP event.
>>> @@ -213,6 +219,12 @@ int cpr_state_save(MigrationChannel *channel, Error **errp)
>>>       return 0;
>>>   }
>>> +static bool unpreserve_fd(int fd)
>>> +{
>>> +    qemu_set_cloexec(fd);
>>> +    return true;
>>> +}
>>> +
>>>   int cpr_state_load(MigrationChannel *channel, Error **errp)
>>>   {
>>>       int ret;
>>> @@ -220,7 +232,13 @@ int cpr_state_load(MigrationChannel *channel, Error **errp)
>>>       QEMUFile *f;
>>>       MigMode mode = 0;
>>> -    if (channel) {
>>> +    if (cpr_exec_has_state()) {
>>> +        mode = MIG_MODE_CPR_EXEC;
>>> +        f = cpr_exec_input(errp);
>>> +        if (channel) {
>>> +            warn_report("ignoring cpr channel for migration mode cpr-exec");
>>
>> migration/cpr.c does not include "qemu/error-report.h"
> 
> It builds just fine because it is included indirectly, but I will include it
> directly.


The build broke on my tree but I have other patches moving code. It is
better to be explicit.


Thanks,

C.


> 
> - Steve
>>> +        }
>>> +    } else if (channel) {
>>>           mode = MIG_MODE_CPR_TRANSFER;
>>>           cpr_set_incoming_mode(mode);
>>>           f = cpr_transfer_input(channel, errp);
>>> @@ -232,6 +250,7 @@ int cpr_state_load(MigrationChannel *channel, Error **errp)
>>>       }
>>>       trace_cpr_state_load(MigMode_str(mode));
>>> +    cpr_set_incoming_mode(mode);
>>>       v = qemu_get_be32(f);
>>>       if (v != QEMU_CPR_FILE_MAGIC) {
>>> @@ -253,6 +272,11 @@ int cpr_state_load(MigrationChannel *channel, Error **errp)
>>>           return ret;
>>>       }
>>> +    if (migrate_mode() == MIG_MODE_CPR_EXEC) {
>>> +        /* Set cloexec to prevent fd leaks from fork until the next cpr-exec */
>>> +        cpr_walk_fd(unpreserve_fd);
>>> +    }
>>> +
>>>       /*
>>>        * Let the caller decide when to close the socket (and generate a HUP event
>>>        * for the sending side).
>>> @@ -273,7 +297,7 @@ void cpr_state_close(void)
>>>   bool cpr_incoming_needed(void *opaque)
>>>   {
>>>       MigMode mode = migrate_mode();
>>> -    return mode == MIG_MODE_CPR_TRANSFER;
>>> +    return mode == MIG_MODE_CPR_TRANSFER || mode == MIG_MODE_CPR_EXEC;
>>>   }
>>>   /*
>>> diff --git a/migration/migration.c b/migration/migration.c
>>> index 08a98f7..2515bec 100644
>>> --- a/migration/migration.c
>>> +++ b/migration/migration.c
>>> @@ -333,6 +333,7 @@ void migration_object_init(void)
>>>       ram_mig_init();
>>>       dirty_bitmap_mig_init();
>>> +    cpr_exec_init();
>>>       /* Initialize cpu throttle timers */
>>>       cpu_throttle_init();
>>> @@ -1796,7 +1797,8 @@ bool migrate_mode_is_cpr(MigrationState *s)
>>>   {
>>>       MigMode mode = s->parameters.mode;
>>>       return mode == MIG_MODE_CPR_REBOOT ||
>>> -           mode == MIG_MODE_CPR_TRANSFER;
>>> +           mode == MIG_MODE_CPR_TRANSFER ||
>>> +           mode == MIG_MODE_CPR_EXEC;
>>>   }
>>>   int migrate_init(MigrationState *s, Error **errp)
>>> @@ -2145,6 +2147,12 @@ static bool migrate_prepare(MigrationState *s, bool resume, Error **errp)
>>>           return false;
>>>       }
>>> +    if (migrate_mode() == MIG_MODE_CPR_EXEC &&
>>> +        !s->parameters.has_cpr_exec_command) {
>>> +        error_setg(errp, "cpr-exec mode requires setting cpr-exec-command");
>>> +        return false;
>>> +    }
>>> +
>>>       if (migration_is_blocked(errp)) {
>>>           return false;
>>>       }
>>> diff --git a/migration/ram.c b/migration/ram.c
>>> index 7208bc1..6730a41 100644
>>> --- a/migration/ram.c
>>> +++ b/migration/ram.c
>>> @@ -228,6 +228,7 @@ bool migrate_ram_is_ignored(RAMBlock *block)
>>>       MigMode mode = migrate_mode();
>>>       return !qemu_ram_is_migratable(block) ||
>>>              mode == MIG_MODE_CPR_TRANSFER ||
>>> +           mode == MIG_MODE_CPR_EXEC ||
>>>              (migrate_ignore_shared() && qemu_ram_is_shared(block)
>>>                                       && qemu_ram_is_named_file(block));
>>>   }
>>> diff --git a/migration/vmstate-types.c b/migration/vmstate-types.c
>>> index 741a588..1aa0573 100644
>>> --- a/migration/vmstate-types.c
>>> +++ b/migration/vmstate-types.c
>>> @@ -321,6 +321,10 @@ static int get_fd(QEMUFile *f, void *pv, size_t size,
>>>                     const VMStateField *field)
>>>   {
>>>       int32_t *v = pv;
>>> +    if (migrate_mode() == MIG_MODE_CPR_EXEC) {
>>> +        qemu_get_sbe32s(f, v);
>>> +        return 0;
>>> +    }
>>>       *v = qemu_file_get_fd(f);
>>>       return 0;
>>>   }
>>> @@ -329,6 +333,10 @@ static int put_fd(QEMUFile *f, void *pv, size_t size,
>>>                     const VMStateField *field, JSONWriter *vmdesc)
>>>   {
>>>       int32_t *v = pv;
>>> +    if (migrate_mode() == MIG_MODE_CPR_EXEC) {
>>> +        qemu_put_sbe32s(f, v);
>>> +        return 0;
>>> +    }
>>>       return qemu_file_put_fd(f, *v);
>>>   }
>>> diff --git a/system/vl.c b/system/vl.c
>>> index 4c24073..f395d04 100644
>>> --- a/system/vl.c
>>> +++ b/system/vl.c
>>> @@ -3867,6 +3867,8 @@ void qemu_init(int argc, char **argv)
>>>       }
>>>       qemu_init_displays();
>>>       accel_setup_post(current_machine);
>>> -    os_setup_post();
>>> +    if (migrate_mode() != MIG_MODE_CPR_EXEC) {
>>> +        os_setup_post();
>>> +    }
>>>       resume_mux_open();
>>>   }
>>> diff --git a/migration/trace-events b/migration/trace-events
>>> index 706db97..e8edd1f 100644
>>> --- a/migration/trace-events
>>> +++ b/migration/trace-events
>>> @@ -354,6 +354,7 @@ cpr_state_save(const char *mode) "%s mode"
>>>   cpr_state_load(const char *mode) "%s mode"
>>>   cpr_transfer_input(const char *path) "%s"
>>>   cpr_transfer_output(const char *path) "%s"
>>> +cpr_exec(void) ""
>>>   # block-dirty-bitmap.c
>>>   send_bitmap_header_enter(void) ""
>>
>