[PATCH v5 27/36] vfio/migration: Multifd device state transfer support - load thread

Maciej S. Szmigiero posted 36 patches 1 month, 1 week ago
There is a newer version of this series
[PATCH v5 27/36] vfio/migration: Multifd device state transfer support - load thread
Posted by Maciej S. Szmigiero 1 month, 1 week ago
From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>

Since it's important to finish loading device state transferred via the
main migration channel (via save_live_iterate SaveVMHandler) before
starting loading the data asynchronously transferred via multifd the thread
doing the actual loading of the multifd transferred data is only started
from switchover_start SaveVMHandler.

switchover_start handler is called when MIG_CMD_SWITCHOVER_START
sub-command of QEMU_VM_COMMAND is received via the main migration channel.

This sub-command is only sent after all save_live_iterate data have already
been posted so it is safe to commence loading of the multifd-transferred
device state upon receiving it - loading of save_live_iterate data happens
synchronously in the main migration thread (much like the processing of
MIG_CMD_SWITCHOVER_START) so by the time MIG_CMD_SWITCHOVER_START is
processed all the proceeding data must have already been loaded.

Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
---
 hw/vfio/migration-multifd.c | 225 ++++++++++++++++++++++++++++++++++++
 hw/vfio/migration-multifd.h |   2 +
 hw/vfio/migration.c         |  12 ++
 hw/vfio/trace-events        |   5 +
 4 files changed, 244 insertions(+)

diff --git a/hw/vfio/migration-multifd.c b/hw/vfio/migration-multifd.c
index 5d5ee1393674..b3a88c062769 100644
--- a/hw/vfio/migration-multifd.c
+++ b/hw/vfio/migration-multifd.c
@@ -42,8 +42,13 @@ typedef struct VFIOStateBuffer {
 } VFIOStateBuffer;
 
 typedef struct VFIOMultifd {
+    QemuThread load_bufs_thread;
+    bool load_bufs_thread_running;
+    bool load_bufs_thread_want_exit;
+
     VFIOStateBuffers load_bufs;
     QemuCond load_bufs_buffer_ready_cond;
+    QemuCond load_bufs_thread_finished_cond;
     QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */
     uint32_t load_buf_idx;
     uint32_t load_buf_idx_last;
@@ -179,6 +184,175 @@ bool vfio_load_state_buffer(void *opaque, char *data, size_t data_size,
     return true;
 }
 
+static int vfio_load_bufs_thread_load_config(VFIODevice *vbasedev)
+{
+    return -EINVAL;
+}
+
+static VFIOStateBuffer *vfio_load_state_buffer_get(VFIOMultifd *multifd)
+{
+    VFIOStateBuffer *lb;
+    guint bufs_len;
+
+    bufs_len = vfio_state_buffers_size_get(&multifd->load_bufs);
+    if (multifd->load_buf_idx >= bufs_len) {
+        assert(multifd->load_buf_idx == bufs_len);
+        return NULL;
+    }
+
+    lb = vfio_state_buffers_at(&multifd->load_bufs,
+                               multifd->load_buf_idx);
+    if (!lb->is_present) {
+        return NULL;
+    }
+
+    return lb;
+}
+
+static bool vfio_load_state_buffer_write(VFIODevice *vbasedev,
+                                         VFIOStateBuffer *lb,
+                                         Error **errp)
+{
+    VFIOMigration *migration = vbasedev->migration;
+    VFIOMultifd *multifd = migration->multifd;
+    g_autofree char *buf = NULL;
+    char *buf_cur;
+    size_t buf_len;
+
+    if (!lb->len) {
+        return true;
+    }
+
+    trace_vfio_load_state_device_buffer_load_start(vbasedev->name,
+                                                   multifd->load_buf_idx);
+
+    /* lb might become re-allocated when we drop the lock */
+    buf = g_steal_pointer(&lb->data);
+    buf_cur = buf;
+    buf_len = lb->len;
+    while (buf_len > 0) {
+        ssize_t wr_ret;
+        int errno_save;
+
+        /*
+         * Loading data to the device takes a while,
+         * drop the lock during this process.
+         */
+        qemu_mutex_unlock(&multifd->load_bufs_mutex);
+        wr_ret = write(migration->data_fd, buf_cur, buf_len);
+        errno_save = errno;
+        qemu_mutex_lock(&multifd->load_bufs_mutex);
+
+        if (wr_ret < 0) {
+            error_setg(errp,
+                       "writing state buffer %" PRIu32 " failed: %d",
+                       multifd->load_buf_idx, errno_save);
+            return false;
+        }
+
+        assert(wr_ret <= buf_len);
+        buf_len -= wr_ret;
+        buf_cur += wr_ret;
+    }
+
+    trace_vfio_load_state_device_buffer_load_end(vbasedev->name,
+                                                 multifd->load_buf_idx);
+
+    return true;
+}
+
+static bool vfio_load_bufs_thread_want_exit(VFIOMultifd *multifd,
+                                            bool *should_quit)
+{
+    return multifd->load_bufs_thread_want_exit || qatomic_read(should_quit);
+}
+
+/*
+ * This thread is spawned by vfio_multifd_switchover_start() which gets
+ * called upon encountering the switchover point marker in main migration
+ * stream.
+ *
+ * It exits after either:
+ * * completing loading the remaining device state and device config, OR:
+ * * encountering some error while doing the above, OR:
+ * * being forcefully aborted by the migration core by it setting should_quit
+ *   or by vfio_load_cleanup_load_bufs_thread() setting
+ *   multifd->load_bufs_thread_want_exit.
+ */
+static bool vfio_load_bufs_thread(void *opaque, bool *should_quit, Error **errp)
+{
+    VFIODevice *vbasedev = opaque;
+    VFIOMigration *migration = vbasedev->migration;
+    VFIOMultifd *multifd = migration->multifd;
+    bool ret = true;
+    int config_ret;
+
+    assert(multifd);
+    QEMU_LOCK_GUARD(&multifd->load_bufs_mutex);
+
+    assert(multifd->load_bufs_thread_running);
+
+    while (true) {
+        VFIOStateBuffer *lb;
+
+        /*
+         * Always check cancellation first after the buffer_ready wait below in
+         * case that cond was signalled by vfio_load_cleanup_load_bufs_thread().
+         */
+        if (vfio_load_bufs_thread_want_exit(multifd, should_quit)) {
+            error_setg(errp, "operation cancelled");
+            ret = false;
+            goto ret_signal;
+        }
+
+        assert(multifd->load_buf_idx <= multifd->load_buf_idx_last);
+
+        lb = vfio_load_state_buffer_get(multifd);
+        if (!lb) {
+            trace_vfio_load_state_device_buffer_starved(vbasedev->name,
+                                                        multifd->load_buf_idx);
+            qemu_cond_wait(&multifd->load_bufs_buffer_ready_cond,
+                           &multifd->load_bufs_mutex);
+            continue;
+        }
+
+        if (multifd->load_buf_idx == multifd->load_buf_idx_last) {
+            break;
+        }
+
+        if (multifd->load_buf_idx == 0) {
+            trace_vfio_load_state_device_buffer_start(vbasedev->name);
+        }
+
+        if (!vfio_load_state_buffer_write(vbasedev, lb, errp)) {
+            ret = false;
+            goto ret_signal;
+        }
+
+        if (multifd->load_buf_idx == multifd->load_buf_idx_last - 1) {
+            trace_vfio_load_state_device_buffer_end(vbasedev->name);
+        }
+
+        multifd->load_buf_idx++;
+    }
+
+    config_ret = vfio_load_bufs_thread_load_config(vbasedev);
+    if (config_ret) {
+        error_setg(errp, "load config state failed: %d", config_ret);
+        ret = false;
+    }
+
+ret_signal:
+    /*
+     * Notify possibly waiting vfio_load_cleanup_load_bufs_thread() that
+     * this thread is exiting.
+     */
+    multifd->load_bufs_thread_running = false;
+    qemu_cond_signal(&multifd->load_bufs_thread_finished_cond);
+
+    return ret;
+}
+
 VFIOMultifd *vfio_multifd_new(void)
 {
     VFIOMultifd *multifd = g_new(VFIOMultifd, 1);
@@ -191,11 +365,42 @@ VFIOMultifd *vfio_multifd_new(void)
     multifd->load_buf_idx_last = UINT32_MAX;
     qemu_cond_init(&multifd->load_bufs_buffer_ready_cond);
 
+    multifd->load_bufs_thread_running = false;
+    multifd->load_bufs_thread_want_exit = false;
+    qemu_cond_init(&multifd->load_bufs_thread_finished_cond);
+
     return multifd;
 }
 
+/*
+ * Terminates vfio_load_bufs_thread by setting
+ * multifd->load_bufs_thread_want_exit and signalling all the conditions
+ * the thread could be blocked on.
+ *
+ * Waits for the thread to signal that it had finished.
+ */
+static void vfio_load_cleanup_load_bufs_thread(VFIOMultifd *multifd)
+{
+    /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */
+    bql_unlock();
+    WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) {
+        while (multifd->load_bufs_thread_running) {
+            multifd->load_bufs_thread_want_exit = true;
+
+            qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond);
+            qemu_cond_wait(&multifd->load_bufs_thread_finished_cond,
+                           &multifd->load_bufs_mutex);
+        }
+    }
+    bql_lock();
+}
+
 void vfio_multifd_free(VFIOMultifd *multifd)
 {
+    vfio_load_cleanup_load_bufs_thread(multifd);
+
+    qemu_cond_destroy(&multifd->load_bufs_thread_finished_cond);
+    vfio_state_buffers_destroy(&multifd->load_bufs);
     qemu_cond_destroy(&multifd->load_bufs_buffer_ready_cond);
     qemu_mutex_destroy(&multifd->load_bufs_mutex);
 
@@ -225,3 +430,23 @@ bool vfio_multifd_transfer_setup(VFIODevice *vbasedev, Error **errp)
 
     return true;
 }
+
+int vfio_multifd_switchover_start(VFIODevice *vbasedev)
+{
+    VFIOMigration *migration = vbasedev->migration;
+    VFIOMultifd *multifd = migration->multifd;
+
+    assert(multifd);
+
+    /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */
+    bql_unlock();
+    WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) {
+        assert(!multifd->load_bufs_thread_running);
+        multifd->load_bufs_thread_running = true;
+    }
+    bql_lock();
+
+    qemu_loadvm_start_load_thread(vfio_load_bufs_thread, vbasedev);
+
+    return 0;
+}
diff --git a/hw/vfio/migration-multifd.h b/hw/vfio/migration-multifd.h
index d5ab7d6f85f5..09cbb437d9d1 100644
--- a/hw/vfio/migration-multifd.h
+++ b/hw/vfio/migration-multifd.h
@@ -25,4 +25,6 @@ bool vfio_multifd_transfer_setup(VFIODevice *vbasedev, Error **errp);
 bool vfio_load_state_buffer(void *opaque, char *data, size_t data_size,
                             Error **errp);
 
+int vfio_multifd_switchover_start(VFIODevice *vbasedev);
+
 #endif
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index abaf4d08d4a9..85f54cb22df2 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -793,6 +793,17 @@ static bool vfio_switchover_ack_needed(void *opaque)
     return vfio_precopy_supported(vbasedev);
 }
 
+static int vfio_switchover_start(void *opaque)
+{
+    VFIODevice *vbasedev = opaque;
+
+    if (vfio_multifd_transfer_enabled(vbasedev)) {
+        return vfio_multifd_switchover_start(vbasedev);
+    }
+
+    return 0;
+}
+
 static const SaveVMHandlers savevm_vfio_handlers = {
     .save_prepare = vfio_save_prepare,
     .save_setup = vfio_save_setup,
@@ -808,6 +819,7 @@ static const SaveVMHandlers savevm_vfio_handlers = {
     .load_state = vfio_load_state,
     .load_state_buffer = vfio_load_state_buffer,
     .switchover_ack_needed = vfio_switchover_ack_needed,
+    .switchover_start = vfio_switchover_start,
 };
 
 /* ---------------------------------------------------------------------- */
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 042a3dc54a33..418b378ebd29 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -154,6 +154,11 @@ vfio_load_device_config_state_end(const char *name) " (%s)"
 vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64
 vfio_load_state_device_data(const char *name, uint64_t data_size, int ret) " (%s) size %"PRIu64" ret %d"
 vfio_load_state_device_buffer_incoming(const char *name, uint32_t idx) " (%s) idx %"PRIu32
+vfio_load_state_device_buffer_start(const char *name) " (%s)"
+vfio_load_state_device_buffer_starved(const char *name, uint32_t idx) " (%s) idx %"PRIu32
+vfio_load_state_device_buffer_load_start(const char *name, uint32_t idx) " (%s) idx %"PRIu32
+vfio_load_state_device_buffer_load_end(const char *name, uint32_t idx) " (%s) idx %"PRIu32
+vfio_load_state_device_buffer_end(const char *name) " (%s)"
 vfio_migration_realize(const char *name) " (%s)"
 vfio_migration_set_device_state(const char *name, const char *state) " (%s) state %s"
 vfio_migration_set_state(const char *name, const char *new_state, const char *recover_state) " (%s) new state %s, recover state %s"
Re: [PATCH v5 27/36] vfio/migration: Multifd device state transfer support - load thread
Posted by Avihai Horon 1 month ago
On 19/02/2025 22:34, Maciej S. Szmigiero wrote:
> External email: Use caution opening links or attachments
>
>
> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>

Maybe add a sentence talking about the load thread itself first? E.g.:

Add a thread which loads the VFIO device state buffers that were 
received and via multifd.
Each VFIO device that has multifd device state transfer enabled has one 
such thread, which is created using migration core API 
qemu_loadvm_start_load_thread().

Since it's important to finish...

> Since it's important to finish loading device state transferred via the
> main migration channel (via save_live_iterate SaveVMHandler) before
> starting loading the data asynchronously transferred via multifd the thread
> doing the actual loading of the multifd transferred data is only started
> from switchover_start SaveVMHandler.
>
> switchover_start handler is called when MIG_CMD_SWITCHOVER_START
> sub-command of QEMU_VM_COMMAND is received via the main migration channel.
>
> This sub-command is only sent after all save_live_iterate data have already
> been posted so it is safe to commence loading of the multifd-transferred
> device state upon receiving it - loading of save_live_iterate data happens
> synchronously in the main migration thread (much like the processing of
> MIG_CMD_SWITCHOVER_START) so by the time MIG_CMD_SWITCHOVER_START is
> processed all the proceeding data must have already been loaded.
>
> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
> ---
>   hw/vfio/migration-multifd.c | 225 ++++++++++++++++++++++++++++++++++++
>   hw/vfio/migration-multifd.h |   2 +
>   hw/vfio/migration.c         |  12 ++
>   hw/vfio/trace-events        |   5 +
>   4 files changed, 244 insertions(+)
>
> diff --git a/hw/vfio/migration-multifd.c b/hw/vfio/migration-multifd.c
> index 5d5ee1393674..b3a88c062769 100644
> --- a/hw/vfio/migration-multifd.c
> +++ b/hw/vfio/migration-multifd.c
> @@ -42,8 +42,13 @@ typedef struct VFIOStateBuffer {
>   } VFIOStateBuffer;
>
>   typedef struct VFIOMultifd {
> +    QemuThread load_bufs_thread;

This can be dropped.

> +    bool load_bufs_thread_running;
> +    bool load_bufs_thread_want_exit;
> +
>       VFIOStateBuffers load_bufs;
>       QemuCond load_bufs_buffer_ready_cond;
> +    QemuCond load_bufs_thread_finished_cond;
>       QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */
>       uint32_t load_buf_idx;
>       uint32_t load_buf_idx_last;
> @@ -179,6 +184,175 @@ bool vfio_load_state_buffer(void *opaque, char *data, size_t data_size,
>       return true;
>   }
>
> +static int vfio_load_bufs_thread_load_config(VFIODevice *vbasedev)
> +{
> +    return -EINVAL;
> +}
> +
> +static VFIOStateBuffer *vfio_load_state_buffer_get(VFIOMultifd *multifd)
> +{
> +    VFIOStateBuffer *lb;
> +    guint bufs_len;
> +
> +    bufs_len = vfio_state_buffers_size_get(&multifd->load_bufs);
> +    if (multifd->load_buf_idx >= bufs_len) {
> +        assert(multifd->load_buf_idx == bufs_len);
> +        return NULL;
> +    }
> +
> +    lb = vfio_state_buffers_at(&multifd->load_bufs,
> +                               multifd->load_buf_idx);
> +    if (!lb->is_present) {
> +        return NULL;
> +    }
> +
> +    return lb;
> +}
> +
> +static bool vfio_load_state_buffer_write(VFIODevice *vbasedev,
> +                                         VFIOStateBuffer *lb,
> +                                         Error **errp)
> +{
> +    VFIOMigration *migration = vbasedev->migration;
> +    VFIOMultifd *multifd = migration->multifd;
> +    g_autofree char *buf = NULL;
> +    char *buf_cur;
> +    size_t buf_len;
> +
> +    if (!lb->len) {
> +        return true;
> +    }
> +
> +    trace_vfio_load_state_device_buffer_load_start(vbasedev->name,
> +                                                   multifd->load_buf_idx);
> +
> +    /* lb might become re-allocated when we drop the lock */
> +    buf = g_steal_pointer(&lb->data);
> +    buf_cur = buf;
> +    buf_len = lb->len;
> +    while (buf_len > 0) {
> +        ssize_t wr_ret;
> +        int errno_save;
> +
> +        /*
> +         * Loading data to the device takes a while,
> +         * drop the lock during this process.
> +         */
> +        qemu_mutex_unlock(&multifd->load_bufs_mutex);
> +        wr_ret = write(migration->data_fd, buf_cur, buf_len);
> +        errno_save = errno;
> +        qemu_mutex_lock(&multifd->load_bufs_mutex);
> +
> +        if (wr_ret < 0) {
> +            error_setg(errp,
> +                       "writing state buffer %" PRIu32 " failed: %d",
> +                       multifd->load_buf_idx, errno_save);

Let's add vbasedev->name to the error message so we know which device 
caused the error.

> +            return false;
> +        }
> +
> +        assert(wr_ret <= buf_len);

I think this assert is redundant: we write buf_len bytes and by 
definition of write() wr_ret will be <= buf_len.

> +        buf_len -= wr_ret;
> +        buf_cur += wr_ret;
> +    }
> +
> +    trace_vfio_load_state_device_buffer_load_end(vbasedev->name,
> +                                                 multifd->load_buf_idx);
> +
> +    return true;
> +}
> +
> +static bool vfio_load_bufs_thread_want_exit(VFIOMultifd *multifd,
> +                                            bool *should_quit)
> +{
> +    return multifd->load_bufs_thread_want_exit || qatomic_read(should_quit);
> +}
> +
> +/*
> + * This thread is spawned by vfio_multifd_switchover_start() which gets
> + * called upon encountering the switchover point marker in main migration
> + * stream.
> + *
> + * It exits after either:
> + * * completing loading the remaining device state and device config, OR:
> + * * encountering some error while doing the above, OR:
> + * * being forcefully aborted by the migration core by it setting should_quit
> + *   or by vfio_load_cleanup_load_bufs_thread() setting
> + *   multifd->load_bufs_thread_want_exit.
> + */
> +static bool vfio_load_bufs_thread(void *opaque, bool *should_quit, Error **errp)
> +{
> +    VFIODevice *vbasedev = opaque;
> +    VFIOMigration *migration = vbasedev->migration;
> +    VFIOMultifd *multifd = migration->multifd;
> +    bool ret = true;
> +    int config_ret;
> +
> +    assert(multifd);
> +    QEMU_LOCK_GUARD(&multifd->load_bufs_mutex);
> +
> +    assert(multifd->load_bufs_thread_running);
> +
> +    while (true) {
> +        VFIOStateBuffer *lb;
> +
> +        /*
> +         * Always check cancellation first after the buffer_ready wait below in
> +         * case that cond was signalled by vfio_load_cleanup_load_bufs_thread().
> +         */
> +        if (vfio_load_bufs_thread_want_exit(multifd, should_quit)) {
> +            error_setg(errp, "operation cancelled");
> +            ret = false;
> +            goto ret_signal;

IIUC, if vfio_load_bufs_thread_want_exit() returns true, it means that 
some other code part already failed and set migration error, no?
If so, shouldn't we return true here? After all, vfio_load_bufs_thread 
didn't really fail, it just got signal to terminate itself.

> +        }
> +
> +        assert(multifd->load_buf_idx <= multifd->load_buf_idx_last);
> +
> +        lb = vfio_load_state_buffer_get(multifd);
> +        if (!lb) {
> +            trace_vfio_load_state_device_buffer_starved(vbasedev->name,
> +                                                        multifd->load_buf_idx);
> +            qemu_cond_wait(&multifd->load_bufs_buffer_ready_cond,
> +                           &multifd->load_bufs_mutex);
> +            continue;
> +        }
> +
> +        if (multifd->load_buf_idx == multifd->load_buf_idx_last) {
> +            break;
> +        }
> +
> +        if (multifd->load_buf_idx == 0) {
> +            trace_vfio_load_state_device_buffer_start(vbasedev->name);
> +        }
> +
> +        if (!vfio_load_state_buffer_write(vbasedev, lb, errp)) {
> +            ret = false;
> +            goto ret_signal;
> +        }
> +
> +        if (multifd->load_buf_idx == multifd->load_buf_idx_last - 1) {
> +            trace_vfio_load_state_device_buffer_end(vbasedev->name);
> +        }
> +
> +        multifd->load_buf_idx++;
> +    }
> +
> +    config_ret = vfio_load_bufs_thread_load_config(vbasedev);
> +    if (config_ret) {
> +        error_setg(errp, "load config state failed: %d", config_ret);

Let's add vbasedev->name to the error message so we know which device 
caused the error.

> +        ret = false;
> +    }
> +
> +ret_signal:
> +    /*
> +     * Notify possibly waiting vfio_load_cleanup_load_bufs_thread() that
> +     * this thread is exiting.
> +     */
> +    multifd->load_bufs_thread_running = false;
> +    qemu_cond_signal(&multifd->load_bufs_thread_finished_cond);
> +
> +    return ret;
> +}
> +
>   VFIOMultifd *vfio_multifd_new(void)
>   {
>       VFIOMultifd *multifd = g_new(VFIOMultifd, 1);
> @@ -191,11 +365,42 @@ VFIOMultifd *vfio_multifd_new(void)
>       multifd->load_buf_idx_last = UINT32_MAX;
>       qemu_cond_init(&multifd->load_bufs_buffer_ready_cond);
>
> +    multifd->load_bufs_thread_running = false;
> +    multifd->load_bufs_thread_want_exit = false;
> +    qemu_cond_init(&multifd->load_bufs_thread_finished_cond);
> +
>       return multifd;
>   }
>
> +/*
> + * Terminates vfio_load_bufs_thread by setting
> + * multifd->load_bufs_thread_want_exit and signalling all the conditions
> + * the thread could be blocked on.
> + *
> + * Waits for the thread to signal that it had finished.
> + */
> +static void vfio_load_cleanup_load_bufs_thread(VFIOMultifd *multifd)
> +{
> +    /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */
> +    bql_unlock();
> +    WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) {
> +        while (multifd->load_bufs_thread_running) {
> +            multifd->load_bufs_thread_want_exit = true;
> +
> +            qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond);
> +            qemu_cond_wait(&multifd->load_bufs_thread_finished_cond,
> +                           &multifd->load_bufs_mutex);
> +        }
> +    }
> +    bql_lock();
> +}
> +
>   void vfio_multifd_free(VFIOMultifd *multifd)
>   {
> +    vfio_load_cleanup_load_bufs_thread(multifd);
> +
> +    qemu_cond_destroy(&multifd->load_bufs_thread_finished_cond);
> +    vfio_state_buffers_destroy(&multifd->load_bufs);

vfio_state_buffers_destroy(&multifd->load_bufs); belongs to patch #26, no?

Thanks.

>       qemu_cond_destroy(&multifd->load_bufs_buffer_ready_cond);
>       qemu_mutex_destroy(&multifd->load_bufs_mutex);
>
> @@ -225,3 +430,23 @@ bool vfio_multifd_transfer_setup(VFIODevice *vbasedev, Error **errp)
>
>       return true;
>   }
> +
> +int vfio_multifd_switchover_start(VFIODevice *vbasedev)
> +{
> +    VFIOMigration *migration = vbasedev->migration;
> +    VFIOMultifd *multifd = migration->multifd;
> +
> +    assert(multifd);
> +
> +    /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */
> +    bql_unlock();
> +    WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) {
> +        assert(!multifd->load_bufs_thread_running);
> +        multifd->load_bufs_thread_running = true;
> +    }
> +    bql_lock();
> +
> +    qemu_loadvm_start_load_thread(vfio_load_bufs_thread, vbasedev);
> +
> +    return 0;
> +}
> diff --git a/hw/vfio/migration-multifd.h b/hw/vfio/migration-multifd.h
> index d5ab7d6f85f5..09cbb437d9d1 100644
> --- a/hw/vfio/migration-multifd.h
> +++ b/hw/vfio/migration-multifd.h
> @@ -25,4 +25,6 @@ bool vfio_multifd_transfer_setup(VFIODevice *vbasedev, Error **errp);
>   bool vfio_load_state_buffer(void *opaque, char *data, size_t data_size,
>                               Error **errp);
>
> +int vfio_multifd_switchover_start(VFIODevice *vbasedev);
> +
>   #endif
> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
> index abaf4d08d4a9..85f54cb22df2 100644
> --- a/hw/vfio/migration.c
> +++ b/hw/vfio/migration.c
> @@ -793,6 +793,17 @@ static bool vfio_switchover_ack_needed(void *opaque)
>       return vfio_precopy_supported(vbasedev);
>   }
>
> +static int vfio_switchover_start(void *opaque)
> +{
> +    VFIODevice *vbasedev = opaque;
> +
> +    if (vfio_multifd_transfer_enabled(vbasedev)) {
> +        return vfio_multifd_switchover_start(vbasedev);
> +    }
> +
> +    return 0;
> +}
> +
>   static const SaveVMHandlers savevm_vfio_handlers = {
>       .save_prepare = vfio_save_prepare,
>       .save_setup = vfio_save_setup,
> @@ -808,6 +819,7 @@ static const SaveVMHandlers savevm_vfio_handlers = {
>       .load_state = vfio_load_state,
>       .load_state_buffer = vfio_load_state_buffer,
>       .switchover_ack_needed = vfio_switchover_ack_needed,
> +    .switchover_start = vfio_switchover_start,
>   };
>
>   /* ---------------------------------------------------------------------- */
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index 042a3dc54a33..418b378ebd29 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -154,6 +154,11 @@ vfio_load_device_config_state_end(const char *name) " (%s)"
>   vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64
>   vfio_load_state_device_data(const char *name, uint64_t data_size, int ret) " (%s) size %"PRIu64" ret %d"
>   vfio_load_state_device_buffer_incoming(const char *name, uint32_t idx) " (%s) idx %"PRIu32
> +vfio_load_state_device_buffer_start(const char *name) " (%s)"
> +vfio_load_state_device_buffer_starved(const char *name, uint32_t idx) " (%s) idx %"PRIu32
> +vfio_load_state_device_buffer_load_start(const char *name, uint32_t idx) " (%s) idx %"PRIu32
> +vfio_load_state_device_buffer_load_end(const char *name, uint32_t idx) " (%s) idx %"PRIu32
> +vfio_load_state_device_buffer_end(const char *name) " (%s)"
>   vfio_migration_realize(const char *name) " (%s)"
>   vfio_migration_set_device_state(const char *name, const char *state) " (%s) state %s"
>   vfio_migration_set_state(const char *name, const char *new_state, const char *recover_state) " (%s) new state %s, recover state %s"
Re: [PATCH v5 27/36] vfio/migration: Multifd device state transfer support - load thread
Posted by Maciej S. Szmigiero 1 month ago
On 2.03.2025 15:15, Avihai Horon wrote:
> 
> On 19/02/2025 22:34, Maciej S. Szmigiero wrote:
>> External email: Use caution opening links or attachments
>>
>>
>> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
> 
> Maybe add a sentence talking about the load thread itself first? E.g.:
> 
> Add a thread which loads the VFIO device state buffers that were received and via multifd.
> Each VFIO device that has multifd device state transfer enabled has one such thread, which is created using migration core API qemu_loadvm_start_load_thread().
> 
> Since it's important to finish...

Added such leading text to the commit message for this patch.

>> Since it's important to finish loading device state transferred via the
>> main migration channel (via save_live_iterate SaveVMHandler) before
>> starting loading the data asynchronously transferred via multifd the thread
>> doing the actual loading of the multifd transferred data is only started
>> from switchover_start SaveVMHandler.
>>
>> switchover_start handler is called when MIG_CMD_SWITCHOVER_START
>> sub-command of QEMU_VM_COMMAND is received via the main migration channel.
>>
>> This sub-command is only sent after all save_live_iterate data have already
>> been posted so it is safe to commence loading of the multifd-transferred
>> device state upon receiving it - loading of save_live_iterate data happens
>> synchronously in the main migration thread (much like the processing of
>> MIG_CMD_SWITCHOVER_START) so by the time MIG_CMD_SWITCHOVER_START is
>> processed all the proceeding data must have already been loaded.
>>
>> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
>> ---
>>   hw/vfio/migration-multifd.c | 225 ++++++++++++++++++++++++++++++++++++
>>   hw/vfio/migration-multifd.h |   2 +
>>   hw/vfio/migration.c         |  12 ++
>>   hw/vfio/trace-events        |   5 +
>>   4 files changed, 244 insertions(+)
>>
>> diff --git a/hw/vfio/migration-multifd.c b/hw/vfio/migration-multifd.c
>> index 5d5ee1393674..b3a88c062769 100644
>> --- a/hw/vfio/migration-multifd.c
>> +++ b/hw/vfio/migration-multifd.c
>> @@ -42,8 +42,13 @@ typedef struct VFIOStateBuffer {
>>   } VFIOStateBuffer;
>>
>>   typedef struct VFIOMultifd {
>> +    QemuThread load_bufs_thread;
> 
> This can be dropped.

Yeah - it was a remainder from pre-load-thread days of v2.

Dropped now.

>> +    bool load_bufs_thread_running;
>> +    bool load_bufs_thread_want_exit;
>> +
>>       VFIOStateBuffers load_bufs;
>>       QemuCond load_bufs_buffer_ready_cond;
>> +    QemuCond load_bufs_thread_finished_cond;
>>       QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */
>>       uint32_t load_buf_idx;
>>       uint32_t load_buf_idx_last;
>> @@ -179,6 +184,175 @@ bool vfio_load_state_buffer(void *opaque, char *data, size_t data_size,
>>       return true;
>>   }
>>
>> +static int vfio_load_bufs_thread_load_config(VFIODevice *vbasedev)
>> +{
>> +    return -EINVAL;
>> +}
>> +
>> +static VFIOStateBuffer *vfio_load_state_buffer_get(VFIOMultifd *multifd)
>> +{
>> +    VFIOStateBuffer *lb;
>> +    guint bufs_len;
>> +
>> +    bufs_len = vfio_state_buffers_size_get(&multifd->load_bufs);
>> +    if (multifd->load_buf_idx >= bufs_len) {
>> +        assert(multifd->load_buf_idx == bufs_len);
>> +        return NULL;
>> +    }
>> +
>> +    lb = vfio_state_buffers_at(&multifd->load_bufs,
>> +                               multifd->load_buf_idx);
>> +    if (!lb->is_present) {
>> +        return NULL;
>> +    }
>> +
>> +    return lb;
>> +}
>> +
>> +static bool vfio_load_state_buffer_write(VFIODevice *vbasedev,
>> +                                         VFIOStateBuffer *lb,
>> +                                         Error **errp)
>> +{
>> +    VFIOMigration *migration = vbasedev->migration;
>> +    VFIOMultifd *multifd = migration->multifd;
>> +    g_autofree char *buf = NULL;
>> +    char *buf_cur;
>> +    size_t buf_len;
>> +
>> +    if (!lb->len) {
>> +        return true;
>> +    }
>> +
>> +    trace_vfio_load_state_device_buffer_load_start(vbasedev->name,
>> +                                                   multifd->load_buf_idx);
>> +
>> +    /* lb might become re-allocated when we drop the lock */
>> +    buf = g_steal_pointer(&lb->data);
>> +    buf_cur = buf;
>> +    buf_len = lb->len;
>> +    while (buf_len > 0) {
>> +        ssize_t wr_ret;
>> +        int errno_save;
>> +
>> +        /*
>> +         * Loading data to the device takes a while,
>> +         * drop the lock during this process.
>> +         */
>> +        qemu_mutex_unlock(&multifd->load_bufs_mutex);
>> +        wr_ret = write(migration->data_fd, buf_cur, buf_len);
>> +        errno_save = errno;
>> +        qemu_mutex_lock(&multifd->load_bufs_mutex);
>> +
>> +        if (wr_ret < 0) {
>> +            error_setg(errp,
>> +                       "writing state buffer %" PRIu32 " failed: %d",
>> +                       multifd->load_buf_idx, errno_save);
> 
> Let's add vbasedev->name to the error message so we know which device caused the error.

Done.

>> +            return false;
>> +        }
>> +
>> +        assert(wr_ret <= buf_len);
> 
> I think this assert is redundant: we write buf_len bytes and by definition of write() wr_ret will be <= buf_len.

It's for catching when the "definition" for some reason does not match reality
since this would result in a reading well past the buffer.

That's why it's an assert, not an error return.

>> +        buf_len -= wr_ret;
>> +        buf_cur += wr_ret;
>> +    }
>> +
>> +    trace_vfio_load_state_device_buffer_load_end(vbasedev->name,
>> +                                                 multifd->load_buf_idx);
>> +
>> +    return true;
>> +}
>> +
>> +static bool vfio_load_bufs_thread_want_exit(VFIOMultifd *multifd,
>> +                                            bool *should_quit)
>> +{
>> +    return multifd->load_bufs_thread_want_exit || qatomic_read(should_quit);
>> +}
>> +
>> +/*
>> + * This thread is spawned by vfio_multifd_switchover_start() which gets
>> + * called upon encountering the switchover point marker in main migration
>> + * stream.
>> + *
>> + * It exits after either:
>> + * * completing loading the remaining device state and device config, OR:
>> + * * encountering some error while doing the above, OR:
>> + * * being forcefully aborted by the migration core by it setting should_quit
>> + *   or by vfio_load_cleanup_load_bufs_thread() setting
>> + *   multifd->load_bufs_thread_want_exit.
>> + */
>> +static bool vfio_load_bufs_thread(void *opaque, bool *should_quit, Error **errp)
>> +{
>> +    VFIODevice *vbasedev = opaque;
>> +    VFIOMigration *migration = vbasedev->migration;
>> +    VFIOMultifd *multifd = migration->multifd;
>> +    bool ret = true;
>> +    int config_ret;
>> +
>> +    assert(multifd);
>> +    QEMU_LOCK_GUARD(&multifd->load_bufs_mutex);
>> +
>> +    assert(multifd->load_bufs_thread_running);
>> +
>> +    while (true) {
>> +        VFIOStateBuffer *lb;
>> +
>> +        /*
>> +         * Always check cancellation first after the buffer_ready wait below in
>> +         * case that cond was signalled by vfio_load_cleanup_load_bufs_thread().
>> +         */
>> +        if (vfio_load_bufs_thread_want_exit(multifd, should_quit)) {
>> +            error_setg(errp, "operation cancelled");
>> +            ret = false;
>> +            goto ret_signal;
> 
> IIUC, if vfio_load_bufs_thread_want_exit() returns true, it means that some other code part already failed and set migration error, no?
> If so, shouldn't we return true here? After all, vfio_load_bufs_thread didn't really fail, it just got signal to terminate itself.

The thread didn't succeed with loading all the data either, but got cancelled.

It's a similar logic as Glib's GIO returning G_IO_ERROR_CANCELLED if the operation
got cancelled.

In a GTask a pending cancellation will even overwrite any other error or value
that the task tried to return (at least by default).

>> +        }
>> +
>> +        assert(multifd->load_buf_idx <= multifd->load_buf_idx_last);
>> +
>> +        lb = vfio_load_state_buffer_get(multifd);
>> +        if (!lb) {
>> +            trace_vfio_load_state_device_buffer_starved(vbasedev->name,
>> +                                                        multifd->load_buf_idx);
>> +            qemu_cond_wait(&multifd->load_bufs_buffer_ready_cond,
>> +                           &multifd->load_bufs_mutex);
>> +            continue;
>> +        }
>> +
>> +        if (multifd->load_buf_idx == multifd->load_buf_idx_last) {
>> +            break;
>> +        }
>> +
>> +        if (multifd->load_buf_idx == 0) {
>> +            trace_vfio_load_state_device_buffer_start(vbasedev->name);
>> +        }
>> +
>> +        if (!vfio_load_state_buffer_write(vbasedev, lb, errp)) {
>> +            ret = false;
>> +            goto ret_signal;
>> +        }
>> +
>> +        if (multifd->load_buf_idx == multifd->load_buf_idx_last - 1) {
>> +            trace_vfio_load_state_device_buffer_end(vbasedev->name);
>> +        }
>> +
>> +        multifd->load_buf_idx++;
>> +    }
>> +
>> +    config_ret = vfio_load_bufs_thread_load_config(vbasedev);
>> +    if (config_ret) {
>> +        error_setg(errp, "load config state failed: %d", config_ret);
> 
> Let's add vbasedev->name to the error message so we know which device caused the error.

This line is not present anymore in the current version of the code,
but applied such change to all error_setg() calls in vfio_load_bufs_thread_load_config()
instead.

>> +        ret = false;
>> +    }
>> +
>> +ret_signal:
>> +    /*
>> +     * Notify possibly waiting vfio_load_cleanup_load_bufs_thread() that
>> +     * this thread is exiting.
>> +     */
>> +    multifd->load_bufs_thread_running = false;
>> +    qemu_cond_signal(&multifd->load_bufs_thread_finished_cond);
>> +
>> +    return ret;
>> +}
>> +
>>   VFIOMultifd *vfio_multifd_new(void)
>>   {
>>       VFIOMultifd *multifd = g_new(VFIOMultifd, 1);
>> @@ -191,11 +365,42 @@ VFIOMultifd *vfio_multifd_new(void)
>>       multifd->load_buf_idx_last = UINT32_MAX;
>>       qemu_cond_init(&multifd->load_bufs_buffer_ready_cond);
>>
>> +    multifd->load_bufs_thread_running = false;
>> +    multifd->load_bufs_thread_want_exit = false;
>> +    qemu_cond_init(&multifd->load_bufs_thread_finished_cond);
>> +
>>       return multifd;
>>   }
>>
>> +/*
>> + * Terminates vfio_load_bufs_thread by setting
>> + * multifd->load_bufs_thread_want_exit and signalling all the conditions
>> + * the thread could be blocked on.
>> + *
>> + * Waits for the thread to signal that it had finished.
>> + */
>> +static void vfio_load_cleanup_load_bufs_thread(VFIOMultifd *multifd)
>> +{
>> +    /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */
>> +    bql_unlock();
>> +    WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) {
>> +        while (multifd->load_bufs_thread_running) {
>> +            multifd->load_bufs_thread_want_exit = true;
>> +
>> +            qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond);
>> +            qemu_cond_wait(&multifd->load_bufs_thread_finished_cond,
>> +                           &multifd->load_bufs_mutex);
>> +        }
>> +    }
>> +    bql_lock();
>> +}
>> +
>>   void vfio_multifd_free(VFIOMultifd *multifd)
>>   {
>> +    vfio_load_cleanup_load_bufs_thread(multifd);
>> +
>> +    qemu_cond_destroy(&multifd->load_bufs_thread_finished_cond);
>> +    vfio_state_buffers_destroy(&multifd->load_bufs);
> 
> vfio_state_buffers_destroy(&multifd->load_bufs); belongs to patch #26, no?

Yeah - moved it there.
  
> Thanks.

Thanks,
Maciej


Re: [PATCH v5 27/36] vfio/migration: Multifd device state transfer support - load thread
Posted by Avihai Horon 1 month ago
On 04/03/2025 0:16, Maciej S. Szmigiero wrote:
> External email: Use caution opening links or attachments
>
>
> On 2.03.2025 15:15, Avihai Horon wrote:
>>
>> On 19/02/2025 22:34, Maciej S. Szmigiero wrote:
>>> External email: Use caution opening links or attachments
>>>
>>>
>>> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
>>
>> Maybe add a sentence talking about the load thread itself first? E.g.:
>>
>> Add a thread which loads the VFIO device state buffers that were 
>> received and via multifd.
>> Each VFIO device that has multifd device state transfer enabled has 
>> one such thread, which is created using migration core API 
>> qemu_loadvm_start_load_thread().
>>
>> Since it's important to finish...
>
> Added such leading text to the commit message for this patch.
>
>>> Since it's important to finish loading device state transferred via the
>>> main migration channel (via save_live_iterate SaveVMHandler) before
>>> starting loading the data asynchronously transferred via multifd the 
>>> thread
>>> doing the actual loading of the multifd transferred data is only 
>>> started
>>> from switchover_start SaveVMHandler.
>>>
>>> switchover_start handler is called when MIG_CMD_SWITCHOVER_START
>>> sub-command of QEMU_VM_COMMAND is received via the main migration 
>>> channel.
>>>
>>> This sub-command is only sent after all save_live_iterate data have 
>>> already
>>> been posted so it is safe to commence loading of the 
>>> multifd-transferred
>>> device state upon receiving it - loading of save_live_iterate data 
>>> happens
>>> synchronously in the main migration thread (much like the processing of
>>> MIG_CMD_SWITCHOVER_START) so by the time MIG_CMD_SWITCHOVER_START is
>>> processed all the proceeding data must have already been loaded.
>>>
>>> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
>>> ---
>>>   hw/vfio/migration-multifd.c | 225 
>>> ++++++++++++++++++++++++++++++++++++
>>>   hw/vfio/migration-multifd.h |   2 +
>>>   hw/vfio/migration.c         |  12 ++
>>>   hw/vfio/trace-events        |   5 +
>>>   4 files changed, 244 insertions(+)
>>>
>>> diff --git a/hw/vfio/migration-multifd.c b/hw/vfio/migration-multifd.c
>>> index 5d5ee1393674..b3a88c062769 100644
>>> --- a/hw/vfio/migration-multifd.c
>>> +++ b/hw/vfio/migration-multifd.c
>>> @@ -42,8 +42,13 @@ typedef struct VFIOStateBuffer {
>>>   } VFIOStateBuffer;
>>>
>>>   typedef struct VFIOMultifd {
>>> +    QemuThread load_bufs_thread;
>>
>> This can be dropped.
>
> Yeah - it was a remainder from pre-load-thread days of v2.
>
> Dropped now.
>
>>> +    bool load_bufs_thread_running;
>>> +    bool load_bufs_thread_want_exit;
>>> +
>>>       VFIOStateBuffers load_bufs;
>>>       QemuCond load_bufs_buffer_ready_cond;
>>> +    QemuCond load_bufs_thread_finished_cond;
>>>       QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */
>>>       uint32_t load_buf_idx;
>>>       uint32_t load_buf_idx_last;
>>> @@ -179,6 +184,175 @@ bool vfio_load_state_buffer(void *opaque, char 
>>> *data, size_t data_size,
>>>       return true;
>>>   }
>>>
>>> +static int vfio_load_bufs_thread_load_config(VFIODevice *vbasedev)
>>> +{
>>> +    return -EINVAL;
>>> +}
>>> +
>>> +static VFIOStateBuffer *vfio_load_state_buffer_get(VFIOMultifd 
>>> *multifd)
>>> +{
>>> +    VFIOStateBuffer *lb;
>>> +    guint bufs_len;
>>> +
>>> +    bufs_len = vfio_state_buffers_size_get(&multifd->load_bufs);
>>> +    if (multifd->load_buf_idx >= bufs_len) {
>>> +        assert(multifd->load_buf_idx == bufs_len);
>>> +        return NULL;
>>> +    }
>>> +
>>> +    lb = vfio_state_buffers_at(&multifd->load_bufs,
>>> +                               multifd->load_buf_idx);
>>> +    if (!lb->is_present) {
>>> +        return NULL;
>>> +    }
>>> +
>>> +    return lb;
>>> +}
>>> +
>>> +static bool vfio_load_state_buffer_write(VFIODevice *vbasedev,
>>> +                                         VFIOStateBuffer *lb,
>>> +                                         Error **errp)
>>> +{
>>> +    VFIOMigration *migration = vbasedev->migration;
>>> +    VFIOMultifd *multifd = migration->multifd;
>>> +    g_autofree char *buf = NULL;
>>> +    char *buf_cur;
>>> +    size_t buf_len;
>>> +
>>> +    if (!lb->len) {
>>> +        return true;
>>> +    }
>>> +
>>> + trace_vfio_load_state_device_buffer_load_start(vbasedev->name,
>>> + multifd->load_buf_idx);
>>> +
>>> +    /* lb might become re-allocated when we drop the lock */
>>> +    buf = g_steal_pointer(&lb->data);
>>> +    buf_cur = buf;
>>> +    buf_len = lb->len;
>>> +    while (buf_len > 0) {
>>> +        ssize_t wr_ret;
>>> +        int errno_save;
>>> +
>>> +        /*
>>> +         * Loading data to the device takes a while,
>>> +         * drop the lock during this process.
>>> +         */
>>> +        qemu_mutex_unlock(&multifd->load_bufs_mutex);
>>> +        wr_ret = write(migration->data_fd, buf_cur, buf_len);
>>> +        errno_save = errno;
>>> +        qemu_mutex_lock(&multifd->load_bufs_mutex);
>>> +
>>> +        if (wr_ret < 0) {
>>> +            error_setg(errp,
>>> +                       "writing state buffer %" PRIu32 " failed: %d",
>>> +                       multifd->load_buf_idx, errno_save);
>>
>> Let's add vbasedev->name to the error message so we know which device 
>> caused the error.
>
> Done.
>
>>> +            return false;
>>> +        }
>>> +
>>> +        assert(wr_ret <= buf_len);
>>
>> I think this assert is redundant: we write buf_len bytes and by 
>> definition of write() wr_ret will be <= buf_len.
>
> It's for catching when the "definition" for some reason does not match 
> reality
> since this would result in a reading well past the buffer.
>
> That's why it's an assert, not an error return.

Yes, but it's highly unlikely that write() will not match reality.
But that's a minor, so whatever you prefer.

>
>>> +        buf_len -= wr_ret;
>>> +        buf_cur += wr_ret;
>>> +    }
>>> +
>>> + trace_vfio_load_state_device_buffer_load_end(vbasedev->name,
>>> + multifd->load_buf_idx);
>>> +
>>> +    return true;
>>> +}
>>> +
>>> +static bool vfio_load_bufs_thread_want_exit(VFIOMultifd *multifd,
>>> +                                            bool *should_quit)
>>> +{
>>> +    return multifd->load_bufs_thread_want_exit || 
>>> qatomic_read(should_quit);
>>> +}
>>> +
>>> +/*
>>> + * This thread is spawned by vfio_multifd_switchover_start() which 
>>> gets
>>> + * called upon encountering the switchover point marker in main 
>>> migration
>>> + * stream.
>>> + *
>>> + * It exits after either:
>>> + * * completing loading the remaining device state and device 
>>> config, OR:
>>> + * * encountering some error while doing the above, OR:
>>> + * * being forcefully aborted by the migration core by it setting 
>>> should_quit
>>> + *   or by vfio_load_cleanup_load_bufs_thread() setting
>>> + *   multifd->load_bufs_thread_want_exit.
>>> + */
>>> +static bool vfio_load_bufs_thread(void *opaque, bool *should_quit, 
>>> Error **errp)
>>> +{
>>> +    VFIODevice *vbasedev = opaque;
>>> +    VFIOMigration *migration = vbasedev->migration;
>>> +    VFIOMultifd *multifd = migration->multifd;
>>> +    bool ret = true;
>>> +    int config_ret;
>>> +
>>> +    assert(multifd);
>>> +    QEMU_LOCK_GUARD(&multifd->load_bufs_mutex);
>>> +
>>> +    assert(multifd->load_bufs_thread_running);
>>> +
>>> +    while (true) {
>>> +        VFIOStateBuffer *lb;
>>> +
>>> +        /*
>>> +         * Always check cancellation first after the buffer_ready 
>>> wait below in
>>> +         * case that cond was signalled by 
>>> vfio_load_cleanup_load_bufs_thread().
>>> +         */
>>> +        if (vfio_load_bufs_thread_want_exit(multifd, should_quit)) {
>>> +            error_setg(errp, "operation cancelled");
>>> +            ret = false;
>>> +            goto ret_signal;
>>
>> IIUC, if vfio_load_bufs_thread_want_exit() returns true, it means 
>> that some other code part already failed and set migration error, no?
>> If so, shouldn't we return true here? After all, 
>> vfio_load_bufs_thread didn't really fail, it just got signal to 
>> terminate itself.
>
> The thread didn't succeed with loading all the data either, but got 
> cancelled.
>
> It's a similar logic as Glib's GIO returning G_IO_ERROR_CANCELLED if 
> the operation
> got cancelled.
>
> In a GTask a pending cancellation will even overwrite any other error 
> or value
> that the task tried to return (at least by default).

Ah I see.
I was looking on multifd_{send,recv}_thread and there they don't set an 
error if cancelled.

Anyway, what confused me is that we set an error here only so 
qemu_loadvm_load_thread() will try to migrate_set_error(), but that 
won't work because a migration error is already expected to be present.
If that's indeed so, then to me it looks a bit redundant to set this 
error here.

Thanks.


Re: [PATCH v5 27/36] vfio/migration: Multifd device state transfer support - load thread
Posted by Cédric Le Goater 1 month ago
On 2/19/25 21:34, Maciej S. Szmigiero wrote:
> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
> 
> Since it's important to finish loading device state transferred via the
> main migration channel (via save_live_iterate SaveVMHandler) before
> starting loading the data asynchronously transferred via multifd the thread
> doing the actual loading of the multifd transferred data is only started
> from switchover_start SaveVMHandler.
> 
> switchover_start handler is called when MIG_CMD_SWITCHOVER_START
> sub-command of QEMU_VM_COMMAND is received via the main migration channel.
> 
> This sub-command is only sent after all save_live_iterate data have already
> been posted so it is safe to commence loading of the multifd-transferred
> device state upon receiving it - loading of save_live_iterate data happens
> synchronously in the main migration thread (much like the processing of
> MIG_CMD_SWITCHOVER_START) so by the time MIG_CMD_SWITCHOVER_START is
> processed all the proceeding data must have already been loaded.
> 
> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
> ---
>   hw/vfio/migration-multifd.c | 225 ++++++++++++++++++++++++++++++++++++
>   hw/vfio/migration-multifd.h |   2 +
>   hw/vfio/migration.c         |  12 ++
>   hw/vfio/trace-events        |   5 +
>   4 files changed, 244 insertions(+)
> 
> diff --git a/hw/vfio/migration-multifd.c b/hw/vfio/migration-multifd.c
> index 5d5ee1393674..b3a88c062769 100644
> --- a/hw/vfio/migration-multifd.c
> +++ b/hw/vfio/migration-multifd.c
> @@ -42,8 +42,13 @@ typedef struct VFIOStateBuffer {
>   } VFIOStateBuffer;
>   
>   typedef struct VFIOMultifd {
> +    QemuThread load_bufs_thread;
> +    bool load_bufs_thread_running;
> +    bool load_bufs_thread_want_exit;
> +
>       VFIOStateBuffers load_bufs;
>       QemuCond load_bufs_buffer_ready_cond;
> +    QemuCond load_bufs_thread_finished_cond;
>       QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */
>       uint32_t load_buf_idx;
>       uint32_t load_buf_idx_last;
> @@ -179,6 +184,175 @@ bool vfio_load_state_buffer(void *opaque, char *data, size_t data_size,
>       return true;
>   }
>   
> +static int vfio_load_bufs_thread_load_config(VFIODevice *vbasedev)
> +{
> +    return -EINVAL;
> +}


please move to next patch.

> +static VFIOStateBuffer *vfio_load_state_buffer_get(VFIOMultifd *multifd)
> +{
> +    VFIOStateBuffer *lb;
> +    guint bufs_len;

guint:  I guess it's ok to use here. It is not common practice in VFIO.

> +
> +    bufs_len = vfio_state_buffers_size_get(&multifd->load_bufs);
> +    if (multifd->load_buf_idx >= bufs_len) {
> +        assert(multifd->load_buf_idx == bufs_len);
> +        return NULL;
> +    }
> +
> +    lb = vfio_state_buffers_at(&multifd->load_bufs,
> +                               multifd->load_buf_idx);

Could be one line. minor.

> +    if (!lb->is_present) {
> +        return NULL;
> +    }
> +
> +    return lb;
> +}
> +
> +static bool vfio_load_state_buffer_write(VFIODevice *vbasedev,
> +                                         VFIOStateBuffer *lb,
> +                                         Error **errp)
> +{
> +    VFIOMigration *migration = vbasedev->migration;
> +    VFIOMultifd *multifd = migration->multifd;
> +    g_autofree char *buf = NULL;
> +    char *buf_cur;
> +    size_t buf_len;
> +
> +    if (!lb->len) {
> +        return true;
> +    }
> +
> +    trace_vfio_load_state_device_buffer_load_start(vbasedev->name,
> +                                                   multifd->load_buf_idx);

I thin we can move this trace event to vfio_load_bufs_thread()

> +    /* lb might become re-allocated when we drop the lock */
> +    buf = g_steal_pointer(&lb->data);
> +    buf_cur = buf;
> +    buf_len = lb->len;
> +    while (buf_len > 0) {
> +        ssize_t wr_ret;
> +        int errno_save;
> +
> +        /*
> +         * Loading data to the device takes a while,
> +         * drop the lock during this process.
> +         */
> +        qemu_mutex_unlock(&multifd->load_bufs_mutex);
> +        wr_ret = write(migration->data_fd, buf_cur, buf_len);> +        errno_save = errno;
> +        qemu_mutex_lock(&multifd->load_bufs_mutex);
> +
> +        if (wr_ret < 0) {
> +            error_setg(errp,
> +                       "writing state buffer %" PRIu32 " failed: %d",
> +                       multifd->load_buf_idx, errno_save);
> +            return false;
> +        }
> +
> +        assert(wr_ret <= buf_len);
> +        buf_len -= wr_ret;
> +        buf_cur += wr_ret;
> +    }
> +
> +    trace_vfio_load_state_device_buffer_load_end(vbasedev->name,
> +                                                 multifd->load_buf_idx);

and drop this trace event.

In which case, we can modify the parameters of vfio_load_state_buffer_write()
to use directly a 'VFIOMultifd *multifd'and an fd instead of "migration->data_fd".

> +
> +    return true;
> +}
> +
> +static bool vfio_load_bufs_thread_want_exit(VFIOMultifd *multifd,
> +                                            bool *should_quit)
> +{
> +    return multifd->load_bufs_thread_want_exit || qatomic_read(should_quit);
> +}
> +
> +/*
> + * This thread is spawned by vfio_multifd_switchover_start() which gets
> + * called upon encountering the switchover point marker in main migration
> + * stream.
> + *
> + * It exits after either:
> + * * completing loading the remaining device state and device config, OR:
> + * * encountering some error while doing the above, OR:
> + * * being forcefully aborted by the migration core by it setting should_quit
> + *   or by vfio_load_cleanup_load_bufs_thread() setting
> + *   multifd->load_bufs_thread_want_exit.
> + */
> +static bool vfio_load_bufs_thread(void *opaque, bool *should_quit, Error **errp)
> +{
> +    VFIODevice *vbasedev = opaque;
> +    VFIOMigration *migration = vbasedev->migration;
> +    VFIOMultifd *multifd = migration->multifd;
> +    bool ret = true;
> +    int config_ret;

No needed IMO. see below.

> +
> +    assert(multifd);
> +    QEMU_LOCK_GUARD(&multifd->load_bufs_mutex);
> +
> +    assert(multifd->load_bufs_thread_running);

We could add a trace event for the start and the end of the thread.

> +    while (true) {
> +        VFIOStateBuffer *lb;
> +
> +        /*
> +         * Always check cancellation first after the buffer_ready wait below in
> +         * case that cond was signalled by vfio_load_cleanup_load_bufs_thread().
> +         */
> +        if (vfio_load_bufs_thread_want_exit(multifd, should_quit)) {
> +            error_setg(errp, "operation cancelled");
> +            ret = false;
> +            goto ret_signal;

goto thread_exit ?

> +        }
> +
> +        assert(multifd->load_buf_idx <= multifd->load_buf_idx_last);
> +
> +        lb = vfio_load_state_buffer_get(multifd);
> +        if (!lb) {
> +            trace_vfio_load_state_device_buffer_starved(vbasedev->name,
> +                                                        multifd->load_buf_idx);
> +            qemu_cond_wait(&multifd->load_bufs_buffer_ready_cond,
> +                           &multifd->load_bufs_mutex);
> +            continue;
> +        }
> +
> +        if (multifd->load_buf_idx == multifd->load_buf_idx_last) {
> +            break;
> +        }
> +
> +        if (multifd->load_buf_idx == 0) {
> +            trace_vfio_load_state_device_buffer_start(vbasedev->name);
> +        }
> +
> +        if (!vfio_load_state_buffer_write(vbasedev, lb, errp)) {
> +            ret = false;
> +            goto ret_signal;
> +        }
> +
> +        if (multifd->load_buf_idx == multifd->load_buf_idx_last - 1) {
> +            trace_vfio_load_state_device_buffer_end(vbasedev->name);
> +        }
> +
> +        multifd->load_buf_idx++;
> +    }

if ret is assigned to true here, the "ret = false" can dropped

> +    config_ret = vfio_load_bufs_thread_load_config(vbasedev);
> +    if (config_ret) {
> +        error_setg(errp, "load config state failed: %d", config_ret);
> +        ret = false;
> +    }

please move to next patch. This is adding nothing to this patch
since it's returning -EINVAL.


Thanks,

C.



> +ret_signal:
> +    /*
> +     * Notify possibly waiting vfio_load_cleanup_load_bufs_thread() that
> +     * this thread is exiting.
> +     */
> +    multifd->load_bufs_thread_running = false;
> +    qemu_cond_signal(&multifd->load_bufs_thread_finished_cond);
> +
> +    return ret;
> +}
> +
>   VFIOMultifd *vfio_multifd_new(void)
>   {
>       VFIOMultifd *multifd = g_new(VFIOMultifd, 1);
> @@ -191,11 +365,42 @@ VFIOMultifd *vfio_multifd_new(void)
>       multifd->load_buf_idx_last = UINT32_MAX;
>       qemu_cond_init(&multifd->load_bufs_buffer_ready_cond);
>   
> +    multifd->load_bufs_thread_running = false;
> +    multifd->load_bufs_thread_want_exit = false;
> +    qemu_cond_init(&multifd->load_bufs_thread_finished_cond);
> +
>       return multifd;
>   }
>   
> +/*
> + * Terminates vfio_load_bufs_thread by setting
> + * multifd->load_bufs_thread_want_exit and signalling all the conditions
> + * the thread could be blocked on.
> + *
> + * Waits for the thread to signal that it had finished.
> + */
> +static void vfio_load_cleanup_load_bufs_thread(VFIOMultifd *multifd)
> +{
> +    /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */
> +    bql_unlock();
> +    WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) {
> +        while (multifd->load_bufs_thread_running) {
> +            multifd->load_bufs_thread_want_exit = true;
> +
> +            qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond);
> +            qemu_cond_wait(&multifd->load_bufs_thread_finished_cond,
> +                           &multifd->load_bufs_mutex);
> +        }
> +    }
> +    bql_lock();
> +}
> +
>   void vfio_multifd_free(VFIOMultifd *multifd)
>   {
> +    vfio_load_cleanup_load_bufs_thread(multifd);
> +
> +    qemu_cond_destroy(&multifd->load_bufs_thread_finished_cond);
> +    vfio_state_buffers_destroy(&multifd->load_bufs);
>       qemu_cond_destroy(&multifd->load_bufs_buffer_ready_cond);
>       qemu_mutex_destroy(&multifd->load_bufs_mutex);
>   
> @@ -225,3 +430,23 @@ bool vfio_multifd_transfer_setup(VFIODevice *vbasedev, Error **errp)
>   
>       return true;
>   }
> +
> +int vfio_multifd_switchover_start(VFIODevice *vbasedev)
> +{
> +    VFIOMigration *migration = vbasedev->migration;
> +    VFIOMultifd *multifd = migration->multifd;
> +
> +    assert(multifd);
> +
> +    /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */
> +    bql_unlock();
> +    WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) {
> +        assert(!multifd->load_bufs_thread_running);
> +        multifd->load_bufs_thread_running = true;
> +    }
> +    bql_lock();
> +
> +    qemu_loadvm_start_load_thread(vfio_load_bufs_thread, vbasedev);
> +
> +    return 0;
> +}
> diff --git a/hw/vfio/migration-multifd.h b/hw/vfio/migration-multifd.h
> index d5ab7d6f85f5..09cbb437d9d1 100644
> --- a/hw/vfio/migration-multifd.h
> +++ b/hw/vfio/migration-multifd.h
> @@ -25,4 +25,6 @@ bool vfio_multifd_transfer_setup(VFIODevice *vbasedev, Error **errp);
>   bool vfio_load_state_buffer(void *opaque, char *data, size_t data_size,
>                               Error **errp);
>   
> +int vfio_multifd_switchover_start(VFIODevice *vbasedev);
> +
>   #endif
> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
> index abaf4d08d4a9..85f54cb22df2 100644
> --- a/hw/vfio/migration.c
> +++ b/hw/vfio/migration.c
> @@ -793,6 +793,17 @@ static bool vfio_switchover_ack_needed(void *opaque)
>       return vfio_precopy_supported(vbasedev);
>   }
>   
> +static int vfio_switchover_start(void *opaque)
> +{
> +    VFIODevice *vbasedev = opaque;
> +
> +    if (vfio_multifd_transfer_enabled(vbasedev)) {
> +        return vfio_multifd_switchover_start(vbasedev);
> +    }
> +
> +    return 0;
> +}
> +
>   static const SaveVMHandlers savevm_vfio_handlers = {
>       .save_prepare = vfio_save_prepare,
>       .save_setup = vfio_save_setup,
> @@ -808,6 +819,7 @@ static const SaveVMHandlers savevm_vfio_handlers = {
>       .load_state = vfio_load_state,
>       .load_state_buffer = vfio_load_state_buffer,
>       .switchover_ack_needed = vfio_switchover_ack_needed,
> +    .switchover_start = vfio_switchover_start,
>   };
>   
>   /* ---------------------------------------------------------------------- */
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index 042a3dc54a33..418b378ebd29 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -154,6 +154,11 @@ vfio_load_device_config_state_end(const char *name) " (%s)"
>   vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64
>   vfio_load_state_device_data(const char *name, uint64_t data_size, int ret) " (%s) size %"PRIu64" ret %d"
>   vfio_load_state_device_buffer_incoming(const char *name, uint32_t idx) " (%s) idx %"PRIu32
> +vfio_load_state_device_buffer_start(const char *name) " (%s)"
> +vfio_load_state_device_buffer_starved(const char *name, uint32_t idx) " (%s) idx %"PRIu32
> +vfio_load_state_device_buffer_load_start(const char *name, uint32_t idx) " (%s) idx %"PRIu32
> +vfio_load_state_device_buffer_load_end(const char *name, uint32_t idx) " (%s) idx %"PRIu32
> +vfio_load_state_device_buffer_end(const char *name) " (%s)"
>   vfio_migration_realize(const char *name) " (%s)"
>   vfio_migration_set_device_state(const char *name, const char *state) " (%s) state %s"
>   vfio_migration_set_state(const char *name, const char *new_state, const char *recover_state) " (%s) new state %s, recover state %s"
>
Re: [PATCH v5 27/36] vfio/migration: Multifd device state transfer support - load thread
Posted by Avihai Horon 1 month ago
On 26/02/2025 15:49, Cédric Le Goater wrote:
> External email: Use caution opening links or attachments
>
>
> On 2/19/25 21:34, Maciej S. Szmigiero wrote:
>> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
>>
>> Since it's important to finish loading device state transferred via the
>> main migration channel (via save_live_iterate SaveVMHandler) before
>> starting loading the data asynchronously transferred via multifd the 
>> thread
>> doing the actual loading of the multifd transferred data is only started
>> from switchover_start SaveVMHandler.
>>
>> switchover_start handler is called when MIG_CMD_SWITCHOVER_START
>> sub-command of QEMU_VM_COMMAND is received via the main migration 
>> channel.
>>
>> This sub-command is only sent after all save_live_iterate data have 
>> already
>> been posted so it is safe to commence loading of the multifd-transferred
>> device state upon receiving it - loading of save_live_iterate data 
>> happens
>> synchronously in the main migration thread (much like the processing of
>> MIG_CMD_SWITCHOVER_START) so by the time MIG_CMD_SWITCHOVER_START is
>> processed all the proceeding data must have already been loaded.
>>
>> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
>> ---
>>   hw/vfio/migration-multifd.c | 225 ++++++++++++++++++++++++++++++++++++
>>   hw/vfio/migration-multifd.h |   2 +
>>   hw/vfio/migration.c         |  12 ++
>>   hw/vfio/trace-events        |   5 +
>>   4 files changed, 244 insertions(+)
>>
>> diff --git a/hw/vfio/migration-multifd.c b/hw/vfio/migration-multifd.c
>> index 5d5ee1393674..b3a88c062769 100644
>> --- a/hw/vfio/migration-multifd.c
>> +++ b/hw/vfio/migration-multifd.c
>> @@ -42,8 +42,13 @@ typedef struct VFIOStateBuffer {
>>   } VFIOStateBuffer;
>>
>>   typedef struct VFIOMultifd {
>> +    QemuThread load_bufs_thread;
>> +    bool load_bufs_thread_running;
>> +    bool load_bufs_thread_want_exit;
>> +
>>       VFIOStateBuffers load_bufs;
>>       QemuCond load_bufs_buffer_ready_cond;
>> +    QemuCond load_bufs_thread_finished_cond;
>>       QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */
>>       uint32_t load_buf_idx;
>>       uint32_t load_buf_idx_last;
>> @@ -179,6 +184,175 @@ bool vfio_load_state_buffer(void *opaque, char 
>> *data, size_t data_size,
>>       return true;
>>   }
>>
>> +static int vfio_load_bufs_thread_load_config(VFIODevice *vbasedev)
>> +{
>> +    return -EINVAL;
>> +}
>
>
> please move to next patch.
>
>> +static VFIOStateBuffer *vfio_load_state_buffer_get(VFIOMultifd 
>> *multifd)
>> +{
>> +    VFIOStateBuffer *lb;
>> +    guint bufs_len;
>
> guint:  I guess it's ok to use here. It is not common practice in VFIO.

Glib documentation says that in new code unsigned int is preferred over 
guint [1].

Thanks.

[1] https://docs.gtk.org/glib/types.html#guint


Re: [PATCH v5 27/36] vfio/migration: Multifd device state transfer support - load thread
Posted by Maciej S. Szmigiero 1 month ago
On 2.03.2025 15:19, Avihai Horon wrote:
> 
> On 26/02/2025 15:49, Cédric Le Goater wrote:
>> External email: Use caution opening links or attachments
>>
>>
>> On 2/19/25 21:34, Maciej S. Szmigiero wrote:
>>> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
>>>
>>> Since it's important to finish loading device state transferred via the
>>> main migration channel (via save_live_iterate SaveVMHandler) before
>>> starting loading the data asynchronously transferred via multifd the thread
>>> doing the actual loading of the multifd transferred data is only started
>>> from switchover_start SaveVMHandler.
>>>
>>> switchover_start handler is called when MIG_CMD_SWITCHOVER_START
>>> sub-command of QEMU_VM_COMMAND is received via the main migration channel.
>>>
>>> This sub-command is only sent after all save_live_iterate data have already
>>> been posted so it is safe to commence loading of the multifd-transferred
>>> device state upon receiving it - loading of save_live_iterate data happens
>>> synchronously in the main migration thread (much like the processing of
>>> MIG_CMD_SWITCHOVER_START) so by the time MIG_CMD_SWITCHOVER_START is
>>> processed all the proceeding data must have already been loaded.
>>>
>>> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
>>> ---
>>>   hw/vfio/migration-multifd.c | 225 ++++++++++++++++++++++++++++++++++++
>>>   hw/vfio/migration-multifd.h |   2 +
>>>   hw/vfio/migration.c         |  12 ++
>>>   hw/vfio/trace-events        |   5 +
>>>   4 files changed, 244 insertions(+)
>>>
>>> diff --git a/hw/vfio/migration-multifd.c b/hw/vfio/migration-multifd.c
>>> index 5d5ee1393674..b3a88c062769 100644
>>> --- a/hw/vfio/migration-multifd.c
>>> +++ b/hw/vfio/migration-multifd.c
>>> @@ -42,8 +42,13 @@ typedef struct VFIOStateBuffer {
>>>   } VFIOStateBuffer;
>>>
>>>   typedef struct VFIOMultifd {
>>> +    QemuThread load_bufs_thread;
>>> +    bool load_bufs_thread_running;
>>> +    bool load_bufs_thread_want_exit;
>>> +
>>>       VFIOStateBuffers load_bufs;
>>>       QemuCond load_bufs_buffer_ready_cond;
>>> +    QemuCond load_bufs_thread_finished_cond;
>>>       QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */
>>>       uint32_t load_buf_idx;
>>>       uint32_t load_buf_idx_last;
>>> @@ -179,6 +184,175 @@ bool vfio_load_state_buffer(void *opaque, char *data, size_t data_size,
>>>       return true;
>>>   }
>>>
>>> +static int vfio_load_bufs_thread_load_config(VFIODevice *vbasedev)
>>> +{
>>> +    return -EINVAL;
>>> +}
>>
>>
>> please move to next patch.
>>
>>> +static VFIOStateBuffer *vfio_load_state_buffer_get(VFIOMultifd *multifd)
>>> +{
>>> +    VFIOStateBuffer *lb;
>>> +    guint bufs_len;
>>
>> guint:  I guess it's ok to use here. It is not common practice in VFIO.
> 
> Glib documentation says that in new code unsigned int is preferred over guint [1].

I turned guints into unsigned ints where I spotted them in this patch set.
  
> Thanks.

Thanks,
Maciej

> [1] https://docs.gtk.org/glib/types.html#guint
> 


Re: [PATCH v5 27/36] vfio/migration: Multifd device state transfer support - load thread
Posted by Maciej S. Szmigiero 1 month ago
On 26.02.2025 14:49, Cédric Le Goater wrote:
> On 2/19/25 21:34, Maciej S. Szmigiero wrote:
>> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
>>
>> Since it's important to finish loading device state transferred via the
>> main migration channel (via save_live_iterate SaveVMHandler) before
>> starting loading the data asynchronously transferred via multifd the thread
>> doing the actual loading of the multifd transferred data is only started
>> from switchover_start SaveVMHandler.
>>
>> switchover_start handler is called when MIG_CMD_SWITCHOVER_START
>> sub-command of QEMU_VM_COMMAND is received via the main migration channel.
>>
>> This sub-command is only sent after all save_live_iterate data have already
>> been posted so it is safe to commence loading of the multifd-transferred
>> device state upon receiving it - loading of save_live_iterate data happens
>> synchronously in the main migration thread (much like the processing of
>> MIG_CMD_SWITCHOVER_START) so by the time MIG_CMD_SWITCHOVER_START is
>> processed all the proceeding data must have already been loaded.
>>
>> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
>> ---
>>   hw/vfio/migration-multifd.c | 225 ++++++++++++++++++++++++++++++++++++
>>   hw/vfio/migration-multifd.h |   2 +
>>   hw/vfio/migration.c         |  12 ++
>>   hw/vfio/trace-events        |   5 +
>>   4 files changed, 244 insertions(+)
>>
>> diff --git a/hw/vfio/migration-multifd.c b/hw/vfio/migration-multifd.c
>> index 5d5ee1393674..b3a88c062769 100644
>> --- a/hw/vfio/migration-multifd.c
>> +++ b/hw/vfio/migration-multifd.c
>> @@ -42,8 +42,13 @@ typedef struct VFIOStateBuffer {
>>   } VFIOStateBuffer;
>>   typedef struct VFIOMultifd {
>> +    QemuThread load_bufs_thread;
>> +    bool load_bufs_thread_running;
>> +    bool load_bufs_thread_want_exit;
>> +
>>       VFIOStateBuffers load_bufs;
>>       QemuCond load_bufs_buffer_ready_cond;
>> +    QemuCond load_bufs_thread_finished_cond;
>>       QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */
>>       uint32_t load_buf_idx;
>>       uint32_t load_buf_idx_last;
>> @@ -179,6 +184,175 @@ bool vfio_load_state_buffer(void *opaque, char *data, size_t data_size,
>>       return true;
>>   }
>> +static int vfio_load_bufs_thread_load_config(VFIODevice *vbasedev)
>> +{
>> +    return -EINVAL;
>> +}
> 
> 
> please move to next patch.

As I wrote on the previous version of the patch set at
https://lore.kernel.org/qemu-devel/4f335de0-ba9f-4537-b230-2cf8af1c160b@maciej.szmigiero.name/:
> The dummy call has to be there, otherwise the code at the
> previous commit time wouldn't compile since that
> vfio_load_bufs_thread_load_config() call is a part of
> vfio_load_bufs_thread().
> 
> This is an artifact of splitting the whole load operation in
> multiple commits.

I think adding empty dummy implementations is the typical way
to do this - much like you asked today to leave
vfio_multifd_transfer_setup() returning true unconditionally
before being filled with true implementation in later patch.

See also my response at the end of this e-mail message, below
the call to vfio_load_bufs_thread_load_config().

>> +static VFIOStateBuffer *vfio_load_state_buffer_get(VFIOMultifd *multifd)
>> +{
>> +    VFIOStateBuffer *lb;
>> +    guint bufs_len;
> 
> guint:  I guess it's ok to use here. It is not common practice in VFIO.
> 
>> +
>> +    bufs_len = vfio_state_buffers_size_get(&multifd->load_bufs);
>> +    if (multifd->load_buf_idx >= bufs_len) {
>> +        assert(multifd->load_buf_idx == bufs_len);
>> +        return NULL;
>> +    }
>> +
>> +    lb = vfio_state_buffers_at(&multifd->load_bufs,
>> +                               multifd->load_buf_idx);
> 
> Could be one line. minor.
> 
>> +    if (!lb->is_present) {
>> +        return NULL;
>> +    }
>> +
>> +    return lb;
>> +}
>> +
>> +static bool vfio_load_state_buffer_write(VFIODevice *vbasedev,
>> +                                         VFIOStateBuffer *lb,
>> +                                         Error **errp)
>> +{
>> +    VFIOMigration *migration = vbasedev->migration;
>> +    VFIOMultifd *multifd = migration->multifd;
>> +    g_autofree char *buf = NULL;
>> +    char *buf_cur;
>> +    size_t buf_len;
>> +
>> +    if (!lb->len) {
>> +        return true;
>> +    }
>> +
>> +    trace_vfio_load_state_device_buffer_load_start(vbasedev->name,
>> +                                                   multifd->load_buf_idx);
> 
> I thin we can move this trace event to vfio_load_bufs_thread()

It would get messy since we don't load empty buffers,
so we we don't print this trace point (and its _end sibling)
for empty buffers.

If we print this in vfio_load_bufs_thread() then it would
need to duplicate that !lb->len check.

>> +    /* lb might become re-allocated when we drop the lock */
>> +    buf = g_steal_pointer(&lb->data);
>> +    buf_cur = buf;
>> +    buf_len = lb->len;
>> +    while (buf_len > 0) {
>> +        ssize_t wr_ret;
>> +        int errno_save;
>> +
>> +        /*
>> +         * Loading data to the device takes a while,
>> +         * drop the lock during this process.
>> +         */
>> +        qemu_mutex_unlock(&multifd->load_bufs_mutex);
>> +        wr_ret = write(migration->data_fd, buf_cur, buf_len);> +        errno_save = errno;
>> +        qemu_mutex_lock(&multifd->load_bufs_mutex);
>> +
>> +        if (wr_ret < 0) {
>> +            error_setg(errp,
>> +                       "writing state buffer %" PRIu32 " failed: %d",
>> +                       multifd->load_buf_idx, errno_save);
>> +            return false;
>> +        }
>> +
>> +        assert(wr_ret <= buf_len);
>> +        buf_len -= wr_ret;
>> +        buf_cur += wr_ret;
>> +    }
>> +
>> +    trace_vfio_load_state_device_buffer_load_end(vbasedev->name,
>> +                                                 multifd->load_buf_idx);
> 
> and drop this trace event.

That's important data since it provides for how long it took to load that
buffer (_end - _start).

It's not the same information as _start(next buffer) - _start(current buffer)
since the next buffer might not have arrived yet so its loading won't
start immediately after the end of loading of the previous one.

> In which case, we can modify the parameters of vfio_load_state_buffer_write()
> to use directly a 'VFIOMultifd *multifd'and an fd instead of "migration->data_fd".
> 
>> +
>> +    return true;
>> +}
>> +
>> +static bool vfio_load_bufs_thread_want_exit(VFIOMultifd *multifd,
>> +                                            bool *should_quit)
>> +{
>> +    return multifd->load_bufs_thread_want_exit || qatomic_read(should_quit);
>> +}
>> +
>> +/*
>> + * This thread is spawned by vfio_multifd_switchover_start() which gets
>> + * called upon encountering the switchover point marker in main migration
>> + * stream.
>> + *
>> + * It exits after either:
>> + * * completing loading the remaining device state and device config, OR:
>> + * * encountering some error while doing the above, OR:
>> + * * being forcefully aborted by the migration core by it setting should_quit
>> + *   or by vfio_load_cleanup_load_bufs_thread() setting
>> + *   multifd->load_bufs_thread_want_exit.
>> + */
>> +static bool vfio_load_bufs_thread(void *opaque, bool *should_quit, Error **errp)
>> +{
>> +    VFIODevice *vbasedev = opaque;
>> +    VFIOMigration *migration = vbasedev->migration;
>> +    VFIOMultifd *multifd = migration->multifd;
>> +    bool ret = true;
>> +    int config_ret;
> 
> No needed IMO. see below.
> 
>> +
>> +    assert(multifd);
>> +    QEMU_LOCK_GUARD(&multifd->load_bufs_mutex);
>> +
>> +    assert(multifd->load_bufs_thread_running);
> 
> We could add a trace event for the start and the end of the thread.

Added vfio_load_bufs_thread_{start,end} trace events now.

>> +    while (true) {
>> +        VFIOStateBuffer *lb;
>> +
>> +        /*
>> +         * Always check cancellation first after the buffer_ready wait below in
>> +         * case that cond was signalled by vfio_load_cleanup_load_bufs_thread().
>> +         */
>> +        if (vfio_load_bufs_thread_want_exit(multifd, should_quit)) {
>> +            error_setg(errp, "operation cancelled");
>> +            ret = false;
>> +            goto ret_signal;
> 
> goto thread_exit ?

I'm not sure that I fully understand this comment.
Do you mean to rename ret_signal label to thread_exit?

>> +        }
>> +
>> +        assert(multifd->load_buf_idx <= multifd->load_buf_idx_last);
>> +
>> +        lb = vfio_load_state_buffer_get(multifd);
>> +        if (!lb) {
>> +            trace_vfio_load_state_device_buffer_starved(vbasedev->name,
>> +                                                        multifd->load_buf_idx);
>> +            qemu_cond_wait(&multifd->load_bufs_buffer_ready_cond,
>> +                           &multifd->load_bufs_mutex);
>> +            continue;
>> +        }
>> +
>> +        if (multifd->load_buf_idx == multifd->load_buf_idx_last) {
>> +            break;
>> +        }
>> +
>> +        if (multifd->load_buf_idx == 0) {
>> +            trace_vfio_load_state_device_buffer_start(vbasedev->name);
>> +        }
>> +
>> +        if (!vfio_load_state_buffer_write(vbasedev, lb, errp)) {
>> +            ret = false;
>> +            goto ret_signal;
>> +        }
>> +
>> +        if (multifd->load_buf_idx == multifd->load_buf_idx_last - 1) {
>> +            trace_vfio_load_state_device_buffer_end(vbasedev->name);
>> +        }
>> +
>> +        multifd->load_buf_idx++;
>> +    }
> 
> if ret is assigned to true here, the "ret = false" can dropped

I inverted the "ret" logic here now - initialized ret to false
at definition, removed "ret = false" at every failure/early exit block
and added "ret = true" just before the "ret_signal" label.

>> +    config_ret = vfio_load_bufs_thread_load_config(vbasedev);
>> +    if (config_ret) {
>> +        error_setg(errp, "load config state failed: %d", config_ret);
>> +        ret = false;
>> +    }
> 
> please move to next patch. This is adding nothing to this patch
> since it's returning -EINVAL.
> 

That's the whole point - if someone were to accidentally enable this
(for example by forgetting to apply the next patch when backporting
the series) it would fail safely with EINVAL instead of having a
half-broken implementation.

Another option would be to simply integrate the next patch into this
one as these are two parts of the same single operation and I think
splitting them in two in the end brings little value.

> Thanks,
> 
> C.

Thanks,
Maciej


Re: [PATCH v5 27/36] vfio/migration: Multifd device state transfer support - load thread
Posted by Cédric Le Goater 1 month ago
On 2/26/25 22:05, Maciej S. Szmigiero wrote:
> On 26.02.2025 14:49, Cédric Le Goater wrote:
>> On 2/19/25 21:34, Maciej S. Szmigiero wrote:
>>> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
>>>
>>> Since it's important to finish loading device state transferred via the
>>> main migration channel (via save_live_iterate SaveVMHandler) before
>>> starting loading the data asynchronously transferred via multifd the thread
>>> doing the actual loading of the multifd transferred data is only started
>>> from switchover_start SaveVMHandler.
>>>
>>> switchover_start handler is called when MIG_CMD_SWITCHOVER_START
>>> sub-command of QEMU_VM_COMMAND is received via the main migration channel.
>>>
>>> This sub-command is only sent after all save_live_iterate data have already
>>> been posted so it is safe to commence loading of the multifd-transferred
>>> device state upon receiving it - loading of save_live_iterate data happens
>>> synchronously in the main migration thread (much like the processing of
>>> MIG_CMD_SWITCHOVER_START) so by the time MIG_CMD_SWITCHOVER_START is
>>> processed all the proceeding data must have already been loaded.
>>>
>>> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
>>> ---
>>>   hw/vfio/migration-multifd.c | 225 ++++++++++++++++++++++++++++++++++++
>>>   hw/vfio/migration-multifd.h |   2 +
>>>   hw/vfio/migration.c         |  12 ++
>>>   hw/vfio/trace-events        |   5 +
>>>   4 files changed, 244 insertions(+)
>>>
>>> diff --git a/hw/vfio/migration-multifd.c b/hw/vfio/migration-multifd.c
>>> index 5d5ee1393674..b3a88c062769 100644
>>> --- a/hw/vfio/migration-multifd.c
>>> +++ b/hw/vfio/migration-multifd.c
>>> @@ -42,8 +42,13 @@ typedef struct VFIOStateBuffer {
>>>   } VFIOStateBuffer;
>>>   typedef struct VFIOMultifd {
>>> +    QemuThread load_bufs_thread;
>>> +    bool load_bufs_thread_running;
>>> +    bool load_bufs_thread_want_exit;
>>> +
>>>       VFIOStateBuffers load_bufs;
>>>       QemuCond load_bufs_buffer_ready_cond;
>>> +    QemuCond load_bufs_thread_finished_cond;
>>>       QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */
>>>       uint32_t load_buf_idx;
>>>       uint32_t load_buf_idx_last;
>>> @@ -179,6 +184,175 @@ bool vfio_load_state_buffer(void *opaque, char *data, size_t data_size,
>>>       return true;
>>>   }
>>> +static int vfio_load_bufs_thread_load_config(VFIODevice *vbasedev)
>>> +{
>>> +    return -EINVAL;
>>> +}
>>
>>
>> please move to next patch.
> 
> As I wrote on the previous version of the patch set at
> https://lore.kernel.org/qemu-devel/4f335de0-ba9f-4537-b230-2cf8af1c160b@maciej.szmigiero.name/:
>> The dummy call has to be there, otherwise the code at the
>> previous commit time wouldn't compile since that
>> vfio_load_bufs_thread_load_config() call is a part of
>> vfio_load_bufs_thread().
>>
>> This is an artifact of splitting the whole load operation in
>> multiple commits.
> 
> I think adding empty dummy implementations is the typical way
> to do this - much like you asked today to leave
> vfio_multifd_transfer_setup() returning true unconditionally
> before being filled with true implementation in later patch.
> 
> See also my response at the end of this e-mail message, below
> the call to vfio_load_bufs_thread_load_config().
> 
>>> +static VFIOStateBuffer *vfio_load_state_buffer_get(VFIOMultifd *multifd)
>>> +{
>>> +    VFIOStateBuffer *lb;
>>> +    guint bufs_len;
>>
>> guint:  I guess it's ok to use here. It is not common practice in VFIO.
>>
>>> +
>>> +    bufs_len = vfio_state_buffers_size_get(&multifd->load_bufs);
>>> +    if (multifd->load_buf_idx >= bufs_len) {
>>> +        assert(multifd->load_buf_idx == bufs_len);
>>> +        return NULL;
>>> +    }
>>> +
>>> +    lb = vfio_state_buffers_at(&multifd->load_bufs,
>>> +                               multifd->load_buf_idx);
>>
>> Could be one line. minor.
>>
>>> +    if (!lb->is_present) {
>>> +        return NULL;
>>> +    }
>>> +
>>> +    return lb;
>>> +}
>>> +
>>> +static bool vfio_load_state_buffer_write(VFIODevice *vbasedev,
>>> +                                         VFIOStateBuffer *lb,
>>> +                                         Error **errp)
>>> +{
>>> +    VFIOMigration *migration = vbasedev->migration;
>>> +    VFIOMultifd *multifd = migration->multifd;
>>> +    g_autofree char *buf = NULL;
>>> +    char *buf_cur;
>>> +    size_t buf_len;
>>> +
>>> +    if (!lb->len) {
>>> +        return true;
>>> +    }
>>> +
>>> +    trace_vfio_load_state_device_buffer_load_start(vbasedev->name,
>>> +                                                   multifd->load_buf_idx);
>>
>> I thin we can move this trace event to vfio_load_bufs_thread()
> 
> It would get messy since we don't load empty buffers,
> so we we don't print this trace point (and its _end sibling)
> for empty buffers.
> 
> If we print this in vfio_load_bufs_thread() then it would
> need to duplicate that !lb->len check.
> 
>>> +    /* lb might become re-allocated when we drop the lock */
>>> +    buf = g_steal_pointer(&lb->data);
>>> +    buf_cur = buf;
>>> +    buf_len = lb->len;
>>> +    while (buf_len > 0) {
>>> +        ssize_t wr_ret;
>>> +        int errno_save;
>>> +
>>> +        /*
>>> +         * Loading data to the device takes a while,
>>> +         * drop the lock during this process.
>>> +         */
>>> +        qemu_mutex_unlock(&multifd->load_bufs_mutex);
>>> +        wr_ret = write(migration->data_fd, buf_cur, buf_len);> +        errno_save = errno;
>>> +        qemu_mutex_lock(&multifd->load_bufs_mutex);
>>> +
>>> +        if (wr_ret < 0) {
>>> +            error_setg(errp,
>>> +                       "writing state buffer %" PRIu32 " failed: %d",
>>> +                       multifd->load_buf_idx, errno_save);
>>> +            return false;
>>> +        }
>>> +
>>> +        assert(wr_ret <= buf_len);
>>> +        buf_len -= wr_ret;
>>> +        buf_cur += wr_ret;
>>> +    }
>>> +
>>> +    trace_vfio_load_state_device_buffer_load_end(vbasedev->name,
>>> +                                                 multifd->load_buf_idx);
>>
>> and drop this trace event.
> 
> That's important data since it provides for how long it took to load that
> buffer (_end - _start).
> 
> It's not the same information as _start(next buffer) - _start(current buffer)
> since the next buffer might not have arrived yet so its loading won't
> start immediately after the end of loading of the previous one.
> 
>> In which case, we can modify the parameters of vfio_load_state_buffer_write()
>> to use directly a 'VFIOMultifd *multifd'and an fd instead of "migration->data_fd".
>>
>>> +
>>> +    return true;
>>> +}
>>> +
>>> +static bool vfio_load_bufs_thread_want_exit(VFIOMultifd *multifd,
>>> +                                            bool *should_quit)
>>> +{
>>> +    return multifd->load_bufs_thread_want_exit || qatomic_read(should_quit);
>>> +}
>>> +
>>> +/*
>>> + * This thread is spawned by vfio_multifd_switchover_start() which gets
>>> + * called upon encountering the switchover point marker in main migration
>>> + * stream.
>>> + *
>>> + * It exits after either:
>>> + * * completing loading the remaining device state and device config, OR:
>>> + * * encountering some error while doing the above, OR:
>>> + * * being forcefully aborted by the migration core by it setting should_quit
>>> + *   or by vfio_load_cleanup_load_bufs_thread() setting
>>> + *   multifd->load_bufs_thread_want_exit.
>>> + */
>>> +static bool vfio_load_bufs_thread(void *opaque, bool *should_quit, Error **errp)
>>> +{
>>> +    VFIODevice *vbasedev = opaque;
>>> +    VFIOMigration *migration = vbasedev->migration;
>>> +    VFIOMultifd *multifd = migration->multifd;
>>> +    bool ret = true;
>>> +    int config_ret;
>>
>> No needed IMO. see below.
>>
>>> +
>>> +    assert(multifd);
>>> +    QEMU_LOCK_GUARD(&multifd->load_bufs_mutex);
>>> +
>>> +    assert(multifd->load_bufs_thread_running);
>>
>> We could add a trace event for the start and the end of the thread.
> 
> Added vfio_load_bufs_thread_{start,end} trace events now.
> 
>>> +    while (true) {
>>> +        VFIOStateBuffer *lb;
>>> +
>>> +        /*
>>> +         * Always check cancellation first after the buffer_ready wait below in
>>> +         * case that cond was signalled by vfio_load_cleanup_load_bufs_thread().
>>> +         */
>>> +        if (vfio_load_bufs_thread_want_exit(multifd, should_quit)) {
>>> +            error_setg(errp, "operation cancelled");
>>> +            ret = false;
>>> +            goto ret_signal;
>>
>> goto thread_exit ?
> 
> I'm not sure that I fully understand this comment.
> Do you mean to rename ret_signal label to thread_exit?


Yes. I find label 'thread_exit' more meaning full. This is minor since
there is only one 'exit' label.

> 
>>> +        }
>>> +
>>> +        assert(multifd->load_buf_idx <= multifd->load_buf_idx_last);
>>> +
>>> +        lb = vfio_load_state_buffer_get(multifd);
>>> +        if (!lb) {
>>> +            trace_vfio_load_state_device_buffer_starved(vbasedev->name,
>>> +                                                        multifd->load_buf_idx);
>>> +            qemu_cond_wait(&multifd->load_bufs_buffer_ready_cond,
>>> +                           &multifd->load_bufs_mutex);
>>> +            continue;
>>> +        }
>>> +
>>> +        if (multifd->load_buf_idx == multifd->load_buf_idx_last) {
>>> +            break;
>>> +        }
>>> +
>>> +        if (multifd->load_buf_idx == 0) {
>>> +            trace_vfio_load_state_device_buffer_start(vbasedev->name);
>>> +        }
>>> +
>>> +        if (!vfio_load_state_buffer_write(vbasedev, lb, errp)) {
>>> +            ret = false;
>>> +            goto ret_signal;
>>> +        }
>>> +
>>> +        if (multifd->load_buf_idx == multifd->load_buf_idx_last - 1) {
>>> +            trace_vfio_load_state_device_buffer_end(vbasedev->name);
>>> +        }
>>> +
>>> +        multifd->load_buf_idx++;
>>> +    }
>>
>> if ret is assigned to true here, the "ret = false" can dropped
> 
> I inverted the "ret" logic here now - initialized ret to false
> at definition, removed "ret = false" at every failure/early exit block
> and added "ret = true" just before the "ret_signal" label.
> 
>>> +    config_ret = vfio_load_bufs_thread_load_config(vbasedev);
>>> +    if (config_ret) {
>>> +        error_setg(errp, "load config state failed: %d", config_ret);
>>> +        ret = false;
>>> +    }
>>
>> please move to next patch. This is adding nothing to this patch
>> since it's returning -EINVAL.
>>
> 
> That's the whole point - if someone were to accidentally enable this
> (for example by forgetting to apply the next patch when backporting
> the series) it would fail safely with EINVAL instead of having a
> half-broken implementation.

OK. Let's keep it that way.


Thanks,

C.


> 
> Another option would be to simply integrate the next patch into this
> one as these are two parts of the same single operation and I think
> splitting them in two in the end brings little value.
> 
>> Thanks,
>>
>> C.
> 
> Thanks,
> Maciej
> 


Re: [PATCH v5 27/36] vfio/migration: Multifd device state transfer support - load thread
Posted by Maciej S. Szmigiero 1 month ago
On 28.02.2025 10:11, Cédric Le Goater wrote:
> On 2/26/25 22:05, Maciej S. Szmigiero wrote:
>> On 26.02.2025 14:49, Cédric Le Goater wrote:
>>> On 2/19/25 21:34, Maciej S. Szmigiero wrote:
>>>> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
>>>>
>>>> Since it's important to finish loading device state transferred via the
>>>> main migration channel (via save_live_iterate SaveVMHandler) before
>>>> starting loading the data asynchronously transferred via multifd the thread
>>>> doing the actual loading of the multifd transferred data is only started
>>>> from switchover_start SaveVMHandler.
>>>>
>>>> switchover_start handler is called when MIG_CMD_SWITCHOVER_START
>>>> sub-command of QEMU_VM_COMMAND is received via the main migration channel.
>>>>
>>>> This sub-command is only sent after all save_live_iterate data have already
>>>> been posted so it is safe to commence loading of the multifd-transferred
>>>> device state upon receiving it - loading of save_live_iterate data happens
>>>> synchronously in the main migration thread (much like the processing of
>>>> MIG_CMD_SWITCHOVER_START) so by the time MIG_CMD_SWITCHOVER_START is
>>>> processed all the proceeding data must have already been loaded.
>>>>
>>>> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
>>>> ---
>>>>   hw/vfio/migration-multifd.c | 225 ++++++++++++++++++++++++++++++++++++
>>>>   hw/vfio/migration-multifd.h |   2 +
>>>>   hw/vfio/migration.c         |  12 ++
>>>>   hw/vfio/trace-events        |   5 +
>>>>   4 files changed, 244 insertions(+)
>>>>
>>>> diff --git a/hw/vfio/migration-multifd.c b/hw/vfio/migration-multifd.c
>>>> index 5d5ee1393674..b3a88c062769 100644
>>>> --- a/hw/vfio/migration-multifd.c
>>>> +++ b/hw/vfio/migration-multifd.c
(..)
>>>> +    while (true) {
>>>> +        VFIOStateBuffer *lb;
>>>> +
>>>> +        /*
>>>> +         * Always check cancellation first after the buffer_ready wait below in
>>>> +         * case that cond was signalled by vfio_load_cleanup_load_bufs_thread().
>>>> +         */
>>>> +        if (vfio_load_bufs_thread_want_exit(multifd, should_quit)) {
>>>> +            error_setg(errp, "operation cancelled");
>>>> +            ret = false;
>>>> +            goto ret_signal;
>>>
>>> goto thread_exit ?
>>
>> I'm not sure that I fully understand this comment.
>> Do you mean to rename ret_signal label to thread_exit?
> 
> 
> Yes. I find label 'thread_exit' more meaning full. This is minor since
> there is only one 'exit' label.
> 

Renamed ret_signal to thread_exit then.

(..)
>>>> +    config_ret = vfio_load_bufs_thread_load_config(vbasedev);
>>>> +    if (config_ret) {
>>>> +        error_setg(errp, "load config state failed: %d", config_ret);
>>>> +        ret = false;
>>>> +    }
>>>
>>> please move to next patch. This is adding nothing to this patch
>>> since it's returning -EINVAL.
>>>
>>
>> That's the whole point - if someone were to accidentally enable this
>> (for example by forgetting to apply the next patch when backporting
>> the series) it would fail safely with EINVAL instead of having a
>> half-broken implementation.
> 
> OK. Let's keep it that way.
> 
> 
> Thanks,
> 
> C.

Thanks,
Maciej