[v2] migration: File based migration with multifd and fixed-ram

[PATCH v2 21/29] migration/multifd: Add pages to the receiving side

Posted by Fabiano Rosas 2 years, 3 months ago

Currently multifd does not need to have knowledge of pages on the
receiving side because all the information needed is within the
packets that come in the stream.

We're about to add support to fixed-ram migration, which cannot use
packets because it expects the ramblock section in the migration file
to contain only the guest pages data.

Add a pointer to MultiFDPages in the multifd_recv_state and use the
pages similarly to what we already do on the sending side. The pages
are used to transfer data between the ram migration code in the main
migration thread and the multifd receiving threads.

Signed-off-by: Fabiano Rosas <farosas@suse.de>
---
 migration/multifd.c | 107 ++++++++++++++++++++++++++++++++++++++++++++
 migration/multifd.h |  13 +++++-
 2 files changed, 119 insertions(+), 1 deletion(-)

diff --git a/migration/multifd.c b/migration/multifd.c
index ad51210f13..20e8635740 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -992,6 +992,8 @@ int multifd_save_setup(Error **errp)
 
 struct {
     MultiFDRecvParams *params;
+    /* array of pages to receive */
+    MultiFDPages_t *pages;
     /* number of created threads */
     int count;
     /* syncs main thread and channels */
@@ -1002,6 +1004,75 @@ struct {
     MultiFDMethods *ops;
 } *multifd_recv_state;
 
+static int multifd_recv_pages(QEMUFile *f)
+{
+    int i;
+    static int next_recv_channel;
+    MultiFDRecvParams *p = NULL;
+    MultiFDPages_t *pages = multifd_recv_state->pages;
+
+    /*
+     * next_channel can remain from a previous migration that was
+     * using more channels, so ensure it doesn't overflow if the
+     * limit is lower now.
+     */
+    next_recv_channel %= migrate_multifd_channels();
+    for (i = next_recv_channel;; i = (i + 1) % migrate_multifd_channels()) {
+        p = &multifd_recv_state->params[i];
+
+        qemu_mutex_lock(&p->mutex);
+        if (p->quit) {
+            error_report("%s: channel %d has already quit!", __func__, i);
+            qemu_mutex_unlock(&p->mutex);
+            return -1;
+        }
+        if (!p->pending_job) {
+            p->pending_job++;
+            next_recv_channel = (i + 1) % migrate_multifd_channels();
+            break;
+        }
+        qemu_mutex_unlock(&p->mutex);
+    }
+
+    multifd_recv_state->pages = p->pages;
+    p->pages = pages;
+    qemu_mutex_unlock(&p->mutex);
+    qemu_sem_post(&p->sem);
+
+    return 1;
+}
+
+int multifd_recv_queue_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
+{
+    MultiFDPages_t *pages = multifd_recv_state->pages;
+    bool changed = false;
+
+    if (!pages->block) {
+        pages->block = block;
+    }
+
+    if (pages->block == block) {
+        pages->offset[pages->num] = offset;
+        pages->num++;
+
+        if (pages->num < pages->allocated) {
+            return 1;
+        }
+    } else {
+        changed = true;
+    }
+
+    if (multifd_recv_pages(f) < 0) {
+        return -1;
+    }
+
+    if (changed) {
+        return multifd_recv_queue_page(f, block, offset);
+    }
+
+    return 1;
+}
+
 static void multifd_recv_terminate_threads(Error *err)
 {
     int i;
@@ -1023,6 +1094,7 @@ static void multifd_recv_terminate_threads(Error *err)
 
         qemu_mutex_lock(&p->mutex);
         p->quit = true;
+        qemu_sem_post(&p->sem);
         /*
          * We could arrive here for two reasons:
          *  - normal quit, i.e. everything went fine, just finished
@@ -1072,8 +1144,11 @@ void multifd_load_cleanup(void)
         p->c = NULL;
         qemu_mutex_destroy(&p->mutex);
         qemu_sem_destroy(&p->sem_sync);
+        qemu_sem_destroy(&p->sem);
         g_free(p->name);
         p->name = NULL;
+        multifd_pages_clear(p->pages);
+        p->pages = NULL;
         p->packet_len = 0;
         g_free(p->packet);
         p->packet = NULL;
@@ -1086,6 +1161,8 @@ void multifd_load_cleanup(void)
     qemu_sem_destroy(&multifd_recv_state->sem_sync);
     g_free(multifd_recv_state->params);
     multifd_recv_state->params = NULL;
+    multifd_pages_clear(multifd_recv_state->pages);
+    multifd_recv_state->pages = NULL;
     g_free(multifd_recv_state);
     multifd_recv_state = NULL;
 }
@@ -1148,6 +1225,25 @@ static void *multifd_recv_thread(void *opaque)
                 break;
             }
             p->num_packets++;
+        } else {
+            /*
+             * No packets, so we need to wait for the vmstate code to
+             * queue pages.
+             */
+            qemu_sem_wait(&p->sem);
+            qemu_mutex_lock(&p->mutex);
+            if (!p->pending_job) {
+                qemu_mutex_unlock(&p->mutex);
+                break;
+            }
+
+            for (int i = 0; i < p->pages->num; i++) {
+                p->normal[p->normal_num] = p->pages->offset[i];
+                p->normal_num++;
+            }
+
+            p->pages->num = 0;
+            p->host = p->pages->block->host;
         }
 
         flags = p->flags;
@@ -1170,6 +1266,13 @@ static void *multifd_recv_thread(void *opaque)
             qemu_sem_post(&multifd_recv_state->sem_sync);
             qemu_sem_wait(&p->sem_sync);
         }
+
+        if (!use_packets) {
+            qemu_mutex_lock(&p->mutex);
+            p->pending_job--;
+            p->pages->block = NULL;
+            qemu_mutex_unlock(&p->mutex);
+        }
     }
 
     if (local_err) {
@@ -1204,6 +1307,7 @@ int multifd_load_setup(Error **errp)
     thread_count = migrate_multifd_channels();
     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
+    multifd_recv_state->pages = multifd_pages_init(page_count);
     qatomic_set(&multifd_recv_state->count, 0);
     qemu_sem_init(&multifd_recv_state->sem_sync, 0);
     multifd_recv_state->ops = multifd_ops[migrate_multifd_compression()];
@@ -1213,8 +1317,11 @@ int multifd_load_setup(Error **errp)
 
         qemu_mutex_init(&p->mutex);
         qemu_sem_init(&p->sem_sync, 0);
+        qemu_sem_init(&p->sem, 0);
         p->quit = false;
+        p->pending_job = 0;
         p->id = i;
+        p->pages = multifd_pages_init(page_count);
 
         if (use_packets) {
             p->packet_len = sizeof(MultiFDPacket_t)
diff --git a/migration/multifd.h b/migration/multifd.h
index a112ec7ac6..b571b1e4a2 100644
--- a/migration/multifd.h
+++ b/migration/multifd.h
@@ -24,6 +24,7 @@ void multifd_recv_new_channel(QIOChannel *ioc, Error **errp);
 void multifd_recv_sync_main(void);
 int multifd_send_sync_main(QEMUFile *f);
 int multifd_queue_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset);
+int multifd_recv_queue_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset);
 
 /* Multifd Compression flags */
 #define MULTIFD_FLAG_SYNC (1 << 0)
@@ -153,9 +154,13 @@ typedef struct {
     uint32_t page_size;
     /* number of pages in a full packet */
     uint32_t page_count;
+    /* multifd flags for receiving ram */
+    int read_flags;
 
     /* syncs main thread and channels */
     QemuSemaphore sem_sync;
+    /* sem where to wait for more work */
+    QemuSemaphore sem;
 
     /* this mutex protects the following parameters */
     QemuMutex mutex;
@@ -167,6 +172,13 @@ typedef struct {
     uint32_t flags;
     /* global number of generated multifd packets */
     uint64_t packet_num;
+    int pending_job;
+    /* array of pages to sent.
+     * The owner of 'pages' depends of 'pending_job' value:
+     * pending_job == 0 -> migration_thread can use it.
+     * pending_job != 0 -> multifd_channel can use it.
+     */
+    MultiFDPages_t *pages;
 
     /* thread local variables. No locking required */
 
@@ -210,4 +222,3 @@ typedef struct {
 void multifd_register_ops(int method, MultiFDMethods *ops);
 
 #endif
-
-- 
2.35.3

Re: [PATCH v2 21/29] migration/multifd: Add pages to the receiving side

Posted by Peter Xu 2 years, 3 months ago

On Mon, Oct 23, 2023 at 05:36:00PM -0300, Fabiano Rosas wrote:
> Currently multifd does not need to have knowledge of pages on the
> receiving side because all the information needed is within the
> packets that come in the stream.
> 
> We're about to add support to fixed-ram migration, which cannot use
> packets because it expects the ramblock section in the migration file
> to contain only the guest pages data.
> 
> Add a pointer to MultiFDPages in the multifd_recv_state and use the
> pages similarly to what we already do on the sending side. The pages
> are used to transfer data between the ram migration code in the main
> migration thread and the multifd receiving threads.
> 
> Signed-off-by: Fabiano Rosas <farosas@suse.de>

If it'll be new code to maintain anyway, I think we don't necessarily
always use multifd structs, right?

Rather than introducing MultiFDPages_t into recv side, can we allow pages
to be distributed in chunks of (ramblock, start_offset, end_offset) tuples?
That'll be much more efficient than per-page.  We don't need page granule
here on recv side, we want to load chunks of mem fast.

We don't even need page granule on sender side, but since only myself cared
about perf.. and obviously the plan is to even drop auto-pause, then VM can
be running there, so sender must do that per-page for now.  But now on recv
side VM must be stopped before all ram loaded, so there's no such problem.
And since we'll introduce new code anyway, IMHO we can decide how to do
that even if we want to reuse multifd.

Main thread can assign these (ramblock, start_offset, end_offset) jobs to
recv threads.  If ramblock is too small (e.g. 1M), assign it anyway to one
thread.  If ramblock is >512MB, cut it into slices and feed them to multifd
threads one by one.  All the rest can be the same.

Would that be better?  I would expect measurable loading speed difference
with much larger chunks and with that range-based tuples.

Thanks,

-- 
Peter Xu

Re: [PATCH v2 21/29] migration/multifd: Add pages to the receiving side

Posted by Fabiano Rosas 2 years, 3 months ago

Peter Xu <peterx@redhat.com> writes:

> On Mon, Oct 23, 2023 at 05:36:00PM -0300, Fabiano Rosas wrote:
>> Currently multifd does not need to have knowledge of pages on the
>> receiving side because all the information needed is within the
>> packets that come in the stream.
>> 
>> We're about to add support to fixed-ram migration, which cannot use
>> packets because it expects the ramblock section in the migration file
>> to contain only the guest pages data.
>> 
>> Add a pointer to MultiFDPages in the multifd_recv_state and use the
>> pages similarly to what we already do on the sending side. The pages
>> are used to transfer data between the ram migration code in the main
>> migration thread and the multifd receiving threads.
>> 
>> Signed-off-by: Fabiano Rosas <farosas@suse.de>
>
> If it'll be new code to maintain anyway, I think we don't necessarily
> always use multifd structs, right?
>

For the sending side, unrelated to this series, I'm experimenting with
defining a generic structure to be passed into multifd:

struct MultiFDData_t {
    void *opaque;
    size_t size;
    bool ready;
    void (*cleanup_fn)(void *);
};

The client code (ram.c) would use the opaque field to put whatever it
wants in it. Maybe we could have a similar concept on the receiving
side?

Here's a PoC I'm writing, if you're interested:

https://github.com/farosas/qemu/commits/multifd-packet-cleanups

(I'm delaying sending this to the list because we already have a
reasonable backlog of features and refactorings to merge.)

> Rather than introducing MultiFDPages_t into recv side, can we allow pages
> to be distributed in chunks of (ramblock, start_offset, end_offset) tuples?
> That'll be much more efficient than per-page.  We don't need page granule
> here on recv side, we want to load chunks of mem fast.
>
> We don't even need page granule on sender side, but since only myself cared
> about perf.. and obviously the plan is to even drop auto-pause, then VM can
> be running there, so sender must do that per-page for now.  But now on recv
> side VM must be stopped before all ram loaded, so there's no such problem.
> And since we'll introduce new code anyway, IMHO we can decide how to do
> that even if we want to reuse multifd.
>
> Main thread can assign these (ramblock, start_offset, end_offset) jobs to
> recv threads.  If ramblock is too small (e.g. 1M), assign it anyway to one
> thread.  If ramblock is >512MB, cut it into slices and feed them to multifd
> threads one by one.  All the rest can be the same.
>
> Would that be better?  I would expect measurable loading speed difference
> with much larger chunks and with that range-based tuples.

I need to check how that would interact with the existing recv_thread
code. Hopefully there's nothing there preventing us from using a
different data structure.