[v2] vfio-user server in QEMU

[PATCH RFC server v2 10/11] vfio-user: register handlers to facilitate migration

Posted by Jagannathan Raman 4 years, 5 months ago

Store and load the device's state during migration. use libvfio-user's
handlers for this purpose

Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
---
 migration/savevm.h        |   2 +
 hw/remote/vfio-user-obj.c | 313 ++++++++++++++++++++++++++++++++++++++++++++++
 migration/savevm.c        |  73 +++++++++++
 3 files changed, 388 insertions(+)

diff --git a/migration/savevm.h b/migration/savevm.h
index 6461342..8007064 100644
--- a/migration/savevm.h
+++ b/migration/savevm.h
@@ -67,5 +67,7 @@ int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis);
 int qemu_load_device_state(QEMUFile *f);
 int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
         bool in_postcopy, bool inactivate_disks);
+int qemu_remote_savevm(QEMUFile *f, DeviceState *dev);
+int qemu_remote_loadvm(QEMUFile *f);
 
 #endif
diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c
index 92605ed..16cf515 100644
--- a/hw/remote/vfio-user-obj.c
+++ b/hw/remote/vfio-user-obj.c
@@ -45,6 +45,10 @@
 #include "hw/boards.h"
 #include "hw/remote/iohub.h"
 #include "hw/remote/machine.h"
+#include "migration/qemu-file.h"
+#include "migration/savevm.h"
+#include "migration/global_state.h"
+#include "block/block.h"
 
 #define TYPE_VFU_OBJECT "vfio-user"
 OBJECT_DECLARE_TYPE(VfuObject, VfuObjectClass, VFU_OBJECT)
@@ -72,6 +76,33 @@ struct VfuObject {
     PCIDevice *pci_dev;
 
     int vfu_poll_fd;
+
+    /*
+     * vfu_mig_buf holds the migration data. In the remote server, this
+     * buffer replaces the role of an IO channel which links the source
+     * and the destination.
+     *
+     * Whenever the client QEMU process initiates migration, the remote
+     * server gets notified via libvfio-user callbacks. The remote server
+     * sets up a QEMUFile object using this buffer as backend. The remote
+     * server passes this object to its migration subsystem, which slurps
+     * the VMSD of the device ('devid' above) referenced by this object
+     * and stores the VMSD in this buffer.
+     *
+     * The client subsequetly asks the remote server for any data that
+     * needs to be moved over to the destination via libvfio-user
+     * library's vfu_migration_callbacks_t callbacks. The remote hands
+     * over this buffer as data at this time.
+     *
+     * A reverse of this process happens at the destination.
+     */
+    uint8_t *vfu_mig_buf;
+
+    uint64_t vfu_mig_buf_size;
+
+    uint64_t vfu_mig_buf_pending;
+
+    QEMUFile *vfu_mig_file;
 };
 
 static void vfu_object_set_socket(Object *obj, const char *str, Error **errp)
@@ -96,6 +127,250 @@ static void vfu_object_set_devid(Object *obj, const char *str, Error **errp)
     trace_vfu_prop("devid", str);
 }
 
+/**
+ * Migration helper functions
+ *
+ * vfu_mig_buf_read & vfu_mig_buf_write are used by QEMU's migration
+ * subsystem - qemu_remote_loadvm & qemu_remote_savevm. loadvm/savevm
+ * call these functions via QEMUFileOps to load/save the VMSD of a
+ * device into vfu_mig_buf
+ *
+ */
+static ssize_t vfu_mig_buf_read(void *opaque, uint8_t *buf, int64_t pos,
+                                size_t size, Error **errp)
+{
+    VfuObject *o = opaque;
+
+    if (pos > o->vfu_mig_buf_size) {
+        size = 0;
+    } else if ((pos + size) > o->vfu_mig_buf_size) {
+        size = o->vfu_mig_buf_size;
+    }
+
+    memcpy(buf, (o->vfu_mig_buf + pos), size);
+
+    o->vfu_mig_buf_size -= size;
+
+    return size;
+}
+
+static ssize_t vfu_mig_buf_write(void *opaque, struct iovec *iov, int iovcnt,
+                                 int64_t pos, Error **errp)
+{
+    VfuObject *o = opaque;
+    uint64_t end = pos + iov_size(iov, iovcnt);
+    int i;
+
+    if (end > o->vfu_mig_buf_size) {
+        o->vfu_mig_buf = g_realloc(o->vfu_mig_buf, end);
+    }
+
+    for (i = 0; i < iovcnt; i++) {
+        memcpy((o->vfu_mig_buf + o->vfu_mig_buf_size), iov[i].iov_base,
+               iov[i].iov_len);
+        o->vfu_mig_buf_size += iov[i].iov_len;
+        o->vfu_mig_buf_pending += iov[i].iov_len;
+    }
+
+    return iov_size(iov, iovcnt);
+}
+
+static int vfu_mig_buf_shutdown(void *opaque, bool rd, bool wr, Error **errp)
+{
+    VfuObject *o = opaque;
+
+    o->vfu_mig_buf_size = 0;
+
+    g_free(o->vfu_mig_buf);
+
+    return 0;
+}
+
+static const QEMUFileOps vfu_mig_fops_save = {
+    .writev_buffer  = vfu_mig_buf_write,
+    .shut_down      = vfu_mig_buf_shutdown,
+};
+
+static const QEMUFileOps vfu_mig_fops_load = {
+    .get_buffer     = vfu_mig_buf_read,
+    .shut_down      = vfu_mig_buf_shutdown,
+};
+
+/**
+ * handlers for vfu_migration_callbacks_t
+ *
+ * The libvfio-user library accesses these handlers to drive the migration
+ * at the remote end, and also to transport the data stored in vfu_mig_buf
+ *
+ */
+static void vfu_mig_state_precopy(vfu_ctx_t *vfu_ctx)
+{
+    VfuObject *o = vfu_get_private(vfu_ctx);
+    int ret;
+
+    if (!o->vfu_mig_file) {
+        o->vfu_mig_file = qemu_fopen_ops(o, &vfu_mig_fops_save, false);
+    }
+
+    ret = qemu_remote_savevm(o->vfu_mig_file, DEVICE(o->pci_dev));
+    if (ret) {
+        qemu_file_shutdown(o->vfu_mig_file);
+        return;
+    }
+
+    qemu_fflush(o->vfu_mig_file);
+}
+
+static void vfu_mig_state_running(vfu_ctx_t *vfu_ctx)
+{
+    VfuObject *o = vfu_get_private(vfu_ctx);
+    VfuObjectClass *k = VFU_OBJECT_GET_CLASS(OBJECT(o));
+    static int migrated_devs;
+    Error *local_err = NULL;
+    int ret;
+
+    ret = qemu_remote_loadvm(o->vfu_mig_file);
+    if (ret) {
+        error_setg(&error_abort, "vfu: failed to restore device state");
+        return;
+    }
+
+    if (++migrated_devs == k->nr_devs) {
+        bdrv_invalidate_cache_all(&local_err);
+        if (local_err) {
+            error_report_err(local_err);
+            return;
+        }
+
+        vm_start();
+    }
+}
+
+static void vfu_mig_state_stop(vfu_ctx_t *vfu_ctx)
+{
+    VfuObject *o = vfu_get_private(vfu_ctx);
+    VfuObjectClass *k = VFU_OBJECT_GET_CLASS(OBJECT(o));
+    static int migrated_devs;
+
+    /**
+     * note: calling bdrv_inactivate_all() is not the best approach.
+     *
+     *  Ideally, we would identify the block devices (if any) indirectly
+     *  linked (such as via a scs-hd device) to each of the migrated devices,
+     *  and inactivate them individually. This is essential while operating
+     *  the server in a storage daemon mode, with devices from different VMs.
+     *
+     *  However, we currently don't have this capability. As such, we need to
+     *  inactivate all devices at the same time when migration is completed.
+     */
+    if (++migrated_devs == k->nr_devs) {
+        bdrv_inactivate_all();
+    }
+}
+
+static int vfu_mig_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state)
+{
+    switch (state) {
+    case VFU_MIGR_STATE_RESUME:
+    case VFU_MIGR_STATE_STOP_AND_COPY:
+        break;
+    case VFU_MIGR_STATE_STOP:
+        vfu_mig_state_stop(vfu_ctx);
+        break;
+    case VFU_MIGR_STATE_PRE_COPY:
+        vfu_mig_state_precopy(vfu_ctx);
+        break;
+    case VFU_MIGR_STATE_RUNNING:
+        if (!runstate_is_running()) {
+            vfu_mig_state_running(vfu_ctx);
+        }
+        break;
+    default:
+        warn_report("vfu: Unknown migration state %d", state);
+    }
+
+    return 0;
+}
+
+static uint64_t vfu_mig_get_pending_bytes(vfu_ctx_t *vfu_ctx)
+{
+    VfuObject *o = vfu_get_private(vfu_ctx);
+
+    return o->vfu_mig_buf_pending;
+}
+
+static int vfu_mig_prepare_data(vfu_ctx_t *vfu_ctx, uint64_t *offset,
+                                uint64_t *size)
+{
+    VfuObject *o = vfu_get_private(vfu_ctx);
+
+    if (offset) {
+        *offset = 0;
+    }
+
+    if (size) {
+        *size = o->vfu_mig_buf_size;
+    }
+
+    return 0;
+}
+
+static ssize_t vfu_mig_read_data(vfu_ctx_t *vfu_ctx, void *buf,
+                                 uint64_t size, uint64_t offset)
+{
+    VfuObject *o = vfu_get_private(vfu_ctx);
+
+    if (offset > o->vfu_mig_buf_size) {
+        return -1;
+    }
+
+    if ((offset + size) > o->vfu_mig_buf_size) {
+        warn_report("vfu: buffer overflow - check pending_bytes");
+        size = o->vfu_mig_buf_size - offset;
+    }
+
+    memcpy(buf, (o->vfu_mig_buf + offset), size);
+
+    o->vfu_mig_buf_pending -= size;
+
+    return size;
+}
+
+static ssize_t vfu_mig_write_data(vfu_ctx_t *vfu_ctx, void *data,
+                                  uint64_t size, uint64_t offset)
+{
+    VfuObject *o = vfu_get_private(vfu_ctx);
+    uint64_t end = offset + size;
+
+    if (end > o->vfu_mig_buf_size) {
+        o->vfu_mig_buf = g_realloc(o->vfu_mig_buf, end);
+        o->vfu_mig_buf_size = end;
+    }
+
+    memcpy((o->vfu_mig_buf + offset), data, size);
+
+    if (!o->vfu_mig_file) {
+        o->vfu_mig_file = qemu_fopen_ops(o, &vfu_mig_fops_load, false);
+    }
+
+    return size;
+}
+
+static int vfu_mig_data_written(vfu_ctx_t *vfu_ctx, uint64_t count)
+{
+    return 0;
+}
+
+static const vfu_migration_callbacks_t vfu_mig_cbs = {
+    .version = VFU_MIGR_CALLBACKS_VERS,
+    .transition = &vfu_mig_transition,
+    .get_pending_bytes = &vfu_mig_get_pending_bytes,
+    .prepare_data = &vfu_mig_prepare_data,
+    .read_data = &vfu_mig_read_data,
+    .data_written = &vfu_mig_data_written,
+    .write_data = &vfu_mig_write_data,
+};
+
 static void vfu_object_ctx_run(void *opaque)
 {
     VfuObject *o = opaque;
@@ -340,6 +615,7 @@ static void vfu_object_machine_done(Notifier *notifier, void *data)
 {
     VfuObject *o = container_of(notifier, VfuObject, machine_done);
     DeviceState *dev = NULL;
+    size_t migr_area_size;
     QemuThread thread;
     int ret;
 
@@ -401,6 +677,35 @@ static void vfu_object_machine_done(Notifier *notifier, void *data)
         return;
     }
 
+    /*
+     * TODO: The 0x20000 number used below is a temporary. We are working on
+     *     a cleaner fix for this.
+     *
+     *     The libvfio-user library assumes that the remote knows the size of
+     *     the data to be migrated at boot time, but that is not the case with
+     *     VMSDs, as it can contain a variable-size buffer. 0x20000 is used
+     *     as a sufficiently large buffer to demonstrate migration, but that
+     *     cannot be used as a solution.
+     *
+     */
+    ret = vfu_setup_region(o->vfu_ctx, VFU_PCI_DEV_MIGR_REGION_IDX,
+                           0x20000, NULL,
+                           VFU_REGION_FLAG_RW, NULL, 0, -1, 0);
+    if (ret < 0) {
+        error_setg(&error_abort, "vfu: Failed to register migration BAR %s- %s",
+                   o->devid, strerror(errno));
+        return;
+    }
+
+    migr_area_size = vfu_get_migr_register_area_size();
+    ret = vfu_setup_device_migration_callbacks(o->vfu_ctx, &vfu_mig_cbs,
+                                               migr_area_size);
+    if (ret < 0) {
+        error_setg(&error_abort, "vfu: Failed to setup migration %s- %s",
+                   o->devid, strerror(errno));
+        return;
+    }
+
     ret = vfu_realize_ctx(o->vfu_ctx);
     if (ret < 0) {
         error_setg(&error_abort, "vfu: Failed to realize device %s- %s",
@@ -435,6 +740,14 @@ static void vfu_object_init(Object *obj)
     qemu_add_machine_init_done_notifier(&o->machine_done);
 
     o->vfu_poll_fd = -1;
+
+    o->vfu_mig_file = NULL;
+
+    o->vfu_mig_buf = NULL;
+
+    o->vfu_mig_buf_size = 0;
+
+    o->vfu_mig_buf_pending = 0;
 }
 
 static void vfu_object_finalize(Object *obj)
diff --git a/migration/savevm.c b/migration/savevm.c
index 7b7b64b..341fde7 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -1604,6 +1604,49 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp)
     return ret;
 }
 
+static SaveStateEntry *find_se_from_dev(DeviceState *dev)
+{
+    SaveStateEntry *se;
+
+    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
+        if (se->opaque == dev) {
+            return se;
+        }
+    }
+
+    return NULL;
+}
+
+int qemu_remote_savevm(QEMUFile *f, DeviceState *dev)
+{
+    SaveStateEntry *se;
+    int ret = 0;
+
+    se = find_se_from_dev(dev);
+    if (!se) {
+        return -ENODEV;
+    }
+
+    if (!se->vmsd || !vmstate_save_needed(se->vmsd, se->opaque)) {
+        return ret;
+    }
+
+    save_section_header(f, se, QEMU_VM_SECTION_FULL);
+
+    ret = vmstate_save(f, se, NULL);
+    if (ret) {
+        qemu_file_set_error(f, ret);
+        return ret;
+    }
+
+    save_section_footer(f, se);
+
+    qemu_put_byte(f, QEMU_VM_EOF);
+    qemu_fflush(f);
+
+    return 0;
+}
+
 void qemu_savevm_live_state(QEMUFile *f)
 {
     /* save QEMU_VM_SECTION_END section */
@@ -2444,6 +2487,36 @@ qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis)
     return 0;
 }
 
+int qemu_remote_loadvm(QEMUFile *f)
+{
+    uint8_t section_type;
+    int ret = 0;
+
+    while (true) {
+        section_type = qemu_get_byte(f);
+
+        ret = qemu_file_get_error(f);
+        if (ret) {
+            break;
+        }
+
+        switch (section_type) {
+        case QEMU_VM_SECTION_FULL:
+            ret = qemu_loadvm_section_start_full(f, NULL);
+            if (ret < 0) {
+                break;
+            }
+            break;
+        case QEMU_VM_EOF:
+            return ret;
+        default:
+            return -EINVAL;
+        }
+    }
+
+    return ret;
+}
+
 static int
 qemu_loadvm_section_part_end(QEMUFile *f, MigrationIncomingState *mis)
 {
-- 
1.8.3.1

Re: [PATCH RFC server v2 10/11] vfio-user: register handlers to facilitate migration

Posted by Stefan Hajnoczi 4 years, 5 months ago

On Fri, Aug 27, 2021 at 01:53:29PM -0400, Jagannathan Raman wrote:
> +static ssize_t vfu_mig_buf_read(void *opaque, uint8_t *buf, int64_t pos,
> +                                size_t size, Error **errp)
> +{
> +    VfuObject *o = opaque;
> +
> +    if (pos > o->vfu_mig_buf_size) {
> +        size = 0;
> +    } else if ((pos + size) > o->vfu_mig_buf_size) {
> +        size = o->vfu_mig_buf_size;
> +    }
> +
> +    memcpy(buf, (o->vfu_mig_buf + pos), size);
> +
> +    o->vfu_mig_buf_size -= size;

This looks strange. pos increases each time we're called. We seem to be
truncating the buffer on each read. Should this line be dropped? Did you
test live migration (maybe this code needs more debugging)?

> +
> +    return size;
> +}
> +
> +static ssize_t vfu_mig_buf_write(void *opaque, struct iovec *iov, int iovcnt,
> +                                 int64_t pos, Error **errp)
> +{
> +    VfuObject *o = opaque;
> +    uint64_t end = pos + iov_size(iov, iovcnt);
> +    int i;
> +
> +    if (end > o->vfu_mig_buf_size) {
> +        o->vfu_mig_buf = g_realloc(o->vfu_mig_buf, end);
> +    }
> +
> +    for (i = 0; i < iovcnt; i++) {
> +        memcpy((o->vfu_mig_buf + o->vfu_mig_buf_size), iov[i].iov_base,
> +               iov[i].iov_len);
> +        o->vfu_mig_buf_size += iov[i].iov_len;
> +        o->vfu_mig_buf_pending += iov[i].iov_len;
> +    }
> +
> +    return iov_size(iov, iovcnt);
> +}
> +
> +static int vfu_mig_buf_shutdown(void *opaque, bool rd, bool wr, Error **errp)
> +{
> +    VfuObject *o = opaque;
> +
> +    o->vfu_mig_buf_size = 0;
> +
> +    g_free(o->vfu_mig_buf);
> +
> +    return 0;
> +}
> +
> +static const QEMUFileOps vfu_mig_fops_save = {
> +    .writev_buffer  = vfu_mig_buf_write,
> +    .shut_down      = vfu_mig_buf_shutdown,
> +};
> +
> +static const QEMUFileOps vfu_mig_fops_load = {
> +    .get_buffer     = vfu_mig_buf_read,
> +    .shut_down      = vfu_mig_buf_shutdown,
> +};
> +
> +/**
> + * handlers for vfu_migration_callbacks_t
> + *
> + * The libvfio-user library accesses these handlers to drive the migration
> + * at the remote end, and also to transport the data stored in vfu_mig_buf
> + *
> + */
> +static void vfu_mig_state_precopy(vfu_ctx_t *vfu_ctx)
> +{
> +    VfuObject *o = vfu_get_private(vfu_ctx);
> +    int ret;
> +
> +    if (!o->vfu_mig_file) {
> +        o->vfu_mig_file = qemu_fopen_ops(o, &vfu_mig_fops_save, false);
> +    }
> +
> +    ret = qemu_remote_savevm(o->vfu_mig_file, DEVICE(o->pci_dev));
> +    if (ret) {
> +        qemu_file_shutdown(o->vfu_mig_file);
> +        return;
> +    }
> +
> +    qemu_fflush(o->vfu_mig_file);
> +}

Are you sure pre-copy is the state where you want to serialize the
savevm data? IIUC pre-copy is the iterative state while the device is
still running (e.g. when copying RAM but before devices are stopped). I
expected savevm to happen when we reach stop-and-copy.

The reason why this matters is that we're saving the state of the device
while the guest is still running and possibly interacting with the
device. The destination won't have the final state of the device, it
will have an earlier state of the device when we started migrating RAM!

Maybe I'm wrong, please double-check, but this looks like a bug.

> +
> +static void vfu_mig_state_running(vfu_ctx_t *vfu_ctx)
> +{
> +    VfuObject *o = vfu_get_private(vfu_ctx);
> +    VfuObjectClass *k = VFU_OBJECT_GET_CLASS(OBJECT(o));
> +    static int migrated_devs;
> +    Error *local_err = NULL;
> +    int ret;
> +
> +    ret = qemu_remote_loadvm(o->vfu_mig_file);
> +    if (ret) {
> +        error_setg(&error_abort, "vfu: failed to restore device state");
> +        return;
> +    }
> +
> +    if (++migrated_devs == k->nr_devs) {
> +        bdrv_invalidate_cache_all(&local_err);
> +        if (local_err) {
> +            error_report_err(local_err);
> +            return;
> +        }
> +
> +        vm_start();
> +    }
> +}

This looks like it's intended for the destination side. Does this code
work on the source side if the device is transitioned back into the
running state?

> +
> +static void vfu_mig_state_stop(vfu_ctx_t *vfu_ctx)
> +{
> +    VfuObject *o = vfu_get_private(vfu_ctx);
> +    VfuObjectClass *k = VFU_OBJECT_GET_CLASS(OBJECT(o));
> +    static int migrated_devs;
> +
> +    /**
> +     * note: calling bdrv_inactivate_all() is not the best approach.
> +     *
> +     *  Ideally, we would identify the block devices (if any) indirectly
> +     *  linked (such as via a scs-hd device) to each of the migrated devices,
> +     *  and inactivate them individually. This is essential while operating
> +     *  the server in a storage daemon mode, with devices from different VMs.
> +     *
> +     *  However, we currently don't have this capability. As such, we need to
> +     *  inactivate all devices at the same time when migration is completed.
> +     */
> +    if (++migrated_devs == k->nr_devs) {
> +        bdrv_inactivate_all();
> +    }
> +}
> +
> +static int vfu_mig_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state)
> +{
> +    switch (state) {
> +    case VFU_MIGR_STATE_RESUME:
> +    case VFU_MIGR_STATE_STOP_AND_COPY:
> +        break;
> +    case VFU_MIGR_STATE_STOP:
> +        vfu_mig_state_stop(vfu_ctx);
> +        break;
> +    case VFU_MIGR_STATE_PRE_COPY:
> +        vfu_mig_state_precopy(vfu_ctx);
> +        break;
> +    case VFU_MIGR_STATE_RUNNING:
> +        if (!runstate_is_running()) {
> +            vfu_mig_state_running(vfu_ctx);
> +        }
> +        break;
> +    default:
> +        warn_report("vfu: Unknown migration state %d", state);
> +    }
> +
> +    return 0;
> +}
> +
> +static uint64_t vfu_mig_get_pending_bytes(vfu_ctx_t *vfu_ctx)
> +{
> +    VfuObject *o = vfu_get_private(vfu_ctx);
> +
> +    return o->vfu_mig_buf_pending;
> +}
> +
> +static int vfu_mig_prepare_data(vfu_ctx_t *vfu_ctx, uint64_t *offset,
> +                                uint64_t *size)
> +{
> +    VfuObject *o = vfu_get_private(vfu_ctx);
> +
> +    if (offset) {
> +        *offset = 0;
> +    }
> +
> +    if (size) {
> +        *size = o->vfu_mig_buf_size;
> +    }
> +
> +    return 0;
> +}
> +
> +static ssize_t vfu_mig_read_data(vfu_ctx_t *vfu_ctx, void *buf,
> +                                 uint64_t size, uint64_t offset)
> +{
> +    VfuObject *o = vfu_get_private(vfu_ctx);
> +
> +    if (offset > o->vfu_mig_buf_size) {
> +        return -1;
> +    }
> +
> +    if ((offset + size) > o->vfu_mig_buf_size) {
> +        warn_report("vfu: buffer overflow - check pending_bytes");
> +        size = o->vfu_mig_buf_size - offset;
> +    }
> +
> +    memcpy(buf, (o->vfu_mig_buf + offset), size);
> +
> +    o->vfu_mig_buf_pending -= size;
> +
> +    return size;
> +}
> +
> +static ssize_t vfu_mig_write_data(vfu_ctx_t *vfu_ctx, void *data,
> +                                  uint64_t size, uint64_t offset)
> +{
> +    VfuObject *o = vfu_get_private(vfu_ctx);
> +    uint64_t end = offset + size;
> +
> +    if (end > o->vfu_mig_buf_size) {
> +        o->vfu_mig_buf = g_realloc(o->vfu_mig_buf, end);
> +        o->vfu_mig_buf_size = end;
> +    }
> +
> +    memcpy((o->vfu_mig_buf + offset), data, size);
> +
> +    if (!o->vfu_mig_file) {
> +        o->vfu_mig_file = qemu_fopen_ops(o, &vfu_mig_fops_load, false);
> +    }

Why open the file here where it's not accessed? I expected this to
happen at the point where data has been fully written and we call
qemu_remote_loadvm().

> +
> +    return size;
> +}
> +
> +static int vfu_mig_data_written(vfu_ctx_t *vfu_ctx, uint64_t count)
> +{
> +    return 0;
> +}
> +
> +static const vfu_migration_callbacks_t vfu_mig_cbs = {
> +    .version = VFU_MIGR_CALLBACKS_VERS,
> +    .transition = &vfu_mig_transition,
> +    .get_pending_bytes = &vfu_mig_get_pending_bytes,
> +    .prepare_data = &vfu_mig_prepare_data,
> +    .read_data = &vfu_mig_read_data,
> +    .data_written = &vfu_mig_data_written,
> +    .write_data = &vfu_mig_write_data,
> +};
> +
>  static void vfu_object_ctx_run(void *opaque)
>  {
>      VfuObject *o = opaque;
> @@ -340,6 +615,7 @@ static void vfu_object_machine_done(Notifier *notifier, void *data)
>  {
>      VfuObject *o = container_of(notifier, VfuObject, machine_done);
>      DeviceState *dev = NULL;
> +    size_t migr_area_size;
>      QemuThread thread;
>      int ret;
>  
> @@ -401,6 +677,35 @@ static void vfu_object_machine_done(Notifier *notifier, void *data)
>          return;
>      }
>  
> +    /*
> +     * TODO: The 0x20000 number used below is a temporary. We are working on
> +     *     a cleaner fix for this.
> +     *
> +     *     The libvfio-user library assumes that the remote knows the size of
> +     *     the data to be migrated at boot time, but that is not the case with
> +     *     VMSDs, as it can contain a variable-size buffer. 0x20000 is used
> +     *     as a sufficiently large buffer to demonstrate migration, but that
> +     *     cannot be used as a solution.
> +     *
> +     */

libvfio-user has the vfu_migration_callbacks_t interface that allows the
device to save/load more data regardless of the size of the migration
region. I don't see the issue here since the region doesn't need to be
sized to fit the savevm data?