From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
The multifd received data needs to be reassembled since device state
packets sent via different multifd channels can arrive out-of-order.
Therefore, each VFIO device state packet carries a header indicating its
position in the stream.
The raw device state data is saved into a VFIOStateBuffer for later
in-order loading into the device.
The last such VFIO device state packet should have
VFIO_DEVICE_STATE_CONFIG_STATE flag set and carry the device config state.
Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
---
hw/vfio/migration-multifd.c | 103 ++++++++++++++++++++++++++++++++++++
hw/vfio/migration-multifd.h | 3 ++
hw/vfio/migration.c | 1 +
hw/vfio/trace-events | 1 +
4 files changed, 108 insertions(+)
diff --git a/hw/vfio/migration-multifd.c b/hw/vfio/migration-multifd.c
index c2defc0efef0..5d5ee1393674 100644
--- a/hw/vfio/migration-multifd.c
+++ b/hw/vfio/migration-multifd.c
@@ -42,6 +42,11 @@ typedef struct VFIOStateBuffer {
} VFIOStateBuffer;
typedef struct VFIOMultifd {
+ VFIOStateBuffers load_bufs;
+ QemuCond load_bufs_buffer_ready_cond;
+ QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */
+ uint32_t load_buf_idx;
+ uint32_t load_buf_idx_last;
} VFIOMultifd;
static void vfio_state_buffer_clear(gpointer data)
@@ -87,15 +92,113 @@ static VFIOStateBuffer *vfio_state_buffers_at(VFIOStateBuffers *bufs, guint idx)
return &g_array_index(bufs->array, VFIOStateBuffer, idx);
}
+static bool vfio_load_state_buffer_insert(VFIODevice *vbasedev,
+ VFIODeviceStatePacket *packet,
+ size_t packet_total_size,
+ Error **errp)
+{
+ VFIOMigration *migration = vbasedev->migration;
+ VFIOMultifd *multifd = migration->multifd;
+ VFIOStateBuffer *lb;
+
+ vfio_state_buffers_assert_init(&multifd->load_bufs);
+ if (packet->idx >= vfio_state_buffers_size_get(&multifd->load_bufs)) {
+ vfio_state_buffers_size_set(&multifd->load_bufs, packet->idx + 1);
+ }
+
+ lb = vfio_state_buffers_at(&multifd->load_bufs, packet->idx);
+ if (lb->is_present) {
+ error_setg(errp, "state buffer %" PRIu32 " already filled",
+ packet->idx);
+ return false;
+ }
+
+ assert(packet->idx >= multifd->load_buf_idx);
+
+ lb->data = g_memdup2(&packet->data, packet_total_size - sizeof(*packet));
+ lb->len = packet_total_size - sizeof(*packet);
+ lb->is_present = true;
+
+ return true;
+}
+
+bool vfio_load_state_buffer(void *opaque, char *data, size_t data_size,
+ Error **errp)
+{
+ VFIODevice *vbasedev = opaque;
+ VFIOMigration *migration = vbasedev->migration;
+ VFIOMultifd *multifd = migration->multifd;
+ VFIODeviceStatePacket *packet = (VFIODeviceStatePacket *)data;
+
+ /*
+ * Holding BQL here would violate the lock order and can cause
+ * a deadlock once we attempt to lock load_bufs_mutex below.
+ */
+ assert(!bql_locked());
+
+ if (!vfio_multifd_transfer_enabled(vbasedev)) {
+ error_setg(errp,
+ "got device state packet but not doing multifd transfer");
+ return false;
+ }
+
+ assert(multifd);
+
+ if (data_size < sizeof(*packet)) {
+ error_setg(errp, "packet too short at %zu (min is %zu)",
+ data_size, sizeof(*packet));
+ return false;
+ }
+
+ if (packet->version != VFIO_DEVICE_STATE_PACKET_VER_CURRENT) {
+ error_setg(errp, "packet has unknown version %" PRIu32,
+ packet->version);
+ return false;
+ }
+
+ if (packet->idx == UINT32_MAX) {
+ error_setg(errp, "packet has too high idx");
+ return false;
+ }
+
+ trace_vfio_load_state_device_buffer_incoming(vbasedev->name, packet->idx);
+
+ QEMU_LOCK_GUARD(&multifd->load_bufs_mutex);
+
+ /* config state packet should be the last one in the stream */
+ if (packet->flags & VFIO_DEVICE_STATE_CONFIG_STATE) {
+ multifd->load_buf_idx_last = packet->idx;
+ }
+
+ if (!vfio_load_state_buffer_insert(vbasedev, packet, data_size, errp)) {
+ return false;
+ }
+
+ qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond);
+
+ return true;
+}
+
VFIOMultifd *vfio_multifd_new(void)
{
VFIOMultifd *multifd = g_new(VFIOMultifd, 1);
+ vfio_state_buffers_init(&multifd->load_bufs);
+
+ qemu_mutex_init(&multifd->load_bufs_mutex);
+
+ multifd->load_buf_idx = 0;
+ multifd->load_buf_idx_last = UINT32_MAX;
+ qemu_cond_init(&multifd->load_bufs_buffer_ready_cond);
+
return multifd;
}
void vfio_multifd_free(VFIOMultifd *multifd)
{
+ qemu_cond_destroy(&multifd->load_bufs_buffer_ready_cond);
+ qemu_mutex_destroy(&multifd->load_bufs_mutex);
+
g_free(multifd);
}
diff --git a/hw/vfio/migration-multifd.h b/hw/vfio/migration-multifd.h
index 1eefba3b2eed..d5ab7d6f85f5 100644
--- a/hw/vfio/migration-multifd.h
+++ b/hw/vfio/migration-multifd.h
@@ -22,4 +22,7 @@ bool vfio_multifd_transfer_enabled(VFIODevice *vbasedev);
bool vfio_multifd_transfer_setup(VFIODevice *vbasedev, Error **errp);
+bool vfio_load_state_buffer(void *opaque, char *data, size_t data_size,
+ Error **errp);
+
#endif
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 4311de763885..abaf4d08d4a9 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -806,6 +806,7 @@ static const SaveVMHandlers savevm_vfio_handlers = {
.load_setup = vfio_load_setup,
.load_cleanup = vfio_load_cleanup,
.load_state = vfio_load_state,
+ .load_state_buffer = vfio_load_state_buffer,
.switchover_ack_needed = vfio_switchover_ack_needed,
};
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 1bebe9877d88..042a3dc54a33 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -153,6 +153,7 @@ vfio_load_device_config_state_start(const char *name) " (%s)"
vfio_load_device_config_state_end(const char *name) " (%s)"
vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64
vfio_load_state_device_data(const char *name, uint64_t data_size, int ret) " (%s) size %"PRIu64" ret %d"
+vfio_load_state_device_buffer_incoming(const char *name, uint32_t idx) " (%s) idx %"PRIu32
vfio_migration_realize(const char *name) " (%s)"
vfio_migration_set_device_state(const char *name, const char *state) " (%s) state %s"
vfio_migration_set_state(const char *name, const char *new_state, const char *recover_state) " (%s) new state %s, recover state %s"
On 19/02/2025 22:34, Maciej S. Szmigiero wrote:
> External email: Use caution opening links or attachments
>
>
> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
>
> The multifd received data needs to be reassembled since device state
> packets sent via different multifd channels can arrive out-of-order.
>
> Therefore, each VFIO device state packet carries a header indicating its
> position in the stream.
> The raw device state data is saved into a VFIOStateBuffer for later
> in-order loading into the device.
>
> The last such VFIO device state packet should have
> VFIO_DEVICE_STATE_CONFIG_STATE flag set and carry the device config state.
>
> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
> ---
> hw/vfio/migration-multifd.c | 103 ++++++++++++++++++++++++++++++++++++
> hw/vfio/migration-multifd.h | 3 ++
> hw/vfio/migration.c | 1 +
> hw/vfio/trace-events | 1 +
> 4 files changed, 108 insertions(+)
>
> diff --git a/hw/vfio/migration-multifd.c b/hw/vfio/migration-multifd.c
> index c2defc0efef0..5d5ee1393674 100644
> --- a/hw/vfio/migration-multifd.c
> +++ b/hw/vfio/migration-multifd.c
> @@ -42,6 +42,11 @@ typedef struct VFIOStateBuffer {
> } VFIOStateBuffer;
>
> typedef struct VFIOMultifd {
> + VFIOStateBuffers load_bufs;
> + QemuCond load_bufs_buffer_ready_cond;
> + QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */
> + uint32_t load_buf_idx;
> + uint32_t load_buf_idx_last;
> } VFIOMultifd;
>
> static void vfio_state_buffer_clear(gpointer data)
> @@ -87,15 +92,113 @@ static VFIOStateBuffer *vfio_state_buffers_at(VFIOStateBuffers *bufs, guint idx)
> return &g_array_index(bufs->array, VFIOStateBuffer, idx);
> }
>
> +static bool vfio_load_state_buffer_insert(VFIODevice *vbasedev,
> + VFIODeviceStatePacket *packet,
> + size_t packet_total_size,
> + Error **errp)
> +{
> + VFIOMigration *migration = vbasedev->migration;
> + VFIOMultifd *multifd = migration->multifd;
> + VFIOStateBuffer *lb;
> +
> + vfio_state_buffers_assert_init(&multifd->load_bufs);
> + if (packet->idx >= vfio_state_buffers_size_get(&multifd->load_bufs)) {
> + vfio_state_buffers_size_set(&multifd->load_bufs, packet->idx + 1);
> + }
> +
> + lb = vfio_state_buffers_at(&multifd->load_bufs, packet->idx);
> + if (lb->is_present) {
> + error_setg(errp, "state buffer %" PRIu32 " already filled",
> + packet->idx);
Let's add vbasedev->name to the error message so we know which device
caused the error.
> + return false;
> + }
> +
> + assert(packet->idx >= multifd->load_buf_idx);
> +
> + lb->data = g_memdup2(&packet->data, packet_total_size - sizeof(*packet));
> + lb->len = packet_total_size - sizeof(*packet);
> + lb->is_present = true;
> +
> + return true;
> +}
> +
> +bool vfio_load_state_buffer(void *opaque, char *data, size_t data_size,
> + Error **errp)
> +{
> + VFIODevice *vbasedev = opaque;
> + VFIOMigration *migration = vbasedev->migration;
> + VFIOMultifd *multifd = migration->multifd;
> + VFIODeviceStatePacket *packet = (VFIODeviceStatePacket *)data;
> +
> + /*
> + * Holding BQL here would violate the lock order and can cause
> + * a deadlock once we attempt to lock load_bufs_mutex below.
> + */
> + assert(!bql_locked());
To be clearer, I'd move the assert down to be just above
"QEMU_LOCK_GUARD(&multifd->load_bufs_mutex);".
> +
> + if (!vfio_multifd_transfer_enabled(vbasedev)) {
> + error_setg(errp,
> + "got device state packet but not doing multifd transfer");
Let's add vbasedev->name to the error message so we know which device
caused the error.
> + return false;
> + }
> +
> + assert(multifd);
> +
> + if (data_size < sizeof(*packet)) {
> + error_setg(errp, "packet too short at %zu (min is %zu)",
> + data_size, sizeof(*packet));
Ditto.
> + return false;
> + }
> +
> + if (packet->version != VFIO_DEVICE_STATE_PACKET_VER_CURRENT) {
> + error_setg(errp, "packet has unknown version %" PRIu32,
> + packet->version);
Ditto.
> + return false;
> + }
> +
> + if (packet->idx == UINT32_MAX) {
> + error_setg(errp, "packet has too high idx");
Ditto.
> + return false;
> + }
> +
> + trace_vfio_load_state_device_buffer_incoming(vbasedev->name, packet->idx);
> +
> + QEMU_LOCK_GUARD(&multifd->load_bufs_mutex);
> +
> + /* config state packet should be the last one in the stream */
> + if (packet->flags & VFIO_DEVICE_STATE_CONFIG_STATE) {
> + multifd->load_buf_idx_last = packet->idx;
> + }
> +
> + if (!vfio_load_state_buffer_insert(vbasedev, packet, data_size, errp)) {
> + return false;
> + }
> +
> + qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond);
> +
> + return true;
> +}
> +
> VFIOMultifd *vfio_multifd_new(void)
> {
> VFIOMultifd *multifd = g_new(VFIOMultifd, 1);
>
> + vfio_state_buffers_init(&multifd->load_bufs);
> +
> + qemu_mutex_init(&multifd->load_bufs_mutex);
Nit: move qemu_mutex_init() just above qemu_cond_init()?
Thanks.
> +
> + multifd->load_buf_idx = 0;
> + multifd->load_buf_idx_last = UINT32_MAX;
> + qemu_cond_init(&multifd->load_bufs_buffer_ready_cond);
> +
> return multifd;
> }
>
> void vfio_multifd_free(VFIOMultifd *multifd)
> {
> + qemu_cond_destroy(&multifd->load_bufs_buffer_ready_cond);
> + qemu_mutex_destroy(&multifd->load_bufs_mutex);
> +
> g_free(multifd);
> }
>
> diff --git a/hw/vfio/migration-multifd.h b/hw/vfio/migration-multifd.h
> index 1eefba3b2eed..d5ab7d6f85f5 100644
> --- a/hw/vfio/migration-multifd.h
> +++ b/hw/vfio/migration-multifd.h
> @@ -22,4 +22,7 @@ bool vfio_multifd_transfer_enabled(VFIODevice *vbasedev);
>
> bool vfio_multifd_transfer_setup(VFIODevice *vbasedev, Error **errp);
>
> +bool vfio_load_state_buffer(void *opaque, char *data, size_t data_size,
> + Error **errp);
> +
> #endif
> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
> index 4311de763885..abaf4d08d4a9 100644
> --- a/hw/vfio/migration.c
> +++ b/hw/vfio/migration.c
> @@ -806,6 +806,7 @@ static const SaveVMHandlers savevm_vfio_handlers = {
> .load_setup = vfio_load_setup,
> .load_cleanup = vfio_load_cleanup,
> .load_state = vfio_load_state,
> + .load_state_buffer = vfio_load_state_buffer,
> .switchover_ack_needed = vfio_switchover_ack_needed,
> };
>
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index 1bebe9877d88..042a3dc54a33 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -153,6 +153,7 @@ vfio_load_device_config_state_start(const char *name) " (%s)"
> vfio_load_device_config_state_end(const char *name) " (%s)"
> vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64
> vfio_load_state_device_data(const char *name, uint64_t data_size, int ret) " (%s) size %"PRIu64" ret %d"
> +vfio_load_state_device_buffer_incoming(const char *name, uint32_t idx) " (%s) idx %"PRIu32
> vfio_migration_realize(const char *name) " (%s)"
> vfio_migration_set_device_state(const char *name, const char *state) " (%s) state %s"
> vfio_migration_set_state(const char *name, const char *new_state, const char *recover_state) " (%s) new state %s, recover state %s"
On 2.03.2025 14:12, Avihai Horon wrote:
>
> On 19/02/2025 22:34, Maciej S. Szmigiero wrote:
>> External email: Use caution opening links or attachments
>>
>>
>> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
>>
>> The multifd received data needs to be reassembled since device state
>> packets sent via different multifd channels can arrive out-of-order.
>>
>> Therefore, each VFIO device state packet carries a header indicating its
>> position in the stream.
>> The raw device state data is saved into a VFIOStateBuffer for later
>> in-order loading into the device.
>>
>> The last such VFIO device state packet should have
>> VFIO_DEVICE_STATE_CONFIG_STATE flag set and carry the device config state.
>>
>> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
>> ---
>> hw/vfio/migration-multifd.c | 103 ++++++++++++++++++++++++++++++++++++
>> hw/vfio/migration-multifd.h | 3 ++
>> hw/vfio/migration.c | 1 +
>> hw/vfio/trace-events | 1 +
>> 4 files changed, 108 insertions(+)
>>
>> diff --git a/hw/vfio/migration-multifd.c b/hw/vfio/migration-multifd.c
>> index c2defc0efef0..5d5ee1393674 100644
>> --- a/hw/vfio/migration-multifd.c
>> +++ b/hw/vfio/migration-multifd.c
>> @@ -42,6 +42,11 @@ typedef struct VFIOStateBuffer {
>> } VFIOStateBuffer;
>>
>> typedef struct VFIOMultifd {
>> + VFIOStateBuffers load_bufs;
>> + QemuCond load_bufs_buffer_ready_cond;
>> + QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */
>> + uint32_t load_buf_idx;
>> + uint32_t load_buf_idx_last;
>> } VFIOMultifd;
>>
>> static void vfio_state_buffer_clear(gpointer data)
>> @@ -87,15 +92,113 @@ static VFIOStateBuffer *vfio_state_buffers_at(VFIOStateBuffers *bufs, guint idx)
>> return &g_array_index(bufs->array, VFIOStateBuffer, idx);
>> }
>>
>> +static bool vfio_load_state_buffer_insert(VFIODevice *vbasedev,
>> + VFIODeviceStatePacket *packet,
>> + size_t packet_total_size,
>> + Error **errp)
>> +{
>> + VFIOMigration *migration = vbasedev->migration;
>> + VFIOMultifd *multifd = migration->multifd;
>> + VFIOStateBuffer *lb;
>> +
>> + vfio_state_buffers_assert_init(&multifd->load_bufs);
>> + if (packet->idx >= vfio_state_buffers_size_get(&multifd->load_bufs)) {
>> + vfio_state_buffers_size_set(&multifd->load_bufs, packet->idx + 1);
>> + }
>> +
>> + lb = vfio_state_buffers_at(&multifd->load_bufs, packet->idx);
>> + if (lb->is_present) {
>> + error_setg(errp, "state buffer %" PRIu32 " already filled",
>> + packet->idx);
>
> Let's add vbasedev->name to the error message so we know which device caused the error.
Done.
>> + return false;
>> + }
>> +
>> + assert(packet->idx >= multifd->load_buf_idx);
>> +
>> + lb->data = g_memdup2(&packet->data, packet_total_size - sizeof(*packet));
>> + lb->len = packet_total_size - sizeof(*packet);
>> + lb->is_present = true;
>> +
>> + return true;
>> +}
>> +
>> +bool vfio_load_state_buffer(void *opaque, char *data, size_t data_size,
>> + Error **errp)
>> +{
>> + VFIODevice *vbasedev = opaque;
>> + VFIOMigration *migration = vbasedev->migration;
>> + VFIOMultifd *multifd = migration->multifd;
>> + VFIODeviceStatePacket *packet = (VFIODeviceStatePacket *)data;
>> +
>> + /*
>> + * Holding BQL here would violate the lock order and can cause
>> + * a deadlock once we attempt to lock load_bufs_mutex below.
>> + */
>> + assert(!bql_locked());
>
> To be clearer, I'd move the assert down to be just above "QEMU_LOCK_GUARD(&multifd->load_bufs_mutex);".
Moved there.
>> +
>> + if (!vfio_multifd_transfer_enabled(vbasedev)) {
>> + error_setg(errp,
>> + "got device state packet but not doing multifd transfer");
>
> Let's add vbasedev->name to the error message so we know which device caused the error.
Done.
>> + return false;
>> + }
>> +
>> + assert(multifd);
>> +
>> + if (data_size < sizeof(*packet)) {
>> + error_setg(errp, "packet too short at %zu (min is %zu)",
>> + data_size, sizeof(*packet));
>
> Ditto.
Done.
>> + return false;
>> + }
>> +
>> + if (packet->version != VFIO_DEVICE_STATE_PACKET_VER_CURRENT) {
>> + error_setg(errp, "packet has unknown version %" PRIu32,
>> + packet->version);
>
> Ditto.
Done.
>> + return false;
>> + }
>> +
>> + if (packet->idx == UINT32_MAX) {
>> + error_setg(errp, "packet has too high idx");
>
> Ditto.
Done.
>> + return false;
>> + }
>> +
>> + trace_vfio_load_state_device_buffer_incoming(vbasedev->name, packet->idx);
>> +
>> + QEMU_LOCK_GUARD(&multifd->load_bufs_mutex);
>> +
>> + /* config state packet should be the last one in the stream */
>> + if (packet->flags & VFIO_DEVICE_STATE_CONFIG_STATE) {
>> + multifd->load_buf_idx_last = packet->idx;
>> + }
>> +
>> + if (!vfio_load_state_buffer_insert(vbasedev, packet, data_size, errp)) {
>> + return false;
>> + }
>> +
>> + qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond);
>> +
>> + return true;
>> +}
>> +
>> VFIOMultifd *vfio_multifd_new(void)
>> {
>> VFIOMultifd *multifd = g_new(VFIOMultifd, 1);
>>
>> + vfio_state_buffers_init(&multifd->load_bufs);
>> +
>> + qemu_mutex_init(&multifd->load_bufs_mutex);
>
> Nit: move qemu_mutex_init() just above qemu_cond_init()?
It's in a separate "block" because it is common for all 3
conditions in the ultimate form of code (and most of the
later variables too), rather just this single condition.
> Thanks.
Thanks,
Maciej
>> +
>> + multifd->load_buf_idx = 0;
>> + multifd->load_buf_idx_last = UINT32_MAX;
>> + qemu_cond_init(&multifd->load_bufs_buffer_ready_cond);
>> +
>> return multifd;
>> }
>>
On 2/19/25 21:34, Maciej S. Szmigiero wrote:
> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
>
> The multifd received data needs to be reassembled since device state
> packets sent via different multifd channels can arrive out-of-order.
>
> Therefore, each VFIO device state packet carries a header indicating its
> position in the stream.
> The raw device state data is saved into a VFIOStateBuffer for later
> in-order loading into the device.
>
> The last such VFIO device state packet should have
> VFIO_DEVICE_STATE_CONFIG_STATE flag set and carry the device config state.
>
> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
> ---
> hw/vfio/migration-multifd.c | 103 ++++++++++++++++++++++++++++++++++++
> hw/vfio/migration-multifd.h | 3 ++
> hw/vfio/migration.c | 1 +
> hw/vfio/trace-events | 1 +
> 4 files changed, 108 insertions(+)
>
> diff --git a/hw/vfio/migration-multifd.c b/hw/vfio/migration-multifd.c
> index c2defc0efef0..5d5ee1393674 100644
> --- a/hw/vfio/migration-multifd.c
> +++ b/hw/vfio/migration-multifd.c
> @@ -42,6 +42,11 @@ typedef struct VFIOStateBuffer {
> } VFIOStateBuffer;
>
> typedef struct VFIOMultifd {
> + VFIOStateBuffers load_bufs;
> + QemuCond load_bufs_buffer_ready_cond;
> + QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */
> + uint32_t load_buf_idx;
> + uint32_t load_buf_idx_last;
> } VFIOMultifd;
>
> static void vfio_state_buffer_clear(gpointer data)
> @@ -87,15 +92,113 @@ static VFIOStateBuffer *vfio_state_buffers_at(VFIOStateBuffers *bufs, guint idx)
> return &g_array_index(bufs->array, VFIOStateBuffer, idx);
> }
>
this routine expects load_bufs_mutex to be locked ? May be say so.
> +static bool vfio_load_state_buffer_insert(VFIODevice *vbasedev,
could you pass VFIOMultifd* instead ?
> + VFIODeviceStatePacket *packet,
> + size_t packet_total_size,
> + Error **errp)
> +{
> + VFIOMigration *migration = vbasedev->migration;
> + VFIOMultifd *multifd = migration->multifd;
> + VFIOStateBuffer *lb;
> +
> + vfio_state_buffers_assert_init(&multifd->load_bufs);
> + if (packet->idx >= vfio_state_buffers_size_get(&multifd->load_bufs)) {
> + vfio_state_buffers_size_set(&multifd->load_bufs, packet->idx + 1);
> + }
> +
> + lb = vfio_state_buffers_at(&multifd->load_bufs, packet->idx);
> + if (lb->is_present) {
> + error_setg(errp, "state buffer %" PRIu32 " already filled",
> + packet->idx);
> + return false;
> + }
> +
> + assert(packet->idx >= multifd->load_buf_idx);
> +
> + lb->data = g_memdup2(&packet->data, packet_total_size - sizeof(*packet));
> + lb->len = packet_total_size - sizeof(*packet);
> + lb->is_present = true;
> +
> + return true;
> +}
> +
> +bool vfio_load_state_buffer(void *opaque, char *data, size_t data_size,
> + Error **errp)
AFAICS, the only users of the .load_state_buffer() handlers is
multifd_device_state_recv().
Please rename to vfio_multifd_load_state_buffer().
> +{
> + VFIODevice *vbasedev = opaque;
> + VFIOMigration *migration = vbasedev->migration;
> + VFIOMultifd *multifd = migration->multifd;
> + VFIODeviceStatePacket *packet = (VFIODeviceStatePacket *)data;
> +
> + /*
> + * Holding BQL here would violate the lock order and can cause
> + * a deadlock once we attempt to lock load_bufs_mutex below.
> + */
> + assert(!bql_locked());
> +
> + if (!vfio_multifd_transfer_enabled(vbasedev)) {
> + error_setg(errp,
> + "got device state packet but not doing multifd transfer");
> + return false;
> + }
> +
> + assert(multifd);
> +
> + if (data_size < sizeof(*packet)) {
> + error_setg(errp, "packet too short at %zu (min is %zu)",
> + data_size, sizeof(*packet));
> + return false;
> + }
> +
> + if (packet->version != VFIO_DEVICE_STATE_PACKET_VER_CURRENT) {
> + error_setg(errp, "packet has unknown version %" PRIu32,
> + packet->version);
> + return false;
> + }
> +
> + if (packet->idx == UINT32_MAX) {
> + error_setg(errp, "packet has too high idx");
or "packet index is invalid" ?
> + return false;
> + }
> +
> + trace_vfio_load_state_device_buffer_incoming(vbasedev->name, packet->idx);
> +
> + QEMU_LOCK_GUARD(&multifd->load_bufs_mutex);
Using WITH_QEMU_LOCK_GUARD() would be cleaner I think.
> +
> + /* config state packet should be the last one in the stream */
> + if (packet->flags & VFIO_DEVICE_STATE_CONFIG_STATE) {
> + multifd->load_buf_idx_last = packet->idx;
> + }
> +
> + if (!vfio_load_state_buffer_insert(vbasedev, packet, data_size, errp)) {
> + return false;
> + }
> +
> + qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond);
> +
> + return true;
> +}
> +
> VFIOMultifd *vfio_multifd_new(void)
> {
> VFIOMultifd *multifd = g_new(VFIOMultifd, 1);
>
> + vfio_state_buffers_init(&multifd->load_bufs);
> +
> + qemu_mutex_init(&multifd->load_bufs_mutex);
> +
> + multifd->load_buf_idx = 0;
> + multifd->load_buf_idx_last = UINT32_MAX;
> + qemu_cond_init(&multifd->load_bufs_buffer_ready_cond);
> +
> return multifd;
> }
>
> void vfio_multifd_free(VFIOMultifd *multifd)
> {
> + qemu_cond_destroy(&multifd->load_bufs_buffer_ready_cond);
> + qemu_mutex_destroy(&multifd->load_bufs_mutex);
> +
> g_free(multifd);
> }
>
> diff --git a/hw/vfio/migration-multifd.h b/hw/vfio/migration-multifd.h
> index 1eefba3b2eed..d5ab7d6f85f5 100644
> --- a/hw/vfio/migration-multifd.h
> +++ b/hw/vfio/migration-multifd.h
> @@ -22,4 +22,7 @@ bool vfio_multifd_transfer_enabled(VFIODevice *vbasedev);
>
> bool vfio_multifd_transfer_setup(VFIODevice *vbasedev, Error **errp);
>
> +bool vfio_load_state_buffer(void *opaque, char *data, size_t data_size,
> + Error **errp);
> +
> #endif
> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
> index 4311de763885..abaf4d08d4a9 100644
> --- a/hw/vfio/migration.c
> +++ b/hw/vfio/migration.c
> @@ -806,6 +806,7 @@ static const SaveVMHandlers savevm_vfio_handlers = {
> .load_setup = vfio_load_setup,
> .load_cleanup = vfio_load_cleanup,
> .load_state = vfio_load_state,
> + .load_state_buffer = vfio_load_state_buffer,
> .switchover_ack_needed = vfio_switchover_ack_needed,> };
>
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index 1bebe9877d88..042a3dc54a33 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -153,6 +153,7 @@ vfio_load_device_config_state_start(const char *name) " (%s)"
> vfio_load_device_config_state_end(const char *name) " (%s)"
> vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64
> vfio_load_state_device_data(const char *name, uint64_t data_size, int ret) " (%s) size %"PRIu64" ret %d"
> +vfio_load_state_device_buffer_incoming(const char *name, uint32_t idx) " (%s) idx %"PRIu32
> vfio_migration_realize(const char *name) " (%s)"
> vfio_migration_set_device_state(const char *name, const char *state) " (%s) state %s"
> vfio_migration_set_state(const char *name, const char *new_state, const char *recover_state) " (%s) new state %s, recover state %s"
>
Thanks,
C.
On 26.02.2025 11:43, Cédric Le Goater wrote:
> On 2/19/25 21:34, Maciej S. Szmigiero wrote:
>> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
>>
>> The multifd received data needs to be reassembled since device state
>> packets sent via different multifd channels can arrive out-of-order.
>>
>> Therefore, each VFIO device state packet carries a header indicating its
>> position in the stream.
>> The raw device state data is saved into a VFIOStateBuffer for later
>> in-order loading into the device.
>>
>> The last such VFIO device state packet should have
>> VFIO_DEVICE_STATE_CONFIG_STATE flag set and carry the device config state.
>>
>> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
>> ---
>> hw/vfio/migration-multifd.c | 103 ++++++++++++++++++++++++++++++++++++
>> hw/vfio/migration-multifd.h | 3 ++
>> hw/vfio/migration.c | 1 +
>> hw/vfio/trace-events | 1 +
>> 4 files changed, 108 insertions(+)
>>
>> diff --git a/hw/vfio/migration-multifd.c b/hw/vfio/migration-multifd.c
>> index c2defc0efef0..5d5ee1393674 100644
>> --- a/hw/vfio/migration-multifd.c
>> +++ b/hw/vfio/migration-multifd.c
>> @@ -42,6 +42,11 @@ typedef struct VFIOStateBuffer {
>> } VFIOStateBuffer;
>> typedef struct VFIOMultifd {
>> + VFIOStateBuffers load_bufs;
>> + QemuCond load_bufs_buffer_ready_cond;
>> + QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */
>> + uint32_t load_buf_idx;
>> + uint32_t load_buf_idx_last;
>> } VFIOMultifd;
>> static void vfio_state_buffer_clear(gpointer data)
>> @@ -87,15 +92,113 @@ static VFIOStateBuffer *vfio_state_buffers_at(VFIOStateBuffers *bufs, guint idx)
>> return &g_array_index(bufs->array, VFIOStateBuffer, idx);
>> }
>
> this routine expects load_bufs_mutex to be locked ? May be say so.
I guess the comment above pertains to the vfio_load_state_buffer_insert()
below.
Do you mean it should have a comment that it expects to be called
under load_bufs_mutex?
>> +static bool vfio_load_state_buffer_insert(VFIODevice *vbasedev,
>
> could you pass VFIOMultifd* instead ?
No, it needs vbasedev->migration_max_queued_buffers too (introduced
in later patch).
Also, most of VFIO routines (besides very small helpers/wrappers)
take VFIODevice *.
>> + VFIODeviceStatePacket *packet,
>> + size_t packet_total_size,
>> + Error **errp)
>> +{
>> + VFIOMigration *migration = vbasedev->migration;
>> + VFIOMultifd *multifd = migration->multifd;
>> + VFIOStateBuffer *lb;
>> +
>> + vfio_state_buffers_assert_init(&multifd->load_bufs);
>> + if (packet->idx >= vfio_state_buffers_size_get(&multifd->load_bufs)) {
>> + vfio_state_buffers_size_set(&multifd->load_bufs, packet->idx + 1);
>> + }
>> +
>> + lb = vfio_state_buffers_at(&multifd->load_bufs, packet->idx);
>> + if (lb->is_present) {
>> + error_setg(errp, "state buffer %" PRIu32 " already filled",
>> + packet->idx);
>> + return false;
>> + }
>> +
>> + assert(packet->idx >= multifd->load_buf_idx);
>> +
>> + lb->data = g_memdup2(&packet->data, packet_total_size - sizeof(*packet));
>> + lb->len = packet_total_size - sizeof(*packet);
>> + lb->is_present = true;
>> +
>> + return true;
>> +}
>> +
>> +bool vfio_load_state_buffer(void *opaque, char *data, size_t data_size,
>> + Error **errp)
>
>
> AFAICS, the only users of the .load_state_buffer() handlers is
> multifd_device_state_recv().
>
> Please rename to vfio_multifd_load_state_buffer().
Renamed it accordingly.
>> +{
>> + VFIODevice *vbasedev = opaque;
>> + VFIOMigration *migration = vbasedev->migration;
>> + VFIOMultifd *multifd = migration->multifd;
>> + VFIODeviceStatePacket *packet = (VFIODeviceStatePacket *)data;
>> +
>> + /*
>> + * Holding BQL here would violate the lock order and can cause
>> + * a deadlock once we attempt to lock load_bufs_mutex below.
>> + */
>> + assert(!bql_locked());
>> +
>> + if (!vfio_multifd_transfer_enabled(vbasedev)) {
>> + error_setg(errp,
>> + "got device state packet but not doing multifd transfer");
>> + return false;
>> + }
>> +
>> + assert(multifd);
>> +
>> + if (data_size < sizeof(*packet)) {
>> + error_setg(errp, "packet too short at %zu (min is %zu)",
>> + data_size, sizeof(*packet));
>> + return false;
>> + }
>> +
>> + if (packet->version != VFIO_DEVICE_STATE_PACKET_VER_CURRENT) {
>> + error_setg(errp, "packet has unknown version %" PRIu32,
>> + packet->version);
>> + return false;
>> + }
>> +
>> + if (packet->idx == UINT32_MAX) {
>> + error_setg(errp, "packet has too high idx");
>
> or "packet index is invalid" ?
Changed the error message.
>> + return false;
>> + }
>> +
>> + trace_vfio_load_state_device_buffer_incoming(vbasedev->name, packet->idx);
>> +
>> + QEMU_LOCK_GUARD(&multifd->load_bufs_mutex);
>
> Using WITH_QEMU_LOCK_GUARD() would be cleaner I think.
Changed into a WITH_QEMU_LOCK_GUARD() block.
>
>
> Thanks,
>
> C.
Thanks,
Maciej
On 2/26/25 22:04, Maciej S. Szmigiero wrote:
> On 26.02.2025 11:43, Cédric Le Goater wrote:
>> On 2/19/25 21:34, Maciej S. Szmigiero wrote:
>>> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
>>>
>>> The multifd received data needs to be reassembled since device state
>>> packets sent via different multifd channels can arrive out-of-order.
>>>
>>> Therefore, each VFIO device state packet carries a header indicating its
>>> position in the stream.
>>> The raw device state data is saved into a VFIOStateBuffer for later
>>> in-order loading into the device.
>>>
>>> The last such VFIO device state packet should have
>>> VFIO_DEVICE_STATE_CONFIG_STATE flag set and carry the device config state.
>>>
>>> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
>>> ---
>>> hw/vfio/migration-multifd.c | 103 ++++++++++++++++++++++++++++++++++++
>>> hw/vfio/migration-multifd.h | 3 ++
>>> hw/vfio/migration.c | 1 +
>>> hw/vfio/trace-events | 1 +
>>> 4 files changed, 108 insertions(+)
>>>
>>> diff --git a/hw/vfio/migration-multifd.c b/hw/vfio/migration-multifd.c
>>> index c2defc0efef0..5d5ee1393674 100644
>>> --- a/hw/vfio/migration-multifd.c
>>> +++ b/hw/vfio/migration-multifd.c
>>> @@ -42,6 +42,11 @@ typedef struct VFIOStateBuffer {
>>> } VFIOStateBuffer;
>>> typedef struct VFIOMultifd {
>>> + VFIOStateBuffers load_bufs;
>>> + QemuCond load_bufs_buffer_ready_cond;
>>> + QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */
>>> + uint32_t load_buf_idx;
>>> + uint32_t load_buf_idx_last;
>>> } VFIOMultifd;
>>> static void vfio_state_buffer_clear(gpointer data)
>>> @@ -87,15 +92,113 @@ static VFIOStateBuffer *vfio_state_buffers_at(VFIOStateBuffers *bufs, guint idx)
>>> return &g_array_index(bufs->array, VFIOStateBuffer, idx);
>>> }
>>
>> this routine expects load_bufs_mutex to be locked ? May be say so.
>
> I guess the comment above pertains to the vfio_load_state_buffer_insert()
> below.
>
> Do you mean it should have a comment that it expects to be called
> under load_bufs_mutex?
Just a one liner like :
/* called with load_bufs_mutex locked */
?
It's good to have for the future generations.
>
>>> +static bool vfio_load_state_buffer_insert(VFIODevice *vbasedev,
>>
>> could you pass VFIOMultifd* instead ?
>
> No, it needs vbasedev->migration_max_queued_buffers too (introduced
> in later patch).
> > Also, most of VFIO routines (besides very small helpers/wrappers)
> take VFIODevice *.
OK. It's minor but I prefer when parameters are limited to the minimum.
Having 'VFIODevice *' opens doors to a lot of state.
Thanks,
C.
>
>>> + VFIODeviceStatePacket *packet,
>>> + size_t packet_total_size,
>>> + Error **errp)
>>> +{
>>> + VFIOMigration *migration = vbasedev->migration;
>>> + VFIOMultifd *multifd = migration->multifd;
>>> + VFIOStateBuffer *lb;
>>> +
>>> + vfio_state_buffers_assert_init(&multifd->load_bufs);
>>> + if (packet->idx >= vfio_state_buffers_size_get(&multifd->load_bufs)) {
>>> + vfio_state_buffers_size_set(&multifd->load_bufs, packet->idx + 1);
>>> + }
>>> +
>>> + lb = vfio_state_buffers_at(&multifd->load_bufs, packet->idx);
>>> + if (lb->is_present) {
>>> + error_setg(errp, "state buffer %" PRIu32 " already filled",
>>> + packet->idx);
>>> + return false;
>>> + }
>>> +
>>> + assert(packet->idx >= multifd->load_buf_idx);
>>> +
>>> + lb->data = g_memdup2(&packet->data, packet_total_size - sizeof(*packet));
>>> + lb->len = packet_total_size - sizeof(*packet);
>>> + lb->is_present = true;
>>> +
>>> + return true;
>>> +}
>>> +
>>> +bool vfio_load_state_buffer(void *opaque, char *data, size_t data_size,
>>> + Error **errp)
>>
>>
>> AFAICS, the only users of the .load_state_buffer() handlers is
>> multifd_device_state_recv().
>>
>> Please rename to vfio_multifd_load_state_buffer().
>
> Renamed it accordingly.
>
>>> +{
>>> + VFIODevice *vbasedev = opaque;
>>> + VFIOMigration *migration = vbasedev->migration;
>>> + VFIOMultifd *multifd = migration->multifd;
>>> + VFIODeviceStatePacket *packet = (VFIODeviceStatePacket *)data;
>>> +
>>> + /*
>>> + * Holding BQL here would violate the lock order and can cause
>>> + * a deadlock once we attempt to lock load_bufs_mutex below.
>>> + */
>>> + assert(!bql_locked());
>>> +
>>> + if (!vfio_multifd_transfer_enabled(vbasedev)) {
>>> + error_setg(errp,
>>> + "got device state packet but not doing multifd transfer");
>>> + return false;
>>> + }
>>> +
>>> + assert(multifd);
>>> +
>>> + if (data_size < sizeof(*packet)) {
>>> + error_setg(errp, "packet too short at %zu (min is %zu)",
>>> + data_size, sizeof(*packet));
>>> + return false;
>>> + }
>>> +
>>> + if (packet->version != VFIO_DEVICE_STATE_PACKET_VER_CURRENT) {
>>> + error_setg(errp, "packet has unknown version %" PRIu32,
>>> + packet->version);
>>> + return false;
>>> + }
>>> +
>>> + if (packet->idx == UINT32_MAX) {
>>> + error_setg(errp, "packet has too high idx");
>>
>> or "packet index is invalid" ?
>
> Changed the error message.
>
>>> + return false;
>>> + }
>>> +
>>> + trace_vfio_load_state_device_buffer_incoming(vbasedev->name, packet->idx);
>>> +
>>> + QEMU_LOCK_GUARD(&multifd->load_bufs_mutex);
>>
>> Using WITH_QEMU_LOCK_GUARD() would be cleaner I think.
>
> Changed into a WITH_QEMU_LOCK_GUARD() block.
>
>>
>>
>> Thanks,
>>
>> C.
>
> Thanks,
> Maciej
>
On 28.02.2025 09:09, Cédric Le Goater wrote:
> On 2/26/25 22:04, Maciej S. Szmigiero wrote:
>> On 26.02.2025 11:43, Cédric Le Goater wrote:
>>> On 2/19/25 21:34, Maciej S. Szmigiero wrote:
>>>> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
>>>>
>>>> The multifd received data needs to be reassembled since device state
>>>> packets sent via different multifd channels can arrive out-of-order.
>>>>
>>>> Therefore, each VFIO device state packet carries a header indicating its
>>>> position in the stream.
>>>> The raw device state data is saved into a VFIOStateBuffer for later
>>>> in-order loading into the device.
>>>>
>>>> The last such VFIO device state packet should have
>>>> VFIO_DEVICE_STATE_CONFIG_STATE flag set and carry the device config state.
>>>>
>>>> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
>>>> ---
>>>> hw/vfio/migration-multifd.c | 103 ++++++++++++++++++++++++++++++++++++
>>>> hw/vfio/migration-multifd.h | 3 ++
>>>> hw/vfio/migration.c | 1 +
>>>> hw/vfio/trace-events | 1 +
>>>> 4 files changed, 108 insertions(+)
>>>>
>>>> diff --git a/hw/vfio/migration-multifd.c b/hw/vfio/migration-multifd.c
>>>> index c2defc0efef0..5d5ee1393674 100644
>>>> --- a/hw/vfio/migration-multifd.c
>>>> +++ b/hw/vfio/migration-multifd.c
>>>> @@ -42,6 +42,11 @@ typedef struct VFIOStateBuffer {
>>>> } VFIOStateBuffer;
>>>> typedef struct VFIOMultifd {
>>>> + VFIOStateBuffers load_bufs;
>>>> + QemuCond load_bufs_buffer_ready_cond;
>>>> + QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */
>>>> + uint32_t load_buf_idx;
>>>> + uint32_t load_buf_idx_last;
>>>> } VFIOMultifd;
>>>> static void vfio_state_buffer_clear(gpointer data)
>>>> @@ -87,15 +92,113 @@ static VFIOStateBuffer *vfio_state_buffers_at(VFIOStateBuffers *bufs, guint idx)
>>>> return &g_array_index(bufs->array, VFIOStateBuffer, idx);
>>>> }
>>>
>>> this routine expects load_bufs_mutex to be locked ? May be say so.
>>
>> I guess the comment above pertains to the vfio_load_state_buffer_insert()
>> below.
>>
>> Do you mean it should have a comment that it expects to be called
>> under load_bufs_mutex?
>
> Just a one liner like :
>
> /* called with load_bufs_mutex locked */
>
> ?
>
> It's good to have for the future generations.
Okay, done.
>>
>>>> +static bool vfio_load_state_buffer_insert(VFIODevice *vbasedev,
>>>
>>> could you pass VFIOMultifd* instead ?
>>
>> No, it needs vbasedev->migration_max_queued_buffers too (introduced
>> in later patch).
>> > Also, most of VFIO routines (besides very small helpers/wrappers)
>> take VFIODevice *.
>
> OK. It's minor but I prefer when parameters are limited to the minimum.
> Having 'VFIODevice *' opens doors to a lot of state.
>
>
> Thanks,
>
> C.
>
Thanks.
Maciej
© 2016 - 2026 Red Hat, Inc.