Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
---
hw/vfio/user-protocol.h | 18 +++++++++++++++++
hw/vfio/migration.c | 30 +++++++++++++--------------
hw/vfio/pci.c | 7 +++++++
hw/vfio/user.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 93 insertions(+), 16 deletions(-)
diff --git a/hw/vfio/user-protocol.h b/hw/vfio/user-protocol.h
index 8932311..abe7002 100644
--- a/hw/vfio/user-protocol.h
+++ b/hw/vfio/user-protocol.h
@@ -193,6 +193,10 @@ typedef struct {
char data[];
} VFIOUserDMARW;
+/*
+ * VFIO_USER_DIRTY_PAGES
+ */
+
/*imported from struct vfio_bitmap */
typedef struct {
uint64_t pgsize;
@@ -200,4 +204,18 @@ typedef struct {
char data[];
} VFIOUserBitmap;
+/* imported from struct vfio_iommu_type1_dirty_bitmap_get */
+typedef struct {
+ uint64_t iova;
+ uint64_t size;
+ VFIOUserBitmap bitmap;
+} VFIOUserBitmapRange;
+
+/* imported from struct vfio_iommu_type1_dirty_bitmap */
+typedef struct {
+ VFIOUserHdr hdr;
+ uint32_t argsz;
+ uint32_t flags;
+} VFIOUserDirtyPages;
+
#endif /* VFIO_USER_PROTOCOL_H */
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index ff6b45d..df63f5c 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -27,6 +27,7 @@
#include "pci.h"
#include "trace.h"
#include "hw/hw.h"
+#include "user.h"
/*
* Flags to be used as unique delimiters for VFIO devices in the migration
@@ -49,11 +50,13 @@ static int64_t bytes_transferred;
static inline int vfio_mig_access(VFIODevice *vbasedev, void *val, int count,
off_t off, bool iswrite)
{
+ VFIORegion *region = &vbasedev->migration->region;
int ret;
- ret = iswrite ? pwrite(vbasedev->fd, val, count, off) :
- pread(vbasedev->fd, val, count, off);
- if (ret < count) {
+ ret = iswrite ?
+ VDEV_REGION_WRITE(vbasedev, region->nr, off, count, val, false) :
+ VDEV_REGION_READ(vbasedev, region->nr, off, count, val);
+ if (ret < count) {
error_report("vfio_mig_%s %d byte %s: failed at offset 0x%"
HWADDR_PRIx", err: %s", iswrite ? "write" : "read", count,
vbasedev->name, off, strerror(errno));
@@ -111,9 +114,7 @@ static int vfio_migration_set_state(VFIODevice *vbasedev, uint32_t mask,
uint32_t value)
{
VFIOMigration *migration = vbasedev->migration;
- VFIORegion *region = &migration->region;
- off_t dev_state_off = region->fd_offset +
- VFIO_MIG_STRUCT_OFFSET(device_state);
+ off_t dev_state_off = VFIO_MIG_STRUCT_OFFSET(device_state);
uint32_t device_state;
int ret;
@@ -201,13 +202,13 @@ static int vfio_save_buffer(QEMUFile *f, VFIODevice *vbasedev, uint64_t *size)
int ret;
ret = vfio_mig_read(vbasedev, &data_offset, sizeof(data_offset),
- region->fd_offset + VFIO_MIG_STRUCT_OFFSET(data_offset));
+ VFIO_MIG_STRUCT_OFFSET(data_offset));
if (ret < 0) {
return ret;
}
ret = vfio_mig_read(vbasedev, &data_size, sizeof(data_size),
- region->fd_offset + VFIO_MIG_STRUCT_OFFSET(data_size));
+ VFIO_MIG_STRUCT_OFFSET(data_size));
if (ret < 0) {
return ret;
}
@@ -233,8 +234,7 @@ static int vfio_save_buffer(QEMUFile *f, VFIODevice *vbasedev, uint64_t *size)
}
buf_allocated = true;
- ret = vfio_mig_read(vbasedev, buf, sec_size,
- region->fd_offset + data_offset);
+ ret = vfio_mig_read(vbasedev, buf, sec_size, data_offset);
if (ret < 0) {
g_free(buf);
return ret;
@@ -269,7 +269,7 @@ static int vfio_load_buffer(QEMUFile *f, VFIODevice *vbasedev,
do {
ret = vfio_mig_read(vbasedev, &data_offset, sizeof(data_offset),
- region->fd_offset + VFIO_MIG_STRUCT_OFFSET(data_offset));
+ VFIO_MIG_STRUCT_OFFSET(data_offset));
if (ret < 0) {
return ret;
}
@@ -309,8 +309,7 @@ static int vfio_load_buffer(QEMUFile *f, VFIODevice *vbasedev,
qemu_get_buffer(f, buf, sec_size);
if (buf_alloc) {
- ret = vfio_mig_write(vbasedev, buf, sec_size,
- region->fd_offset + data_offset);
+ ret = vfio_mig_write(vbasedev, buf, sec_size, data_offset);
g_free(buf);
if (ret < 0) {
@@ -322,7 +321,7 @@ static int vfio_load_buffer(QEMUFile *f, VFIODevice *vbasedev,
}
ret = vfio_mig_write(vbasedev, &report_size, sizeof(report_size),
- region->fd_offset + VFIO_MIG_STRUCT_OFFSET(data_size));
+ VFIO_MIG_STRUCT_OFFSET(data_size));
if (ret < 0) {
return ret;
}
@@ -334,12 +333,11 @@ static int vfio_load_buffer(QEMUFile *f, VFIODevice *vbasedev,
static int vfio_update_pending(VFIODevice *vbasedev)
{
VFIOMigration *migration = vbasedev->migration;
- VFIORegion *region = &migration->region;
uint64_t pending_bytes = 0;
int ret;
ret = vfio_mig_read(vbasedev, &pending_bytes, sizeof(pending_bytes),
- region->fd_offset + VFIO_MIG_STRUCT_OFFSET(pending_bytes));
+ VFIO_MIG_STRUCT_OFFSET(pending_bytes));
if (ret < 0) {
migration->pending_bytes = 0;
return ret;
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index d47b98e..598e9ed 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3677,6 +3677,13 @@ static void vfio_user_pci_realize(PCIDevice *pdev, Error **errp)
goto out_teardown;
}
+ if (!pdev->failover_pair_id) {
+ ret = vfio_migration_probe(&vdev->vbasedev, errp);
+ if (ret) {
+ error_report("%s: Migration disabled", vdev->vbasedev.name);
+ }
+ }
+
vfio_register_err_notifier(vdev);
vfio_register_req_notifier(vdev);
diff --git a/hw/vfio/user.c b/hw/vfio/user.c
index 33d8f06..2eac62a 100644
--- a/hw/vfio/user.c
+++ b/hw/vfio/user.c
@@ -1410,6 +1410,52 @@ void vfio_user_reset(VFIOProxy *proxy)
}
}
+static int vfio_user_dirty_bitmap(VFIOProxy *proxy,
+ struct vfio_iommu_type1_dirty_bitmap *cmd,
+ struct vfio_iommu_type1_dirty_bitmap_get
+ *dbitmap)
+{
+ g_autofree struct {
+ VFIOUserDirtyPages msg;
+ VFIOUserBitmapRange range;
+ } *msgp = NULL;
+ int msize, rsize;
+
+ /*
+ * If just the command is sent, the returned bitmap isn't needed.
+ * The bitmap structs are different from the ioctl() versions,
+ * ioctl() returns the bitmap in a local VA
+ */
+ if (dbitmap != NULL) {
+ msize = sizeof(*msgp);
+ rsize = msize + dbitmap->bitmap.size;
+ msgp = g_malloc0(rsize);
+ msgp->range.iova = dbitmap->iova;
+ msgp->range.size = dbitmap->size;
+ msgp->range.bitmap.pgsize = dbitmap->bitmap.pgsize;
+ msgp->range.bitmap.size = dbitmap->bitmap.size;
+ } else {
+ msize = rsize = sizeof(VFIOUserDirtyPages);
+ msgp = g_malloc0(rsize);
+ }
+
+ vfio_user_request_msg(&msgp->msg.hdr, VFIO_USER_DIRTY_PAGES, msize, 0);
+ msgp->msg.argsz = rsize - sizeof(VFIOUserHdr);
+ msgp->msg.flags = cmd->flags;
+
+ vfio_user_send_wait(proxy, &msgp->msg.hdr, NULL, rsize, false);
+ if (msgp->msg.hdr.flags & VFIO_USER_ERROR) {
+ return -msgp->msg.hdr.error_reply;
+ }
+
+ if (dbitmap != NULL) {
+ memcpy(dbitmap->bitmap.data, &msgp->range.bitmap.data,
+ dbitmap->bitmap.size);
+ }
+
+ return 0;
+}
+
/*
* Socket-based io_ops
@@ -1530,6 +1576,13 @@ static int vfio_user_io_dma_unmap(VFIOContainer *container,
container->async_ops);
}
+static int vfio_user_io_dirty_bitmap(VFIOContainer *container,
+ struct vfio_iommu_type1_dirty_bitmap *bitmap,
+ struct vfio_iommu_type1_dirty_bitmap_get *range)
+{
+ return vfio_user_dirty_bitmap(container->proxy, bitmap, range);
+}
+
static void vfio_user_io_wait_commit(VFIOContainer *container)
{
vfio_user_wait_reqs(container->proxy);
@@ -1538,5 +1591,6 @@ static void vfio_user_io_wait_commit(VFIOContainer *container)
VFIOContIO vfio_cont_io_sock = {
.dma_map = vfio_user_io_dma_map,
.dma_unmap = vfio_user_io_dma_unmap,
+ .dirty_bitmap = vfio_user_io_dirty_bitmap,
.wait_commit = vfio_user_io_wait_commit,
};
--
1.8.3.1
> -----Original Message-----
> From: Qemu-devel <qemu-devel-
> bounces+thanos.makatos=nutanix.com@nongnu.org> On Behalf Of John
> Johnson
> Sent: 12 January 2022 00:44
> To: qemu-devel@nongnu.org
> Subject: [RFC v4 20/21] vfio-user: migration support
>
> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
> ---
> hw/vfio/user-protocol.h | 18 +++++++++++++++++
> hw/vfio/migration.c | 30 +++++++++++++--------------
> hw/vfio/pci.c | 7 +++++++
> hw/vfio/user.c | 54
> +++++++++++++++++++++++++++++++++++++++++++++++++
> 4 files changed, 93 insertions(+), 16 deletions(-)
>
> diff --git a/hw/vfio/user-protocol.h b/hw/vfio/user-protocol.h
> index 8932311..abe7002 100644
> --- a/hw/vfio/user-protocol.h
> +++ b/hw/vfio/user-protocol.h
> @@ -193,6 +193,10 @@ typedef struct {
> char data[];
> } VFIOUserDMARW;
>
> +/*
> + * VFIO_USER_DIRTY_PAGES
> + */
> +
> /*imported from struct vfio_bitmap */
> typedef struct {
> uint64_t pgsize;
> @@ -200,4 +204,18 @@ typedef struct {
> char data[];
> } VFIOUserBitmap;
>
> +/* imported from struct vfio_iommu_type1_dirty_bitmap_get */
> +typedef struct {
> + uint64_t iova;
> + uint64_t size;
> + VFIOUserBitmap bitmap;
> +} VFIOUserBitmapRange;
> +
> +/* imported from struct vfio_iommu_type1_dirty_bitmap */
> +typedef struct {
> + VFIOUserHdr hdr;
> + uint32_t argsz;
> + uint32_t flags;
> +} VFIOUserDirtyPages;
> +
> #endif /* VFIO_USER_PROTOCOL_H */
> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
> index ff6b45d..df63f5c 100644
> --- a/hw/vfio/migration.c
> +++ b/hw/vfio/migration.c
> @@ -27,6 +27,7 @@
> #include "pci.h"
> #include "trace.h"
> #include "hw/hw.h"
> +#include "user.h"
>
> /*
> * Flags to be used as unique delimiters for VFIO devices in the migration
> @@ -49,11 +50,13 @@ static int64_t bytes_transferred;
> static inline int vfio_mig_access(VFIODevice *vbasedev, void *val, int count,
> off_t off, bool iswrite)
> {
> + VFIORegion *region = &vbasedev->migration->region;
> int ret;
>
> - ret = iswrite ? pwrite(vbasedev->fd, val, count, off) :
> - pread(vbasedev->fd, val, count, off);
> - if (ret < count) {
> + ret = iswrite ?
> + VDEV_REGION_WRITE(vbasedev, region->nr, off, count, val, false) :
> + VDEV_REGION_READ(vbasedev, region->nr, off, count, val);
> + if (ret < count) {
> error_report("vfio_mig_%s %d byte %s: failed at offset 0x%"
> HWADDR_PRIx", err: %s", iswrite ? "write" : "read", count,
> vbasedev->name, off, strerror(errno));
> @@ -111,9 +114,7 @@ static int vfio_migration_set_state(VFIODevice
> *vbasedev, uint32_t mask,
> uint32_t value)
> {
> VFIOMigration *migration = vbasedev->migration;
> - VFIORegion *region = &migration->region;
> - off_t dev_state_off = region->fd_offset +
> - VFIO_MIG_STRUCT_OFFSET(device_state);
> + off_t dev_state_off = VFIO_MIG_STRUCT_OFFSET(device_state);
> uint32_t device_state;
> int ret;
>
> @@ -201,13 +202,13 @@ static int vfio_save_buffer(QEMUFile *f, VFIODevice
> *vbasedev, uint64_t *size)
> int ret;
>
> ret = vfio_mig_read(vbasedev, &data_offset, sizeof(data_offset),
> - region->fd_offset + VFIO_MIG_STRUCT_OFFSET(data_offset));
> + VFIO_MIG_STRUCT_OFFSET(data_offset));
> if (ret < 0) {
> return ret;
> }
>
> ret = vfio_mig_read(vbasedev, &data_size, sizeof(data_size),
> - region->fd_offset + VFIO_MIG_STRUCT_OFFSET(data_size));
> + VFIO_MIG_STRUCT_OFFSET(data_size));
> if (ret < 0) {
> return ret;
> }
> @@ -233,8 +234,7 @@ static int vfio_save_buffer(QEMUFile *f, VFIODevice
> *vbasedev, uint64_t *size)
> }
> buf_allocated = true;
>
> - ret = vfio_mig_read(vbasedev, buf, sec_size,
> - region->fd_offset + data_offset);
> + ret = vfio_mig_read(vbasedev, buf, sec_size, data_offset);
> if (ret < 0) {
> g_free(buf);
> return ret;
> @@ -269,7 +269,7 @@ static int vfio_load_buffer(QEMUFile *f, VFIODevice
> *vbasedev,
>
> do {
> ret = vfio_mig_read(vbasedev, &data_offset, sizeof(data_offset),
> - region->fd_offset + VFIO_MIG_STRUCT_OFFSET(data_offset));
> + VFIO_MIG_STRUCT_OFFSET(data_offset));
> if (ret < 0) {
> return ret;
> }
> @@ -309,8 +309,7 @@ static int vfio_load_buffer(QEMUFile *f, VFIODevice
> *vbasedev,
> qemu_get_buffer(f, buf, sec_size);
>
> if (buf_alloc) {
> - ret = vfio_mig_write(vbasedev, buf, sec_size,
> - region->fd_offset + data_offset);
> + ret = vfio_mig_write(vbasedev, buf, sec_size, data_offset);
> g_free(buf);
>
> if (ret < 0) {
> @@ -322,7 +321,7 @@ static int vfio_load_buffer(QEMUFile *f, VFIODevice
> *vbasedev,
> }
>
> ret = vfio_mig_write(vbasedev, &report_size, sizeof(report_size),
> - region->fd_offset + VFIO_MIG_STRUCT_OFFSET(data_size));
> + VFIO_MIG_STRUCT_OFFSET(data_size));
> if (ret < 0) {
> return ret;
> }
> @@ -334,12 +333,11 @@ static int vfio_load_buffer(QEMUFile *f, VFIODevice
> *vbasedev,
> static int vfio_update_pending(VFIODevice *vbasedev)
> {
> VFIOMigration *migration = vbasedev->migration;
> - VFIORegion *region = &migration->region;
> uint64_t pending_bytes = 0;
> int ret;
>
> ret = vfio_mig_read(vbasedev, &pending_bytes, sizeof(pending_bytes),
> - region->fd_offset + VFIO_MIG_STRUCT_OFFSET(pending_bytes));
> + VFIO_MIG_STRUCT_OFFSET(pending_bytes));
> if (ret < 0) {
> migration->pending_bytes = 0;
> return ret;
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index d47b98e..598e9ed 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -3677,6 +3677,13 @@ static void vfio_user_pci_realize(PCIDevice *pdev,
> Error **errp)
> goto out_teardown;
> }
>
> + if (!pdev->failover_pair_id) {
> + ret = vfio_migration_probe(&vdev->vbasedev, errp);
> + if (ret) {
> + error_report("%s: Migration disabled", vdev->vbasedev.name);
> + }
> + }
> +
> vfio_register_err_notifier(vdev);
> vfio_register_req_notifier(vdev);
>
> diff --git a/hw/vfio/user.c b/hw/vfio/user.c
> index 33d8f06..2eac62a 100644
> --- a/hw/vfio/user.c
> +++ b/hw/vfio/user.c
> @@ -1410,6 +1410,52 @@ void vfio_user_reset(VFIOProxy *proxy)
> }
> }
>
> +static int vfio_user_dirty_bitmap(VFIOProxy *proxy,
> + struct vfio_iommu_type1_dirty_bitmap *cmd,
> + struct vfio_iommu_type1_dirty_bitmap_get
> + *dbitmap)
> +{
> + g_autofree struct {
> + VFIOUserDirtyPages msg;
> + VFIOUserBitmapRange range;
> + } *msgp = NULL;
> + int msize, rsize;
> +
> + /*
> + * If just the command is sent, the returned bitmap isn't needed.
> + * The bitmap structs are different from the ioctl() versions,
> + * ioctl() returns the bitmap in a local VA
> + */
> + if (dbitmap != NULL) {
> + msize = sizeof(*msgp);
> + rsize = msize + dbitmap->bitmap.size;
> + msgp = g_malloc0(rsize);
> + msgp->range.iova = dbitmap->iova;
> + msgp->range.size = dbitmap->size;
> + msgp->range.bitmap.pgsize = dbitmap->bitmap.pgsize;
> + msgp->range.bitmap.size = dbitmap->bitmap.size;
> + } else {
> + msize = rsize = sizeof(VFIOUserDirtyPages);
> + msgp = g_malloc0(rsize);
> + }
> +
> + vfio_user_request_msg(&msgp->msg.hdr, VFIO_USER_DIRTY_PAGES, msize,
> 0);
> + msgp->msg.argsz = rsize - sizeof(VFIOUserHdr);
> + msgp->msg.flags = cmd->flags;
> +
> + vfio_user_send_wait(proxy, &msgp->msg.hdr, NULL, rsize, false);
> + if (msgp->msg.hdr.flags & VFIO_USER_ERROR) {
> + return -msgp->msg.hdr.error_reply;
> + }
We need to check argsz in the response, in which case the client needs to retry with a larger argsz.
> +
> + if (dbitmap != NULL) {
> + memcpy(dbitmap->bitmap.data, &msgp->range.bitmap.data,
> + dbitmap->bitmap.size);
> + }
> +
> + return 0;
> +}
> +
>
> /*
> * Socket-based io_ops
> @@ -1530,6 +1576,13 @@ static int vfio_user_io_dma_unmap(VFIOContainer
> *container,
> container->async_ops);
> }
>
> +static int vfio_user_io_dirty_bitmap(VFIOContainer *container,
> + struct vfio_iommu_type1_dirty_bitmap *bitmap,
> + struct vfio_iommu_type1_dirty_bitmap_get *range)
> +{
> + return vfio_user_dirty_bitmap(container->proxy, bitmap, range);
> +}
> +
> static void vfio_user_io_wait_commit(VFIOContainer *container)
> {
> vfio_user_wait_reqs(container->proxy);
> @@ -1538,5 +1591,6 @@ static void vfio_user_io_wait_commit(VFIOContainer
> *container)
> VFIOContIO vfio_cont_io_sock = {
> .dma_map = vfio_user_io_dma_map,
> .dma_unmap = vfio_user_io_dma_unmap,
> + .dirty_bitmap = vfio_user_io_dirty_bitmap,
> .wait_commit = vfio_user_io_wait_commit,
> };
> --
> 1.8.3.1
>
> On Feb 11, 2022, at 5:31 AM, Thanos Makatos <thanos.makatos@nutanix.com> wrote:
>
>
>> -----Original Message-----
>> From: Qemu-devel <qemu-devel-
>> bounces+thanos.makatos=nutanix.com@nongnu.org> On Behalf Of John
>> Johnson
>> Sent: 12 January 2022 00:44
>> To: qemu-devel@nongnu.org
>> Subject: [RFC v4 20/21] vfio-user: migration support
>>
>>
>> +static int vfio_user_dirty_bitmap(VFIOProxy *proxy,
>> + struct vfio_iommu_type1_dirty_bitmap *cmd,
>> + struct vfio_iommu_type1_dirty_bitmap_get
>> + *dbitmap)
>> +{
>> + g_autofree struct {
>> + VFIOUserDirtyPages msg;
>> + VFIOUserBitmapRange range;
>> + } *msgp = NULL;
>> + int msize, rsize;
>> +
>> + /*
>> + * If just the command is sent, the returned bitmap isn't needed.
>> + * The bitmap structs are different from the ioctl() versions,
>> + * ioctl() returns the bitmap in a local VA
>> + */
>> + if (dbitmap != NULL) {
>> + msize = sizeof(*msgp);
>> + rsize = msize + dbitmap->bitmap.size;
>> + msgp = g_malloc0(rsize);
>> + msgp->range.iova = dbitmap->iova;
>> + msgp->range.size = dbitmap->size;
>> + msgp->range.bitmap.pgsize = dbitmap->bitmap.pgsize;
>> + msgp->range.bitmap.size = dbitmap->bitmap.size;
>> + } else {
>> + msize = rsize = sizeof(VFIOUserDirtyPages);
>> + msgp = g_malloc0(rsize);
>> + }
>> +
>> + vfio_user_request_msg(&msgp->msg.hdr, VFIO_USER_DIRTY_PAGES, msize,
>> 0);
>> + msgp->msg.argsz = rsize - sizeof(VFIOUserHdr);
>> + msgp->msg.flags = cmd->flags;
>> +
>> + vfio_user_send_wait(proxy, &msgp->msg.hdr, NULL, rsize, false);
>> + if (msgp->msg.hdr.flags & VFIO_USER_ERROR) {
>> + return -msgp->msg.hdr.error_reply;
>> + }
>
> We need to check argsz in the response, in which case the client needs to retry with a larger argsz.
>
The client needs to retry if the server reply can be larger than the client
expects, such as GET_REGION_INFO, where the client doesn’t know how many capabilities
will be returned a priori.
In this case, argsz is derived from dbitmap->bitmap.size, which was set in
vfio_get_dirty_bitmap() to be large enough to cover the entire range:
pages = REAL_HOST_PAGE_ALIGN(range->size) / qemu_real_host_page_size;
range->bitmap.size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
BITS_PER_BYTE;
so I’m not sure we’d ever see a case where the reply is larger than expected.
JJ
>> +
>> + if (dbitmap != NULL) {
>> + memcpy(dbitmap->bitmap.data, &msgp->range.bitmap.data,
>> + dbitmap->bitmap.size);
>> + }
>> +
>> + return 0;
>> +}
>> +
>>
> -----Original Message-----
> From: John Johnson <john.g.johnson@oracle.com>
> Sent: 14 February 2022 18:50
> To: Thanos Makatos <thanos.makatos@nutanix.com>
> Cc: qemu-devel@nongnu.org
> Subject: Re: [RFC v4 20/21] vfio-user: migration support
>
>
>
> > On Feb 11, 2022, at 5:31 AM, Thanos Makatos
> <thanos.makatos@nutanix.com> wrote:
> >
> >
> >> -----Original Message-----
> >> From: Qemu-devel <qemu-devel-
> >> bounces+thanos.makatos=nutanix.com@nongnu.org> On Behalf Of John
> >> Johnson
> >> Sent: 12 January 2022 00:44
> >> To: qemu-devel@nongnu.org
> >> Subject: [RFC v4 20/21] vfio-user: migration support
> >>
> >>
> >> +static int vfio_user_dirty_bitmap(VFIOProxy *proxy,
> >> + struct vfio_iommu_type1_dirty_bitmap *cmd,
> >> + struct vfio_iommu_type1_dirty_bitmap_get
> >> + *dbitmap)
> >> +{
> >> + g_autofree struct {
> >> + VFIOUserDirtyPages msg;
> >> + VFIOUserBitmapRange range;
> >> + } *msgp = NULL;
> >> + int msize, rsize;
> >> +
> >> + /*
> >> + * If just the command is sent, the returned bitmap isn't needed.
> >> + * The bitmap structs are different from the ioctl() versions,
> >> + * ioctl() returns the bitmap in a local VA
> >> + */
> >> + if (dbitmap != NULL) {
> >> + msize = sizeof(*msgp);
> >> + rsize = msize + dbitmap->bitmap.size;
> >> + msgp = g_malloc0(rsize);
> >> + msgp->range.iova = dbitmap->iova;
> >> + msgp->range.size = dbitmap->size;
> >> + msgp->range.bitmap.pgsize = dbitmap->bitmap.pgsize;
> >> + msgp->range.bitmap.size = dbitmap->bitmap.size;
> >> + } else {
> >> + msize = rsize = sizeof(VFIOUserDirtyPages);
> >> + msgp = g_malloc0(rsize);
> >> + }
> >> +
> >> + vfio_user_request_msg(&msgp->msg.hdr, VFIO_USER_DIRTY_PAGES,
> msize,
> >> 0);
> >> + msgp->msg.argsz = rsize - sizeof(VFIOUserHdr);
> >> + msgp->msg.flags = cmd->flags;
> >> +
> >> + vfio_user_send_wait(proxy, &msgp->msg.hdr, NULL, rsize, false);
> >> + if (msgp->msg.hdr.flags & VFIO_USER_ERROR) {
> >> + return -msgp->msg.hdr.error_reply;
> >> + }
> >
> > We need to check argsz in the response, in which case the client needs to retry
> with a larger argsz.
> >
>
> The client needs to retry if the server reply can be larger than the client
> expects, such as GET_REGION_INFO, where the client doesn’t know how many
> capabilities
> will be returned a priori.
>
> In this case, argsz is derived from dbitmap->bitmap.size, which was set
> in
> vfio_get_dirty_bitmap() to be large enough to cover the entire range:
>
> pages = REAL_HOST_PAGE_ALIGN(range->size) / qemu_real_host_page_size;
> range->bitmap.size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
> BITS_PER_BYTE;
>
> so I’m not sure we’d ever see a case where the reply is larger than expected.
The client is doing the right thing, and most likely at this stage a different argsz would mean a server bug. I did actually run into this case because of a server bug, should we at least check and report and error if it's not what we expect?
© 2016 - 2026 Red Hat, Inc.