[v3] migration/rdma: add x-rdma-chunk-size parameter

[PATCH v3] migration/rdma: add x-rdma-chunk-size parameter

Posted by Samuel Zhang 6 days, 8 hours ago

The default 1MB RDMA chunk size causes slow live migration because
each chunk triggers a write_flush (ibv_post_send). For 8GB RAM,
1MB chunk size produces ~15000 flushes vs ~3700 with 1024MB chunk size.

Add x-rdma-chunk-size parameter to configure the RDMA chunk size for
faster migration.
Usage: `migrate_set_parameter x-rdma-chunk-size 1024M`

Performance with RDMA live migration of 8GB RAM VM:

| x-rdma-chunk-size (B) | time (s) | throughput (MB/s) |
|-----------------------|----------|-------------------|
| 1M (default)          | 37.915   |  1,007            |
| 32M                   | 17.880   |  2,260            |
| 1024M                 |  4.368   | 17,529            |

Signed-off-by: Samuel Zhang <guoqing.zhang@amd.com>
---
v2:
- Renamed x-rdma-chunk-shift to x-rdma-chunk-size (byte count)
- Added validation in migrate_params_check()
- Added hmp_migrate_set_parameter() support
- Added hmp_info_migrate_parameters() support
- Added migrate_mark_all_params_present()
- Use qemu_strtosz() for size suffix support
v3:
- Use visit_type_size() in HMP set parameter
- Use MiB/GiB constants

 migration/migration-hmp-cmds.c | 11 +++++++++++
 migration/options.c            | 33 ++++++++++++++++++++++++++++++++-
 migration/options.h            |  1 +
 migration/rdma.c               | 30 ++++++++++++++++--------------
 qapi/migration.json            | 12 ++++++++++--
 5 files changed, 70 insertions(+), 17 deletions(-)

diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c
index 0a193b8f54..4f6c1dbf89 100644
--- a/migration/migration-hmp-cmds.c
+++ b/migration/migration-hmp-cmds.c
@@ -451,6 +451,13 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict)
                            params->direct_io ? "on" : "off");
         }
 
+        if (params->has_x_rdma_chunk_size) {
+            monitor_printf(mon, "%s: %" PRIu64 " bytes\n",
+                           MigrationParameter_str(
+                               MIGRATION_PARAMETER_X_RDMA_CHUNK_SIZE),
+                           params->x_rdma_chunk_size);
+        }
+
         assert(params->has_cpr_exec_command);
         monitor_print_cpr_exec_command(mon, params->cpr_exec_command);
     }
@@ -734,6 +741,10 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
         p->has_direct_io = true;
         visit_type_bool(v, param, &p->direct_io, &err);
         break;
+    case MIGRATION_PARAMETER_X_RDMA_CHUNK_SIZE:
+        p->has_x_rdma_chunk_size = true;
+        visit_type_size(v, param, &p->x_rdma_chunk_size, &err);
+        break;
     case MIGRATION_PARAMETER_CPR_EXEC_COMMAND: {
         /*
          * NOTE: g_autofree will only auto g_free() the strv array when
diff --git a/migration/options.c b/migration/options.c
index f33b297929..bc61c8665d 100644
--- a/migration/options.c
+++ b/migration/options.c
@@ -13,6 +13,7 @@
 
 #include "qemu/osdep.h"
 #include "qemu/error-report.h"
+#include "qemu/units.h"
 #include "exec/target_page.h"
 #include "qapi/clone-visitor.h"
 #include "qapi/error.h"
@@ -90,6 +91,7 @@ const PropertyInfo qdev_prop_StrOrNull;
 
 #define DEFAULT_MIGRATE_VCPU_DIRTY_LIMIT_PERIOD     1000    /* milliseconds */
 #define DEFAULT_MIGRATE_VCPU_DIRTY_LIMIT            1       /* MB/s */
+#define DEFAULT_MIGRATE_X_RDMA_CHUNK_SIZE           MiB     /* 1MB */
 
 const Property migration_properties[] = {
     DEFINE_PROP_BOOL("store-global-state", MigrationState,
@@ -183,6 +185,9 @@ const Property migration_properties[] = {
     DEFINE_PROP_ZERO_PAGE_DETECTION("zero-page-detection", MigrationState,
                        parameters.zero_page_detection,
                        ZERO_PAGE_DETECTION_MULTIFD),
+    DEFINE_PROP_UINT64("x-rdma-chunk-size", MigrationState,
+                      parameters.x_rdma_chunk_size,
+                      DEFAULT_MIGRATE_X_RDMA_CHUNK_SIZE),
 
     /* Migration capabilities */
     DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE),
@@ -993,6 +998,15 @@ ZeroPageDetection migrate_zero_page_detection(void)
     return s->parameters.zero_page_detection;
 }
 
+uint64_t migrate_rdma_chunk_size(void)
+{
+    MigrationState *s = migrate_get_current();
+    uint64_t size = s->parameters.x_rdma_chunk_size;
+
+    assert(MiB <= size && size <= GiB && is_power_of_2(size));
+    return size;
+}
+
 /* parameters helpers */
 
 AnnounceParameters *migrate_announce_params(void)
@@ -1055,7 +1069,7 @@ static void migrate_mark_all_params_present(MigrationParameters *p)
         &p->has_announce_step, &p->has_block_bitmap_mapping,
         &p->has_x_vcpu_dirty_limit_period, &p->has_vcpu_dirty_limit,
         &p->has_mode, &p->has_zero_page_detection, &p->has_direct_io,
-        &p->has_cpr_exec_command,
+        &p->has_x_rdma_chunk_size, &p->has_cpr_exec_command,
     };
 
     len = ARRAY_SIZE(has_fields);
@@ -1266,6 +1280,15 @@ bool migrate_params_check(MigrationParameters *params, Error **errp)
         return false;
     }
 
+    if (params->has_x_rdma_chunk_size &&
+        (params->x_rdma_chunk_size < MiB ||
+         params->x_rdma_chunk_size > GiB ||
+         !is_power_of_2(params->x_rdma_chunk_size))) {
+        error_setg(errp, "Option x_rdma_chunk_size expects "
+                   "a power of 2 in the range 1MiB to 1024MiB");
+        return false;
+    }
+
     return true;
 }
 
@@ -1391,6 +1414,10 @@ static void migrate_params_test_apply(MigrationParameters *params,
         dest->direct_io = params->direct_io;
     }
 
+    if (params->has_x_rdma_chunk_size) {
+        dest->x_rdma_chunk_size = params->x_rdma_chunk_size;
+    }
+
     if (params->has_cpr_exec_command) {
         dest->cpr_exec_command = params->cpr_exec_command;
     }
@@ -1517,6 +1544,10 @@ static void migrate_params_apply(MigrationParameters *params)
         s->parameters.direct_io = params->direct_io;
     }
 
+    if (params->has_x_rdma_chunk_size) {
+        s->parameters.x_rdma_chunk_size = params->x_rdma_chunk_size;
+    }
+
     if (params->has_cpr_exec_command) {
         qapi_free_strList(s->parameters.cpr_exec_command);
         s->parameters.cpr_exec_command =
diff --git a/migration/options.h b/migration/options.h
index b502871097..b46221998a 100644
--- a/migration/options.h
+++ b/migration/options.h
@@ -87,6 +87,7 @@ const char *migrate_tls_creds(void);
 const char *migrate_tls_hostname(void);
 uint64_t migrate_xbzrle_cache_size(void);
 ZeroPageDetection migrate_zero_page_detection(void);
+uint64_t migrate_rdma_chunk_size(void);
 
 /* parameters helpers */
 
diff --git a/migration/rdma.c b/migration/rdma.c
index 55ab85650a..3e37a1d440 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -45,10 +45,12 @@
 #define RDMA_RESOLVE_TIMEOUT_MS 10000
 
 /* Do not merge data if larger than this. */
-#define RDMA_MERGE_MAX (2 * 1024 * 1024)
-#define RDMA_SIGNALED_SEND_MAX (RDMA_MERGE_MAX / 4096)
+static inline uint64_t rdma_merge_max(void)
+{
+    return migrate_rdma_chunk_size() * 2;
+}
 
-#define RDMA_REG_CHUNK_SHIFT 20 /* 1 MB */
+#define RDMA_SIGNALED_SEND_MAX 512
 
 /*
  * This is only for non-live state being migrated.
@@ -527,21 +529,21 @@ static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
 static inline uint64_t ram_chunk_index(const uint8_t *start,
                                        const uint8_t *host)
 {
-    return ((uintptr_t) host - (uintptr_t) start) >> RDMA_REG_CHUNK_SHIFT;
+    return ((uintptr_t) host - (uintptr_t) start) / migrate_rdma_chunk_size();
 }
 
 static inline uint8_t *ram_chunk_start(const RDMALocalBlock *rdma_ram_block,
                                        uint64_t i)
 {
     return (uint8_t *)(uintptr_t)(rdma_ram_block->local_host_addr +
-                                  (i << RDMA_REG_CHUNK_SHIFT));
+                                  (i * migrate_rdma_chunk_size()));
 }
 
 static inline uint8_t *ram_chunk_end(const RDMALocalBlock *rdma_ram_block,
                                      uint64_t i)
 {
     uint8_t *result = ram_chunk_start(rdma_ram_block, i) +
-                                         (1UL << RDMA_REG_CHUNK_SHIFT);
+                                         migrate_rdma_chunk_size();
 
     if (result > (rdma_ram_block->local_host_addr + rdma_ram_block->length)) {
         result = rdma_ram_block->local_host_addr + rdma_ram_block->length;
@@ -1841,6 +1843,7 @@ static int qemu_rdma_write_one(RDMAContext *rdma,
     struct ibv_send_wr *bad_wr;
     int reg_result_idx, ret, count = 0;
     uint64_t chunk, chunks;
+    uint64_t chunk_size = migrate_rdma_chunk_size();
     uint8_t *chunk_start, *chunk_end;
     RDMALocalBlock *block = &(rdma->local_ram_blocks.block[current_index]);
     RDMARegister reg;
@@ -1861,22 +1864,21 @@ retry:
     chunk_start = ram_chunk_start(block, chunk);
 
     if (block->is_ram_block) {
-        chunks = length / (1UL << RDMA_REG_CHUNK_SHIFT);
+        chunks = length / chunk_size;
 
-        if (chunks && ((length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
+        if (chunks && ((length % chunk_size) == 0)) {
             chunks--;
         }
     } else {
-        chunks = block->length / (1UL << RDMA_REG_CHUNK_SHIFT);
+        chunks = block->length / chunk_size;
 
-        if (chunks && ((block->length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
+        if (chunks && ((block->length % chunk_size) == 0)) {
             chunks--;
         }
     }
 
     trace_qemu_rdma_write_one_top(chunks + 1,
-                                  (chunks + 1) *
-                                  (1UL << RDMA_REG_CHUNK_SHIFT) / 1024 / 1024);
+                                  (chunks + 1) * chunk_size / 1024 / 1024);
 
     chunk_end = ram_chunk_end(block, chunk + chunks);
 
@@ -2176,7 +2178,7 @@ static int qemu_rdma_write(RDMAContext *rdma,
     rdma->current_length += len;
 
     /* flush it if buffer is too large */
-    if (rdma->current_length >= RDMA_MERGE_MAX) {
+    if (rdma->current_length >= rdma_merge_max()) {
         return qemu_rdma_write_flush(rdma, errp);
     }
 
@@ -3522,7 +3524,7 @@ int rdma_registration_handle(QEMUFile *f)
                 } else {
                     chunk = reg->key.chunk;
                     host_addr = block->local_host_addr +
-                        (reg->key.chunk * (1UL << RDMA_REG_CHUNK_SHIFT));
+                        (reg->key.chunk * migrate_rdma_chunk_size());
                     /* Check for particularly bad chunk value */
                     if (host_addr < (void *)block->local_host_addr) {
                         error_report("rdma: bad chunk for block %s"
diff --git a/qapi/migration.json b/qapi/migration.json
index 7134d4ce47..292d96c95a 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -806,7 +806,7 @@
 #
 # Features:
 #
-# @unstable: Members @x-checkpoint-delay and
+# @unstable: Members @x-checkpoint-delay, @x-rdma-chunk-size, and
 #     @x-vcpu-dirty-limit-period are experimental.
 #
 # Since: 2.4
@@ -831,6 +831,7 @@
            'mode',
            'zero-page-detection',
            'direct-io',
+           { 'name': 'x-rdma-chunk-size', 'features': [ 'unstable' ] },
            'cpr-exec-command'] }
 
 ##
@@ -1007,9 +1008,14 @@
 #     is @cpr-exec.  The first list element is the program's filename,
 #     the remainder its arguments.  (Since 10.2)
 #
+# @x-rdma-chunk-size: RDMA memory registration chunk size in bytes.
+#     Default is 1MiB.  Must be a power of 2 in the range
+#     [1MiB, 1024MiB].  Only takes effect for RDMA migration.
+#     (Since 11.1)
+#
 # Features:
 #
-# @unstable: Members @x-checkpoint-delay and
+# @unstable: Members @x-checkpoint-delay, @x-rdma-chunk-size, and
 #     @x-vcpu-dirty-limit-period are experimental.
 #
 # Since: 2.4
@@ -1046,6 +1052,8 @@
             '*mode': 'MigMode',
             '*zero-page-detection': 'ZeroPageDetection',
             '*direct-io': 'bool',
+            '*x-rdma-chunk-size': { 'type': 'uint64',
+                                    'features': [ 'unstable' ] },
             '*cpr-exec-command': [ 'str' ]} }
 
 ##
-- 
2.43.7

Re: [PATCH v3] migration/rdma: add x-rdma-chunk-size parameter

Posted by Peter Xu 2 days, 23 hours ago

Hi, Samuel,

On Fri, Mar 27, 2026 at 02:50:06PM +0800, Samuel Zhang wrote:
> The default 1MB RDMA chunk size causes slow live migration because
> each chunk triggers a write_flush (ibv_post_send). For 8GB RAM,
> 1MB chunk size produces ~15000 flushes vs ~3700 with 1024MB chunk size.
> 
> Add x-rdma-chunk-size parameter to configure the RDMA chunk size for
> faster migration.
> Usage: `migrate_set_parameter x-rdma-chunk-size 1024M`
> 
> Performance with RDMA live migration of 8GB RAM VM:
> 
> | x-rdma-chunk-size (B) | time (s) | throughput (MB/s) |
> |-----------------------|----------|-------------------|
> | 1M (default)          | 37.915   |  1,007            |

This is the default. It surprised me a bit knowing it can only reach 1GB/s
throughput with the current code base.  Do you know why?  I thought RDMA
should be much faster than this on throughput with whatever hardware setup.

> | 32M                   | 17.880   |  2,260            |
> | 1024M                 |  4.368   | 17,529            |
> 
> Signed-off-by: Samuel Zhang <guoqing.zhang@amd.com>

One thing to mention is RDMA migration is in odd-fixes stage, actually it
doesn't have a real maintainer so it is kind of "orphaned".  In this case,
I actually won't suggest we add any new knobs for performance reasons.

Do you have a strong reason to propose this patch to land upstream?  Is it
used in production systems and it solves some real problems for you?

I also wonder what Zhijian would say on this.

Thanks,

-- 
Peter Xu

Re: [PATCH v3] migration/rdma: add x-rdma-chunk-size parameter

Posted by Markus Armbruster 2 days, 4 hours ago

Peter Xu <peterx@redhat.com> writes:

[...]

> One thing to mention is RDMA migration is in odd-fixes stage, actually it
> doesn't have a real maintainer so it is kind of "orphaned".  In this case,
> I actually won't suggest we add any new knobs for performance reasons.

Good point.

> Do you have a strong reason to propose this patch to land upstream?  Is it
> used in production systems and it solves some real problems for you?

If you use it in production, finding an upstream maintainer is in your
best interest.  Give it a thought.

> I also wonder what Zhijian would say on this.
>
> Thanks,

Re: [PATCH v3] migration/rdma: add x-rdma-chunk-size parameter

Posted by Zhijian Li (Fujitsu) 2 days, 12 hours ago


On 31/03/2026 00:10, Peter Xu wrote:
> Hi, Samuel,
> 
> On Fri, Mar 27, 2026 at 02:50:06PM +0800, Samuel Zhang wrote:
>> The default 1MB RDMA chunk size causes slow live migration because
>> each chunk triggers a write_flush (ibv_post_send). For 8GB RAM,
>> 1MB chunk size produces ~15000 flushes vs ~3700 with 1024MB chunk size.
>>
>> Add x-rdma-chunk-size parameter to configure the RDMA chunk size for
>> faster migration.
>> Usage: `migrate_set_parameter x-rdma-chunk-size 1024M`
>>
>> Performance with RDMA live migration of 8GB RAM VM:
>>
>> | x-rdma-chunk-size (B) | time (s) | throughput (MB/s) |
>> |-----------------------|----------|-------------------|
>> | 1M (default)          | 37.915   |  1,007            |
> 
> This is the default. It surprised me a bit knowing it can only reach 1GB/s
> throughput with the current code base.  Do you know why?  I thought RDMA
> should be much faster than this on throughput with whatever hardware setup.

  
Regarding the baseline performance, Samuel's numbers look reasonable. I checked
some of my old test data on a ConnectX-4 Lx card years ago, and the throughput
was around 10 Gbps (~1.25 GB/s), which is consistent with the 1 GB/s he reported.

> 
>> | 32M                   | 17.880   |  2,260            |
>> | 1024M                 |  4.368   | 17,529            |

My guess for the dramatic performance improvement is that a larger chunk size
allows qemu_rdma_write() to batch more *contiguous dirty pages* into a single,
more efficient RDMA send operation.

Is there any workloads running on the guest during the migration, or just an idle guest? @Samuel

Given the significant benefit and the fact that the patch itself is straightforward,
I think it's a worthwhile addition.
  
Acked-by: Li Zhijian <lizhijian@fujitsu.com>



>>
>> Signed-off-by: Samuel Zhang <guoqing.zhang@amd.com>
> 
> One thing to mention is RDMA migration is in odd-fixes stage, actually it
> doesn't have a real maintainer so it is kind of "orphaned".  In this case,
> I actually won't suggest we add any new knobs for performance reasons.
> 
> Do you have a strong reason to propose this patch to land upstream?  Is it
> used in production systems and it solves some real problems for you?
> 
> I also wonder what Zhijian would say on this.
> 
> Thanks,
>

Re: [PATCH v3] migration/rdma: add x-rdma-chunk-size parameter

Posted by Zhang, GuoQing (Sam) 2 days, 5 hours ago

On 2026/3/31 11:30, Zhijian Li (Fujitsu) wrote:
> [Some people who received this message don't often get email from lizhijian@fujitsu.com. Learn why this is important at https://aka.ms/LearnAboutSenderIdentification ]
>
> On 31/03/2026 00:10, Peter Xu wrote:
>> Hi, Samuel,
>>
>> On Fri, Mar 27, 2026 at 02:50:06PM +0800, Samuel Zhang wrote:
>>> The default 1MB RDMA chunk size causes slow live migration because
>>> each chunk triggers a write_flush (ibv_post_send). For 8GB RAM,
>>> 1MB chunk size produces ~15000 flushes vs ~3700 with 1024MB chunk size.
>>>
>>> Add x-rdma-chunk-size parameter to configure the RDMA chunk size for
>>> faster migration.
>>> Usage: `migrate_set_parameter x-rdma-chunk-size 1024M`
>>>
>>> Performance with RDMA live migration of 8GB RAM VM:
>>>
>>> | x-rdma-chunk-size (B) | time (s) | throughput (MB/s) |
>>> |-----------------------|----------|-------------------|
>>> | 1M (default)          | 37.915   |  1,007            |
>> This is the default. It surprised me a bit knowing it can only reach 1GB/s
>> throughput with the current code base.  Do you know why?  I thought RDMA
>> should be much faster than this on throughput with whatever hardware setup.
>
> Regarding the baseline performance, Samuel's numbers look reasonable. I checked
> some of my old test data on a ConnectX-4 Lx card years ago, and the throughput
> was around 10 Gbps (~1.25 GB/s), which is consistent with the 1 GB/s he reported.
>
>>> | 32M                   | 17.880   |  2,260            |
>>> | 1024M                 |  4.368   | 17,529            |
> My guess for the dramatic performance improvement is that a larger chunk size
> allows qemu_rdma_write() to batch more *contiguous dirty pages* into a single,
> more efficient RDMA send operation.

The `throughput` data is collected from `info migrate` qemu monitor 
command after live-migration.

Yes, Zhijian is right. As each chunk triggers a write_flush and each 
flush involves posting an RDMA WRITE and WAITING for completion, there's 
software overhead here.

For 8GB RAM VM migration, 1MB chunk size produces ~15000 flushes. The 
software overhead adds up and prevents the RDMA hardware from sustaining 
high throughput.

When chunk size is 1GB, there are ~3700 flushes. Reduced flush count 
means reduced software overhead and improved overall throughput.



>
> Is there any workloads running on the guest during the migration, or just an idle guest? @Samuel


The guest is idle when I test the migration and collect the data.


>
> Given the significant benefit and the fact that the patch itself is straightforward,
> I think it's a worthwhile addition.
>
> Acked-by: Li Zhijian <lizhijian@fujitsu.com>


Thank you for the ack, Zhijian!


>
>
>
>>> Signed-off-by: Samuel Zhang <guoqing.zhang@amd.com>
>> One thing to mention is RDMA migration is in odd-fixes stage, actually it
>> doesn't have a real maintainer so it is kind of "orphaned".  In this case,
>> I actually won't suggest we add any new knobs for performance reasons.
>>
>> Do you have a strong reason to propose this patch to land upstream?  Is it
>> used in production systems and it solves some real problems for you?


We have VMs with large RAM and find TCP live-migration is not fast 
enough and expect RDMA migration can be faster.

But we found the rdma mode migration speed is slower than tcp mode. See 
following data.


8GB RAM idle VM live-migration performance:
| transport mode       | time (s) | throughput (MB/s) |
|----------------------|----------|-------------------|
| TCP                  | 36.89    |  1,081            |
| RDMA, 1MB chunk size | 37.915   |  1,007            |
| RDMA, 1GB chunk size |  4.368   | 17,529            |

This patch allows us to use larger chunk size for faster RDMA migration.


Regards
Sam


>>
>> I also wonder what Zhijian would say on this.
>>
>> Thanks,
>>

Re: [PATCH v3] migration/rdma: add x-rdma-chunk-size parameter

Posted by Peter Xu 23 hours ago

On Tue, Mar 31, 2026 at 06:33:23PM +0800, Zhang, GuoQing (Sam) wrote:
> 
> On 2026/3/31 11:30, Zhijian Li (Fujitsu) wrote:
> > [Some people who received this message don't often get email from lizhijian@fujitsu.com. Learn why this is important at https://aka.ms/LearnAboutSenderIdentification ]
> > 
> > On 31/03/2026 00:10, Peter Xu wrote:
> > > Hi, Samuel,
> > > 
> > > On Fri, Mar 27, 2026 at 02:50:06PM +0800, Samuel Zhang wrote:
> > > > The default 1MB RDMA chunk size causes slow live migration because
> > > > each chunk triggers a write_flush (ibv_post_send). For 8GB RAM,
> > > > 1MB chunk size produces ~15000 flushes vs ~3700 with 1024MB chunk size.
> > > > 
> > > > Add x-rdma-chunk-size parameter to configure the RDMA chunk size for
> > > > faster migration.
> > > > Usage: `migrate_set_parameter x-rdma-chunk-size 1024M`
> > > > 
> > > > Performance with RDMA live migration of 8GB RAM VM:
> > > > 
> > > > | x-rdma-chunk-size (B) | time (s) | throughput (MB/s) |
> > > > |-----------------------|----------|-------------------|
> > > > | 1M (default)          | 37.915   |  1,007            |
> > > This is the default. It surprised me a bit knowing it can only reach 1GB/s
> > > throughput with the current code base.  Do you know why?  I thought RDMA
> > > should be much faster than this on throughput with whatever hardware setup.
> > 
> > Regarding the baseline performance, Samuel's numbers look reasonable. I checked
> > some of my old test data on a ConnectX-4 Lx card years ago, and the throughput
> > was around 10 Gbps (~1.25 GB/s), which is consistent with the 1 GB/s he reported.
> > 
> > > > | 32M                   | 17.880   |  2,260            |
> > > > | 1024M                 |  4.368   | 17,529            |
> > My guess for the dramatic performance improvement is that a larger chunk size
> > allows qemu_rdma_write() to batch more *contiguous dirty pages* into a single,
> > more efficient RDMA send operation.
> 
> The `throughput` data is collected from `info migrate` qemu monitor command
> after live-migration.
> 
> Yes, Zhijian is right. As each chunk triggers a write_flush and each flush
> involves posting an RDMA WRITE and WAITING for completion, there's software
> overhead here.
> 
> For 8GB RAM VM migration, 1MB chunk size produces ~15000 flushes. The
> software overhead adds up and prevents the RDMA hardware from sustaining
> high throughput.
> 
> When chunk size is 1GB, there are ~3700 flushes. Reduced flush count means
> reduced software overhead and improved overall throughput.
> 

OK, thanks both.

> 
> > 
> > Is there any workloads running on the guest during the migration, or just an idle guest? @Samuel
> 
> 
> The guest is idle when I test the migration and collect the data.
> 
> 
> > 
> > Given the significant benefit and the fact that the patch itself is straightforward,
> > I think it's a worthwhile addition.
> > 
> > Acked-by: Li Zhijian <lizhijian@fujitsu.com>
> 
> 
> Thank you for the ack, Zhijian!
> 
> 
> > 
> > 
> > 
> > > > Signed-off-by: Samuel Zhang <guoqing.zhang@amd.com>
> > > One thing to mention is RDMA migration is in odd-fixes stage, actually it
> > > doesn't have a real maintainer so it is kind of "orphaned".  In this case,
> > > I actually won't suggest we add any new knobs for performance reasons.
> > > 
> > > Do you have a strong reason to propose this patch to land upstream?  Is it
> > > used in production systems and it solves some real problems for you?
> 
> 
> We have VMs with large RAM and find TCP live-migration is not fast enough
> and expect RDMA migration can be faster.
> 
> But we found the rdma mode migration speed is slower than tcp mode. See
> following data.
> 
> 
> 8GB RAM idle VM live-migration performance:
> | transport mode       | time (s) | throughput (MB/s) |
> |----------------------|----------|-------------------|
> | TCP                  | 36.89    |  1,081            |

What is the NIC setup?  Did you try to enable multifd to offload zeropage
detections?  Or is that not feasible due to some reason?

> | RDMA, 1MB chunk size | 37.915   |  1,007            |
> | RDMA, 1GB chunk size |  4.368   | 17,529            |
> 
> This patch allows us to use larger chunk size for faster RDMA migration.

Sure, Zhijian's point is reasonable.  If he's fine, I'm OK.

Thanks,

> 
> 
> Regards
> Sam
> 
> 
> > > 
> > > I also wonder what Zhijian would say on this.
> > > 
> > > Thanks,
> > > 
> 

-- 
Peter Xu

Re: [PATCH v3] migration/rdma: add x-rdma-chunk-size parameter

Posted by Zhijian Li (Fujitsu) 2 days, 4 hours ago

On 31/03/2026 18:33, Zhang, GuoQing (Sam) wrote:
>>
>>>> Signed-off-by: Samuel Zhang <guoqing.zhang@amd.com>
>>> One thing to mention is RDMA migration is in odd-fixes stage, actually it
>>> doesn't have a real maintainer so it is kind of "orphaned".  In this case,
>>> I actually won't suggest we add any new knobs for performance reasons.
>>>
>>> Do you have a strong reason to propose this patch to land upstream?  Is it
>>> used in production systems and it solves some real problems for you?
> 
> 
> We have VMs with large RAM and find TCP live-migration is not fast
> enough and expect RDMA migration can be faster.
> 
> But we found the rdma mode migration speed is slower than tcp mode. See
> following data.
> 
> 
> 8GB RAM idle VM live-migration performance:
> | transport mode       | time (s) | throughput (MB/s) |
> |----------------------|----------|-------------------|
> | TCP                  | 36.89    |  1,081            |
> | RDMA, 1MB chunk size | 37.915   |  1,007            |
> | RDMA, 1GB chunk size |  4.368   | 17,529            |
> 
> This patch allows us to use larger chunk size for faster RDMA migration.

Hi Samuel,

Thanks for sharing this comparison data.

 From the fast completion time in your test (4.3s for 8GB), it looks like the VM was
mostly idle. This means after the first full memory pass, very few new dirty pages were
generated, allowing the migration to complete quickly. This scenario is perfect for highlighting
the benefit of large chunks when memory is dirtied in large, contiguous blocks (as it is
during the initial full scan).

To make the case for this new knob even stronger, it would be very helpful to also see data
from a more realistic workload scenario, especially one that generates randomly dirty pages.

In such a case, even with a large chunk size, `qemu_rdma_write()` would only be able to send
small, discontiguous blocks of dirty pages in each iteration.

I suspect the throughput gain will still be significant, but likely different from the idle case.
Having data from a workload that simulates random writes (e.g., using `stress-ng`, memhog etc) would
provide a more complete picture and help us understand the benefits in a worst-case scenario.

Thanks,
Zhijian

Re: [PATCH v3] migration/rdma: add x-rdma-chunk-size parameter

Posted by Markus Armbruster 6 days, 5 hours ago

Samuel Zhang <guoqing.zhang@amd.com> writes:

> The default 1MB RDMA chunk size causes slow live migration because
> each chunk triggers a write_flush (ibv_post_send). For 8GB RAM,
> 1MB chunk size produces ~15000 flushes vs ~3700 with 1024MB chunk size.
>
> Add x-rdma-chunk-size parameter to configure the RDMA chunk size for
> faster migration.
> Usage: `migrate_set_parameter x-rdma-chunk-size 1024M`
>
> Performance with RDMA live migration of 8GB RAM VM:
>
> | x-rdma-chunk-size (B) | time (s) | throughput (MB/s) |
> |-----------------------|----------|-------------------|
> | 1M (default)          | 37.915   |  1,007            |
> | 32M                   | 17.880   |  2,260            |
> | 1024M                 |  4.368   | 17,529            |
>
> Signed-off-by: Samuel Zhang <guoqing.zhang@amd.com>

[...]

> diff --git a/migration/options.c b/migration/options.c
> index f33b297929..bc61c8665d 100644
> --- a/migration/options.c
> +++ b/migration/options.c
> @@ -13,6 +13,7 @@
>  
>  #include "qemu/osdep.h"
>  #include "qemu/error-report.h"
> +#include "qemu/units.h"
>  #include "exec/target_page.h"
>  #include "qapi/clone-visitor.h"
>  #include "qapi/error.h"
> @@ -90,6 +91,7 @@ const PropertyInfo qdev_prop_StrOrNull;
>  
>  #define DEFAULT_MIGRATE_VCPU_DIRTY_LIMIT_PERIOD     1000    /* milliseconds */
>  #define DEFAULT_MIGRATE_VCPU_DIRTY_LIMIT            1       /* MB/s */
> +#define DEFAULT_MIGRATE_X_RDMA_CHUNK_SIZE           MiB     /* 1MB */

The comment is now superfluous.

>  
>  const Property migration_properties[] = {
>      DEFINE_PROP_BOOL("store-global-state", MigrationState,

[...]

> diff --git a/qapi/migration.json b/qapi/migration.json
> index 7134d4ce47..292d96c95a 100644
> --- a/qapi/migration.json
> +++ b/qapi/migration.json
> @@ -806,7 +806,7 @@
>  #
>  # Features:
>  #
> -# @unstable: Members @x-checkpoint-delay and
> +# @unstable: Members @x-checkpoint-delay, @x-rdma-chunk-size, and
>  #     @x-vcpu-dirty-limit-period are experimental.
>  #
>  # Since: 2.4
> @@ -831,6 +831,7 @@
>             'mode',
>             'zero-page-detection',
>             'direct-io',
> +           { 'name': 'x-rdma-chunk-size', 'features': [ 'unstable' ] },
>             'cpr-exec-command'] }
>  
>  ##
> @@ -1007,9 +1008,14 @@
>  #     is @cpr-exec.  The first list element is the program's filename,
>  #     the remainder its arguments.  (Since 10.2)
>  #
> +# @x-rdma-chunk-size: RDMA memory registration chunk size in bytes.
> +#     Default is 1MiB.  Must be a power of 2 in the range
> +#     [1MiB, 1024MiB].  Only takes effect for RDMA migration.

I believe it applies to channels whose migration address type is is
"rdma".  In MigrationChannel syntax

    {"channel-type": ..., "addr": {"transport": "rdma", ...}}

Correct?

> +#     (Since 11.1)
> +#
>  # Features:
>  #
> -# @unstable: Members @x-checkpoint-delay and
> +# @unstable: Members @x-checkpoint-delay, @x-rdma-chunk-size, and
>  #     @x-vcpu-dirty-limit-period are experimental.
>  #
>  # Since: 2.4
> @@ -1046,6 +1052,8 @@
>              '*mode': 'MigMode',
>              '*zero-page-detection': 'ZeroPageDetection',
>              '*direct-io': 'bool',
> +            '*x-rdma-chunk-size': { 'type': 'uint64',
> +                                    'features': [ 'unstable' ] },
>              '*cpr-exec-command': [ 'str' ]} }
>  
>  ##

Re: [PATCH v3] migration/rdma: add x-rdma-chunk-size parameter

Posted by Zhang, GuoQing (Sam) 6 days, 5 hours ago

On 2026/3/27 17:45, Markus Armbruster wrote:
> Samuel Zhang <guoqing.zhang@amd.com> writes:
>
>> The default 1MB RDMA chunk size causes slow live migration because
>> each chunk triggers a write_flush (ibv_post_send). For 8GB RAM,
>> 1MB chunk size produces ~15000 flushes vs ~3700 with 1024MB chunk size.
>>
>> Add x-rdma-chunk-size parameter to configure the RDMA chunk size for
>> faster migration.
>> Usage: `migrate_set_parameter x-rdma-chunk-size 1024M`
>>
>> Performance with RDMA live migration of 8GB RAM VM:
>>
>> | x-rdma-chunk-size (B) | time (s) | throughput (MB/s) |
>> |-----------------------|----------|-------------------|
>> | 1M (default)          | 37.915   |  1,007            |
>> | 32M                   | 17.880   |  2,260            |
>> | 1024M                 |  4.368   | 17,529            |
>>
>> Signed-off-by: Samuel Zhang <guoqing.zhang@amd.com>
> [...]
>
>> diff --git a/migration/options.c b/migration/options.c
>> index f33b297929..bc61c8665d 100644
>> --- a/migration/options.c
>> +++ b/migration/options.c
>> @@ -13,6 +13,7 @@
>>   
>>   #include "qemu/osdep.h"
>>   #include "qemu/error-report.h"
>> +#include "qemu/units.h"
>>   #include "exec/target_page.h"
>>   #include "qapi/clone-visitor.h"
>>   #include "qapi/error.h"
>> @@ -90,6 +91,7 @@ const PropertyInfo qdev_prop_StrOrNull;
>>   
>>   #define DEFAULT_MIGRATE_VCPU_DIRTY_LIMIT_PERIOD     1000    /* milliseconds */
>>   #define DEFAULT_MIGRATE_VCPU_DIRTY_LIMIT            1       /* MB/s */
>> +#define DEFAULT_MIGRATE_X_RDMA_CHUNK_SIZE           MiB     /* 1MB */
> The comment is now superfluous.
>
>>   
>>   const Property migration_properties[] = {
>>       DEFINE_PROP_BOOL("store-global-state", MigrationState,
> [...]
>
>> diff --git a/qapi/migration.json b/qapi/migration.json
>> index 7134d4ce47..292d96c95a 100644
>> --- a/qapi/migration.json
>> +++ b/qapi/migration.json
>> @@ -806,7 +806,7 @@
>>   #
>>   # Features:
>>   #
>> -# @unstable: Members @x-checkpoint-delay and
>> +# @unstable: Members @x-checkpoint-delay, @x-rdma-chunk-size, and
>>   #     @x-vcpu-dirty-limit-period are experimental.
>>   #
>>   # Since: 2.4
>> @@ -831,6 +831,7 @@
>>              'mode',
>>              'zero-page-detection',
>>              'direct-io',
>> +           { 'name': 'x-rdma-chunk-size', 'features': [ 'unstable' ] },
>>              'cpr-exec-command'] }
>>   
>>   ##
>> @@ -1007,9 +1008,14 @@
>>   #     is @cpr-exec.  The first list element is the program's filename,
>>   #     the remainder its arguments.  (Since 10.2)
>>   #
>> +# @x-rdma-chunk-size: RDMA memory registration chunk size in bytes.
>> +#     Default is 1MiB.  Must be a power of 2 in the range
>> +#     [1MiB, 1024MiB].  Only takes effect for RDMA migration.
> I believe it applies to channels whose migration address type is is
> "rdma".  In MigrationChannel syntax
>
>      {"channel-type": ..., "addr": {"transport": "rdma", ...}}
>
> Correct?


Correct! Is it OK to update the doc to the following one? Thank you!

`Only takes effect when migration address transport is "rdma".`


>
>> +#     (Since 11.1)
>> +#
>>   # Features:
>>   #
>> -# @unstable: Members @x-checkpoint-delay and
>> +# @unstable: Members @x-checkpoint-delay, @x-rdma-chunk-size, and
>>   #     @x-vcpu-dirty-limit-period are experimental.
>>   #
>>   # Since: 2.4
>> @@ -1046,6 +1052,8 @@
>>               '*mode': 'MigMode',
>>               '*zero-page-detection': 'ZeroPageDetection',
>>               '*direct-io': 'bool',
>> +            '*x-rdma-chunk-size': { 'type': 'uint64',
>> +                                    'features': [ 'unstable' ] },
>>               '*cpr-exec-command': [ 'str' ]} }
>>   
>>   ##

Re: [PATCH v3] migration/rdma: add x-rdma-chunk-size parameter

Posted by Markus Armbruster 6 days, 4 hours ago

"Zhang, GuoQing (Sam)" <guoqzhan@amd.com> writes:

> On 2026/3/27 17:45, Markus Armbruster wrote:
>> Samuel Zhang <guoqing.zhang@amd.com> writes:
>>
>>> The default 1MB RDMA chunk size causes slow live migration because
>>> each chunk triggers a write_flush (ibv_post_send). For 8GB RAM,
>>> 1MB chunk size produces ~15000 flushes vs ~3700 with 1024MB chunk size.
>>>
>>> Add x-rdma-chunk-size parameter to configure the RDMA chunk size for
>>> faster migration.
>>> Usage: `migrate_set_parameter x-rdma-chunk-size 1024M`
>>>
>>> Performance with RDMA live migration of 8GB RAM VM:
>>>
>>> | x-rdma-chunk-size (B) | time (s) | throughput (MB/s) |
>>> |-----------------------|----------|-------------------|
>>> | 1M (default)          | 37.915   |  1,007            |
>>> | 32M                   | 17.880   |  2,260            |
>>> | 1024M                 |  4.368   | 17,529            |
>>>
>>> Signed-off-by: Samuel Zhang <guoqing.zhang@amd.com>

[...]

>>> diff --git a/qapi/migration.json b/qapi/migration.json
>>> index 7134d4ce47..292d96c95a 100644
>>> --- a/qapi/migration.json
>>> +++ b/qapi/migration.json
>>> @@ -806,7 +806,7 @@
>>>  #
>>>  # Features:
>>>  #
>>> -# @unstable: Members @x-checkpoint-delay and
>>> +# @unstable: Members @x-checkpoint-delay, @x-rdma-chunk-size, and
>>>  #     @x-vcpu-dirty-limit-period are experimental.
>>>  #
>>>  # Since: 2.4
>>> @@ -831,6 +831,7 @@
>>>             'mode',
>>>             'zero-page-detection',
>>>             'direct-io',
>>> +           { 'name': 'x-rdma-chunk-size', 'features': [ 'unstable' ] },
>>>             'cpr-exec-command'] }
>>>  
>>>  ##
>>> @@ -1007,9 +1008,14 @@
>>>  #     is @cpr-exec.  The first list element is the program's filename,
>>>  #     the remainder its arguments.  (Since 10.2)
>>>  #
>>> +# @x-rdma-chunk-size: RDMA memory registration chunk size in bytes.
>>> +#     Default is 1MiB.  Must be a power of 2 in the range
>>> +#     [1MiB, 1024MiB].  Only takes effect for RDMA migration.
>>
>> I believe it applies to channels whose migration address type is is
>> "rdma".  In MigrationChannel syntax
>>
>>      {"channel-type": ..., "addr": {"transport": "rdma", ...}}
>>
>> Correct?
>
>
> Correct! Is it OK to update the doc to the following one? Thank you!
>
> `Only takes effect when migration address transport is "rdma".`

The phrasing in the patch feels okay as is.  Perhaps "Only applies when
migrating via RDMA" to more closely match the description of
MigrationAddressType @rdma: Migrate via RDMA.

Either way, QAPI schema
Acked-by: Markus Armbruster <armbru@redhat.com>

>>> +#     (Since 11.1)
>>> +#
>>>  # Features:
>>>  #
>>> -# @unstable: Members @x-checkpoint-delay and
>>> +# @unstable: Members @x-checkpoint-delay, @x-rdma-chunk-size, and
>>>  #     @x-vcpu-dirty-limit-period are experimental.
>>>  #
>>>  # Since: 2.4
>>> @@ -1046,6 +1052,8 @@
>>>              '*mode': 'MigMode',
>>>              '*zero-page-detection': 'ZeroPageDetection',
>>>              '*direct-io': 'bool',
>>> +            '*x-rdma-chunk-size': { 'type': 'uint64',
>>> +                                    'features': [ 'unstable' ] },
>>>              '*cpr-exec-command': [ 'str' ]} }
>>  
>>>  ##