Previously, for the fsdax mem-backend-file, it will register failed with
Operation not supported. In this case, we can try to register it with
On-Demand Paging[1] like what rpma_mr_reg() does on rpma[2].
[1]: https://community.mellanox.com/s/article/understanding-on-demand-paging--odp-x
[2]: http://pmem.io/rpma/manpages/v0.9.0/rpma_mr_reg.3
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
---
migration/rdma.c | 27 ++++++++++++++++++---------
migration/trace-events | 1 +
2 files changed, 19 insertions(+), 9 deletions(-)
diff --git a/migration/rdma.c b/migration/rdma.c
index 5c2d113aa94..8784b5f22a6 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -1123,15 +1123,21 @@ static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma)
RDMALocalBlocks *local = &rdma->local_ram_blocks;
for (i = 0; i < local->nb_blocks; i++) {
+ int access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE;
+
+on_demand:
local->block[i].mr =
ibv_reg_mr(rdma->pd,
local->block[i].local_host_addr,
- local->block[i].length,
- IBV_ACCESS_LOCAL_WRITE |
- IBV_ACCESS_REMOTE_WRITE
+ local->block[i].length, access
);
if (!local->block[i].mr) {
- perror("Failed to register local dest ram block!");
+ if (!(access & IBV_ACCESS_ON_DEMAND) && errno == ENOTSUP) {
+ access |= IBV_ACCESS_ON_DEMAND;
+ trace_qemu_rdma_register_odp_mr(local->block[i].block_name);
+ goto on_demand;
+ }
+ perror("Failed to register local dest ram block!\n");
break;
}
rdma->total_registrations++;
@@ -1215,15 +1221,18 @@ static int qemu_rdma_register_and_get_keys(RDMAContext *rdma,
*/
if (!block->pmr[chunk]) {
uint64_t len = chunk_end - chunk_start;
+ int access = rkey ? IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE : 0;
trace_qemu_rdma_register_and_get_keys(len, chunk_start);
- block->pmr[chunk] = ibv_reg_mr(rdma->pd,
- chunk_start, len,
- (rkey ? (IBV_ACCESS_LOCAL_WRITE |
- IBV_ACCESS_REMOTE_WRITE) : 0));
-
+on_demand:
+ block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
if (!block->pmr[chunk]) {
+ if (!(access & IBV_ACCESS_ON_DEMAND) && errno == ENOTSUP) {
+ access |= IBV_ACCESS_ON_DEMAND;
+ trace_qemu_rdma_register_odp_mr(block->block_name);
+ goto on_demand;
+ }
perror("Failed to register chunk!");
fprintf(stderr, "Chunk details: block: %d chunk index %d"
" start %" PRIuPTR " end %" PRIuPTR
diff --git a/migration/trace-events b/migration/trace-events
index a1c0f034ab8..5f6aa580def 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -212,6 +212,7 @@ qemu_rdma_poll_write(const char *compstr, int64_t comp, int left, uint64_t block
qemu_rdma_poll_other(const char *compstr, int64_t comp, int left) "other completion %s (%" PRId64 ") received left %d"
qemu_rdma_post_send_control(const char *desc) "CONTROL: sending %s.."
qemu_rdma_register_and_get_keys(uint64_t len, void *start) "Registering %" PRIu64 " bytes @ %p"
+qemu_rdma_register_odp_mr(const char *name) "Try to register On-Demand Paging memory region: %s"
qemu_rdma_registration_handle_compress(int64_t length, int index, int64_t offset) "Zapping zero chunk: %" PRId64 " bytes, index %d, offset %" PRId64
qemu_rdma_registration_handle_finished(void) ""
qemu_rdma_registration_handle_ram_blocks(void) ""
--
2.31.1
Hi
On Sat, Jul 31, 2021 at 5:00 PM Li Zhijian <lizhijian@cn.fujitsu.com> wrote:
>
> Previously, for the fsdax mem-backend-file, it will register failed with
> Operation not supported. In this case, we can try to register it with
> On-Demand Paging[1] like what rpma_mr_reg() does on rpma[2].
>
> [1]: https://community.mellanox.com/s/article/understanding-on-demand-paging--odp-x
> [2]: http://pmem.io/rpma/manpages/v0.9.0/rpma_mr_reg.3
> Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
> ---
> migration/rdma.c | 27 ++++++++++++++++++---------
> migration/trace-events | 1 +
> 2 files changed, 19 insertions(+), 9 deletions(-)
>
> diff --git a/migration/rdma.c b/migration/rdma.c
> index 5c2d113aa94..8784b5f22a6 100644
> --- a/migration/rdma.c
> +++ b/migration/rdma.c
> @@ -1123,15 +1123,21 @@ static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma)
> RDMALocalBlocks *local = &rdma->local_ram_blocks;
>
> for (i = 0; i < local->nb_blocks; i++) {
> + int access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE;
> +
> +on_demand:
> local->block[i].mr =
> ibv_reg_mr(rdma->pd,
> local->block[i].local_host_addr,
> - local->block[i].length,
> - IBV_ACCESS_LOCAL_WRITE |
> - IBV_ACCESS_REMOTE_WRITE
> + local->block[i].length, access
> );
> if (!local->block[i].mr) {
> - perror("Failed to register local dest ram block!");
> + if (!(access & IBV_ACCESS_ON_DEMAND) && errno == ENOTSUP) {
> + access |= IBV_ACCESS_ON_DEMAND;
> + trace_qemu_rdma_register_odp_mr(local->block[i].block_name);
> + goto on_demand;
Wouldn't it be better to check first if the device supports ODP ?
Something like:
ret = ibv_exp_query_device(context, &dattr);
if (dattr.exp_device_cap_flags & IBV_EXP_DEVICE_ODP)...
Also, I am not (personally) too fond of the "on_demand" label usage here,
however I will let the maintainer/others decide.
Thanks,
Marcel
> + }
> + perror("Failed to register local dest ram block!\n");
> break;
> }
> rdma->total_registrations++;
> @@ -1215,15 +1221,18 @@ static int qemu_rdma_register_and_get_keys(RDMAContext *rdma,
> */
> if (!block->pmr[chunk]) {
> uint64_t len = chunk_end - chunk_start;
> + int access = rkey ? IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE : 0;
>
> trace_qemu_rdma_register_and_get_keys(len, chunk_start);
>
> - block->pmr[chunk] = ibv_reg_mr(rdma->pd,
> - chunk_start, len,
> - (rkey ? (IBV_ACCESS_LOCAL_WRITE |
> - IBV_ACCESS_REMOTE_WRITE) : 0));
> -
> +on_demand:
> + block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
> if (!block->pmr[chunk]) {
> + if (!(access & IBV_ACCESS_ON_DEMAND) && errno == ENOTSUP) {
> + access |= IBV_ACCESS_ON_DEMAND;
> + trace_qemu_rdma_register_odp_mr(block->block_name);
> + goto on_demand;
> + }
> perror("Failed to register chunk!");
> fprintf(stderr, "Chunk details: block: %d chunk index %d"
> " start %" PRIuPTR " end %" PRIuPTR
> diff --git a/migration/trace-events b/migration/trace-events
> index a1c0f034ab8..5f6aa580def 100644
> --- a/migration/trace-events
> +++ b/migration/trace-events
> @@ -212,6 +212,7 @@ qemu_rdma_poll_write(const char *compstr, int64_t comp, int left, uint64_t block
> qemu_rdma_poll_other(const char *compstr, int64_t comp, int left) "other completion %s (%" PRId64 ") received left %d"
> qemu_rdma_post_send_control(const char *desc) "CONTROL: sending %s.."
> qemu_rdma_register_and_get_keys(uint64_t len, void *start) "Registering %" PRIu64 " bytes @ %p"
> +qemu_rdma_register_odp_mr(const char *name) "Try to register On-Demand Paging memory region: %s"
> qemu_rdma_registration_handle_compress(int64_t length, int index, int64_t offset) "Zapping zero chunk: %" PRId64 " bytes, index %d, offset %" PRId64
> qemu_rdma_registration_handle_finished(void) ""
> qemu_rdma_registration_handle_ram_blocks(void) ""
> --
> 2.31.1
>
>
>
>
On 22/08/2021 16:53, Marcel Apfelbaum wrote:
> Hi
>
> On Sat, Jul 31, 2021 at 5:00 PM Li Zhijian <lizhijian@cn.fujitsu.com> wrote:
>> Previously, for the fsdax mem-backend-file, it will register failed with
>> Operation not supported. In this case, we can try to register it with
>> On-Demand Paging[1] like what rpma_mr_reg() does on rpma[2].
>>
>> [1]: https://community.mellanox.com/s/article/understanding-on-demand-paging--odp-x
>> [2]: http://pmem.io/rpma/manpages/v0.9.0/rpma_mr_reg.3
>> Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
>> ---
>> migration/rdma.c | 27 ++++++++++++++++++---------
>> migration/trace-events | 1 +
>> 2 files changed, 19 insertions(+), 9 deletions(-)
>>
>> diff --git a/migration/rdma.c b/migration/rdma.c
>> index 5c2d113aa94..8784b5f22a6 100644
>> --- a/migration/rdma.c
>> +++ b/migration/rdma.c
>> @@ -1123,15 +1123,21 @@ static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma)
>> RDMALocalBlocks *local = &rdma->local_ram_blocks;
>>
>> for (i = 0; i < local->nb_blocks; i++) {
>> + int access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE;
>> +
>> +on_demand:
>> local->block[i].mr =
>> ibv_reg_mr(rdma->pd,
>> local->block[i].local_host_addr,
>> - local->block[i].length,
>> - IBV_ACCESS_LOCAL_WRITE |
>> - IBV_ACCESS_REMOTE_WRITE
>> + local->block[i].length, access
>> );
>> if (!local->block[i].mr) {
>> - perror("Failed to register local dest ram block!");
>> + if (!(access & IBV_ACCESS_ON_DEMAND) && errno == ENOTSUP) {
>> + access |= IBV_ACCESS_ON_DEMAND;
>> + trace_qemu_rdma_register_odp_mr(local->block[i].block_name);
>> + goto on_demand;
> Wouldn't it be better to check first if the device supports ODP ?
> Something like:
> ret = ibv_exp_query_device(context, &dattr);
> if (dattr.exp_device_cap_flags & IBV_EXP_DEVICE_ODP)...
Good idea !
>
> Also, I am not (personally) too fond of the "on_demand" label usage here,
> however I will let the maintainer/others decide.
Indeed, how just repeating the ibv_reg_mr() instead of a 'go to'
Thanks
Zhijian
>
> Thanks,
> Marcel
>
>> + }
>> + perror("Failed to register local dest ram block!\n");
>> break;
>> }
>> rdma->total_registrations++;
>> @@ -1215,15 +1221,18 @@ static int qemu_rdma_register_and_get_keys(RDMAContext *rdma,
>> */
>> if (!block->pmr[chunk]) {
>> uint64_t len = chunk_end - chunk_start;
>> + int access = rkey ? IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE : 0;
>>
>> trace_qemu_rdma_register_and_get_keys(len, chunk_start);
>>
>> - block->pmr[chunk] = ibv_reg_mr(rdma->pd,
>> - chunk_start, len,
>> - (rkey ? (IBV_ACCESS_LOCAL_WRITE |
>> - IBV_ACCESS_REMOTE_WRITE) : 0));
>> -
>> +on_demand:
>> + block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
>> if (!block->pmr[chunk]) {
>> + if (!(access & IBV_ACCESS_ON_DEMAND) && errno == ENOTSUP) {
>> + access |= IBV_ACCESS_ON_DEMAND;
>> + trace_qemu_rdma_register_odp_mr(block->block_name);
>> + goto on_demand;
>> + }
>> perror("Failed to register chunk!");
>> fprintf(stderr, "Chunk details: block: %d chunk index %d"
>> " start %" PRIuPTR " end %" PRIuPTR
>> diff --git a/migration/trace-events b/migration/trace-events
>> index a1c0f034ab8..5f6aa580def 100644
>> --- a/migration/trace-events
>> +++ b/migration/trace-events
>> @@ -212,6 +212,7 @@ qemu_rdma_poll_write(const char *compstr, int64_t comp, int left, uint64_t block
>> qemu_rdma_poll_other(const char *compstr, int64_t comp, int left) "other completion %s (%" PRId64 ") received left %d"
>> qemu_rdma_post_send_control(const char *desc) "CONTROL: sending %s.."
>> qemu_rdma_register_and_get_keys(uint64_t len, void *start) "Registering %" PRIu64 " bytes @ %p"
>> +qemu_rdma_register_odp_mr(const char *name) "Try to register On-Demand Paging memory region: %s"
>> qemu_rdma_registration_handle_compress(int64_t length, int index, int64_t offset) "Zapping zero chunk: %" PRId64 " bytes, index %d, offset %" PRId64
>> qemu_rdma_registration_handle_finished(void) ""
>> qemu_rdma_registration_handle_ram_blocks(void) ""
>> --
>> 2.31.1
>>
>>
>>
>>
>
Hi Zhijian,
On Mon, Aug 23, 2021 at 4:41 AM lizhijian@fujitsu.com
<lizhijian@fujitsu.com> wrote:
>
>
>
> On 22/08/2021 16:53, Marcel Apfelbaum wrote:
> > Hi
> >
> > On Sat, Jul 31, 2021 at 5:00 PM Li Zhijian <lizhijian@cn.fujitsu.com> wrote:
> >> Previously, for the fsdax mem-backend-file, it will register failed with
> >> Operation not supported. In this case, we can try to register it with
> >> On-Demand Paging[1] like what rpma_mr_reg() does on rpma[2].
> >>
> >> [1]: https://community.mellanox.com/s/article/understanding-on-demand-paging--odp-x
> >> [2]: http://pmem.io/rpma/manpages/v0.9.0/rpma_mr_reg.3
> >> Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
> >> ---
> >> migration/rdma.c | 27 ++++++++++++++++++---------
> >> migration/trace-events | 1 +
> >> 2 files changed, 19 insertions(+), 9 deletions(-)
> >>
> >> diff --git a/migration/rdma.c b/migration/rdma.c
> >> index 5c2d113aa94..8784b5f22a6 100644
> >> --- a/migration/rdma.c
> >> +++ b/migration/rdma.c
> >> @@ -1123,15 +1123,21 @@ static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma)
> >> RDMALocalBlocks *local = &rdma->local_ram_blocks;
> >>
> >> for (i = 0; i < local->nb_blocks; i++) {
> >> + int access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE;
> >> +
> >> +on_demand:
> >> local->block[i].mr =
> >> ibv_reg_mr(rdma->pd,
> >> local->block[i].local_host_addr,
> >> - local->block[i].length,
> >> - IBV_ACCESS_LOCAL_WRITE |
> >> - IBV_ACCESS_REMOTE_WRITE
> >> + local->block[i].length, access
> >> );
> >> if (!local->block[i].mr) {
> >> - perror("Failed to register local dest ram block!");
> >> + if (!(access & IBV_ACCESS_ON_DEMAND) && errno == ENOTSUP) {
> >> + access |= IBV_ACCESS_ON_DEMAND;
> >> + trace_qemu_rdma_register_odp_mr(local->block[i].block_name);
> >> + goto on_demand;
> > Wouldn't it be better to check first if the device supports ODP ?
> > Something like:
> > ret = ibv_exp_query_device(context, &dattr);
> > if (dattr.exp_device_cap_flags & IBV_EXP_DEVICE_ODP)...
>
> Good idea !
>
>
>
> >
> > Also, I am not (personally) too fond of the "on_demand" label usage here,
> > however I will let the maintainer/others decide.
> Indeed, how just repeating the ibv_reg_mr() instead of a 'go to'
>
Any "boring"/standard approach would do.
Thanks,
Marcel
> Thanks
> Zhijian
>
>
>
> >
> > Thanks,
> > Marcel
> >
> >> + }
> >> + perror("Failed to register local dest ram block!\n");
> >> break;
> >> }
> >> rdma->total_registrations++;
> >> @@ -1215,15 +1221,18 @@ static int qemu_rdma_register_and_get_keys(RDMAContext *rdma,
> >> */
> >> if (!block->pmr[chunk]) {
> >> uint64_t len = chunk_end - chunk_start;
> >> + int access = rkey ? IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE : 0;
> >>
> >> trace_qemu_rdma_register_and_get_keys(len, chunk_start);
> >>
> >> - block->pmr[chunk] = ibv_reg_mr(rdma->pd,
> >> - chunk_start, len,
> >> - (rkey ? (IBV_ACCESS_LOCAL_WRITE |
> >> - IBV_ACCESS_REMOTE_WRITE) : 0));
> >> -
> >> +on_demand:
> >> + block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
> >> if (!block->pmr[chunk]) {
> >> + if (!(access & IBV_ACCESS_ON_DEMAND) && errno == ENOTSUP) {
> >> + access |= IBV_ACCESS_ON_DEMAND;
> >> + trace_qemu_rdma_register_odp_mr(block->block_name);
> >> + goto on_demand;
> >> + }
> >> perror("Failed to register chunk!");
> >> fprintf(stderr, "Chunk details: block: %d chunk index %d"
> >> " start %" PRIuPTR " end %" PRIuPTR
> >> diff --git a/migration/trace-events b/migration/trace-events
> >> index a1c0f034ab8..5f6aa580def 100644
> >> --- a/migration/trace-events
> >> +++ b/migration/trace-events
> >> @@ -212,6 +212,7 @@ qemu_rdma_poll_write(const char *compstr, int64_t comp, int left, uint64_t block
> >> qemu_rdma_poll_other(const char *compstr, int64_t comp, int left) "other completion %s (%" PRId64 ") received left %d"
> >> qemu_rdma_post_send_control(const char *desc) "CONTROL: sending %s.."
> >> qemu_rdma_register_and_get_keys(uint64_t len, void *start) "Registering %" PRIu64 " bytes @ %p"
> >> +qemu_rdma_register_odp_mr(const char *name) "Try to register On-Demand Paging memory region: %s"
> >> qemu_rdma_registration_handle_compress(int64_t length, int index, int64_t offset) "Zapping zero chunk: %" PRId64 " bytes, index %d, offset %" PRId64
> >> qemu_rdma_registration_handle_finished(void) ""
> >> qemu_rdma_registration_handle_ram_blocks(void) ""
> >> --
> >> 2.31.1
> >>
> >>
> >>
> >>
> >
© 2016 - 2026 Red Hat, Inc.