Previously, for the fsdax mem-backend-file, it will register failed with
Operation not supported. In this case, we can try to register it with
On-Demand Paging[1] like what rpma_mr_reg() does on rpma[2].
[1]: https://community.mellanox.com/s/article/understanding-on-demand-paging--odp-x
[2]: http://pmem.io/rpma/manpages/v0.9.0/rpma_mr_reg.3
CC: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
---
V2: add ODP sanity check and remove goto
---
migration/rdma.c | 73 ++++++++++++++++++++++++++++++------------
migration/trace-events | 1 +
2 files changed, 54 insertions(+), 20 deletions(-)
diff --git a/migration/rdma.c b/migration/rdma.c
index 5c2d113aa94..eb80431aae2 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -1117,19 +1117,47 @@ static int qemu_rdma_alloc_qp(RDMAContext *rdma)
return 0;
}
+/* Check whether On-Demand Paging is supported by RDAM device */
+static bool rdma_support_odp(struct ibv_context *dev)
+{
+ struct ibv_device_attr_ex attr = {0};
+ int ret = ibv_query_device_ex(dev, NULL, &attr);
+ if (ret) {
+ return false;
+ }
+
+ if (attr.odp_caps.general_caps & IBV_ODP_SUPPORT) {
+ return true;
+ }
+
+ return false;
+}
+
static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma)
{
int i;
RDMALocalBlocks *local = &rdma->local_ram_blocks;
for (i = 0; i < local->nb_blocks; i++) {
+ int access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE;
+
local->block[i].mr =
ibv_reg_mr(rdma->pd,
local->block[i].local_host_addr,
- local->block[i].length,
- IBV_ACCESS_LOCAL_WRITE |
- IBV_ACCESS_REMOTE_WRITE
+ local->block[i].length, access
);
+
+ if (!local->block[i].mr &&
+ errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
+ access |= IBV_ACCESS_ON_DEMAND;
+ /* register ODP mr */
+ local->block[i].mr =
+ ibv_reg_mr(rdma->pd,
+ local->block[i].local_host_addr,
+ local->block[i].length, access);
+ trace_qemu_rdma_register_odp_mr(local->block[i].block_name);
+ }
+
if (!local->block[i].mr) {
perror("Failed to register local dest ram block!");
break;
@@ -1215,28 +1243,33 @@ static int qemu_rdma_register_and_get_keys(RDMAContext *rdma,
*/
if (!block->pmr[chunk]) {
uint64_t len = chunk_end - chunk_start;
+ int access = rkey ? IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE :
+ 0;
trace_qemu_rdma_register_and_get_keys(len, chunk_start);
- block->pmr[chunk] = ibv_reg_mr(rdma->pd,
- chunk_start, len,
- (rkey ? (IBV_ACCESS_LOCAL_WRITE |
- IBV_ACCESS_REMOTE_WRITE) : 0));
-
- if (!block->pmr[chunk]) {
- perror("Failed to register chunk!");
- fprintf(stderr, "Chunk details: block: %d chunk index %d"
- " start %" PRIuPTR " end %" PRIuPTR
- " host %" PRIuPTR
- " local %" PRIuPTR " registrations: %d\n",
- block->index, chunk, (uintptr_t)chunk_start,
- (uintptr_t)chunk_end, host_addr,
- (uintptr_t)block->local_host_addr,
- rdma->total_registrations);
- return -1;
+ block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
+ if (!block->pmr[chunk] &&
+ errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
+ access |= IBV_ACCESS_ON_DEMAND;
+ /* register ODP mr */
+ block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
+ trace_qemu_rdma_register_odp_mr(block->block_name);
}
- rdma->total_registrations++;
}
+ if (!block->pmr[chunk]) {
+ perror("Failed to register chunk!");
+ fprintf(stderr, "Chunk details: block: %d chunk index %d"
+ " start %" PRIuPTR " end %" PRIuPTR
+ " host %" PRIuPTR
+ " local %" PRIuPTR " registrations: %d\n",
+ block->index, chunk, (uintptr_t)chunk_start,
+ (uintptr_t)chunk_end, host_addr,
+ (uintptr_t)block->local_host_addr,
+ rdma->total_registrations);
+ return -1;
+ }
+ rdma->total_registrations++;
if (lkey) {
*lkey = block->pmr[chunk]->lkey;
diff --git a/migration/trace-events b/migration/trace-events
index a1c0f034ab8..5f6aa580def 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -212,6 +212,7 @@ qemu_rdma_poll_write(const char *compstr, int64_t comp, int left, uint64_t block
qemu_rdma_poll_other(const char *compstr, int64_t comp, int left) "other completion %s (%" PRId64 ") received left %d"
qemu_rdma_post_send_control(const char *desc) "CONTROL: sending %s.."
qemu_rdma_register_and_get_keys(uint64_t len, void *start) "Registering %" PRIu64 " bytes @ %p"
+qemu_rdma_register_odp_mr(const char *name) "Try to register On-Demand Paging memory region: %s"
qemu_rdma_registration_handle_compress(int64_t length, int index, int64_t offset) "Zapping zero chunk: %" PRId64 " bytes, index %d, offset %" PRId64
qemu_rdma_registration_handle_finished(void) ""
qemu_rdma_registration_handle_ram_blocks(void) ""
--
2.31.1
CCing Marcel
On 23/08/2021 11:33, Li Zhijian wrote:
> Previously, for the fsdax mem-backend-file, it will register failed with
> Operation not supported. In this case, we can try to register it with
> On-Demand Paging[1] like what rpma_mr_reg() does on rpma[2].
>
> [1]: https://community.mellanox.com/s/article/understanding-on-demand-paging--odp-x
> [2]: http://pmem.io/rpma/manpages/v0.9.0/rpma_mr_reg.3
>
> CC: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
> Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
>
> ---
> V2: add ODP sanity check and remove goto
> ---
> migration/rdma.c | 73 ++++++++++++++++++++++++++++++------------
> migration/trace-events | 1 +
> 2 files changed, 54 insertions(+), 20 deletions(-)
>
> diff --git a/migration/rdma.c b/migration/rdma.c
> index 5c2d113aa94..eb80431aae2 100644
> --- a/migration/rdma.c
> +++ b/migration/rdma.c
> @@ -1117,19 +1117,47 @@ static int qemu_rdma_alloc_qp(RDMAContext *rdma)
> return 0;
> }
>
> +/* Check whether On-Demand Paging is supported by RDAM device */
> +static bool rdma_support_odp(struct ibv_context *dev)
> +{
> + struct ibv_device_attr_ex attr = {0};
> + int ret = ibv_query_device_ex(dev, NULL, &attr);
> + if (ret) {
> + return false;
> + }
> +
> + if (attr.odp_caps.general_caps & IBV_ODP_SUPPORT) {
> + return true;
> + }
> +
> + return false;
> +}
> +
> static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma)
> {
> int i;
> RDMALocalBlocks *local = &rdma->local_ram_blocks;
>
> for (i = 0; i < local->nb_blocks; i++) {
> + int access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE;
> +
> local->block[i].mr =
> ibv_reg_mr(rdma->pd,
> local->block[i].local_host_addr,
> - local->block[i].length,
> - IBV_ACCESS_LOCAL_WRITE |
> - IBV_ACCESS_REMOTE_WRITE
> + local->block[i].length, access
> );
> +
> + if (!local->block[i].mr &&
> + errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
> + access |= IBV_ACCESS_ON_DEMAND;
> + /* register ODP mr */
> + local->block[i].mr =
> + ibv_reg_mr(rdma->pd,
> + local->block[i].local_host_addr,
> + local->block[i].length, access);
> + trace_qemu_rdma_register_odp_mr(local->block[i].block_name);
> + }
> +
> if (!local->block[i].mr) {
> perror("Failed to register local dest ram block!");
> break;
> @@ -1215,28 +1243,33 @@ static int qemu_rdma_register_and_get_keys(RDMAContext *rdma,
> */
> if (!block->pmr[chunk]) {
> uint64_t len = chunk_end - chunk_start;
> + int access = rkey ? IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE :
> + 0;
>
> trace_qemu_rdma_register_and_get_keys(len, chunk_start);
>
> - block->pmr[chunk] = ibv_reg_mr(rdma->pd,
> - chunk_start, len,
> - (rkey ? (IBV_ACCESS_LOCAL_WRITE |
> - IBV_ACCESS_REMOTE_WRITE) : 0));
> -
> - if (!block->pmr[chunk]) {
> - perror("Failed to register chunk!");
> - fprintf(stderr, "Chunk details: block: %d chunk index %d"
> - " start %" PRIuPTR " end %" PRIuPTR
> - " host %" PRIuPTR
> - " local %" PRIuPTR " registrations: %d\n",
> - block->index, chunk, (uintptr_t)chunk_start,
> - (uintptr_t)chunk_end, host_addr,
> - (uintptr_t)block->local_host_addr,
> - rdma->total_registrations);
> - return -1;
> + block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
> + if (!block->pmr[chunk] &&
> + errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
> + access |= IBV_ACCESS_ON_DEMAND;
> + /* register ODP mr */
> + block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
> + trace_qemu_rdma_register_odp_mr(block->block_name);
> }
> - rdma->total_registrations++;
> }
> + if (!block->pmr[chunk]) {
> + perror("Failed to register chunk!");
> + fprintf(stderr, "Chunk details: block: %d chunk index %d"
> + " start %" PRIuPTR " end %" PRIuPTR
> + " host %" PRIuPTR
> + " local %" PRIuPTR " registrations: %d\n",
> + block->index, chunk, (uintptr_t)chunk_start,
> + (uintptr_t)chunk_end, host_addr,
> + (uintptr_t)block->local_host_addr,
> + rdma->total_registrations);
> + return -1;
> + }
> + rdma->total_registrations++;
>
> if (lkey) {
> *lkey = block->pmr[chunk]->lkey;
> diff --git a/migration/trace-events b/migration/trace-events
> index a1c0f034ab8..5f6aa580def 100644
> --- a/migration/trace-events
> +++ b/migration/trace-events
> @@ -212,6 +212,7 @@ qemu_rdma_poll_write(const char *compstr, int64_t comp, int left, uint64_t block
> qemu_rdma_poll_other(const char *compstr, int64_t comp, int left) "other completion %s (%" PRId64 ") received left %d"
> qemu_rdma_post_send_control(const char *desc) "CONTROL: sending %s.."
> qemu_rdma_register_and_get_keys(uint64_t len, void *start) "Registering %" PRIu64 " bytes @ %p"
> +qemu_rdma_register_odp_mr(const char *name) "Try to register On-Demand Paging memory region: %s"
> qemu_rdma_registration_handle_compress(int64_t length, int index, int64_t offset) "Zapping zero chunk: %" PRId64 " bytes, index %d, offset %" PRId64
> qemu_rdma_registration_handle_finished(void) ""
> qemu_rdma_registration_handle_ram_blocks(void) ""
Hi Zhijian,
On Mon, Aug 23, 2021 at 11:42 AM lizhijian@fujitsu.com
<lizhijian@fujitsu.com> wrote:
>
> CCing Marcel
>
>
> On 23/08/2021 11:33, Li Zhijian wrote:
> > Previously, for the fsdax mem-backend-file, it will register failed with
> > Operation not supported. In this case, we can try to register it with
> > On-Demand Paging[1] like what rpma_mr_reg() does on rpma[2].
> >
> > [1]: https://community.mellanox.com/s/article/understanding-on-demand-paging--odp-x
> > [2]: http://pmem.io/rpma/manpages/v0.9.0/rpma_mr_reg.3
> >
> > CC: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
> > Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
> >
> > ---
> > V2: add ODP sanity check and remove goto
> > ---
> > migration/rdma.c | 73 ++++++++++++++++++++++++++++++------------
> > migration/trace-events | 1 +
> > 2 files changed, 54 insertions(+), 20 deletions(-)
> >
> > diff --git a/migration/rdma.c b/migration/rdma.c
> > index 5c2d113aa94..eb80431aae2 100644
> > --- a/migration/rdma.c
> > +++ b/migration/rdma.c
> > @@ -1117,19 +1117,47 @@ static int qemu_rdma_alloc_qp(RDMAContext *rdma)
> > return 0;
> > }
> >
> > +/* Check whether On-Demand Paging is supported by RDAM device */
> > +static bool rdma_support_odp(struct ibv_context *dev)
> > +{
> > + struct ibv_device_attr_ex attr = {0};
> > + int ret = ibv_query_device_ex(dev, NULL, &attr);
> > + if (ret) {
> > + return false;
> > + }
> > +
> > + if (attr.odp_caps.general_caps & IBV_ODP_SUPPORT) {
> > + return true;
> > + }
> > +
> > + return false;
> > +}
> > +
> > static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma)
> > {
> > int i;
> > RDMALocalBlocks *local = &rdma->local_ram_blocks;
> >
> > for (i = 0; i < local->nb_blocks; i++) {
> > + int access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE;
> > +
> > local->block[i].mr =
> > ibv_reg_mr(rdma->pd,
> > local->block[i].local_host_addr,
> > - local->block[i].length,
> > - IBV_ACCESS_LOCAL_WRITE |
> > - IBV_ACCESS_REMOTE_WRITE
> > + local->block[i].length, access
> > );
> > +
> > + if (!local->block[i].mr &&
> > + errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
> > + access |= IBV_ACCESS_ON_DEMAND;
> > + /* register ODP mr */
> > + local->block[i].mr =
> > + ibv_reg_mr(rdma->pd,
> > + local->block[i].local_host_addr,
> > + local->block[i].length, access);
> > + trace_qemu_rdma_register_odp_mr(local->block[i].block_name);
> > + }
> > +
> > if (!local->block[i].mr) {
> > perror("Failed to register local dest ram block!");
> > break;
> > @@ -1215,28 +1243,33 @@ static int qemu_rdma_register_and_get_keys(RDMAContext *rdma,
> > */
> > if (!block->pmr[chunk]) {
> > uint64_t len = chunk_end - chunk_start;
> > + int access = rkey ? IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE :
> > + 0;
> >
> > trace_qemu_rdma_register_and_get_keys(len, chunk_start);
> >
> > - block->pmr[chunk] = ibv_reg_mr(rdma->pd,
> > - chunk_start, len,
> > - (rkey ? (IBV_ACCESS_LOCAL_WRITE |
> > - IBV_ACCESS_REMOTE_WRITE) : 0));
> > -
> > - if (!block->pmr[chunk]) {
> > - perror("Failed to register chunk!");
> > - fprintf(stderr, "Chunk details: block: %d chunk index %d"
> > - " start %" PRIuPTR " end %" PRIuPTR
> > - " host %" PRIuPTR
> > - " local %" PRIuPTR " registrations: %d\n",
> > - block->index, chunk, (uintptr_t)chunk_start,
> > - (uintptr_t)chunk_end, host_addr,
> > - (uintptr_t)block->local_host_addr,
> > - rdma->total_registrations);
> > - return -1;
> > + block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
> > + if (!block->pmr[chunk] &&
> > + errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
> > + access |= IBV_ACCESS_ON_DEMAND;
> > + /* register ODP mr */
> > + block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
> > + trace_qemu_rdma_register_odp_mr(block->block_name);
> > }
> > - rdma->total_registrations++;
> > }
> > + if (!block->pmr[chunk]) {
> > + perror("Failed to register chunk!");
> > + fprintf(stderr, "Chunk details: block: %d chunk index %d"
> > + " start %" PRIuPTR " end %" PRIuPTR
> > + " host %" PRIuPTR
> > + " local %" PRIuPTR " registrations: %d\n",
> > + block->index, chunk, (uintptr_t)chunk_start,
> > + (uintptr_t)chunk_end, host_addr,
> > + (uintptr_t)block->local_host_addr,
> > + rdma->total_registrations);
> > + return -1;
> > + }
> > + rdma->total_registrations++;
> >
> > if (lkey) {
> > *lkey = block->pmr[chunk]->lkey;
> > diff --git a/migration/trace-events b/migration/trace-events
> > index a1c0f034ab8..5f6aa580def 100644
> > --- a/migration/trace-events
> > +++ b/migration/trace-events
> > @@ -212,6 +212,7 @@ qemu_rdma_poll_write(const char *compstr, int64_t comp, int left, uint64_t block
> > qemu_rdma_poll_other(const char *compstr, int64_t comp, int left) "other completion %s (%" PRId64 ") received left %d"
> > qemu_rdma_post_send_control(const char *desc) "CONTROL: sending %s.."
> > qemu_rdma_register_and_get_keys(uint64_t len, void *start) "Registering %" PRIu64 " bytes @ %p"
> > +qemu_rdma_register_odp_mr(const char *name) "Try to register On-Demand Paging memory region: %s"
> > qemu_rdma_registration_handle_compress(int64_t length, int index, int64_t offset) "Zapping zero chunk: %" PRId64 " bytes, index %d, offset %" PRId64
> > qemu_rdma_registration_handle_finished(void) ""
> > qemu_rdma_registration_handle_ram_blocks(void) ""
Reviewed-by: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
Thanks,
Marcel
© 2016 - 2026 Red Hat, Inc.