[PATCH RFC 9/9] migration/rdma: Remove rdma_cm_poll_handler

Peter Xu posted 9 patches 1 month ago
Maintainers: Alex Williamson <alex.williamson@redhat.com>, "Cédric Le Goater" <clg@redhat.com>, Hailiang Zhang <zhanghailiang@xfusion.com>, Peter Xu <peterx@redhat.com>, Fabiano Rosas <farosas@suse.de>, Li Zhijian <lizhijian@fujitsu.com>
[PATCH RFC 9/9] migration/rdma: Remove rdma_cm_poll_handler
Posted by Peter Xu 1 month ago
This almost reverts commit 923709896b1b01fb982c93492ad01b233e6b6023.

It was needed because the RDMA iochannel on dest QEMU used to only yield
without monitoring the fd.  Now it should be monitored by the same poll()
similarly on the src QEMU in qemu_rdma_wait_comp_channel().  So even
without the fd handler, dest QEMU should be able to receive the events.

I tested this by initiating an RDMA migration, then do two things:

  - Either does migrate_cancel on src, or,
  - Directly kill destination QEMU

In both cases, the other side of QEMU will be able to receive the
disconnect event in qemu_rdma_wait_comp_channel() and properly cancel or
fail the migration.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 migration/rdma.c | 29 +----------------------------
 1 file changed, 1 insertion(+), 28 deletions(-)

diff --git a/migration/rdma.c b/migration/rdma.c
index 7751262460..da7fd48bf3 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -3045,32 +3045,6 @@ int rdma_control_save_page(QEMUFile *f, ram_addr_t block_offset,
 
 static void rdma_accept_incoming_migration(void *opaque);
 
-static void rdma_cm_poll_handler(void *opaque)
-{
-    RDMAContext *rdma = opaque;
-    struct rdma_cm_event *cm_event;
-
-    if (rdma_get_cm_event(rdma->channel, &cm_event) < 0) {
-        error_report("get_cm_event failed %d", errno);
-        return;
-    }
-
-    if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
-        cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
-        if (!rdma->errored &&
-            migration_incoming_get_current()->state !=
-              MIGRATION_STATUS_COMPLETED) {
-            error_report("receive cm event, cm event is %d", cm_event->event);
-            rdma->errored = true;
-            if (rdma->return_path) {
-                rdma->return_path->errored = true;
-            }
-        }
-        rdma_ack_cm_event(cm_event);
-    }
-    rdma_ack_cm_event(cm_event);
-}
-
 static int qemu_rdma_accept(RDMAContext *rdma)
 {
     Error *err = NULL;
@@ -3188,8 +3162,7 @@ static int qemu_rdma_accept(RDMAContext *rdma)
                             NULL,
                             (void *)(intptr_t)rdma->return_path);
     } else {
-        qemu_set_fd_handler(rdma->channel->fd, rdma_cm_poll_handler,
-                            NULL, rdma);
+        qemu_set_fd_handler(rdma->channel->fd, NULL, NULL, NULL);
     }
 
     ret = rdma_accept(rdma->cm_id, &conn_param);
-- 
2.50.1
Re: [PATCH RFC 9/9] migration/rdma: Remove rdma_cm_poll_handler
Posted by Zhijian Li (Fujitsu) 2 days, 11 hours ago

On 28/08/2025 04:59, Peter Xu wrote:
> This almost reverts commit 923709896b1b01fb982c93492ad01b233e6b6023.
> 
> It was needed because the RDMA iochannel on dest QEMU used to only yield
> without monitoring the fd.  Now it should be monitored by the same poll()
> similarly on the src QEMU in qemu_rdma_wait_comp_channel().  So even
> without the fd handler, dest QEMU should be able to receive the events.

Agree

Reviewed-by: Li Zhijian <lizhijian@fujitsu.com>


> 
> I tested this by initiating an RDMA migration, then do two things:
> 
>    - Either does migrate_cancel on src, or,
>    - Directly kill destination QEMU
> 
> In both cases, the other side of QEMU will be able to receive the
> disconnect event in qemu_rdma_wait_comp_channel() and properly cancel or
> fail the migration.
> 
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>   migration/rdma.c | 29 +----------------------------
>   1 file changed, 1 insertion(+), 28 deletions(-)
> 
> diff --git a/migration/rdma.c b/migration/rdma.c
> index 7751262460..da7fd48bf3 100644
> --- a/migration/rdma.c
> +++ b/migration/rdma.c
> @@ -3045,32 +3045,6 @@ int rdma_control_save_page(QEMUFile *f, ram_addr_t block_offset,
>   
>   static void rdma_accept_incoming_migration(void *opaque);
>   
> -static void rdma_cm_poll_handler(void *opaque)
> -{
> -    RDMAContext *rdma = opaque;
> -    struct rdma_cm_event *cm_event;
> -
> -    if (rdma_get_cm_event(rdma->channel, &cm_event) < 0) {
> -        error_report("get_cm_event failed %d", errno);
> -        return;
> -    }
> -
> -    if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
> -        cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
> -        if (!rdma->errored &&
> -            migration_incoming_get_current()->state !=
> -              MIGRATION_STATUS_COMPLETED) {
> -            error_report("receive cm event, cm event is %d", cm_event->event);
> -            rdma->errored = true;
> -            if (rdma->return_path) {
> -                rdma->return_path->errored = true;
> -            }
> -        }
> -        rdma_ack_cm_event(cm_event);
> -    }
> -    rdma_ack_cm_event(cm_event);
> -}
> -
>   static int qemu_rdma_accept(RDMAContext *rdma)
>   {
>       Error *err = NULL;
> @@ -3188,8 +3162,7 @@ static int qemu_rdma_accept(RDMAContext *rdma)
>                               NULL,
>                               (void *)(intptr_t)rdma->return_path);
>       } else {
> -        qemu_set_fd_handler(rdma->channel->fd, rdma_cm_poll_handler,
> -                            NULL, rdma);
> +        qemu_set_fd_handler(rdma->channel->fd, NULL, NULL, NULL);
>       }
>   
>       ret = rdma_accept(rdma->cm_id, &conn_param);
Re: [PATCH RFC 9/9] migration/rdma: Remove rdma_cm_poll_handler
Posted by Fabiano Rosas 1 week, 3 days ago
Peter Xu <peterx@redhat.com> writes:

> This almost reverts commit 923709896b1b01fb982c93492ad01b233e6b6023.
>
> It was needed because the RDMA iochannel on dest QEMU used to only yield
> without monitoring the fd.  Now it should be monitored by the same poll()
> similarly on the src QEMU in qemu_rdma_wait_comp_channel().  So even
> without the fd handler, dest QEMU should be able to receive the events.
>
> I tested this by initiating an RDMA migration, then do two things:
>
>   - Either does migrate_cancel on src, or,
>   - Directly kill destination QEMU
>
> In both cases, the other side of QEMU will be able to receive the
> disconnect event in qemu_rdma_wait_comp_channel() and properly cancel or
> fail the migration.
>
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  migration/rdma.c | 29 +----------------------------
>  1 file changed, 1 insertion(+), 28 deletions(-)
>
> diff --git a/migration/rdma.c b/migration/rdma.c
> index 7751262460..da7fd48bf3 100644
> --- a/migration/rdma.c
> +++ b/migration/rdma.c
> @@ -3045,32 +3045,6 @@ int rdma_control_save_page(QEMUFile *f, ram_addr_t block_offset,
>  
>  static void rdma_accept_incoming_migration(void *opaque);
>  
> -static void rdma_cm_poll_handler(void *opaque)
> -{
> -    RDMAContext *rdma = opaque;
> -    struct rdma_cm_event *cm_event;
> -
> -    if (rdma_get_cm_event(rdma->channel, &cm_event) < 0) {
> -        error_report("get_cm_event failed %d", errno);
> -        return;
> -    }
> -
> -    if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
> -        cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
> -        if (!rdma->errored &&
> -            migration_incoming_get_current()->state !=
> -              MIGRATION_STATUS_COMPLETED) {
> -            error_report("receive cm event, cm event is %d", cm_event->event);
> -            rdma->errored = true;
> -            if (rdma->return_path) {
> -                rdma->return_path->errored = true;
> -            }
> -        }
> -        rdma_ack_cm_event(cm_event);
> -    }
> -    rdma_ack_cm_event(cm_event);
> -}
> -
>  static int qemu_rdma_accept(RDMAContext *rdma)
>  {
>      Error *err = NULL;
> @@ -3188,8 +3162,7 @@ static int qemu_rdma_accept(RDMAContext *rdma)
>                              NULL,
>                              (void *)(intptr_t)rdma->return_path);
>      } else {
> -        qemu_set_fd_handler(rdma->channel->fd, rdma_cm_poll_handler,
> -                            NULL, rdma);
> +        qemu_set_fd_handler(rdma->channel->fd, NULL, NULL, NULL);

I'm not familiar with this code, but is this left here to remove the
handler? Can't we remove this line altogether?

>      }
>  
>      ret = rdma_accept(rdma->cm_id, &conn_param);