This almost reverts commit 923709896b1b01fb982c93492ad01b233e6b6023.
It was needed because the RDMA iochannel on dest QEMU used to only yield
without monitoring the fd. Now it should be monitored by the same poll()
similarly on the src QEMU in qemu_rdma_wait_comp_channel(). So even
without the fd handler, dest QEMU should be able to receive the events.
I tested this by initiating an RDMA migration, then do two things:
- Either does migrate_cancel on src, or,
- Directly kill destination QEMU
In both cases, the other side of QEMU will be able to receive the
disconnect event in qemu_rdma_wait_comp_channel() and properly cancel or
fail the migration.
Signed-off-by: Peter Xu <peterx@redhat.com>
---
migration/rdma.c | 29 +----------------------------
1 file changed, 1 insertion(+), 28 deletions(-)
diff --git a/migration/rdma.c b/migration/rdma.c
index 7751262460..da7fd48bf3 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -3045,32 +3045,6 @@ int rdma_control_save_page(QEMUFile *f, ram_addr_t block_offset,
static void rdma_accept_incoming_migration(void *opaque);
-static void rdma_cm_poll_handler(void *opaque)
-{
- RDMAContext *rdma = opaque;
- struct rdma_cm_event *cm_event;
-
- if (rdma_get_cm_event(rdma->channel, &cm_event) < 0) {
- error_report("get_cm_event failed %d", errno);
- return;
- }
-
- if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
- cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
- if (!rdma->errored &&
- migration_incoming_get_current()->state !=
- MIGRATION_STATUS_COMPLETED) {
- error_report("receive cm event, cm event is %d", cm_event->event);
- rdma->errored = true;
- if (rdma->return_path) {
- rdma->return_path->errored = true;
- }
- }
- rdma_ack_cm_event(cm_event);
- }
- rdma_ack_cm_event(cm_event);
-}
-
static int qemu_rdma_accept(RDMAContext *rdma)
{
Error *err = NULL;
@@ -3188,8 +3162,7 @@ static int qemu_rdma_accept(RDMAContext *rdma)
NULL,
(void *)(intptr_t)rdma->return_path);
} else {
- qemu_set_fd_handler(rdma->channel->fd, rdma_cm_poll_handler,
- NULL, rdma);
+ qemu_set_fd_handler(rdma->channel->fd, NULL, NULL, NULL);
}
ret = rdma_accept(rdma->cm_id, &conn_param);
--
2.50.1
On 28/08/2025 04:59, Peter Xu wrote:
> This almost reverts commit 923709896b1b01fb982c93492ad01b233e6b6023.
>
> It was needed because the RDMA iochannel on dest QEMU used to only yield
> without monitoring the fd. Now it should be monitored by the same poll()
> similarly on the src QEMU in qemu_rdma_wait_comp_channel(). So even
> without the fd handler, dest QEMU should be able to receive the events.
Agree
Reviewed-by: Li Zhijian <lizhijian@fujitsu.com>
>
> I tested this by initiating an RDMA migration, then do two things:
>
> - Either does migrate_cancel on src, or,
> - Directly kill destination QEMU
>
> In both cases, the other side of QEMU will be able to receive the
> disconnect event in qemu_rdma_wait_comp_channel() and properly cancel or
> fail the migration.
>
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
> migration/rdma.c | 29 +----------------------------
> 1 file changed, 1 insertion(+), 28 deletions(-)
>
> diff --git a/migration/rdma.c b/migration/rdma.c
> index 7751262460..da7fd48bf3 100644
> --- a/migration/rdma.c
> +++ b/migration/rdma.c
> @@ -3045,32 +3045,6 @@ int rdma_control_save_page(QEMUFile *f, ram_addr_t block_offset,
>
> static void rdma_accept_incoming_migration(void *opaque);
>
> -static void rdma_cm_poll_handler(void *opaque)
> -{
> - RDMAContext *rdma = opaque;
> - struct rdma_cm_event *cm_event;
> -
> - if (rdma_get_cm_event(rdma->channel, &cm_event) < 0) {
> - error_report("get_cm_event failed %d", errno);
> - return;
> - }
> -
> - if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
> - cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
> - if (!rdma->errored &&
> - migration_incoming_get_current()->state !=
> - MIGRATION_STATUS_COMPLETED) {
> - error_report("receive cm event, cm event is %d", cm_event->event);
> - rdma->errored = true;
> - if (rdma->return_path) {
> - rdma->return_path->errored = true;
> - }
> - }
> - rdma_ack_cm_event(cm_event);
> - }
> - rdma_ack_cm_event(cm_event);
> -}
> -
> static int qemu_rdma_accept(RDMAContext *rdma)
> {
> Error *err = NULL;
> @@ -3188,8 +3162,7 @@ static int qemu_rdma_accept(RDMAContext *rdma)
> NULL,
> (void *)(intptr_t)rdma->return_path);
> } else {
> - qemu_set_fd_handler(rdma->channel->fd, rdma_cm_poll_handler,
> - NULL, rdma);
> + qemu_set_fd_handler(rdma->channel->fd, NULL, NULL, NULL);
> }
>
> ret = rdma_accept(rdma->cm_id, &conn_param);
Peter Xu <peterx@redhat.com> writes:
> This almost reverts commit 923709896b1b01fb982c93492ad01b233e6b6023.
>
> It was needed because the RDMA iochannel on dest QEMU used to only yield
> without monitoring the fd. Now it should be monitored by the same poll()
> similarly on the src QEMU in qemu_rdma_wait_comp_channel(). So even
> without the fd handler, dest QEMU should be able to receive the events.
>
> I tested this by initiating an RDMA migration, then do two things:
>
> - Either does migrate_cancel on src, or,
> - Directly kill destination QEMU
>
> In both cases, the other side of QEMU will be able to receive the
> disconnect event in qemu_rdma_wait_comp_channel() and properly cancel or
> fail the migration.
>
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
> migration/rdma.c | 29 +----------------------------
> 1 file changed, 1 insertion(+), 28 deletions(-)
>
> diff --git a/migration/rdma.c b/migration/rdma.c
> index 7751262460..da7fd48bf3 100644
> --- a/migration/rdma.c
> +++ b/migration/rdma.c
> @@ -3045,32 +3045,6 @@ int rdma_control_save_page(QEMUFile *f, ram_addr_t block_offset,
>
> static void rdma_accept_incoming_migration(void *opaque);
>
> -static void rdma_cm_poll_handler(void *opaque)
> -{
> - RDMAContext *rdma = opaque;
> - struct rdma_cm_event *cm_event;
> -
> - if (rdma_get_cm_event(rdma->channel, &cm_event) < 0) {
> - error_report("get_cm_event failed %d", errno);
> - return;
> - }
> -
> - if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
> - cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
> - if (!rdma->errored &&
> - migration_incoming_get_current()->state !=
> - MIGRATION_STATUS_COMPLETED) {
> - error_report("receive cm event, cm event is %d", cm_event->event);
> - rdma->errored = true;
> - if (rdma->return_path) {
> - rdma->return_path->errored = true;
> - }
> - }
> - rdma_ack_cm_event(cm_event);
> - }
> - rdma_ack_cm_event(cm_event);
> -}
> -
> static int qemu_rdma_accept(RDMAContext *rdma)
> {
> Error *err = NULL;
> @@ -3188,8 +3162,7 @@ static int qemu_rdma_accept(RDMAContext *rdma)
> NULL,
> (void *)(intptr_t)rdma->return_path);
> } else {
> - qemu_set_fd_handler(rdma->channel->fd, rdma_cm_poll_handler,
> - NULL, rdma);
> + qemu_set_fd_handler(rdma->channel->fd, NULL, NULL, NULL);
I'm not familiar with this code, but is this left here to remove the
handler? Can't we remove this line altogether?
> }
>
> ret = rdma_accept(rdma->cm_id, &conn_param);
On Wed, Sep 17, 2025 at 03:38:35PM -0300, Fabiano Rosas wrote:
> Peter Xu <peterx@redhat.com> writes:
>
> > This almost reverts commit 923709896b1b01fb982c93492ad01b233e6b6023.
> >
> > It was needed because the RDMA iochannel on dest QEMU used to only yield
> > without monitoring the fd. Now it should be monitored by the same poll()
> > similarly on the src QEMU in qemu_rdma_wait_comp_channel(). So even
> > without the fd handler, dest QEMU should be able to receive the events.
> >
> > I tested this by initiating an RDMA migration, then do two things:
> >
> > - Either does migrate_cancel on src, or,
> > - Directly kill destination QEMU
> >
> > In both cases, the other side of QEMU will be able to receive the
> > disconnect event in qemu_rdma_wait_comp_channel() and properly cancel or
> > fail the migration.
> >
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> > ---
> > migration/rdma.c | 29 +----------------------------
> > 1 file changed, 1 insertion(+), 28 deletions(-)
> >
> > diff --git a/migration/rdma.c b/migration/rdma.c
> > index 7751262460..da7fd48bf3 100644
> > --- a/migration/rdma.c
> > +++ b/migration/rdma.c
> > @@ -3045,32 +3045,6 @@ int rdma_control_save_page(QEMUFile *f, ram_addr_t block_offset,
> >
> > static void rdma_accept_incoming_migration(void *opaque);
> >
> > -static void rdma_cm_poll_handler(void *opaque)
> > -{
> > - RDMAContext *rdma = opaque;
> > - struct rdma_cm_event *cm_event;
> > -
> > - if (rdma_get_cm_event(rdma->channel, &cm_event) < 0) {
> > - error_report("get_cm_event failed %d", errno);
> > - return;
> > - }
> > -
> > - if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
> > - cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
> > - if (!rdma->errored &&
> > - migration_incoming_get_current()->state !=
> > - MIGRATION_STATUS_COMPLETED) {
> > - error_report("receive cm event, cm event is %d", cm_event->event);
> > - rdma->errored = true;
> > - if (rdma->return_path) {
> > - rdma->return_path->errored = true;
> > - }
> > - }
> > - rdma_ack_cm_event(cm_event);
> > - }
> > - rdma_ack_cm_event(cm_event);
> > -}
> > -
> > static int qemu_rdma_accept(RDMAContext *rdma)
> > {
> > Error *err = NULL;
> > @@ -3188,8 +3162,7 @@ static int qemu_rdma_accept(RDMAContext *rdma)
> > NULL,
> > (void *)(intptr_t)rdma->return_path);
> > } else {
> > - qemu_set_fd_handler(rdma->channel->fd, rdma_cm_poll_handler,
> > - NULL, rdma);
> > + qemu_set_fd_handler(rdma->channel->fd, NULL, NULL, NULL);
>
> I'm not familiar with this code, but is this left here to remove the
> handler? Can't we remove this line altogether?
Fair question. I was just lazy because I know it's safe to call it like
that no matter what, unregistering anything if we registered some,
otherwise this qemu_set_fd_handler() should be a no-op.
I am just not confident on RDMA code that we can remove it. IOW, before
923709896b1 we did that, so I kept it as-is.
--
Peter Xu
© 2016 - 2026 Red Hat, Inc.