migration/rdma.c | 157 ----------------------------------------------- 1 file changed, 157 deletions(-)
I hit following error which testing migration in pure RoCE env:
"-incoming rdma:[::]:8089: RDMA ERROR: You only have RoCE / iWARP devices in your
systems and your management software has specified '[::]', but IPv6 over RoCE /
iWARP is not supported in Linux.#012'."
In our setup, we use rdma bind on ipv6 on target host, while connect from source
with ipv4, remove the qemu_rdma_broken_ipv6_kernel, migration just work
fine.
Checking the git history, the function was added since introducing of
rdma migration, which is more than 10 years ago. linux-rdma has
improved support on RoCE/iWARP for ipv6 over past years. There are a few fixes
back in 2016 seems related to the issue, eg:
aeb76df46d11 ("IB/core: Set routable RoCE gid type for ipv4/ipv6 networks")
other fixes back in 2018, eg:
052eac6eeb56 RDMA/cma: Update RoCE multicast routines to use net namespace
8d20a1f0ecd5 RDMA/cma: Fix rdma_cm raw IB path setting for RoCE
9327c7afdce3 RDMA/cma: Provide a function to set RoCE path record L2 parameters
5c181bda77f4 RDMA/cma: Set default GID type as RoCE when resolving RoCE route
3c7f67d1880d IB/cma: Fix default RoCE type setting
be1d325a3358 IB/core: Set RoCEv2 MGID according to spec
63a5f483af0e IB/cma: Set default gid type to RoCEv2
So remove the outdated function and it's usage.
Cc: Peter Xu <peterx@redhat.com>
Cc: Li Zhijian <lizhijian@fujitsu.com>
Cc: Yu Zhang <yu.zhang@ionos.com>
Cc: qemu-devel@nongnu.org
Cc: linux-rdma@vger.kernel.org
Cc: michael@flatgalaxy.com
Signed-off-by: Jack Wang <jinpu.wang@ionos.com>
---
migration/rdma.c | 157 -----------------------------------------------
1 file changed, 157 deletions(-)
diff --git a/migration/rdma.c b/migration/rdma.c
index 76fb0349238a..5ce628ddeef0 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -767,149 +767,6 @@ static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id)
trace_qemu_rdma_dump_gid(who, sgid, dgid);
}
-/*
- * As of now, IPv6 over RoCE / iWARP is not supported by linux.
- * We will try the next addrinfo struct, and fail if there are
- * no other valid addresses to bind against.
- *
- * If user is listening on '[::]', then we will not have a opened a device
- * yet and have no way of verifying if the device is RoCE or not.
- *
- * In this case, the source VM will throw an error for ALL types of
- * connections (both IPv4 and IPv6) if the destination machine does not have
- * a regular infiniband network available for use.
- *
- * The only way to guarantee that an error is thrown for broken kernels is
- * for the management software to choose a *specific* interface at bind time
- * and validate what time of hardware it is.
- *
- * Unfortunately, this puts the user in a fix:
- *
- * If the source VM connects with an IPv4 address without knowing that the
- * destination has bound to '[::]' the migration will unconditionally fail
- * unless the management software is explicitly listening on the IPv4
- * address while using a RoCE-based device.
- *
- * If the source VM connects with an IPv6 address, then we're OK because we can
- * throw an error on the source (and similarly on the destination).
- *
- * But in mixed environments, this will be broken for a while until it is fixed
- * inside linux.
- *
- * We do provide a *tiny* bit of help in this function: We can list all of the
- * devices in the system and check to see if all the devices are RoCE or
- * Infiniband.
- *
- * If we detect that we have a *pure* RoCE environment, then we can safely
- * thrown an error even if the management software has specified '[::]' as the
- * bind address.
- *
- * However, if there is are multiple hetergeneous devices, then we cannot make
- * this assumption and the user just has to be sure they know what they are
- * doing.
- *
- * Patches are being reviewed on linux-rdma.
- */
-static int qemu_rdma_broken_ipv6_kernel(struct ibv_context *verbs, Error **errp)
-{
- /* This bug only exists in linux, to our knowledge. */
-#ifdef CONFIG_LINUX
- struct ibv_port_attr port_attr;
-
- /*
- * Verbs are only NULL if management has bound to '[::]'.
- *
- * Let's iterate through all the devices and see if there any pure IB
- * devices (non-ethernet).
- *
- * If not, then we can safely proceed with the migration.
- * Otherwise, there are no guarantees until the bug is fixed in linux.
- */
- if (!verbs) {
- int num_devices;
- struct ibv_device **dev_list = ibv_get_device_list(&num_devices);
- bool roce_found = false;
- bool ib_found = false;
-
- for (int x = 0; x < num_devices; x++) {
- verbs = ibv_open_device(dev_list[x]);
- /*
- * ibv_open_device() is not documented to set errno. If
- * it does, it's somebody else's doc bug. If it doesn't,
- * the use of errno below is wrong.
- * TODO Find out whether ibv_open_device() sets errno.
- */
- if (!verbs) {
- if (errno == EPERM) {
- continue;
- } else {
- error_setg_errno(errp, errno,
- "could not open RDMA device context");
- return -1;
- }
- }
-
- if (ibv_query_port(verbs, 1, &port_attr)) {
- ibv_close_device(verbs);
- error_setg(errp,
- "RDMA ERROR: Could not query initial IB port");
- return -1;
- }
-
- if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) {
- ib_found = true;
- } else if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
- roce_found = true;
- }
-
- ibv_close_device(verbs);
-
- }
-
- if (roce_found) {
- if (ib_found) {
- warn_report("migrations may fail:"
- " IPv6 over RoCE / iWARP in linux"
- " is broken. But since you appear to have a"
- " mixed RoCE / IB environment, be sure to only"
- " migrate over the IB fabric until the kernel "
- " fixes the bug.");
- } else {
- error_setg(errp, "RDMA ERROR: "
- "You only have RoCE / iWARP devices in your systems"
- " and your management software has specified '[::]'"
- ", but IPv6 over RoCE / iWARP is not supported in Linux.");
- return -1;
- }
- }
-
- return 0;
- }
-
- /*
- * If we have a verbs context, that means that some other than '[::]' was
- * used by the management software for binding. In which case we can
- * actually warn the user about a potentially broken kernel.
- */
-
- /* IB ports start with 1, not 0 */
- if (ibv_query_port(verbs, 1, &port_attr)) {
- error_setg(errp, "RDMA ERROR: Could not query initial IB port");
- return -1;
- }
-
- if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
- error_setg(errp, "RDMA ERROR: "
- "Linux kernel's RoCE / iWARP does not support IPv6 "
- "(but patches on linux-rdma in progress)");
- return -1;
- }
-
-#endif
-
- return 0;
-}
-
/*
* Figure out which RDMA device corresponds to the requested IP hostname
* Also create the initial connection manager identifiers for opening
@@ -964,13 +821,6 @@ static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp)
ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_dst_addr,
RDMA_RESOLVE_TIMEOUT_MS);
if (ret >= 0) {
- if (e->ai_family == AF_INET6) {
- ret = qemu_rdma_broken_ipv6_kernel(rdma->cm_id->verbs,
- local_errp);
- if (ret < 0) {
- continue;
- }
- }
error_free(err);
goto route;
}
@@ -2672,13 +2522,6 @@ static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp)
if (ret < 0) {
continue;
}
- if (e->ai_family == AF_INET6) {
- ret = qemu_rdma_broken_ipv6_kernel(listen_id->verbs,
- local_errp);
- if (ret < 0) {
- continue;
- }
- }
error_free(err);
break;
}
--
2.43.0
Please fix this compiling error. cc -m64 -mcx16 -Ilibcommon.a.p -Isubprojects/libvduse -I../subprojects/libvduse -I/usr/include/p11-kit-1 -I/usr/include/pixman-1 -I/usr/include/libpng16 -I/usr/include/spice-server -I/usr/include/spice-1 -I/usr/include/libusb-1.0 -I/usr/include/glib-2.0 -I/usr/lib/x86_64-linux-gnu/glib-2.0/include -I/usr/include/libmount -I/usr/include/blkid -I/usr/include/gio-unix-2.0 -I/usr/include/gtk-3.0 -I/usr/include/pango-1.0 -I/usr/include/harfbuzz -I/usr/include/freetype2 -I/usr/include/fribidi -I/usr/include/uuid -I/usr/include/cairo -I/usr/include/gdk-pixbuf-2.0 -I/usr/include/x86_64-linux-gnu -I/usr/include/atk-1.0 -I/usr/include/at-spi2-atk/2.0 -I/usr/include/dbus-1.0 -I/usr/lib/x86_64-linux-gnu/dbus-1.0/include -I/usr/include/at-spi-2.0 -I/usr/include/cacard -I/usr/include/nss -I/usr/include/nspr -I/usr/include/PCSC -fdiagnostics-color=auto -Wall -Winvalid-pch -Werror -std=gnu11 -O2 -g -fstack-protector-strong -gsplit-dwarf -Wempty-body -Wendif-labels -Wexpansion-to-defined -Wformat-security -Wformat-y2k -Wignored-qualifiers -Wimplicit-fallthrough=2 -Winit-self -Wmissing-format-attribute -Wmissing-prototypes -Wnested-externs -Wold-style-declaration -Wold-style-definition -Wredundant-decls -Wshadow=local -Wstrict-prototypes -Wtype-limits -Wundef -Wvla -Wwrite-strings -Wno-missing-include-dirs -Wno-psabi -Wno-shift-negative-value -isystem /home/lizj/workspace/qemu/qemu/linux-headers -isystem linux-headers -iquote . -iquote /home/lizj/workspace/qemu/qemu -iquote /home/lizj/workspace/qemu/qemu/include -iquote /home/lizj/workspace/qemu/qemu/host/include/x86_64 -iquote /home/lizj/workspace/qemu/qemu/host/include/generic -iquote /home/lizj/workspace/qemu/qemu/tcg/i386 -pthread -mcx16 -msse2 -D_GNU_SOURCE -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -fno-strict-aliasing -fno-common -fwrapv -fzero-call-used-regs=used-gpr -fPIE -D_DEFAULT_SOURCE -D_XOPEN_SOURCE=600 -DNCURSES_WIDECHAR=1 -DSTRUCT_IOVEC_DEFINED -MD -MQ libcommon.a.p/migration_rdma.c.o -MF libcommon.a.p/migration_rdma.c.o.d -o libcommon.a.p/migration_rdma.c.o -c ../migration/rdma.c ../migration/rdma.c: In function ‘qemu_rdma_resolve_host’: ../migration/rdma.c:815:17: error: unused variable ‘local_errp’ [-Werror=unused-variable] 815 | Error **local_errp = err ? NULL : &err; | ^~~~~~~~~~ ../migration/rdma.c: In function ‘qemu_rdma_dest_init’: ../migration/rdma.c:2504:17: error: unused variable ‘local_errp’ [-Werror=unused-variable] 2504 | Error **local_errp = err ? NULL : &err; | ^~~~~~~~~~ cc1: all warnings being treated as errors [17/19] Compiling C object qemu-img.p/qemu-img.c.o After this fixing, feel free to add Tested-by: Li zhijian <lizhijian@fujitsu.com> On 26/03/2025 17:52, Jack Wang wrote: > I hit following error which testing migration in pure RoCE env: > "-incoming rdma:[::]:8089: RDMA ERROR: You only have RoCE / iWARP devices in your > systems and your management software has specified '[::]', but IPv6 over RoCE / > iWARP is not supported in Linux.#012'." > > In our setup, we use rdma bind on ipv6 on target host, while connect from source > with ipv4, remove the qemu_rdma_broken_ipv6_kernel, migration just work > fine. > > Checking the git history, the function was added since introducing of > rdma migration, which is more than 10 years ago. linux-rdma has > improved support on RoCE/iWARP for ipv6 over past years. There are a few fixes > back in 2016 seems related to the issue, eg: > aeb76df46d11 ("IB/core: Set routable RoCE gid type for ipv4/ipv6 networks") > > other fixes back in 2018, eg: > 052eac6eeb56 RDMA/cma: Update RoCE multicast routines to use net namespace > 8d20a1f0ecd5 RDMA/cma: Fix rdma_cm raw IB path setting for RoCE > 9327c7afdce3 RDMA/cma: Provide a function to set RoCE path record L2 parameters > 5c181bda77f4 RDMA/cma: Set default GID type as RoCE when resolving RoCE route > 3c7f67d1880d IB/cma: Fix default RoCE type setting > be1d325a3358 IB/core: Set RoCEv2 MGID according to spec > 63a5f483af0e IB/cma: Set default gid type to RoCEv2 > > So remove the outdated function and it's usage. > > Cc: Peter Xu <peterx@redhat.com> > Cc: Li Zhijian <lizhijian@fujitsu.com> > Cc: Yu Zhang <yu.zhang@ionos.com> > Cc: qemu-devel@nongnu.org > Cc: linux-rdma@vger.kernel.org > Cc: michael@flatgalaxy.com > Signed-off-by: Jack Wang <jinpu.wang@ionos.com> > --- > migration/rdma.c | 157 ----------------------------------------------- > 1 file changed, 157 deletions(-) > > diff --git a/migration/rdma.c b/migration/rdma.c > index 76fb0349238a..5ce628ddeef0 100644 > --- a/migration/rdma.c > +++ b/migration/rdma.c > @@ -767,149 +767,6 @@ static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id) > trace_qemu_rdma_dump_gid(who, sgid, dgid); > } > > -/* > - * As of now, IPv6 over RoCE / iWARP is not supported by linux. > - * We will try the next addrinfo struct, and fail if there are > - * no other valid addresses to bind against. > - * > - * If user is listening on '[::]', then we will not have a opened a device > - * yet and have no way of verifying if the device is RoCE or not. > - * > - * In this case, the source VM will throw an error for ALL types of > - * connections (both IPv4 and IPv6) if the destination machine does not have > - * a regular infiniband network available for use. > - * > - * The only way to guarantee that an error is thrown for broken kernels is > - * for the management software to choose a *specific* interface at bind time > - * and validate what time of hardware it is. > - * > - * Unfortunately, this puts the user in a fix: > - * > - * If the source VM connects with an IPv4 address without knowing that the > - * destination has bound to '[::]' the migration will unconditionally fail > - * unless the management software is explicitly listening on the IPv4 > - * address while using a RoCE-based device. > - * > - * If the source VM connects with an IPv6 address, then we're OK because we can > - * throw an error on the source (and similarly on the destination). > - * > - * But in mixed environments, this will be broken for a while until it is fixed > - * inside linux. > - * > - * We do provide a *tiny* bit of help in this function: We can list all of the > - * devices in the system and check to see if all the devices are RoCE or > - * Infiniband. > - * > - * If we detect that we have a *pure* RoCE environment, then we can safely > - * thrown an error even if the management software has specified '[::]' as the > - * bind address. > - * > - * However, if there is are multiple hetergeneous devices, then we cannot make > - * this assumption and the user just has to be sure they know what they are > - * doing. > - * > - * Patches are being reviewed on linux-rdma. > - */ > -static int qemu_rdma_broken_ipv6_kernel(struct ibv_context *verbs, Error **errp) > -{ > - /* This bug only exists in linux, to our knowledge. */ > -#ifdef CONFIG_LINUX > - struct ibv_port_attr port_attr; > - > - /* > - * Verbs are only NULL if management has bound to '[::]'. > - * > - * Let's iterate through all the devices and see if there any pure IB > - * devices (non-ethernet). > - * > - * If not, then we can safely proceed with the migration. > - * Otherwise, there are no guarantees until the bug is fixed in linux. > - */ > - if (!verbs) { > - int num_devices; > - struct ibv_device **dev_list = ibv_get_device_list(&num_devices); > - bool roce_found = false; > - bool ib_found = false; > - > - for (int x = 0; x < num_devices; x++) { > - verbs = ibv_open_device(dev_list[x]); > - /* > - * ibv_open_device() is not documented to set errno. If > - * it does, it's somebody else's doc bug. If it doesn't, > - * the use of errno below is wrong. > - * TODO Find out whether ibv_open_device() sets errno. > - */ > - if (!verbs) { > - if (errno == EPERM) { > - continue; > - } else { > - error_setg_errno(errp, errno, > - "could not open RDMA device context"); > - return -1; > - } > - } > - > - if (ibv_query_port(verbs, 1, &port_attr)) { > - ibv_close_device(verbs); > - error_setg(errp, > - "RDMA ERROR: Could not query initial IB port"); > - return -1; > - } > - > - if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) { > - ib_found = true; > - } else if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) { > - roce_found = true; > - } > - > - ibv_close_device(verbs); > - > - } > - > - if (roce_found) { > - if (ib_found) { > - warn_report("migrations may fail:" > - " IPv6 over RoCE / iWARP in linux" > - " is broken. But since you appear to have a" > - " mixed RoCE / IB environment, be sure to only" > - " migrate over the IB fabric until the kernel " > - " fixes the bug."); > - } else { > - error_setg(errp, "RDMA ERROR: " > - "You only have RoCE / iWARP devices in your systems" > - " and your management software has specified '[::]'" > - ", but IPv6 over RoCE / iWARP is not supported in Linux."); > - return -1; > - } > - } > - > - return 0; > - } > - > - /* > - * If we have a verbs context, that means that some other than '[::]' was > - * used by the management software for binding. In which case we can > - * actually warn the user about a potentially broken kernel. > - */ > - > - /* IB ports start with 1, not 0 */ > - if (ibv_query_port(verbs, 1, &port_attr)) { > - error_setg(errp, "RDMA ERROR: Could not query initial IB port"); > - return -1; > - } > - > - if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) { > - error_setg(errp, "RDMA ERROR: " > - "Linux kernel's RoCE / iWARP does not support IPv6 " > - "(but patches on linux-rdma in progress)"); > - return -1; > - } > - > -#endif > - > - return 0; > -} > - > /* > * Figure out which RDMA device corresponds to the requested IP hostname > * Also create the initial connection manager identifiers for opening > @@ -964,13 +821,6 @@ static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp) > ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_dst_addr, > RDMA_RESOLVE_TIMEOUT_MS); > if (ret >= 0) { > - if (e->ai_family == AF_INET6) { > - ret = qemu_rdma_broken_ipv6_kernel(rdma->cm_id->verbs, > - local_errp); > - if (ret < 0) { > - continue; > - } > - } > error_free(err); > goto route; > } > @@ -2672,13 +2522,6 @@ static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp) > if (ret < 0) { > continue; > } > - if (e->ai_family == AF_INET6) { > - ret = qemu_rdma_broken_ipv6_kernel(listen_id->verbs, > - local_errp); > - if (ret < 0) { > - continue; > - } > - } > error_free(err); > break; > }
Hi Zhijian, On Thu, Mar 27, 2025 at 2:18 AM Zhijian Li (Fujitsu) <lizhijian@fujitsu.com> wrote: > > > Please fix this compiling error. > > > cc -m64 -mcx16 -Ilibcommon.a.p -Isubprojects/libvduse -I../subprojects/libvduse -I/usr/include/p11-kit-1 -I/usr/include/pixman-1 -I/usr/include/libpng16 -I/usr/include/spice-server -I/usr/include/spice-1 -I/usr/include/libusb-1.0 -I/usr/include/glib-2.0 -I/usr/lib/x86_64-linux-gnu/glib-2.0/include -I/usr/include/libmount -I/usr/include/blkid -I/usr/include/gio-unix-2.0 -I/usr/include/gtk-3.0 -I/usr/include/pango-1.0 -I/usr/include/harfbuzz -I/usr/include/freetype2 -I/usr/include/fribidi -I/usr/include/uuid -I/usr/include/cairo -I/usr/include/gdk-pixbuf-2.0 -I/usr/include/x86_64-linux-gnu -I/usr/include/atk-1.0 -I/usr/include/at-spi2-atk/2.0 -I/usr/include/dbus-1.0 -I/usr/lib/x86_64-linux-gnu/dbus-1.0/include -I/usr/include/at-spi-2.0 -I/usr/include/cacard -I/usr/include/nss -I/usr/include/nspr -I/usr/include/PCSC -fdiagnostics-color=auto -Wall -Winvalid-pch -Werror -std=gnu11 -O2 -g -fstack-protector-strong -gsplit-dwarf -Wempty-body -Wendif-labels -Wexpansion-to-defined -Wformat-security -Wformat-y2k -Wignored-qualifiers -Wimplicit-fallthrough=2 -Winit-self -Wmissing-format-attribute -Wmissing-prototypes -Wnested-externs -Wold-style-declaration -Wold-style-definition -Wredundant-decls -Wshadow=local -Wstrict-prototypes -Wtype-limits -Wundef -Wvla -Wwrite-strings -Wno-missing-include-dirs -Wno-psabi -Wno-shift-negative-value -isystem /home/lizj/workspace/qemu/qemu/linux-headers -isystem linux-headers -iquote . -iquote /home/lizj/workspace/qemu/qemu -iquote /home/lizj/workspace/qemu/qemu/include -iquote /home/lizj/workspace/qemu/qemu/host/include/x86_64 -iquote /home/lizj/workspace/qemu/qemu/host/include/generic -iquote /home/lizj/workspace/qemu/qemu/tcg/i386 -pthread -mcx16 -msse2 -D_GNU_SOURCE -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -fno-strict-aliasing -fno-common -fwrapv -fzero-call-used-regs=used-gpr -fPIE -D_DEFAULT_SOURCE -D_XOPEN_SOURCE=600 -DNCURSES_WIDECHAR=1 -DSTRUCT_IOVEC_DEFINED -MD -MQ libcommon.a.p/migration_rdma.c.o -MF libcommon.a.p/migration_rdma.c.o.d -o libcommon.a.p/migration_rdma.c.o -c ../migration/rdma.c > ../migration/rdma.c: In function ‘qemu_rdma_resolve_host’: > ../migration/rdma.c:815:17: error: unused variable ‘local_errp’ [-Werror=unused-variable] > 815 | Error **local_errp = err ? NULL : &err; > | ^~~~~~~~~~ > ../migration/rdma.c: In function ‘qemu_rdma_dest_init’: > ../migration/rdma.c:2504:17: error: unused variable ‘local_errp’ [-Werror=unused-variable] > 2504 | Error **local_errp = err ? NULL : &err; > | ^~~~~~~~~~ > cc1: all warnings being treated as errors > [17/19] Compiling C object qemu-img.p/qemu-img.c.o > > > After this fixing, feel free to add > > Tested-by: Li zhijian <lizhijian@fujitsu.com> Thx for checking, will fix them. > > > On 26/03/2025 17:52, Jack Wang wrote: > > I hit following error which testing migration in pure RoCE env: > > "-incoming rdma:[::]:8089: RDMA ERROR: You only have RoCE / iWARP devices in your > > systems and your management software has specified '[::]', but IPv6 over RoCE / > > iWARP is not supported in Linux.#012'." > > > > In our setup, we use rdma bind on ipv6 on target host, while connect from source > > with ipv4, remove the qemu_rdma_broken_ipv6_kernel, migration just work > > fine. > > > > Checking the git history, the function was added since introducing of > > rdma migration, which is more than 10 years ago. linux-rdma has > > improved support on RoCE/iWARP for ipv6 over past years. There are a few fixes > > back in 2016 seems related to the issue, eg: > > aeb76df46d11 ("IB/core: Set routable RoCE gid type for ipv4/ipv6 networks") > > > > other fixes back in 2018, eg: > > 052eac6eeb56 RDMA/cma: Update RoCE multicast routines to use net namespace > > 8d20a1f0ecd5 RDMA/cma: Fix rdma_cm raw IB path setting for RoCE > > 9327c7afdce3 RDMA/cma: Provide a function to set RoCE path record L2 parameters > > 5c181bda77f4 RDMA/cma: Set default GID type as RoCE when resolving RoCE route > > 3c7f67d1880d IB/cma: Fix default RoCE type setting > > be1d325a3358 IB/core: Set RoCEv2 MGID according to spec > > 63a5f483af0e IB/cma: Set default gid type to RoCEv2 > > > > So remove the outdated function and it's usage. > > > > Cc: Peter Xu <peterx@redhat.com> > > Cc: Li Zhijian <lizhijian@fujitsu.com> > > Cc: Yu Zhang <yu.zhang@ionos.com> > > Cc: qemu-devel@nongnu.org > > Cc: linux-rdma@vger.kernel.org > > Cc: michael@flatgalaxy.com > > Signed-off-by: Jack Wang <jinpu.wang@ionos.com> > > --- > > migration/rdma.c | 157 ----------------------------------------------- > > 1 file changed, 157 deletions(-) > > > > diff --git a/migration/rdma.c b/migration/rdma.c > > index 76fb0349238a..5ce628ddeef0 100644 > > --- a/migration/rdma.c > > +++ b/migration/rdma.c > > @@ -767,149 +767,6 @@ static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id) > > trace_qemu_rdma_dump_gid(who, sgid, dgid); > > } > > > > -/* > > - * As of now, IPv6 over RoCE / iWARP is not supported by linux. > > - * We will try the next addrinfo struct, and fail if there are > > - * no other valid addresses to bind against. > > - * > > - * If user is listening on '[::]', then we will not have a opened a device > > - * yet and have no way of verifying if the device is RoCE or not. > > - * > > - * In this case, the source VM will throw an error for ALL types of > > - * connections (both IPv4 and IPv6) if the destination machine does not have > > - * a regular infiniband network available for use. > > - * > > - * The only way to guarantee that an error is thrown for broken kernels is > > - * for the management software to choose a *specific* interface at bind time > > - * and validate what time of hardware it is. > > - * > > - * Unfortunately, this puts the user in a fix: > > - * > > - * If the source VM connects with an IPv4 address without knowing that the > > - * destination has bound to '[::]' the migration will unconditionally fail > > - * unless the management software is explicitly listening on the IPv4 > > - * address while using a RoCE-based device. > > - * > > - * If the source VM connects with an IPv6 address, then we're OK because we can > > - * throw an error on the source (and similarly on the destination). > > - * > > - * But in mixed environments, this will be broken for a while until it is fixed > > - * inside linux. > > - * > > - * We do provide a *tiny* bit of help in this function: We can list all of the > > - * devices in the system and check to see if all the devices are RoCE or > > - * Infiniband. > > - * > > - * If we detect that we have a *pure* RoCE environment, then we can safely > > - * thrown an error even if the management software has specified '[::]' as the > > - * bind address. > > - * > > - * However, if there is are multiple hetergeneous devices, then we cannot make > > - * this assumption and the user just has to be sure they know what they are > > - * doing. > > - * > > - * Patches are being reviewed on linux-rdma. > > - */ > > -static int qemu_rdma_broken_ipv6_kernel(struct ibv_context *verbs, Error **errp) > > -{ > > - /* This bug only exists in linux, to our knowledge. */ > > -#ifdef CONFIG_LINUX > > - struct ibv_port_attr port_attr; > > - > > - /* > > - * Verbs are only NULL if management has bound to '[::]'. > > - * > > - * Let's iterate through all the devices and see if there any pure IB > > - * devices (non-ethernet). > > - * > > - * If not, then we can safely proceed with the migration. > > - * Otherwise, there are no guarantees until the bug is fixed in linux. > > - */ > > - if (!verbs) { > > - int num_devices; > > - struct ibv_device **dev_list = ibv_get_device_list(&num_devices); > > - bool roce_found = false; > > - bool ib_found = false; > > - > > - for (int x = 0; x < num_devices; x++) { > > - verbs = ibv_open_device(dev_list[x]); > > - /* > > - * ibv_open_device() is not documented to set errno. If > > - * it does, it's somebody else's doc bug. If it doesn't, > > - * the use of errno below is wrong. > > - * TODO Find out whether ibv_open_device() sets errno. > > - */ > > - if (!verbs) { > > - if (errno == EPERM) { > > - continue; > > - } else { > > - error_setg_errno(errp, errno, > > - "could not open RDMA device context"); > > - return -1; > > - } > > - } > > - > > - if (ibv_query_port(verbs, 1, &port_attr)) { > > - ibv_close_device(verbs); > > - error_setg(errp, > > - "RDMA ERROR: Could not query initial IB port"); > > - return -1; > > - } > > - > > - if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) { > > - ib_found = true; > > - } else if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) { > > - roce_found = true; > > - } > > - > > - ibv_close_device(verbs); > > - > > - } > > - > > - if (roce_found) { > > - if (ib_found) { > > - warn_report("migrations may fail:" > > - " IPv6 over RoCE / iWARP in linux" > > - " is broken. But since you appear to have a" > > - " mixed RoCE / IB environment, be sure to only" > > - " migrate over the IB fabric until the kernel " > > - " fixes the bug."); > > - } else { > > - error_setg(errp, "RDMA ERROR: " > > - "You only have RoCE / iWARP devices in your systems" > > - " and your management software has specified '[::]'" > > - ", but IPv6 over RoCE / iWARP is not supported in Linux."); > > - return -1; > > - } > > - } > > - > > - return 0; > > - } > > - > > - /* > > - * If we have a verbs context, that means that some other than '[::]' was > > - * used by the management software for binding. In which case we can > > - * actually warn the user about a potentially broken kernel. > > - */ > > - > > - /* IB ports start with 1, not 0 */ > > - if (ibv_query_port(verbs, 1, &port_attr)) { > > - error_setg(errp, "RDMA ERROR: Could not query initial IB port"); > > - return -1; > > - } > > - > > - if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) { > > - error_setg(errp, "RDMA ERROR: " > > - "Linux kernel's RoCE / iWARP does not support IPv6 " > > - "(but patches on linux-rdma in progress)"); > > - return -1; > > - } > > - > > -#endif > > - > > - return 0; > > -} > > - > > /* > > * Figure out which RDMA device corresponds to the requested IP hostname > > * Also create the initial connection manager identifiers for opening > > @@ -964,13 +821,6 @@ static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp) > > ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_dst_addr, > > RDMA_RESOLVE_TIMEOUT_MS); > > if (ret >= 0) { > > - if (e->ai_family == AF_INET6) { > > - ret = qemu_rdma_broken_ipv6_kernel(rdma->cm_id->verbs, > > - local_errp); > > - if (ret < 0) { > > - continue; > > - } > > - } > > error_free(err); > > goto route; > > } > > @@ -2672,13 +2522,6 @@ static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp) > > if (ret < 0) { > > continue; > > } > > - if (e->ai_family == AF_INET6) { > > - ret = qemu_rdma_broken_ipv6_kernel(listen_id->verbs, > > - local_errp); > > - if (ret < 0) { > > - continue; > > - } > > - } > > error_free(err); > > break; > > }
Excellent find. Thank you very much for checking on the history. Hopefully my comments were not too hard to read. =) FYI: I've since left Akamai last year and now work at Nvidia. Reviewed-by: Michael Galaxy <mrgalaxy@nvidia.com> On 3/26/25 04:52, Jack Wang wrote: > I hit following error which testing migration in pure RoCE env: > "-incoming rdma:[::]:8089: RDMA ERROR: You only have RoCE / iWARP devices in your > systems and your management software has specified '[::]', but IPv6 over RoCE / > iWARP is not supported in Linux.#012'." > > In our setup, we use rdma bind on ipv6 on target host, while connect from source > with ipv4, remove the qemu_rdma_broken_ipv6_kernel, migration just work > fine. > > Checking the git history, the function was added since introducing of > rdma migration, which is more than 10 years ago. linux-rdma has > improved support on RoCE/iWARP for ipv6 over past years. There are a few fixes > back in 2016 seems related to the issue, eg: > aeb76df46d11 ("IB/core: Set routable RoCE gid type for ipv4/ipv6 networks") > > other fixes back in 2018, eg: > 052eac6eeb56 RDMA/cma: Update RoCE multicast routines to use net namespace > 8d20a1f0ecd5 RDMA/cma: Fix rdma_cm raw IB path setting for RoCE > 9327c7afdce3 RDMA/cma: Provide a function to set RoCE path record L2 parameters > 5c181bda77f4 RDMA/cma: Set default GID type as RoCE when resolving RoCE route > 3c7f67d1880d IB/cma: Fix default RoCE type setting > be1d325a3358 IB/core: Set RoCEv2 MGID according to spec > 63a5f483af0e IB/cma: Set default gid type to RoCEv2 > > So remove the outdated function and it's usage. > > Cc: Peter Xu<peterx@redhat.com> > Cc: Li Zhijian<lizhijian@fujitsu.com> > Cc: Yu Zhang<yu.zhang@ionos.com> > Cc:qemu-devel@nongnu.org > Cc:linux-rdma@vger.kernel.org > Cc:michael@flatgalaxy.com > Signed-off-by: Jack Wang<jinpu.wang@ionos.com> > --- > migration/rdma.c | 157 ----------------------------------------------- > 1 file changed, 157 deletions(-) > > diff --git a/migration/rdma.c b/migration/rdma.c > index 76fb0349238a..5ce628ddeef0 100644 > --- a/migration/rdma.c > +++ b/migration/rdma.c > @@ -767,149 +767,6 @@ static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id) > trace_qemu_rdma_dump_gid(who, sgid, dgid); > } > > -/* > - * As of now, IPv6 over RoCE / iWARP is not supported by linux. > - * We will try the next addrinfo struct, and fail if there are > - * no other valid addresses to bind against. > - * > - * If user is listening on '[::]', then we will not have a opened a device > - * yet and have no way of verifying if the device is RoCE or not. > - * > - * In this case, the source VM will throw an error for ALL types of > - * connections (both IPv4 and IPv6) if the destination machine does not have > - * a regular infiniband network available for use. > - * > - * The only way to guarantee that an error is thrown for broken kernels is > - * for the management software to choose a *specific* interface at bind time > - * and validate what time of hardware it is. > - * > - * Unfortunately, this puts the user in a fix: > - * > - * If the source VM connects with an IPv4 address without knowing that the > - * destination has bound to '[::]' the migration will unconditionally fail > - * unless the management software is explicitly listening on the IPv4 > - * address while using a RoCE-based device. > - * > - * If the source VM connects with an IPv6 address, then we're OK because we can > - * throw an error on the source (and similarly on the destination). > - * > - * But in mixed environments, this will be broken for a while until it is fixed > - * inside linux. > - * > - * We do provide a *tiny* bit of help in this function: We can list all of the > - * devices in the system and check to see if all the devices are RoCE or > - * Infiniband. > - * > - * If we detect that we have a *pure* RoCE environment, then we can safely > - * thrown an error even if the management software has specified '[::]' as the > - * bind address. > - * > - * However, if there is are multiple hetergeneous devices, then we cannot make > - * this assumption and the user just has to be sure they know what they are > - * doing. > - * > - * Patches are being reviewed on linux-rdma. > - */ > -static int qemu_rdma_broken_ipv6_kernel(struct ibv_context *verbs, Error **errp) > -{ > - /* This bug only exists in linux, to our knowledge. */ > -#ifdef CONFIG_LINUX > - struct ibv_port_attr port_attr; > - > - /* > - * Verbs are only NULL if management has bound to '[::]'. > - * > - * Let's iterate through all the devices and see if there any pure IB > - * devices (non-ethernet). > - * > - * If not, then we can safely proceed with the migration. > - * Otherwise, there are no guarantees until the bug is fixed in linux. > - */ > - if (!verbs) { > - int num_devices; > - struct ibv_device **dev_list = ibv_get_device_list(&num_devices); > - bool roce_found = false; > - bool ib_found = false; > - > - for (int x = 0; x < num_devices; x++) { > - verbs = ibv_open_device(dev_list[x]); > - /* > - * ibv_open_device() is not documented to set errno. If > - * it does, it's somebody else's doc bug. If it doesn't, > - * the use of errno below is wrong. > - * TODO Find out whether ibv_open_device() sets errno. > - */ > - if (!verbs) { > - if (errno == EPERM) { > - continue; > - } else { > - error_setg_errno(errp, errno, > - "could not open RDMA device context"); > - return -1; > - } > - } > - > - if (ibv_query_port(verbs, 1, &port_attr)) { > - ibv_close_device(verbs); > - error_setg(errp, > - "RDMA ERROR: Could not query initial IB port"); > - return -1; > - } > - > - if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) { > - ib_found = true; > - } else if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) { > - roce_found = true; > - } > - > - ibv_close_device(verbs); > - > - } > - > - if (roce_found) { > - if (ib_found) { > - warn_report("migrations may fail:" > - " IPv6 over RoCE / iWARP in linux" > - " is broken. But since you appear to have a" > - " mixed RoCE / IB environment, be sure to only" > - " migrate over the IB fabric until the kernel " > - " fixes the bug."); > - } else { > - error_setg(errp, "RDMA ERROR: " > - "You only have RoCE / iWARP devices in your systems" > - " and your management software has specified '[::]'" > - ", but IPv6 over RoCE / iWARP is not supported in Linux."); > - return -1; > - } > - } > - > - return 0; > - } > - > - /* > - * If we have a verbs context, that means that some other than '[::]' was > - * used by the management software for binding. In which case we can > - * actually warn the user about a potentially broken kernel. > - */ > - > - /* IB ports start with 1, not 0 */ > - if (ibv_query_port(verbs, 1, &port_attr)) { > - error_setg(errp, "RDMA ERROR: Could not query initial IB port"); > - return -1; > - } > - > - if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) { > - error_setg(errp, "RDMA ERROR: " > - "Linux kernel's RoCE / iWARP does not support IPv6 " > - "(but patches on linux-rdma in progress)"); > - return -1; > - } > - > -#endif > - > - return 0; > -} > - > /* > * Figure out which RDMA device corresponds to the requested IP hostname > * Also create the initial connection manager identifiers for opening > @@ -964,13 +821,6 @@ static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp) > ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_dst_addr, > RDMA_RESOLVE_TIMEOUT_MS); > if (ret >= 0) { > - if (e->ai_family == AF_INET6) { > - ret = qemu_rdma_broken_ipv6_kernel(rdma->cm_id->verbs, > - local_errp); > - if (ret < 0) { > - continue; > - } > - } > error_free(err); > goto route; > } > @@ -2672,13 +2522,6 @@ static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp) > if (ret < 0) { > continue; > } > - if (e->ai_family == AF_INET6) { > - ret = qemu_rdma_broken_ipv6_kernel(listen_id->verbs, > - local_errp); > - if (ret < 0) { > - continue; > - } > - } > error_free(err); > break; > }
Excellent find. Thank you very much for checking on the history. Hopefully my comments were not too hard to read. =) FYI: I've since left Akamai last year and now work at Nvidia. Reviewed-by: Michael Galaxy <mrgalaxy@nvidia.com> > On 3/26/25 04:52, Jack Wang wrote: >> I hit following error which testing migration in pure RoCE env: >> "-incoming rdma:[::]:8089: RDMA ERROR: You only have RoCE / iWARP devices in your >> systems and your management software has specified '[::]', but IPv6 over RoCE / >> iWARP is not supported in Linux.#012'." >> >> In our setup, we use rdma bind on ipv6 on target host, while connect from source >> with ipv4, remove the qemu_rdma_broken_ipv6_kernel, migration just work >> fine. >> >> Checking the git history, the function was added since introducing of >> rdma migration, which is more than 10 years ago. linux-rdma has >> improved support on RoCE/iWARP for ipv6 over past years. There are a few fixes >> back in 2016 seems related to the issue, eg: >> aeb76df46d11 ("IB/core: Set routable RoCE gid type for ipv4/ipv6 networks") >> >> other fixes back in 2018, eg: >> 052eac6eeb56 RDMA/cma: Update RoCE multicast routines to use net namespace >> 8d20a1f0ecd5 RDMA/cma: Fix rdma_cm raw IB path setting for RoCE >> 9327c7afdce3 RDMA/cma: Provide a function to set RoCE path record L2 parameters >> 5c181bda77f4 RDMA/cma: Set default GID type as RoCE when resolving RoCE route >> 3c7f67d1880d IB/cma: Fix default RoCE type setting >> be1d325a3358 IB/core: Set RoCEv2 MGID according to spec >> 63a5f483af0e IB/cma: Set default gid type to RoCEv2 >> >> So remove the outdated function and it's usage. >> >> Cc: Peter Xu<peterx@redhat.com> >> Cc: Li Zhijian<lizhijian@fujitsu.com> >> Cc: Yu Zhang<yu.zhang@ionos.com> >> Cc:qemu-devel@nongnu.org >> Cc:linux-rdma@vger.kernel.org >> Cc:michael@flatgalaxy.com >> Signed-off-by: Jack Wang<jinpu.wang@ionos.com> >> --- >> migration/rdma.c | 157 ----------------------------------------------- >> 1 file changed, 157 deletions(-) >> >> diff --git a/migration/rdma.c b/migration/rdma.c >> index 76fb0349238a..5ce628ddeef0 100644 >> --- a/migration/rdma.c >> +++ b/migration/rdma.c >> @@ -767,149 +767,6 @@ static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id) >> trace_qemu_rdma_dump_gid(who, sgid, dgid); >> } >> >> -/* >> - * As of now, IPv6 over RoCE / iWARP is not supported by linux. >> - * We will try the next addrinfo struct, and fail if there are >> - * no other valid addresses to bind against. >> - * >> - * If user is listening on '[::]', then we will not have a opened a device >> - * yet and have no way of verifying if the device is RoCE or not. >> - * >> - * In this case, the source VM will throw an error for ALL types of >> - * connections (both IPv4 and IPv6) if the destination machine does not have >> - * a regular infiniband network available for use. >> - * >> - * The only way to guarantee that an error is thrown for broken kernels is >> - * for the management software to choose a *specific* interface at bind time >> - * and validate what time of hardware it is. >> - * >> - * Unfortunately, this puts the user in a fix: >> - * >> - * If the source VM connects with an IPv4 address without knowing that the >> - * destination has bound to '[::]' the migration will unconditionally fail >> - * unless the management software is explicitly listening on the IPv4 >> - * address while using a RoCE-based device. >> - * >> - * If the source VM connects with an IPv6 address, then we're OK because we can >> - * throw an error on the source (and similarly on the destination). >> - * >> - * But in mixed environments, this will be broken for a while until it is fixed >> - * inside linux. >> - * >> - * We do provide a *tiny* bit of help in this function: We can list all of the >> - * devices in the system and check to see if all the devices are RoCE or >> - * Infiniband. >> - * >> - * If we detect that we have a *pure* RoCE environment, then we can safely >> - * thrown an error even if the management software has specified '[::]' as the >> - * bind address. >> - * >> - * However, if there is are multiple hetergeneous devices, then we cannot make >> - * this assumption and the user just has to be sure they know what they are >> - * doing. >> - * >> - * Patches are being reviewed on linux-rdma. >> - */ >> -static int qemu_rdma_broken_ipv6_kernel(struct ibv_context *verbs, Error **errp) >> -{ >> - /* This bug only exists in linux, to our knowledge. */ >> -#ifdef CONFIG_LINUX >> - struct ibv_port_attr port_attr; >> - >> - /* >> - * Verbs are only NULL if management has bound to '[::]'. >> - * >> - * Let's iterate through all the devices and see if there any pure IB >> - * devices (non-ethernet). >> - * >> - * If not, then we can safely proceed with the migration. >> - * Otherwise, there are no guarantees until the bug is fixed in linux. >> - */ >> - if (!verbs) { >> - int num_devices; >> - struct ibv_device **dev_list = ibv_get_device_list(&num_devices); >> - bool roce_found = false; >> - bool ib_found = false; >> - >> - for (int x = 0; x < num_devices; x++) { >> - verbs = ibv_open_device(dev_list[x]); >> - /* >> - * ibv_open_device() is not documented to set errno. If >> - * it does, it's somebody else's doc bug. If it doesn't, >> - * the use of errno below is wrong. >> - * TODO Find out whether ibv_open_device() sets errno. >> - */ >> - if (!verbs) { >> - if (errno == EPERM) { >> - continue; >> - } else { >> - error_setg_errno(errp, errno, >> - "could not open RDMA device context"); >> - return -1; >> - } >> - } >> - >> - if (ibv_query_port(verbs, 1, &port_attr)) { >> - ibv_close_device(verbs); >> - error_setg(errp, >> - "RDMA ERROR: Could not query initial IB port"); >> - return -1; >> - } >> - >> - if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) { >> - ib_found = true; >> - } else if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) { >> - roce_found = true; >> - } >> - >> - ibv_close_device(verbs); >> - >> - } >> - >> - if (roce_found) { >> - if (ib_found) { >> - warn_report("migrations may fail:" >> - " IPv6 over RoCE / iWARP in linux" >> - " is broken. But since you appear to have a" >> - " mixed RoCE / IB environment, be sure to only" >> - " migrate over the IB fabric until the kernel " >> - " fixes the bug."); >> - } else { >> - error_setg(errp, "RDMA ERROR: " >> - "You only have RoCE / iWARP devices in your systems" >> - " and your management software has specified '[::]'" >> - ", but IPv6 over RoCE / iWARP is not supported in Linux."); >> - return -1; >> - } >> - } >> - >> - return 0; >> - } >> - >> - /* >> - * If we have a verbs context, that means that some other than '[::]' was >> - * used by the management software for binding. In which case we can >> - * actually warn the user about a potentially broken kernel. >> - */ >> - >> - /* IB ports start with 1, not 0 */ >> - if (ibv_query_port(verbs, 1, &port_attr)) { >> - error_setg(errp, "RDMA ERROR: Could not query initial IB port"); >> - return -1; >> - } >> - >> - if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) { >> - error_setg(errp, "RDMA ERROR: " >> - "Linux kernel's RoCE / iWARP does not support IPv6 " >> - "(but patches on linux-rdma in progress)"); >> - return -1; >> - } >> - >> -#endif >> - >> - return 0; >> -} >> - >> /* >> * Figure out which RDMA device corresponds to the requested IP hostname >> * Also create the initial connection manager identifiers for opening >> @@ -964,13 +821,6 @@ static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp) >> ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_dst_addr, >> RDMA_RESOLVE_TIMEOUT_MS); >> if (ret >= 0) { >> - if (e->ai_family == AF_INET6) { >> - ret = qemu_rdma_broken_ipv6_kernel(rdma->cm_id->verbs, >> - local_errp); >> - if (ret < 0) { >> - continue; >> - } >> - } >> error_free(err); >> goto route; >> } >> @@ -2672,13 +2522,6 @@ static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp) >> if (ret < 0) { >> continue; >> } >> - if (e->ai_family == AF_INET6) { >> - ret = qemu_rdma_broken_ipv6_kernel(listen_id->verbs, >> - local_errp); >> - if (ret < 0) { >> - continue; >> - } >> - } >> error_free(err); >> break; >> }
On Wed, Mar 26, 2025 at 2:47 PM Michael Galaxy <michael@flatgalaxy.com> wrote: > > Excellent find. Thank you very much for checking on the history. Hopefully my comments were not too hard to read. =) Yeah, it's pretty clear. > > FYI: I've since left Akamai last year and now work at Nvidia. > > Reviewed-by: Michael Galaxy <mrgalaxy@nvidia.com> cool, thx for the review. All the best at your new job. > > On 3/26/25 04:52, Jack Wang wrote: > > I hit following error which testing migration in pure RoCE env: > "-incoming rdma:[::]:8089: RDMA ERROR: You only have RoCE / iWARP devices in your > systems and your management software has specified '[::]', but IPv6 over RoCE / > iWARP is not supported in Linux.#012'." > > In our setup, we use rdma bind on ipv6 on target host, while connect from source > with ipv4, remove the qemu_rdma_broken_ipv6_kernel, migration just work > fine. > > Checking the git history, the function was added since introducing of > rdma migration, which is more than 10 years ago. linux-rdma has > improved support on RoCE/iWARP for ipv6 over past years. There are a few fixes > back in 2016 seems related to the issue, eg: > aeb76df46d11 ("IB/core: Set routable RoCE gid type for ipv4/ipv6 networks") > > other fixes back in 2018, eg: > 052eac6eeb56 RDMA/cma: Update RoCE multicast routines to use net namespace > 8d20a1f0ecd5 RDMA/cma: Fix rdma_cm raw IB path setting for RoCE > 9327c7afdce3 RDMA/cma: Provide a function to set RoCE path record L2 parameters > 5c181bda77f4 RDMA/cma: Set default GID type as RoCE when resolving RoCE route > 3c7f67d1880d IB/cma: Fix default RoCE type setting > be1d325a3358 IB/core: Set RoCEv2 MGID according to spec > 63a5f483af0e IB/cma: Set default gid type to RoCEv2 > > So remove the outdated function and it's usage. > > Cc: Peter Xu <peterx@redhat.com> > Cc: Li Zhijian <lizhijian@fujitsu.com> > Cc: Yu Zhang <yu.zhang@ionos.com> > Cc: qemu-devel@nongnu.org > Cc: linux-rdma@vger.kernel.org > Cc: michael@flatgalaxy.com > Signed-off-by: Jack Wang <jinpu.wang@ionos.com> > --- > migration/rdma.c | 157 ----------------------------------------------- > 1 file changed, 157 deletions(-) > > diff --git a/migration/rdma.c b/migration/rdma.c > index 76fb0349238a..5ce628ddeef0 100644 > --- a/migration/rdma.c > +++ b/migration/rdma.c > @@ -767,149 +767,6 @@ static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id) > trace_qemu_rdma_dump_gid(who, sgid, dgid); > } > > -/* > - * As of now, IPv6 over RoCE / iWARP is not supported by linux. > - * We will try the next addrinfo struct, and fail if there are > - * no other valid addresses to bind against. > - * > - * If user is listening on '[::]', then we will not have a opened a device > - * yet and have no way of verifying if the device is RoCE or not. > - * > - * In this case, the source VM will throw an error for ALL types of > - * connections (both IPv4 and IPv6) if the destination machine does not have > - * a regular infiniband network available for use. > - * > - * The only way to guarantee that an error is thrown for broken kernels is > - * for the management software to choose a *specific* interface at bind time > - * and validate what time of hardware it is. > - * > - * Unfortunately, this puts the user in a fix: > - * > - * If the source VM connects with an IPv4 address without knowing that the > - * destination has bound to '[::]' the migration will unconditionally fail > - * unless the management software is explicitly listening on the IPv4 > - * address while using a RoCE-based device. > - * > - * If the source VM connects with an IPv6 address, then we're OK because we can > - * throw an error on the source (and similarly on the destination). > - * > - * But in mixed environments, this will be broken for a while until it is fixed > - * inside linux. > - * > - * We do provide a *tiny* bit of help in this function: We can list all of the > - * devices in the system and check to see if all the devices are RoCE or > - * Infiniband. > - * > - * If we detect that we have a *pure* RoCE environment, then we can safely > - * thrown an error even if the management software has specified '[::]' as the > - * bind address. > - * > - * However, if there is are multiple hetergeneous devices, then we cannot make > - * this assumption and the user just has to be sure they know what they are > - * doing. > - * > - * Patches are being reviewed on linux-rdma. > - */ > -static int qemu_rdma_broken_ipv6_kernel(struct ibv_context *verbs, Error **errp) > -{ > - /* This bug only exists in linux, to our knowledge. */ > -#ifdef CONFIG_LINUX > - struct ibv_port_attr port_attr; > - > - /* > - * Verbs are only NULL if management has bound to '[::]'. > - * > - * Let's iterate through all the devices and see if there any pure IB > - * devices (non-ethernet). > - * > - * If not, then we can safely proceed with the migration. > - * Otherwise, there are no guarantees until the bug is fixed in linux. > - */ > - if (!verbs) { > - int num_devices; > - struct ibv_device **dev_list = ibv_get_device_list(&num_devices); > - bool roce_found = false; > - bool ib_found = false; > - > - for (int x = 0; x < num_devices; x++) { > - verbs = ibv_open_device(dev_list[x]); > - /* > - * ibv_open_device() is not documented to set errno. If > - * it does, it's somebody else's doc bug. If it doesn't, > - * the use of errno below is wrong. > - * TODO Find out whether ibv_open_device() sets errno. > - */ > - if (!verbs) { > - if (errno == EPERM) { > - continue; > - } else { > - error_setg_errno(errp, errno, > - "could not open RDMA device context"); > - return -1; > - } > - } > - > - if (ibv_query_port(verbs, 1, &port_attr)) { > - ibv_close_device(verbs); > - error_setg(errp, > - "RDMA ERROR: Could not query initial IB port"); > - return -1; > - } > - > - if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) { > - ib_found = true; > - } else if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) { > - roce_found = true; > - } > - > - ibv_close_device(verbs); > - > - } > - > - if (roce_found) { > - if (ib_found) { > - warn_report("migrations may fail:" > - " IPv6 over RoCE / iWARP in linux" > - " is broken. But since you appear to have a" > - " mixed RoCE / IB environment, be sure to only" > - " migrate over the IB fabric until the kernel " > - " fixes the bug."); > - } else { > - error_setg(errp, "RDMA ERROR: " > - "You only have RoCE / iWARP devices in your systems" > - " and your management software has specified '[::]'" > - ", but IPv6 over RoCE / iWARP is not supported in Linux."); > - return -1; > - } > - } > - > - return 0; > - } > - > - /* > - * If we have a verbs context, that means that some other than '[::]' was > - * used by the management software for binding. In which case we can > - * actually warn the user about a potentially broken kernel. > - */ > - > - /* IB ports start with 1, not 0 */ > - if (ibv_query_port(verbs, 1, &port_attr)) { > - error_setg(errp, "RDMA ERROR: Could not query initial IB port"); > - return -1; > - } > - > - if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) { > - error_setg(errp, "RDMA ERROR: " > - "Linux kernel's RoCE / iWARP does not support IPv6 " > - "(but patches on linux-rdma in progress)"); > - return -1; > - } > - > -#endif > - > - return 0; > -} > - > /* > * Figure out which RDMA device corresponds to the requested IP hostname > * Also create the initial connection manager identifiers for opening > @@ -964,13 +821,6 @@ static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp) > ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_dst_addr, > RDMA_RESOLVE_TIMEOUT_MS); > if (ret >= 0) { > - if (e->ai_family == AF_INET6) { > - ret = qemu_rdma_broken_ipv6_kernel(rdma->cm_id->verbs, > - local_errp); > - if (ret < 0) { > - continue; > - } > - } > error_free(err); > goto route; > } > @@ -2672,13 +2522,6 @@ static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp) > if (ret < 0) { > continue; > } > - if (e->ai_family == AF_INET6) { > - ret = qemu_rdma_broken_ipv6_kernel(listen_id->verbs, > - local_errp); > - if (ret < 0) { > - continue; > - } > - } > error_free(err); > break; > }
© 2016 - 2025 Red Hat, Inc.