migration/cpr-transfer.c | 3 +- tests/qtest/migration/cpr-tests.c | 72 ++++++++++++++++++++++++++++++- 2 files changed, 72 insertions(+), 3 deletions(-)
When the source VM attempts to connect to the destination VM's Unix
domain socket (cpr.sock) during a cpr-transfer test, race conditions can
occur if the socket file isn't ready. This can lead to connection
failures when running tests.
This patch creates and listens on the socket in advance, and passes the
pre-listened FD directly. This avoids timing issues and improves the
reliability of CPR tests.
Reviewed-by: Jason J. Herne <jjherne@linux.ibm.com>
Signed-off-by: Jaehoon Kim <jhkim@linux.ibm.com>
---
Changes since v1:
- In v1, the patch added a wait loop to poll the existence of the socket
file (cpr_validate_socket_path()).
- This version instead creates the socket beforehand and passes its FD
to the destination QEMU, eliminating the race condition entirely.
- Commit title and message changed accordingly.
---
migration/cpr-transfer.c | 3 +-
tests/qtest/migration/cpr-tests.c | 72 ++++++++++++++++++++++++++++++-
2 files changed, 72 insertions(+), 3 deletions(-)
diff --git a/migration/cpr-transfer.c b/migration/cpr-transfer.c
index e1f140359c..7c9de70bad 100644
--- a/migration/cpr-transfer.c
+++ b/migration/cpr-transfer.c
@@ -46,7 +46,8 @@ QEMUFile *cpr_transfer_input(MigrationChannel *channel, Error **errp)
MigrationAddress *addr = channel->addr;
if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET &&
- addr->u.socket.type == SOCKET_ADDRESS_TYPE_UNIX) {
+ (addr->u.socket.type == SOCKET_ADDRESS_TYPE_UNIX ||
+ addr->u.socket.type == SOCKET_ADDRESS_TYPE_FD)) {
g_autoptr(QIOChannelSocket) sioc = NULL;
SocketAddress *saddr = &addr->u.socket;
diff --git a/tests/qtest/migration/cpr-tests.c b/tests/qtest/migration/cpr-tests.c
index 5536e14610..6f90160e21 100644
--- a/tests/qtest/migration/cpr-tests.c
+++ b/tests/qtest/migration/cpr-tests.c
@@ -50,6 +50,51 @@ static void *test_mode_transfer_start(QTestState *from, QTestState *to)
return NULL;
}
+/*
+ * Create a pre-listened UNIX domain socket at the specified path.
+ *
+ * This is used to eliminate a race condition that can occur
+ * intermittently in qtest during CPR tests. By pre-creating and
+ * listening on the socket, we avoid timing-related issues.
+ */
+static int setup_socket_listener(const char *path)
+{
+ struct sockaddr_un un;
+ size_t pathlen;
+ int sock_fd;
+
+ sock_fd = socket(PF_UNIX, SOCK_STREAM, 0);
+ if (sock_fd < 0) {
+ g_test_message("Failed to create Unix socket");
+ return -1;
+ }
+
+ pathlen = strlen(path);
+ if (pathlen >= sizeof(un.sun_path)) {
+ g_test_message("UNIX socket path '%s' is too long", path);
+ close(sock_fd);
+ return -1;
+ }
+
+ memset(&un, 0, sizeof(un));
+ un.sun_family = AF_UNIX;
+ strncpy(un.sun_path, path, sizeof(un.sun_path) - 1);
+
+ if (bind(sock_fd, (struct sockaddr *)&un, sizeof(un)) < 0) {
+ g_test_message("Failed to bind socket to %s", path);
+ close(sock_fd);
+ return -1;
+ }
+
+ if (listen(sock_fd, 1) < 0) {
+ g_test_message("Failed to listen on socket %s", path);
+ close(sock_fd);
+ return -1;
+ }
+
+ return sock_fd;
+}
+
/*
* cpr-transfer mode cannot use the target monitor prior to starting the
* migration, and cannot connect synchronously to the monitor, so defer
@@ -60,13 +105,13 @@ static void test_mode_transfer_common(bool incoming_defer)
g_autofree char *cpr_path = g_strdup_printf("%s/cpr.sock", tmpfs);
g_autofree char *mig_path = g_strdup_printf("%s/migsocket", tmpfs);
g_autofree char *uri = g_strdup_printf("unix:%s", mig_path);
+ g_autofree char *addr_type, *addr_key, *addr_value;
+ g_autofree char *opts_target;
const char *opts = "-machine aux-ram-share=on -nodefaults";
g_autofree const char *cpr_channel = g_strdup_printf(
"cpr,addr.transport=socket,addr.type=unix,addr.path=%s",
cpr_path);
- g_autofree char *opts_target = g_strdup_printf("-incoming %s %s",
- cpr_channel, opts);
g_autofree char *connect_channels = g_strdup_printf(
"[ { 'channel-type': 'main',"
@@ -75,6 +120,29 @@ static void test_mode_transfer_common(bool incoming_defer)
" 'path': '%s' } } ]",
mig_path);
+ /*
+ * Determine socket address type and value.
+ * If socket creation fails, provide the socket path to the target,
+ * so it can create the Unix domain socket itself.
+ * Otherwise, use the pre-listened socket file descriptor directly.
+ */
+ int cpr_sockfd = setup_socket_listener(cpr_path);
+
+ if (cpr_sockfd < 0) {
+ addr_type = g_strdup("unix");
+ addr_key = g_strdup("path");
+ addr_value = g_strdup(cpr_path);
+ } else {
+ addr_type = g_strdup("fd");
+ addr_key = g_strdup("str");
+ addr_value = g_strdup_printf("%d", cpr_sockfd);
+ }
+
+ opts_target = g_strdup_printf("-incoming cpr,addr.transport=socket,"
+ "addr.type=%s,addr.%s=%s %s",
+ addr_type, addr_key, addr_value, opts);
+
+
MigrateCommon args = {
.start.opts_source = opts,
.start.opts_target = opts_target,
--
2.49.0
On Tue, Jun 10, 2025 at 10:08:49AM -0500, Jaehoon Kim wrote:
> When the source VM attempts to connect to the destination VM's Unix
> domain socket (cpr.sock) during a cpr-transfer test, race conditions can
> occur if the socket file isn't ready. This can lead to connection
> failures when running tests.
>
> This patch creates and listens on the socket in advance, and passes the
> pre-listened FD directly. This avoids timing issues and improves the
> reliability of CPR tests.
>
> Reviewed-by: Jason J. Herne <jjherne@linux.ibm.com>
> Signed-off-by: Jaehoon Kim <jhkim@linux.ibm.com>
>
> ---
> Changes since v1:
> - In v1, the patch added a wait loop to poll the existence of the socket
> file (cpr_validate_socket_path()).
>
> - This version instead creates the socket beforehand and passes its FD
> to the destination QEMU, eliminating the race condition entirely.
>
> - Commit title and message changed accordingly.
> ---
> migration/cpr-transfer.c | 3 +-
> tests/qtest/migration/cpr-tests.c | 72 ++++++++++++++++++++++++++++++-
> 2 files changed, 72 insertions(+), 3 deletions(-)
> diff --git a/tests/qtest/migration/cpr-tests.c b/tests/qtest/migration/cpr-tests.c
> index 5536e14610..6f90160e21 100644
> --- a/tests/qtest/migration/cpr-tests.c
> +++ b/tests/qtest/migration/cpr-tests.c
> @@ -50,6 +50,51 @@ static void *test_mode_transfer_start(QTestState *from, QTestState *to)
> return NULL;
> }
>
> +/*
> + * Create a pre-listened UNIX domain socket at the specified path.
> + *
> + * This is used to eliminate a race condition that can occur
> + * intermittently in qtest during CPR tests. By pre-creating and
> + * listening on the socket, we avoid timing-related issues.
> + */
> +static int setup_socket_listener(const char *path)
> +{
> + struct sockaddr_un un;
> + size_t pathlen;
> + int sock_fd;
> +
> + sock_fd = socket(PF_UNIX, SOCK_STREAM, 0);
> + if (sock_fd < 0) {
> + g_test_message("Failed to create Unix socket");
> + return -1;
> + }
> +
> + pathlen = strlen(path);
> + if (pathlen >= sizeof(un.sun_path)) {
> + g_test_message("UNIX socket path '%s' is too long", path);
> + close(sock_fd);
> + return -1;
> + }
> +
> + memset(&un, 0, sizeof(un));
> + un.sun_family = AF_UNIX;
> + strncpy(un.sun_path, path, sizeof(un.sun_path) - 1);
> +
> + if (bind(sock_fd, (struct sockaddr *)&un, sizeof(un)) < 0) {
> + g_test_message("Failed to bind socket to %s", path);
> + close(sock_fd);
> + return -1;
> + }
> +
> + if (listen(sock_fd, 1) < 0) {
> + g_test_message("Failed to listen on socket %s", path);
> + close(sock_fd);
> + return -1;
> + }
> +
> + return sock_fd;
> +}
This is effectively re-implementing 'unix_listen', so just use
that function.
> @@ -75,6 +120,29 @@ static void test_mode_transfer_common(bool incoming_defer)
> " 'path': '%s' } } ]",
> mig_path);
>
> + /*
> + * Determine socket address type and value.
> + * If socket creation fails, provide the socket path to the target,
> + * so it can create the Unix domain socket itself.
> + * Otherwise, use the pre-listened socket file descriptor directly.
> + */
> + int cpr_sockfd = setup_socket_listener(cpr_path);
> +
> + if (cpr_sockfd < 0) {
A failure of this function (or in future 'unix_listen') shouldn't
trigger any fallback logic - we should report it and fail thue
test.
> + addr_type = g_strdup("unix");
> + addr_key = g_strdup("path");
> + addr_value = g_strdup(cpr_path);
> + } else {
> + addr_type = g_strdup("fd");
> + addr_key = g_strdup("str");
> + addr_value = g_strdup_printf("%d", cpr_sockfd);
> + }
> +
> + opts_target = g_strdup_printf("-incoming cpr,addr.transport=socket,"
> + "addr.type=%s,addr.%s=%s %s",
> + addr_type, addr_key, addr_value, opts);
> +
> +
> MigrateCommon args = {
> .start.opts_source = opts,
> .start.opts_target = opts_target,
> --
> 2.49.0
>
>
With regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
On 6/10/2025 12:09 PM, Daniel P. Berrangé wrote:
> On Tue, Jun 10, 2025 at 10:08:49AM -0500, Jaehoon Kim wrote:
>> When the source VM attempts to connect to the destination VM's Unix
>> domain socket (cpr.sock) during a cpr-transfer test, race conditions can
>> occur if the socket file isn't ready. This can lead to connection
>> failures when running tests.
>>
>> This patch creates and listens on the socket in advance, and passes the
>> pre-listened FD directly. This avoids timing issues and improves the
>> reliability of CPR tests.
>>
>> Reviewed-by: Jason J. Herne <jjherne@linux.ibm.com>
>> Signed-off-by: Jaehoon Kim <jhkim@linux.ibm.com>
>>
>> ---
>> Changes since v1:
>> - In v1, the patch added a wait loop to poll the existence of the socket
>> file (cpr_validate_socket_path()).
>>
>> - This version instead creates the socket beforehand and passes its FD
>> to the destination QEMU, eliminating the race condition entirely.
>>
>> - Commit title and message changed accordingly.
>> ---
>> migration/cpr-transfer.c | 3 +-
>> tests/qtest/migration/cpr-tests.c | 72 ++++++++++++++++++++++++++++++-
>> 2 files changed, 72 insertions(+), 3 deletions(-)
>> diff --git a/tests/qtest/migration/cpr-tests.c b/tests/qtest/migration/cpr-tests.c
>> index 5536e14610..6f90160e21 100644
>> --- a/tests/qtest/migration/cpr-tests.c
>> +++ b/tests/qtest/migration/cpr-tests.c
>> @@ -50,6 +50,51 @@ static void *test_mode_transfer_start(QTestState *from, QTestState *to)
>> return NULL;
>> }
>>
>> +/*
>> + * Create a pre-listened UNIX domain socket at the specified path.
>> + *
>> + * This is used to eliminate a race condition that can occur
>> + * intermittently in qtest during CPR tests. By pre-creating and
>> + * listening on the socket, we avoid timing-related issues.
>> + */
>> +static int setup_socket_listener(const char *path)
>> +{
>> + struct sockaddr_un un;
>> + size_t pathlen;
>> + int sock_fd;
>> +
>> + sock_fd = socket(PF_UNIX, SOCK_STREAM, 0);
>> + if (sock_fd < 0) {
>> + g_test_message("Failed to create Unix socket");
>> + return -1;
>> + }
>> +
>> + pathlen = strlen(path);
>> + if (pathlen >= sizeof(un.sun_path)) {
>> + g_test_message("UNIX socket path '%s' is too long", path);
>> + close(sock_fd);
>> + return -1;
>> + }
>> +
>> + memset(&un, 0, sizeof(un));
>> + un.sun_family = AF_UNIX;
>> + strncpy(un.sun_path, path, sizeof(un.sun_path) - 1);
>> +
>> + if (bind(sock_fd, (struct sockaddr *)&un, sizeof(un)) < 0) {
>> + g_test_message("Failed to bind socket to %s", path);
>> + close(sock_fd);
>> + return -1;
>> + }
>> +
>> + if (listen(sock_fd, 1) < 0) {
>> + g_test_message("Failed to listen on socket %s", path);
>> + close(sock_fd);
>> + return -1;
>> + }
>> +
>> + return sock_fd;
>> +}
> This is effectively re-implementing 'unix_listen', so just use
> that function.
I'll revise the patch to use the already defined function instead of
re-implementing it.
Thanks for pointing it out.
>> @@ -75,6 +120,29 @@ static void test_mode_transfer_common(bool incoming_defer)
>> " 'path': '%s' } } ]",
>> mig_path);
>>
>> + /*
>> + * Determine socket address type and value.
>> + * If socket creation fails, provide the socket path to the target,
>> + * so it can create the Unix domain socket itself.
>> + * Otherwise, use the pre-listened socket file descriptor directly.
>> + */
>> + int cpr_sockfd = setup_socket_listener(cpr_path);
>> +
>> + if (cpr_sockfd < 0) {
> A failure of this function (or in future 'unix_listen') shouldn't
> trigger any fallback logic - we should report it and fail thue
> test.
I was considering both options, but I agree with you.
It's better to fail the test and report the error rather than trigger
fallback logic.
I'll update the patch accordingly.
- Jaehoon Kim.
>> + addr_type = g_strdup("unix");
>> + addr_key = g_strdup("path");
>> + addr_value = g_strdup(cpr_path);
>> + } else {
>> + addr_type = g_strdup("fd");
>> + addr_key = g_strdup("str");
>> + addr_value = g_strdup_printf("%d", cpr_sockfd);
>> + }
>> +
>> + opts_target = g_strdup_printf("-incoming cpr,addr.transport=socket,"
>> + "addr.type=%s,addr.%s=%s %s",
>> + addr_type, addr_key, addr_value, opts);
>> +
>> +
>> MigrateCommon args = {
>> .start.opts_source = opts,
>> .start.opts_target = opts_target,
>> --
>> 2.49.0
>>
>>
> With regards,
> Daniel
On Tue, Jun 10, 2025 at 10:08:49AM -0500, Jaehoon Kim wrote:
> When the source VM attempts to connect to the destination VM's Unix
> domain socket (cpr.sock) during a cpr-transfer test, race conditions can
> occur if the socket file isn't ready. This can lead to connection
> failures when running tests.
>
> This patch creates and listens on the socket in advance, and passes the
> pre-listened FD directly. This avoids timing issues and improves the
> reliability of CPR tests.
>
> Reviewed-by: Jason J. Herne <jjherne@linux.ibm.com>
> Signed-off-by: Jaehoon Kim <jhkim@linux.ibm.com>
>
> ---
> Changes since v1:
> - In v1, the patch added a wait loop to poll the existence of the socket
> file (cpr_validate_socket_path()).
>
> - This version instead creates the socket beforehand and passes its FD
> to the destination QEMU, eliminating the race condition entirely.
>
> - Commit title and message changed accordingly.
> ---
> migration/cpr-transfer.c | 3 +-
> tests/qtest/migration/cpr-tests.c | 72 ++++++++++++++++++++++++++++++-
> 2 files changed, 72 insertions(+), 3 deletions(-)
>
> diff --git a/migration/cpr-transfer.c b/migration/cpr-transfer.c
> index e1f140359c..7c9de70bad 100644
> --- a/migration/cpr-transfer.c
> +++ b/migration/cpr-transfer.c
> @@ -46,7 +46,8 @@ QEMUFile *cpr_transfer_input(MigrationChannel *channel, Error **errp)
> MigrationAddress *addr = channel->addr;
>
> if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET &&
> - addr->u.socket.type == SOCKET_ADDRESS_TYPE_UNIX) {
> + (addr->u.socket.type == SOCKET_ADDRESS_TYPE_UNIX ||
> + addr->u.socket.type == SOCKET_ADDRESS_TYPE_FD)) {
>
> g_autoptr(QIOChannelSocket) sioc = NULL;
> SocketAddress *saddr = &addr->u.socket;
This will likely cause a crash in tracing code that is below this diff:
trace_cpr_transfer_input(addr->u.socket.u.q_unix.path);
q_unix.path is NOT valid to access with TYPE_FD.
With regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
On 6/10/2025 12:05 PM, Daniel P. Berrangé wrote:
> On Tue, Jun 10, 2025 at 10:08:49AM -0500, Jaehoon Kim wrote:
>> When the source VM attempts to connect to the destination VM's Unix
>> domain socket (cpr.sock) during a cpr-transfer test, race conditions can
>> occur if the socket file isn't ready. This can lead to connection
>> failures when running tests.
>>
>> This patch creates and listens on the socket in advance, and passes the
>> pre-listened FD directly. This avoids timing issues and improves the
>> reliability of CPR tests.
>>
>> Reviewed-by: Jason J. Herne<jjherne@linux.ibm.com>
>> Signed-off-by: Jaehoon Kim<jhkim@linux.ibm.com>
>>
>> ---
>> Changes since v1:
>> - In v1, the patch added a wait loop to poll the existence of the socket
>> file (cpr_validate_socket_path()).
>>
>> - This version instead creates the socket beforehand and passes its FD
>> to the destination QEMU, eliminating the race condition entirely.
>>
>> - Commit title and message changed accordingly.
>> ---
>> migration/cpr-transfer.c | 3 +-
>> tests/qtest/migration/cpr-tests.c | 72 ++++++++++++++++++++++++++++++-
>> 2 files changed, 72 insertions(+), 3 deletions(-)
>>
>> diff --git a/migration/cpr-transfer.c b/migration/cpr-transfer.c
>> index e1f140359c..7c9de70bad 100644
>> --- a/migration/cpr-transfer.c
>> +++ b/migration/cpr-transfer.c
>> @@ -46,7 +46,8 @@ QEMUFile *cpr_transfer_input(MigrationChannel *channel, Error **errp)
>> MigrationAddress *addr = channel->addr;
>>
>> if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET &&
>> - addr->u.socket.type == SOCKET_ADDRESS_TYPE_UNIX) {
>> + (addr->u.socket.type == SOCKET_ADDRESS_TYPE_UNIX ||
>> + addr->u.socket.type == SOCKET_ADDRESS_TYPE_FD)) {
>>
>> g_autoptr(QIOChannelSocket) sioc = NULL;
>> SocketAddress *saddr = &addr->u.socket;
> This will likely cause a crash in tracing code that is below this diff:
>
> trace_cpr_transfer_input(addr->u.socket.u.q_unix.path);
>
> q_unix.path is NOT valid to access with TYPE_FD.
>
>
> With regards,
> Daniel
You're right, I should split trace_cpr_transfer_input() call based on addr->u.socket.type,
since q_unix.path is not valid when the type is TYPE_FD.
- Jaehoon Kim.
On 6/10/2025 11:08 AM, Jaehoon Kim wrote:
> When the source VM attempts to connect to the destination VM's Unix
> domain socket (cpr.sock) during a cpr-transfer test, race conditions can
> occur if the socket file isn't ready. This can lead to connection
> failures when running tests.
>
> This patch creates and listens on the socket in advance, and passes the
> pre-listened FD directly. This avoids timing issues and improves the
> reliability of CPR tests.
>
> Reviewed-by: Jason J. Herne <jjherne@linux.ibm.com>
> Signed-off-by: Jaehoon Kim <jhkim@linux.ibm.com>
>
> ---
> Changes since v1:
> - In v1, the patch added a wait loop to poll the existence of the socket
> file (cpr_validate_socket_path()).
>
> - This version instead creates the socket beforehand and passes its FD
> to the destination QEMU, eliminating the race condition entirely.
>
> - Commit title and message changed accordingly.
> ---
> migration/cpr-transfer.c | 3 +-
> tests/qtest/migration/cpr-tests.c | 72 ++++++++++++++++++++++++++++++-
> 2 files changed, 72 insertions(+), 3 deletions(-)
>
> diff --git a/migration/cpr-transfer.c b/migration/cpr-transfer.c
> index e1f140359c..7c9de70bad 100644
> --- a/migration/cpr-transfer.c
> +++ b/migration/cpr-transfer.c
> @@ -46,7 +46,8 @@ QEMUFile *cpr_transfer_input(MigrationChannel *channel, Error **errp)
> MigrationAddress *addr = channel->addr;
>
> if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET &&
> - addr->u.socket.type == SOCKET_ADDRESS_TYPE_UNIX) {
> + (addr->u.socket.type == SOCKET_ADDRESS_TYPE_UNIX ||
> + addr->u.socket.type == SOCKET_ADDRESS_TYPE_FD)) {
Nice, I did not realize this would be so simple!
> g_autoptr(QIOChannelSocket) sioc = NULL;
> SocketAddress *saddr = &addr->u.socket;
> diff --git a/tests/qtest/migration/cpr-tests.c b/tests/qtest/migration/cpr-tests.c
> index 5536e14610..6f90160e21 100644
> --- a/tests/qtest/migration/cpr-tests.c
> +++ b/tests/qtest/migration/cpr-tests.c
> @@ -50,6 +50,51 @@ static void *test_mode_transfer_start(QTestState *from, QTestState *to)
> return NULL;
> }
>
> +/*
> + * Create a pre-listened UNIX domain socket at the specified path.
> + *
> + * This is used to eliminate a race condition that can occur
> + * intermittently in qtest during CPR tests. By pre-creating and
> + * listening on the socket, we avoid timing-related issues.
> + */
> +static int setup_socket_listener(const char *path)
> +{
> + struct sockaddr_un un;
> + size_t pathlen;
> + int sock_fd;
> +
> + sock_fd = socket(PF_UNIX, SOCK_STREAM, 0);
> + if (sock_fd < 0) {
> + g_test_message("Failed to create Unix socket");
> + return -1;
> + }
> +
> + pathlen = strlen(path);
> + if (pathlen >= sizeof(un.sun_path)) {
> + g_test_message("UNIX socket path '%s' is too long", path);
> + close(sock_fd);
> + return -1;
> + }
> +
> + memset(&un, 0, sizeof(un));
> + un.sun_family = AF_UNIX;
> + strncpy(un.sun_path, path, sizeof(un.sun_path) - 1);
> +
> + if (bind(sock_fd, (struct sockaddr *)&un, sizeof(un)) < 0) {
> + g_test_message("Failed to bind socket to %s", path);
> + close(sock_fd);
> + return -1;
> + }
> +
> + if (listen(sock_fd, 1) < 0) {
> + g_test_message("Failed to listen on socket %s", path);
> + close(sock_fd);
> + return -1;
> + }
> +
> + return sock_fd;
> +}
> +
Very nice code, but you can just use libqtest.c:qtest_socket_server().
That does not check for errors as nicely as you do, but none of its
actions should ever fail, unless the system is very starved for resources.
> /*
> * cpr-transfer mode cannot use the target monitor prior to starting the
> * migration, and cannot connect synchronously to the monitor, so defer
> @@ -60,13 +105,13 @@ static void test_mode_transfer_common(bool incoming_defer)
> g_autofree char *cpr_path = g_strdup_printf("%s/cpr.sock", tmpfs);
> g_autofree char *mig_path = g_strdup_printf("%s/migsocket", tmpfs);
> g_autofree char *uri = g_strdup_printf("unix:%s", mig_path);
> + g_autofree char *addr_type, *addr_key, *addr_value;
> + g_autofree char *opts_target;
>
> const char *opts = "-machine aux-ram-share=on -nodefaults";
> g_autofree const char *cpr_channel = g_strdup_printf(
> "cpr,addr.transport=socket,addr.type=unix,addr.path=%s",
> cpr_path);
> - g_autofree char *opts_target = g_strdup_printf("-incoming %s %s",
> - cpr_channel, opts);
>
> g_autofree char *connect_channels = g_strdup_printf(
> "[ { 'channel-type': 'main',"
> @@ -75,6 +120,29 @@ static void test_mode_transfer_common(bool incoming_defer)
> " 'path': '%s' } } ]",
> mig_path);
>
> + /*
> + * Determine socket address type and value.
> + * If socket creation fails, provide the socket path to the target,
> + * so it can create the Unix domain socket itself.
> + * Otherwise, use the pre-listened socket file descriptor directly.
> + */
> + int cpr_sockfd = setup_socket_listener(cpr_path);
unlink(cpr_path) after the listen socket is created.
g_assert(cpr_sockfd >= 0), then you can simplify the next block of code.
qtest_socket_server() will only fail if something is very wrong, in which
case trying to proceed with more tests is probably doomed.
- Steve
> +
> + if (cpr_sockfd < 0) {
> + addr_type = g_strdup("unix");
> + addr_key = g_strdup("path");
> + addr_value = g_strdup(cpr_path);
> + } else {
> + addr_type = g_strdup("fd");
> + addr_key = g_strdup("str");
> + addr_value = g_strdup_printf("%d", cpr_sockfd);
> + }
> +
> + opts_target = g_strdup_printf("-incoming cpr,addr.transport=socket,"
> + "addr.type=%s,addr.%s=%s %s",
> + addr_type, addr_key, addr_value, opts);
> +
> +
> MigrateCommon args = {
> .start.opts_source = opts,
> .start.opts_target = opts_target,
On 6/10/2025 11:57 AM, Steven Sistare wrote:
> On 6/10/2025 11:08 AM, Jaehoon Kim wrote:
>> When the source VM attempts to connect to the destination VM's Unix
>> domain socket (cpr.sock) during a cpr-transfer test, race conditions can
>> occur if the socket file isn't ready. This can lead to connection
>> failures when running tests.
>>
>> This patch creates and listens on the socket in advance, and passes the
>> pre-listened FD directly. This avoids timing issues and improves the
>> reliability of CPR tests.
>>
>> Reviewed-by: Jason J. Herne <jjherne@linux.ibm.com>
>> Signed-off-by: Jaehoon Kim <jhkim@linux.ibm.com>
>>
>> ---
>> Changes since v1:
>> - In v1, the patch added a wait loop to poll the existence of the socket
>> file (cpr_validate_socket_path()).
>>
>> - This version instead creates the socket beforehand and passes its FD
>> to the destination QEMU, eliminating the race condition entirely.
>>
>> - Commit title and message changed accordingly.
>> ---
>> migration/cpr-transfer.c | 3 +-
>> tests/qtest/migration/cpr-tests.c | 72 ++++++++++++++++++++++++++++++-
>> 2 files changed, 72 insertions(+), 3 deletions(-)
>>
>> diff --git a/migration/cpr-transfer.c b/migration/cpr-transfer.c
>> index e1f140359c..7c9de70bad 100644
>> --- a/migration/cpr-transfer.c
>> +++ b/migration/cpr-transfer.c
>> @@ -46,7 +46,8 @@ QEMUFile *cpr_transfer_input(MigrationChannel
>> *channel, Error **errp)
>> MigrationAddress *addr = channel->addr;
>> if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET &&
>> - addr->u.socket.type == SOCKET_ADDRESS_TYPE_UNIX) {
>> + (addr->u.socket.type == SOCKET_ADDRESS_TYPE_UNIX ||
>> + addr->u.socket.type == SOCKET_ADDRESS_TYPE_FD)) {
>
> Nice, I did not realize this would be so simple!
>
>> g_autoptr(QIOChannelSocket) sioc = NULL;
>> SocketAddress *saddr = &addr->u.socket;
>> diff --git a/tests/qtest/migration/cpr-tests.c
>> b/tests/qtest/migration/cpr-tests.c
>> index 5536e14610..6f90160e21 100644
>> --- a/tests/qtest/migration/cpr-tests.c
>> +++ b/tests/qtest/migration/cpr-tests.c
>> @@ -50,6 +50,51 @@ static void *test_mode_transfer_start(QTestState
>> *from, QTestState *to)
>> return NULL;
>> }
>> +/*
>> + * Create a pre-listened UNIX domain socket at the specified path.
>> + *
>> + * This is used to eliminate a race condition that can occur
>> + * intermittently in qtest during CPR tests. By pre-creating and
>> + * listening on the socket, we avoid timing-related issues.
>> + */
>> +static int setup_socket_listener(const char *path)
>> +{
>> + struct sockaddr_un un;
>> + size_t pathlen;
>> + int sock_fd;
>> +
>> + sock_fd = socket(PF_UNIX, SOCK_STREAM, 0);
>> + if (sock_fd < 0) {
>> + g_test_message("Failed to create Unix socket");
>> + return -1;
>> + }
>> +
>> + pathlen = strlen(path);
>> + if (pathlen >= sizeof(un.sun_path)) {
>> + g_test_message("UNIX socket path '%s' is too long", path);
>> + close(sock_fd);
>> + return -1;
>> + }
>> +
>> + memset(&un, 0, sizeof(un));
>> + un.sun_family = AF_UNIX;
>> + strncpy(un.sun_path, path, sizeof(un.sun_path) - 1);
>> +
>> + if (bind(sock_fd, (struct sockaddr *)&un, sizeof(un)) < 0) {
>> + g_test_message("Failed to bind socket to %s", path);
>> + close(sock_fd);
>> + return -1;
>> + }
>> +
>> + if (listen(sock_fd, 1) < 0) {
>> + g_test_message("Failed to listen on socket %s", path);
>> + close(sock_fd);
>> + return -1;
>> + }
>> +
>> + return sock_fd;
>> +}
>> +
>
> Very nice code, but you can just use libqtest.c:qtest_socket_server().
> That does not check for errors as nicely as you do, but none of its
> actions should ever fail, unless the system is very starved for
> resources.
>
Yes, as you suggested, using qtest_socket_server() makes the code simpler.
Thank you very much!
>> /*
>> * cpr-transfer mode cannot use the target monitor prior to
>> starting the
>> * migration, and cannot connect synchronously to the monitor, so
>> defer
>> @@ -60,13 +105,13 @@ static void test_mode_transfer_common(bool
>> incoming_defer)
>> g_autofree char *cpr_path = g_strdup_printf("%s/cpr.sock", tmpfs);
>> g_autofree char *mig_path = g_strdup_printf("%s/migsocket",
>> tmpfs);
>> g_autofree char *uri = g_strdup_printf("unix:%s", mig_path);
>> + g_autofree char *addr_type, *addr_key, *addr_value;
>> + g_autofree char *opts_target;
>> const char *opts = "-machine aux-ram-share=on -nodefaults";
>> g_autofree const char *cpr_channel = g_strdup_printf(
>> "cpr,addr.transport=socket,addr.type=unix,addr.path=%s",
>> cpr_path);
>> - g_autofree char *opts_target = g_strdup_printf("-incoming %s %s",
>> - cpr_channel, opts);
>> g_autofree char *connect_channels = g_strdup_printf(
>> "[ { 'channel-type': 'main',"
>> @@ -75,6 +120,29 @@ static void test_mode_transfer_common(bool
>> incoming_defer)
>> " 'path': '%s' } } ]",
>> mig_path);
>> + /*
>> + * Determine socket address type and value.
>> + * If socket creation fails, provide the socket path to the target,
>> + * so it can create the Unix domain socket itself.
>> + * Otherwise, use the pre-listened socket file descriptor directly.
>> + */
>> + int cpr_sockfd = setup_socket_listener(cpr_path);
>
> unlink(cpr_path) after the listen socket is created.
>
> g_assert(cpr_sockfd >= 0), then you can simplify the next block of code.
> qtest_socket_server() will only fail if something is very wrong, in which
> case trying to proceed with more tests is probably doomed.
>
> - Steve
>
Actually, I had considered exiting the process if socket creation failed, but ended up implementing it this way instead.
I'll revise the patch to follow your suggestion and make it terminate on failure.
- Jaehoon Kim
>> +
>> + if (cpr_sockfd < 0) {
>> + addr_type = g_strdup("unix");
>> + addr_key = g_strdup("path");
>> + addr_value = g_strdup(cpr_path);
>> + } else {
>> + addr_type = g_strdup("fd");
>> + addr_key = g_strdup("str");
>> + addr_value = g_strdup_printf("%d", cpr_sockfd);
>> + }
>> +
>> + opts_target = g_strdup_printf("-incoming
>> cpr,addr.transport=socket,"
>> + "addr.type=%s,addr.%s=%s %s",
>> + addr_type, addr_key, addr_value,
>> opts);
>> +
>> +
>> MigrateCommon args = {
>> .start.opts_source = opts,
>> .start.opts_target = opts_target,
>
>
On Tue, Jun 10, 2025 at 10:08:49AM -0500, Jaehoon Kim wrote: > When the source VM attempts to connect to the destination VM's Unix > domain socket (cpr.sock) during a cpr-transfer test, race conditions can > occur if the socket file isn't ready. This can lead to connection > failures when running tests. > > This patch creates and listens on the socket in advance, and passes the > pre-listened FD directly. This avoids timing issues and improves the > reliability of CPR tests. > > Reviewed-by: Jason J. Herne <jjherne@linux.ibm.com> > Signed-off-by: Jaehoon Kim <jhkim@linux.ibm.com> One quick comment while we can wait for others to look at the details: when it involves both qemu and tests changes, please consider splitting that into two patches. The test patch can be prefixed with "tests/migration:". Thanks, -- Peter Xu
On 6/10/2025 11:02 AM, Peter Xu wrote: > On Tue, Jun 10, 2025 at 10:08:49AM -0500, Jaehoon Kim wrote: >> When the source VM attempts to connect to the destination VM's Unix >> domain socket (cpr.sock) during a cpr-transfer test, race conditions can >> occur if the socket file isn't ready. This can lead to connection >> failures when running tests. >> >> This patch creates and listens on the socket in advance, and passes the >> pre-listened FD directly. This avoids timing issues and improves the >> reliability of CPR tests. >> >> Reviewed-by: Jason J. Herne<jjherne@linux.ibm.com> >> Signed-off-by: Jaehoon Kim<jhkim@linux.ibm.com> > One quick comment while we can wait for others to look at the details: when > it involves both qemu and tests changes, please consider splitting that > into two patches. The test patch can be prefixed with "tests/migration:". > > Thanks, Thank you for your suggestion. I'll split the patch into two separate patches and submit an updated v3 version - Jaehoon Kim
© 2016 - 2025 Red Hat, Inc.