Add a new test to ensure that when the transport changes a null pointer
dereference does not occur[1].
Note that this test does not fail, but it may hang on the client side if
it triggers a kernel oops.
This works by creating a socket, trying to connect to a server, and then
executing a second connect operation on the same socket but to a
different CID (0). This triggers a transport change. If the connect
operation is interrupted by a signal, this could cause a null-ptr-deref.
Since this bug is non-deterministic, we need to try several times. It
is safe to assume that the bug will show up within the timeout period.
If there is a G2H transport loaded in the system, the bug is not
triggered and this test will always pass.
[1]https://lore.kernel.org/netdev/Z2LvdTTQR7dBmPb5@v4bel-B760M-AORUS-ELITE-AX/
Suggested-by: Michal Luczaj <mhal@rbox.co>
Signed-off-by: Luigi Leonardi <leonardi@redhat.com>
---
tools/testing/vsock/Makefile | 1 +
tools/testing/vsock/vsock_test.c | 80 ++++++++++++++++++++++++++++++++++++++++
2 files changed, 81 insertions(+)
diff --git a/tools/testing/vsock/Makefile b/tools/testing/vsock/Makefile
index 6e0b4e95e230500f99bb9c74350701a037ecd198..88211fd132d23ecdfd56ab0815580a237889e7f2 100644
--- a/tools/testing/vsock/Makefile
+++ b/tools/testing/vsock/Makefile
@@ -5,6 +5,7 @@ vsock_test: vsock_test.o vsock_test_zerocopy.o timeout.o control.o util.o msg_ze
vsock_diag_test: vsock_diag_test.o timeout.o control.o util.o
vsock_perf: vsock_perf.o msg_zerocopy_common.o
+vsock_test: LDLIBS = -lpthread
vsock_uring_test: LDLIBS = -luring
vsock_uring_test: control.o util.o vsock_uring_test.o timeout.o msg_zerocopy_common.o
diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index d0f6d253ac72d08a957cb81a3c38fcc72bec5a53..1e00cb6e117859d5c18fb3e52a574444b5489173 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -23,6 +23,7 @@
#include <sys/ioctl.h>
#include <linux/sockios.h>
#include <linux/time64.h>
+#include <pthread.h>
#include "vsock_test_zerocopy.h"
#include "timeout.h"
@@ -1788,6 +1789,80 @@ static void test_stream_connect_retry_server(const struct test_opts *opts)
close(fd);
}
+static void *test_transport_change_thread(void *vargp)
+{
+ pid_t *t = (pid_t *)vargp;
+
+ //We want this thread to terminate as soon as possible
+ pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);
+
+ while (true)
+ kill(*t, SIGUSR1);
+ return NULL;
+}
+
+static void test_transport_change_signal_handler(int signal)
+{
+}
+
+static void test_transport_change_client(const struct test_opts *opts)
+{
+ __sighandler_t old_handler;
+ pid_t pid = getpid();
+ pthread_t thread_id;
+
+ old_handler = signal(SIGUSR1, test_transport_change_signal_handler);
+
+ pthread_create(&thread_id, NULL, test_transport_change_thread, &pid);
+
+ timeout_begin(TIMEOUT);
+
+ while (true) {
+ struct sockaddr_vm sa = {
+ .svm_family = AF_VSOCK,
+ .svm_cid = opts->peer_cid,
+ .svm_port = opts->peer_port,
+ };
+
+ int s = socket(AF_VSOCK, SOCK_STREAM, 0);
+
+ connect(s, (struct sockaddr *)&sa, sizeof(sa));
+
+ sa.svm_cid = 0;
+ connect(s, (struct sockaddr *)&sa, sizeof(sa));
+
+ close(s);
+
+ if (timeout_check_expired())
+ break;
+ }
+
+ timeout_end();
+
+ pthread_cancel(thread_id);
+ //Wait for the thread to terminate
+ pthread_join(thread_id, NULL);
+ //Restore the old handler
+ signal(SIGUSR1, old_handler);
+}
+
+static void test_transport_change_server(const struct test_opts *opts)
+{
+ timeout_begin(TIMEOUT);
+
+ while (true) {
+ int s;
+
+ s = vsock_stream_listen(opts->peer_cid, opts->peer_port);
+ close(s);
+
+ if (timeout_check_expired())
+ break;
+ }
+
+ timeout_end();
+}
+
static void test_stream_linger_client(const struct test_opts *opts)
{
struct linger optval = {
@@ -1984,6 +2059,11 @@ static struct test_case test_cases[] = {
.run_client = test_stream_linger_client,
.run_server = test_stream_linger_server,
},
+ {
+ .name = "SOCK_STREAM transport change null-ptr-deref",
+ .run_client = test_transport_change_client,
+ .run_server = test_transport_change_server,
+ },
{},
};
--
2.48.1
On 3/6/25 17:09, Luigi Leonardi wrote: > Add a new test to ensure that when the transport changes a null pointer > dereference does not occur[1]. > > Note that this test does not fail, but it may hang on the client side if > it triggers a kernel oops. > > This works by creating a socket, trying to connect to a server, and then > executing a second connect operation on the same socket but to a > different CID (0). This triggers a transport change. If the connect > operation is interrupted by a signal, this could cause a null-ptr-deref. > > Since this bug is non-deterministic, we need to try several times. It > is safe to assume that the bug will show up within the timeout period. > > If there is a G2H transport loaded in the system, the bug is not > triggered and this test will always pass. > > [1]https://lore.kernel.org/netdev/Z2LvdTTQR7dBmPb5@v4bel-B760M-AORUS-ELITE-AX/ > > Suggested-by: Michal Luczaj <mhal@rbox.co> > Signed-off-by: Luigi Leonardi <leonardi@redhat.com> > --- I think the credit should be given to Hyunwoo Kim, not me. Thanks though, Michal
On Thu, Mar 06, 2025 at 05:09:33PM +0100, Luigi Leonardi wrote:
>Add a new test to ensure that when the transport changes a null pointer
>dereference does not occur[1].
>
>Note that this test does not fail, but it may hang on the client side if
>it triggers a kernel oops.
>
>This works by creating a socket, trying to connect to a server, and then
>executing a second connect operation on the same socket but to a
>different CID (0). This triggers a transport change. If the connect
>operation is interrupted by a signal, this could cause a null-ptr-deref.
>
>Since this bug is non-deterministic, we need to try several times. It
>is safe to assume that the bug will show up within the timeout period.
>
>If there is a G2H transport loaded in the system, the bug is not
>triggered and this test will always pass.
>
>[1]https://lore.kernel.org/netdev/Z2LvdTTQR7dBmPb5@v4bel-B760M-AORUS-ELITE-AX/
>
>Suggested-by: Michal Luczaj <mhal@rbox.co>
>Signed-off-by: Luigi Leonardi <leonardi@redhat.com>
>---
> tools/testing/vsock/Makefile | 1 +
> tools/testing/vsock/vsock_test.c | 80 ++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 81 insertions(+)
>
>diff --git a/tools/testing/vsock/Makefile b/tools/testing/vsock/Makefile
>index 6e0b4e95e230500f99bb9c74350701a037ecd198..88211fd132d23ecdfd56ab0815580a237889e7f2 100644
>--- a/tools/testing/vsock/Makefile
>+++ b/tools/testing/vsock/Makefile
>@@ -5,6 +5,7 @@ vsock_test: vsock_test.o vsock_test_zerocopy.o timeout.o control.o util.o msg_ze
> vsock_diag_test: vsock_diag_test.o timeout.o control.o util.o
> vsock_perf: vsock_perf.o msg_zerocopy_common.o
>
>+vsock_test: LDLIBS = -lpthread
> vsock_uring_test: LDLIBS = -luring
> vsock_uring_test: control.o util.o vsock_uring_test.o timeout.o msg_zerocopy_common.o
>
>diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
>index d0f6d253ac72d08a957cb81a3c38fcc72bec5a53..1e00cb6e117859d5c18fb3e52a574444b5489173 100644
>--- a/tools/testing/vsock/vsock_test.c
>+++ b/tools/testing/vsock/vsock_test.c
>@@ -23,6 +23,7 @@
> #include <sys/ioctl.h>
> #include <linux/sockios.h>
> #include <linux/time64.h>
>+#include <pthread.h>
>
> #include "vsock_test_zerocopy.h"
> #include "timeout.h"
>@@ -1788,6 +1789,80 @@ static void test_stream_connect_retry_server(const struct test_opts *opts)
> close(fd);
> }
>
>+static void *test_transport_change_thread(void *vargp)
>+{
>+ pid_t *t = (pid_t *)vargp;
>+
>+ //We want this thread to terminate as soon as possible
Please follow the stile in this file, we use /* Something ... */ for
comments.
>+ pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);
>+
>+ while (true)
>+ kill(*t, SIGUSR1);
Should we check the return value of kill()?
>+ return NULL;
>+}
>+
>+static void test_transport_change_signal_handler(int signal)
>+{
Can you put comment here to explain why it's empty?
>+}
>+
>+static void test_transport_change_client(const struct test_opts *opts)
>+{
>+ __sighandler_t old_handler;
>+ pid_t pid = getpid();
>+ pthread_t thread_id;
>+
>+ old_handler = signal(SIGUSR1, test_transport_change_signal_handler);
>+
>+ pthread_create(&thread_id, NULL, test_transport_change_thread, &pid);
>+
>+ timeout_begin(TIMEOUT);
>+
>+ while (true) {
What about `while (timeout_check_expired()) {` here?
>+ struct sockaddr_vm sa = {
>+ .svm_family = AF_VSOCK,
>+ .svm_cid = opts->peer_cid,
>+ .svm_port = opts->peer_port,
>+ };
>+
>+ int s = socket(AF_VSOCK, SOCK_STREAM, 0);
Please check that `s` is a valid file descriptor.
>+
>+ connect(s, (struct sockaddr *)&sa, sizeof(sa));
>+
Maybe I'd add a comment here to explain why we are setting
`svm_cid = 0`.
>+ sa.svm_cid = 0;
>+ connect(s, (struct sockaddr *)&sa, sizeof(sa));
>+
>+ close(s);
>+
>+ if (timeout_check_expired())
>+ break;
>+ }
>+
>+ timeout_end();
>+
>+ pthread_cancel(thread_id);
>+ //Wait for the thread to terminate
>+ pthread_join(thread_id, NULL);
Please check return values and fix the comment style.
>+ //Restore the old handler
>+ signal(SIGUSR1, old_handler);
>+}
>+
>+static void test_transport_change_server(const struct test_opts *opts)
>+{
>+ timeout_begin(TIMEOUT);
Instead of using timeout_begin(), etc. on both sides, can we do
something similar to what we did in test_stream_leak_acceptq_client()
and test_stream_leak_acceptq_server() ?
>+
>+ while (true) {
>+ int s;
>+
>+ s = vsock_stream_listen(opts->peer_cid, opts->peer_port);
>+ close(s);
>+
>+ if (timeout_check_expired())
>+ break;
>+ }
>+
>+ timeout_end();
>+}
>+
> static void test_stream_linger_client(const struct test_opts *opts)
> {
> struct linger optval = {
>@@ -1984,6 +2059,11 @@ static struct test_case test_cases[] = {
> .run_client = test_stream_linger_client,
> .run_server = test_stream_linger_server,
> },
>+ {
>+ .name = "SOCK_STREAM transport change null-ptr-deref",
>+ .run_client = test_transport_change_client,
>+ .run_server = test_transport_change_server,
Following the other test, I'd call `test_stream_transport_change...`.
Thanks,
Stefano
>+ },
> {},
> };
>
>
>--
>2.48.1
>
© 2016 - 2026 Red Hat, Inc.