[PATCH net-next v2] vsock/test: Add test for null ptr deref when transport changes

Luigi Leonardi posted 1 patch 9 months, 1 week ago
There is a newer version of this series
tools/testing/vsock/Makefile     |   1 +
tools/testing/vsock/vsock_test.c | 101 +++++++++++++++++++++++++++++++++++++++
2 files changed, 102 insertions(+)
[PATCH net-next v2] vsock/test: Add test for null ptr deref when transport changes
Posted by Luigi Leonardi 9 months, 1 week ago
Add a new test to ensure that when the transport changes a null pointer
dereference does not occur[1].

Note that this test does not fail, but it may hang on the client side if
it triggers a kernel oops.

This works by creating a socket, trying to connect to a server, and then
executing a second connect operation on the same socket but to a
different CID (0). This triggers a transport change. If the connect
operation is interrupted by a signal, this could cause a null-ptr-deref.

Since this bug is non-deterministic, we need to try several times. It
is safe to assume that the bug will show up within the timeout period.

If there is a G2H transport loaded in the system, the bug is not
triggered and this test will always pass.

[1]https://lore.kernel.org/netdev/Z2LvdTTQR7dBmPb5@v4bel-B760M-AORUS-ELITE-AX/

Suggested-by: Hyunwoo Kim <v4bel@theori.io>
Suggested-by: Michal Luczaj <mhal@rbox.co>
Signed-off-by: Luigi Leonardi <leonardi@redhat.com>
---
This series introduces a new test that checks for a null pointer 
dereference that may happen when there is a transport change[1]. This 
bug was fixed in [2].

Note that this test *cannot* fail, it hangs if it triggers a kernel
oops. The intended use-case is to run it and then check if there is any 
oops in the dmesg.

This test is based on Hyunwoo Kim's[3] and Michal's python 
reproducers[4].

[1]https://lore.kernel.org/netdev/Z2LvdTTQR7dBmPb5@v4bel-B760M-AORUS-ELITE-AX/
[2]https://lore.kernel.org/netdev/20250110083511.30419-1-sgarzare@redhat.com/
[3]https://lore.kernel.org/netdev/Z2LvdTTQR7dBmPb5@v4bel-B760M-AORUS-ELITE-AX/#t
[4]https://lore.kernel.org/netdev/2b3062e3-bdaa-4c94-a3c0-2930595b9670@rbox.co/
---
Changes in v2:
- Addressed Stefano's comments:
    - Timeout is now using current_nsec()
    - Check for return values
    - Style issues
- Added Hyunwoo Kim to Suggested-by
- Link to v1: https://lore.kernel.org/r/20250306-test_vsock-v1-0-0320b5accf92@redhat.com
---
 tools/testing/vsock/Makefile     |   1 +
 tools/testing/vsock/vsock_test.c | 101 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 102 insertions(+)

diff --git a/tools/testing/vsock/Makefile b/tools/testing/vsock/Makefile
index 6e0b4e95e230500f99bb9c74350701a037ecd198..88211fd132d23ecdfd56ab0815580a237889e7f2 100644
--- a/tools/testing/vsock/Makefile
+++ b/tools/testing/vsock/Makefile
@@ -5,6 +5,7 @@ vsock_test: vsock_test.o vsock_test_zerocopy.o timeout.o control.o util.o msg_ze
 vsock_diag_test: vsock_diag_test.o timeout.o control.o util.o
 vsock_perf: vsock_perf.o msg_zerocopy_common.o
 
+vsock_test: LDLIBS = -lpthread
 vsock_uring_test: LDLIBS = -luring
 vsock_uring_test: control.o util.o vsock_uring_test.o timeout.o msg_zerocopy_common.o
 
diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index d0f6d253ac72d08a957cb81a3c38fcc72bec5a53..d2820a67403c95bc4a7e7a16113ae2f6137b4c73 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -23,6 +23,7 @@
 #include <sys/ioctl.h>
 #include <linux/sockios.h>
 #include <linux/time64.h>
+#include <pthread.h>
 
 #include "vsock_test_zerocopy.h"
 #include "timeout.h"
@@ -1788,6 +1789,101 @@ static void test_stream_connect_retry_server(const struct test_opts *opts)
 	close(fd);
 }
 
+static void *test_stream_transport_change_thread(void *vargp)
+{
+	pid_t *pid = (pid_t *)vargp;
+
+	/* We want this thread to terminate as soon as possible */
+	if (pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL)) {
+		perror("pthread_setcanceltype");
+		exit(EXIT_FAILURE);
+	}
+
+	while (true) {
+		if (kill(*pid, SIGUSR1) < 0) {
+			perror("kill");
+			exit(EXIT_FAILURE);
+		}
+	}
+	return NULL;
+}
+
+static void test_transport_change_signal_handler(int signal)
+{
+	/* We need a custom handler for SIGUSR1 as the default one terminates the process. */
+}
+
+static void test_stream_transport_change_client(const struct test_opts *opts)
+{
+	__sighandler_t old_handler;
+	pid_t pid = getpid();
+	pthread_t thread_id;
+	time_t tout;
+
+	old_handler = signal(SIGUSR1, test_transport_change_signal_handler);
+	if (old_handler == SIG_ERR) {
+		perror("signal");
+		exit(EXIT_FAILURE);
+	}
+
+	if (pthread_create(&thread_id, NULL, test_stream_transport_change_thread, &pid)) {
+		perror("pthread_create");
+		exit(EXIT_FAILURE);
+	}
+
+	tout = current_nsec() + TIMEOUT * NSEC_PER_SEC;
+	do {
+		struct sockaddr_vm sa = {
+			.svm_family = AF_VSOCK,
+			.svm_cid = opts->peer_cid,
+			.svm_port = opts->peer_port,
+		};
+		int s;
+
+		s = socket(AF_VSOCK, SOCK_STREAM, 0);
+		if (s < 0) {
+			perror("socket");
+			exit(EXIT_FAILURE);
+		}
+
+		connect(s, (struct sockaddr *)&sa, sizeof(sa));
+
+		/* Set CID to 0 cause a transport change. */
+		sa.svm_cid = 0;
+		connect(s, (struct sockaddr *)&sa, sizeof(sa));
+
+		close(s);
+	} while (current_nsec() < tout);
+
+	if (pthread_cancel(thread_id)) {
+		perror("pthread_cancel");
+		exit(EXIT_FAILURE);
+	}
+
+	/* Wait for the thread to terminate */
+	if (pthread_join(thread_id, NULL)) {
+		perror("pthread_join");
+		exit(EXIT_FAILURE);
+	}
+
+	/* Restore the old handler */
+	if (signal(SIGUSR1, old_handler) == SIG_ERR) {
+		perror("signal");
+		exit(EXIT_FAILURE);
+	}
+}
+
+static void test_stream_transport_change_server(const struct test_opts *opts)
+{
+	time_t tout = current_nsec() + TIMEOUT * NSEC_PER_SEC;
+
+	do {
+		int s = vsock_stream_listen(VMADDR_CID_ANY, opts->peer_port);
+
+		close(s);
+	} while (current_nsec() < tout);
+}
+
 static void test_stream_linger_client(const struct test_opts *opts)
 {
 	struct linger optval = {
@@ -1984,6 +2080,11 @@ static struct test_case test_cases[] = {
 		.run_client = test_stream_linger_client,
 		.run_server = test_stream_linger_server,
 	},
+	{
+		.name = "SOCK_STREAM transport change null-ptr-deref",
+		.run_client = test_stream_transport_change_client,
+		.run_server = test_stream_transport_change_server,
+	},
 	{},
 };
 

---
base-commit: 4d872d51bc9d7b899c1f61534e3dbde72613f627
change-id: 20250306-test_vsock-3e77a9c7a245

Best regards,
-- 
Luigi Leonardi <leonardi@redhat.com>
Re: [PATCH net-next v2] vsock/test: Add test for null ptr deref when transport changes
Posted by Michal Luczaj 9 months ago
On 3/14/25 10:27, Luigi Leonardi wrote:
> Add a new test to ensure that when the transport changes a null pointer
> dereference does not occur[1].
> 
> Note that this test does not fail, but it may hang on the client side if
> it triggers a kernel oops.
> 
> This works by creating a socket, trying to connect to a server, and then
> executing a second connect operation on the same socket but to a
> different CID (0). This triggers a transport change. If the connect
> operation is interrupted by a signal, this could cause a null-ptr-deref.

Just to be clear: that's the splat, right?

Oops: general protection fault, probably for non-canonical address 0xdffffc000000000c: 0000 [#1] PREEMPT SMP KASAN NOPTI
KASAN: null-ptr-deref in range [0x0000000000000060-0x0000000000000067]
CPU: 2 UID: 0 PID: 463 Comm: kworker/2:3 Not tainted
Workqueue: vsock-loopback vsock_loopback_work
RIP: 0010:vsock_stream_has_data+0x44/0x70
Call Trace:
 virtio_transport_do_close+0x68/0x1a0
 virtio_transport_recv_pkt+0x1045/0x2ae4
 vsock_loopback_work+0x27d/0x3f0
 process_one_work+0x846/0x1420
 worker_thread+0x5b3/0xf80
 kthread+0x35a/0x700
 ret_from_fork+0x2d/0x70
 ret_from_fork_asm+0x1a/0x30

> ...
> +static void test_stream_transport_change_client(const struct test_opts *opts)
> +{
> +	__sighandler_t old_handler;
> +	pid_t pid = getpid();
> +	pthread_t thread_id;
> +	time_t tout;
> +
> +	old_handler = signal(SIGUSR1, test_transport_change_signal_handler);
> +	if (old_handler == SIG_ERR) {
> +		perror("signal");
> +		exit(EXIT_FAILURE);
> +	}
> +
> +	if (pthread_create(&thread_id, NULL, test_stream_transport_change_thread, &pid)) {
> +		perror("pthread_create");

Does pthread_create() set errno on failure?

> +		exit(EXIT_FAILURE);
> +	}
> +
> +	tout = current_nsec() + TIMEOUT * NSEC_PER_SEC;

Isn't 10 seconds a bit excessive? I see the oops pretty much immediately.

> +	do {
> +		struct sockaddr_vm sa = {
> +			.svm_family = AF_VSOCK,
> +			.svm_cid = opts->peer_cid,
> +			.svm_port = opts->peer_port,
> +		};
> +		int s;
> +
> +		s = socket(AF_VSOCK, SOCK_STREAM, 0);
> +		if (s < 0) {
> +			perror("socket");
> +			exit(EXIT_FAILURE);
> +		}
> +
> +		connect(s, (struct sockaddr *)&sa, sizeof(sa));
> +
> +		/* Set CID to 0 cause a transport change. */
> +		sa.svm_cid = 0;
> +		connect(s, (struct sockaddr *)&sa, sizeof(sa));
> +
> +		close(s);
> +	} while (current_nsec() < tout);
> +
> +	if (pthread_cancel(thread_id)) {
> +		perror("pthread_cancel");

And errno here.

> +		exit(EXIT_FAILURE);
> +	}
> +
> +	/* Wait for the thread to terminate */
> +	if (pthread_join(thread_id, NULL)) {
> +		perror("pthread_join");

And here.
Aaand I've realized I've made exactly the same mistake elsewhere :)

> ...
> +static void test_stream_transport_change_server(const struct test_opts *opts)
> +{
> +	time_t tout = current_nsec() + TIMEOUT * NSEC_PER_SEC;
> +
> +	do {
> +		int s = vsock_stream_listen(VMADDR_CID_ANY, opts->peer_port);
> +
> +		close(s);
> +	} while (current_nsec() < tout);
> +}

I'm not certain you need to re-create the listener or measure the time
here. What about something like

	int s = vsock_stream_listen(VMADDR_CID_ANY, opts->peer_port);
	control_expectln("DONE");
	close(s);

Thanks,
Michal
Re: [PATCH net-next v2] vsock/test: Add test for null ptr deref when transport changes
Posted by Luigi Leonardi 8 months, 3 weeks ago
Hi Michal,

On Wed, Mar 19, 2025 at 01:27:35AM +0100, Michal Luczaj wrote:
>On 3/14/25 10:27, Luigi Leonardi wrote:
>> Add a new test to ensure that when the transport changes a null pointer
>> dereference does not occur[1].
>>
>> Note that this test does not fail, but it may hang on the client side if
>> it triggers a kernel oops.
>>
>> This works by creating a socket, trying to connect to a server, and then
>> executing a second connect operation on the same socket but to a
>> different CID (0). This triggers a transport change. If the connect
>> operation is interrupted by a signal, this could cause a null-ptr-deref.
>
>Just to be clear: that's the splat, right?
>
>Oops: general protection fault, probably for non-canonical address 0xdffffc000000000c: 0000 [#1] PREEMPT SMP KASAN NOPTI
>KASAN: null-ptr-deref in range [0x0000000000000060-0x0000000000000067]
>CPU: 2 UID: 0 PID: 463 Comm: kworker/2:3 Not tainted
>Workqueue: vsock-loopback vsock_loopback_work
>RIP: 0010:vsock_stream_has_data+0x44/0x70
>Call Trace:
> virtio_transport_do_close+0x68/0x1a0
> virtio_transport_recv_pkt+0x1045/0x2ae4
> vsock_loopback_work+0x27d/0x3f0
> process_one_work+0x846/0x1420
> worker_thread+0x5b3/0xf80
> kthread+0x35a/0x700
> ret_from_fork+0x2d/0x70
> ret_from_fork_asm+0x1a/0x30
>

Yep! I'll add it to the commit message in v3.
>> ...
>> +static void test_stream_transport_change_client(const struct test_opts *opts)
>> +{
>> +	__sighandler_t old_handler;
>> +	pid_t pid = getpid();
>> +	pthread_t thread_id;
>> +	time_t tout;
>> +
>> +	old_handler = signal(SIGUSR1, test_transport_change_signal_handler);
>> +	if (old_handler == SIG_ERR) {
>> +		perror("signal");
>> +		exit(EXIT_FAILURE);
>> +	}
>> +
>> +	if (pthread_create(&thread_id, NULL, test_stream_transport_change_thread, &pid)) {
>> +		perror("pthread_create");
>
>Does pthread_create() set errno on failure?
It does not, very good catch!
>
>> +		exit(EXIT_FAILURE);
>> +	}
>> +
>> +	tout = current_nsec() + TIMEOUT * NSEC_PER_SEC;
>
>Isn't 10 seconds a bit excessive? I see the oops pretty much immediately.
Yeah it's probably excessive. I used because it's the default timeout 
value.
>
>> +	do {
>> +		struct sockaddr_vm sa = {
>> +			.svm_family = AF_VSOCK,
>> +			.svm_cid = opts->peer_cid,
>> +			.svm_port = opts->peer_port,
>> +		};
>> +		int s;
>> +
>> +		s = socket(AF_VSOCK, SOCK_STREAM, 0);
>> +		if (s < 0) {
>> +			perror("socket");
>> +			exit(EXIT_FAILURE);
>> +		}
>> +
>> +		connect(s, (struct sockaddr *)&sa, sizeof(sa));
>> +
>> +		/* Set CID to 0 cause a transport change. */
>> +		sa.svm_cid = 0;
>> +		connect(s, (struct sockaddr *)&sa, sizeof(sa));
>> +
>> +		close(s);
>> +	} while (current_nsec() < tout);
>> +
>> +	if (pthread_cancel(thread_id)) {
>> +		perror("pthread_cancel");
>
>And errno here.
>
>> +		exit(EXIT_FAILURE);
>> +	}
>> +
>> +	/* Wait for the thread to terminate */
>> +	if (pthread_join(thread_id, NULL)) {
>> +		perror("pthread_join");
>
>And here.
>Aaand I've realized I've made exactly the same mistake elsewhere :)
>
>> ...
>> +static void test_stream_transport_change_server(const struct test_opts *opts)
>> +{
>> +	time_t tout = current_nsec() + TIMEOUT * NSEC_PER_SEC;
>> +
>> +	do {
>> +		int s = vsock_stream_listen(VMADDR_CID_ANY, opts->peer_port);
>> +
>> +		close(s);
>> +	} while (current_nsec() < tout);
>> +}
>
>I'm not certain you need to re-create the listener or measure the time
>here. What about something like
>
>	int s = vsock_stream_listen(VMADDR_CID_ANY, opts->peer_port);
>	control_expectln("DONE");
>	close(s);
>
Just tried and it triggers the oops :)

>Thanks,
>Michal
>

Thanks for the review!

Luigi
Re: [PATCH net-next v2] vsock/test: Add test for null ptr deref when transport changes
Posted by Stefano Garzarella 8 months, 3 weeks ago
On Wed, Mar 26, 2025 at 04:14:20PM +0100, Luigi Leonardi wrote:
>Hi Michal,
>
>On Wed, Mar 19, 2025 at 01:27:35AM +0100, Michal Luczaj wrote:
>>On 3/14/25 10:27, Luigi Leonardi wrote:
>>>Add a new test to ensure that when the transport changes a null pointer
>>>dereference does not occur[1].
>>>
>>>Note that this test does not fail, but it may hang on the client side if
>>>it triggers a kernel oops.
>>>
>>>This works by creating a socket, trying to connect to a server, and then
>>>executing a second connect operation on the same socket but to a
>>>different CID (0). This triggers a transport change. If the connect
>>>operation is interrupted by a signal, this could cause a null-ptr-deref.
>>
>>Just to be clear: that's the splat, right?
>>
>>Oops: general protection fault, probably for non-canonical address 0xdffffc000000000c: 0000 [#1] PREEMPT SMP KASAN NOPTI
>>KASAN: null-ptr-deref in range [0x0000000000000060-0x0000000000000067]
>>CPU: 2 UID: 0 PID: 463 Comm: kworker/2:3 Not tainted
>>Workqueue: vsock-loopback vsock_loopback_work
>>RIP: 0010:vsock_stream_has_data+0x44/0x70
>>Call Trace:
>>virtio_transport_do_close+0x68/0x1a0
>>virtio_transport_recv_pkt+0x1045/0x2ae4
>>vsock_loopback_work+0x27d/0x3f0
>>process_one_work+0x846/0x1420
>>worker_thread+0x5b3/0xf80
>>kthread+0x35a/0x700
>>ret_from_fork+0x2d/0x70
>>ret_from_fork_asm+0x1a/0x30
>>
>
>Yep! I'll add it to the commit message in v3.
>>>...
>>>+static void test_stream_transport_change_client(const struct test_opts *opts)
>>>+{
>>>+	__sighandler_t old_handler;
>>>+	pid_t pid = getpid();
>>>+	pthread_t thread_id;
>>>+	time_t tout;
>>>+
>>>+	old_handler = signal(SIGUSR1, test_transport_change_signal_handler);
>>>+	if (old_handler == SIG_ERR) {
>>>+		perror("signal");
>>>+		exit(EXIT_FAILURE);
>>>+	}
>>>+
>>>+	if (pthread_create(&thread_id, NULL, test_stream_transport_change_thread, &pid)) {
>>>+		perror("pthread_create");
>>
>>Does pthread_create() set errno on failure?
>It does not, very good catch!
>>
>>>+		exit(EXIT_FAILURE);
>>>+	}
>>>+
>>>+	tout = current_nsec() + TIMEOUT * NSEC_PER_SEC;
>>
>>Isn't 10 seconds a bit excessive? I see the oops pretty much immediately.
>Yeah it's probably excessive. I used because it's the default timeout 
>value.
>>
>>>+	do {
>>>+		struct sockaddr_vm sa = {
>>>+			.svm_family = AF_VSOCK,
>>>+			.svm_cid = opts->peer_cid,
>>>+			.svm_port = opts->peer_port,
>>>+		};
>>>+		int s;
>>>+
>>>+		s = socket(AF_VSOCK, SOCK_STREAM, 0);
>>>+		if (s < 0) {
>>>+			perror("socket");
>>>+			exit(EXIT_FAILURE);
>>>+		}
>>>+
>>>+		connect(s, (struct sockaddr *)&sa, sizeof(sa));
>>>+
>>>+		/* Set CID to 0 cause a transport change. */
>>>+		sa.svm_cid = 0;
>>>+		connect(s, (struct sockaddr *)&sa, sizeof(sa));
>>>+
>>>+		close(s);
>>>+	} while (current_nsec() < tout);
>>>+
>>>+	if (pthread_cancel(thread_id)) {
>>>+		perror("pthread_cancel");
>>
>>And errno here.
>>
>>>+		exit(EXIT_FAILURE);
>>>+	}
>>>+
>>>+	/* Wait for the thread to terminate */
>>>+	if (pthread_join(thread_id, NULL)) {
>>>+		perror("pthread_join");
>>
>>And here.
>>Aaand I've realized I've made exactly the same mistake elsewhere :)
>>
>>>...
>>>+static void test_stream_transport_change_server(const struct test_opts *opts)
>>>+{
>>>+	time_t tout = current_nsec() + TIMEOUT * NSEC_PER_SEC;
>>>+
>>>+	do {
>>>+		int s = vsock_stream_listen(VMADDR_CID_ANY, opts->peer_port);
>>>+
>>>+		close(s);
>>>+	} while (current_nsec() < tout);
>>>+}
>>
>>I'm not certain you need to re-create the listener or measure the time
>>here. What about something like
>>
>>	int s = vsock_stream_listen(VMADDR_CID_ANY, opts->peer_port);
>>	control_expectln("DONE");
>>	close(s);
>>
>Just tried and it triggers the oops :)

If this works (as I also initially thought), we should check the result 
of the first connect() in the client code. It can succeed or fail with 
-EINTR, in other cases we should report an error because it is not 
expected.

And we should check also the second connect(), it should always fail, 
right?

For this I think you need another sync point to be sure the server is 
listening before try to connect the first time:

client:
     // pthread_create, etc.

     control_expectln("LISTENING");

     do {
         ...
     } while();

     control_writeln("DONE");

server:
     int s = vsock_stream_listen(VMADDR_CID_ANY, opts->peer_port);
     control_writeln("LISTENING");
     control_expectln("DONE");
     close(s);

Thanks,
Stefano
Re: [PATCH net-next v2] vsock/test: Add test for null ptr deref when transport changes
Posted by Stefano Garzarella 8 months, 3 weeks ago
On Wed, Mar 26, 2025 at 05:21:03PM +0100, Stefano Garzarella wrote:
>On Wed, Mar 26, 2025 at 04:14:20PM +0100, Luigi Leonardi wrote:
>>Hi Michal,
>>
>>On Wed, Mar 19, 2025 at 01:27:35AM +0100, Michal Luczaj wrote:
>>>On 3/14/25 10:27, Luigi Leonardi wrote:
>>>>Add a new test to ensure that when the transport changes a null pointer
>>>>dereference does not occur[1].
>>>>
>>>>Note that this test does not fail, but it may hang on the client side if
>>>>it triggers a kernel oops.
>>>>
>>>>This works by creating a socket, trying to connect to a server, and then
>>>>executing a second connect operation on the same socket but to a
>>>>different CID (0). This triggers a transport change. If the connect
>>>>operation is interrupted by a signal, this could cause a null-ptr-deref.
>>>
>>>Just to be clear: that's the splat, right?
>>>
>>>Oops: general protection fault, probably for non-canonical address 0xdffffc000000000c: 0000 [#1] PREEMPT SMP KASAN NOPTI
>>>KASAN: null-ptr-deref in range [0x0000000000000060-0x0000000000000067]
>>>CPU: 2 UID: 0 PID: 463 Comm: kworker/2:3 Not tainted
>>>Workqueue: vsock-loopback vsock_loopback_work
>>>RIP: 0010:vsock_stream_has_data+0x44/0x70
>>>Call Trace:
>>>virtio_transport_do_close+0x68/0x1a0
>>>virtio_transport_recv_pkt+0x1045/0x2ae4
>>>vsock_loopback_work+0x27d/0x3f0
>>>process_one_work+0x846/0x1420
>>>worker_thread+0x5b3/0xf80
>>>kthread+0x35a/0x700
>>>ret_from_fork+0x2d/0x70
>>>ret_from_fork_asm+0x1a/0x30
>>>
>>
>>Yep! I'll add it to the commit message in v3.
>>>>...
>>>>+static void test_stream_transport_change_client(const struct test_opts *opts)
>>>>+{
>>>>+	__sighandler_t old_handler;
>>>>+	pid_t pid = getpid();
>>>>+	pthread_t thread_id;
>>>>+	time_t tout;
>>>>+
>>>>+	old_handler = signal(SIGUSR1, test_transport_change_signal_handler);
>>>>+	if (old_handler == SIG_ERR) {
>>>>+		perror("signal");
>>>>+		exit(EXIT_FAILURE);
>>>>+	}
>>>>+
>>>>+	if (pthread_create(&thread_id, NULL, test_stream_transport_change_thread, &pid)) {
>>>>+		perror("pthread_create");
>>>
>>>Does pthread_create() set errno on failure?
>>It does not, very good catch!
>>>
>>>>+		exit(EXIT_FAILURE);
>>>>+	}
>>>>+
>>>>+	tout = current_nsec() + TIMEOUT * NSEC_PER_SEC;
>>>
>>>Isn't 10 seconds a bit excessive? I see the oops pretty much immediately.
>>Yeah it's probably excessive. I used because it's the default 
>>timeout value.
>>>
>>>>+	do {
>>>>+		struct sockaddr_vm sa = {
>>>>+			.svm_family = AF_VSOCK,
>>>>+			.svm_cid = opts->peer_cid,
>>>>+			.svm_port = opts->peer_port,
>>>>+		};
>>>>+		int s;
>>>>+
>>>>+		s = socket(AF_VSOCK, SOCK_STREAM, 0);
>>>>+		if (s < 0) {
>>>>+			perror("socket");
>>>>+			exit(EXIT_FAILURE);
>>>>+		}
>>>>+
>>>>+		connect(s, (struct sockaddr *)&sa, sizeof(sa));
>>>>+
>>>>+		/* Set CID to 0 cause a transport change. */
>>>>+		sa.svm_cid = 0;
>>>>+		connect(s, (struct sockaddr *)&sa, sizeof(sa));
>>>>+
>>>>+		close(s);
>>>>+	} while (current_nsec() < tout);
>>>>+
>>>>+	if (pthread_cancel(thread_id)) {
>>>>+		perror("pthread_cancel");
>>>
>>>And errno here.
>>>
>>>>+		exit(EXIT_FAILURE);
>>>>+	}
>>>>+
>>>>+	/* Wait for the thread to terminate */
>>>>+	if (pthread_join(thread_id, NULL)) {
>>>>+		perror("pthread_join");
>>>
>>>And here.
>>>Aaand I've realized I've made exactly the same mistake elsewhere :)
>>>
>>>>...
>>>>+static void test_stream_transport_change_server(const struct test_opts *opts)
>>>>+{
>>>>+	time_t tout = current_nsec() + TIMEOUT * NSEC_PER_SEC;
>>>>+
>>>>+	do {
>>>>+		int s = vsock_stream_listen(VMADDR_CID_ANY, opts->peer_port);
>>>>+
>>>>+		close(s);
>>>>+	} while (current_nsec() < tout);
>>>>+}
>>>
>>>I'm not certain you need to re-create the listener or measure the time
>>>here. What about something like
>>>
>>>	int s = vsock_stream_listen(VMADDR_CID_ANY, opts->peer_port);
>>>	control_expectln("DONE");
>>>	close(s);
>>>
>>Just tried and it triggers the oops :)
>
>If this works (as I also initially thought), we should check the 
>result of the first connect() in the client code. It can succeed or 
>fail with -EINTR, in other cases we should report an error because it 
>is not expected.
>
>And we should check also the second connect(), it should always fail, 
>right?
>
>For this I think you need another sync point to be sure the server is 
>listening before try to connect the first time:
>
>client:
>    // pthread_create, etc.
>
>    control_expectln("LISTENING");
>
>    do {
>        ...
>    } while();
>
>    control_writeln("DONE");
>
>server:
>    int s = vsock_stream_listen(VMADDR_CID_ANY, opts->peer_port);
>    control_writeln("LISTENING");

We found that this needed to be extended by adding an accept() loop to 
avoid filling up the backlog of the listening socket.
But by doing accept() and close() back to back, we found a problem in 
AF_VSOCK, where connect() in some cases would get stuck until the 
timeout (default: 2 seconds) returning -ETIMEDOUT.

Fix is coming.

Thanks,
Stefano

>    control_expectln("DONE");
>    close(s);
>
>Thanks,
>Stefano
Re: [PATCH net-next v2] vsock/test: Add test for null ptr deref when transport changes
Posted by Stefano Garzarella 8 months, 3 weeks ago
On Wed, Mar 26, 2025 at 04:14:20PM +0100, Luigi Leonardi wrote:
>Hi Michal,
>
>On Wed, Mar 19, 2025 at 01:27:35AM +0100, Michal Luczaj wrote:
>>On 3/14/25 10:27, Luigi Leonardi wrote:
>>>Add a new test to ensure that when the transport changes a null pointer
>>>dereference does not occur[1].
>>>
>>>Note that this test does not fail, but it may hang on the client side if
>>>it triggers a kernel oops.
>>>
>>>This works by creating a socket, trying to connect to a server, and then
>>>executing a second connect operation on the same socket but to a
>>>different CID (0). This triggers a transport change. If the connect
>>>operation is interrupted by a signal, this could cause a null-ptr-deref.
>>
>>Just to be clear: that's the splat, right?
>>
>>Oops: general protection fault, probably for non-canonical address 0xdffffc000000000c: 0000 [#1] PREEMPT SMP KASAN NOPTI
>>KASAN: null-ptr-deref in range [0x0000000000000060-0x0000000000000067]
>>CPU: 2 UID: 0 PID: 463 Comm: kworker/2:3 Not tainted
>>Workqueue: vsock-loopback vsock_loopback_work
>>RIP: 0010:vsock_stream_has_data+0x44/0x70
>>Call Trace:
>>virtio_transport_do_close+0x68/0x1a0
>>virtio_transport_recv_pkt+0x1045/0x2ae4
>>vsock_loopback_work+0x27d/0x3f0
>>process_one_work+0x846/0x1420
>>worker_thread+0x5b3/0xf80
>>kthread+0x35a/0x700
>>ret_from_fork+0x2d/0x70
>>ret_from_fork_asm+0x1a/0x30
>>
>
>Yep! I'll add it to the commit message in v3.
>>>...
>>>+static void test_stream_transport_change_client(const struct test_opts *opts)
>>>+{
>>>+	__sighandler_t old_handler;
>>>+	pid_t pid = getpid();
>>>+	pthread_t thread_id;
>>>+	time_t tout;
>>>+
>>>+	old_handler = signal(SIGUSR1, test_transport_change_signal_handler);
>>>+	if (old_handler == SIG_ERR) {
>>>+		perror("signal");
>>>+		exit(EXIT_FAILURE);
>>>+	}
>>>+
>>>+	if (pthread_create(&thread_id, NULL, test_stream_transport_change_thread, &pid)) {
>>>+		perror("pthread_create");
>>
>>Does pthread_create() set errno on failure?
>It does not, very good catch!
>>
>>>+		exit(EXIT_FAILURE);
>>>+	}
>>>+
>>>+	tout = current_nsec() + TIMEOUT * NSEC_PER_SEC;
>>
>>Isn't 10 seconds a bit excessive? I see the oops pretty much immediately.
>Yeah it's probably excessive. I used because it's the default timeout 
>value.

Please define a new macro with less time, something like we did with
ACCEPTQ_LEAK_RACE_TIMEOUT.

Thanks,
Stefano
Re: [PATCH net-next v2] vsock/test: Add test for null ptr deref when transport changes
Posted by Stefano Garzarella 9 months ago
On Fri, Mar 14, 2025 at 10:27:33AM +0100, Luigi Leonardi wrote:
>Add a new test to ensure that when the transport changes a null pointer
>dereference does not occur[1].

I'd add something like this:

"... does not occur. The bug was reported upstream [1] and fixed with
commit 2cb7c756f605 ("vsock/virtio: discard packets if the transport
changes")."

>
>Note that this test does not fail, but it may hang on the client side if
>it triggers a kernel oops.

In my case the test failed (I guess the other side that was still
working), so I'd say: "Note that this test may not fail in a kernel
without the fix, ..."

>
>This works by creating a socket, trying to connect to a server, and then
>executing a second connect operation on the same socket but to a
>different CID (0). This triggers a transport change. If the connect
>operation is interrupted by a signal, this could cause a null-ptr-deref.
>
>Since this bug is non-deterministic, we need to try several times. It
>is safe to assume that the bug will show up within the timeout period.

s/safe/reasonable

>
>If there is a G2H transport loaded in the system, the bug is not
>triggered and this test will always pass.

The rest LGTM.

Thanks,
Stefano

>
>[1]https://lore.kernel.org/netdev/Z2LvdTTQR7dBmPb5@v4bel-B760M-AORUS-ELITE-AX/
>
>Suggested-by: Hyunwoo Kim <v4bel@theori.io>
>Suggested-by: Michal Luczaj <mhal@rbox.co>
>Signed-off-by: Luigi Leonardi <leonardi@redhat.com>
>---
>This series introduces a new test that checks for a null pointer
>dereference that may happen when there is a transport change[1]. This
>bug was fixed in [2].
>
>Note that this test *cannot* fail, it hangs if it triggers a kernel
>oops. The intended use-case is to run it and then check if there is any
>oops in the dmesg.
>
>This test is based on Hyunwoo Kim's[3] and Michal's python
>reproducers[4].
>
>[1]https://lore.kernel.org/netdev/Z2LvdTTQR7dBmPb5@v4bel-B760M-AORUS-ELITE-AX/
>[2]https://lore.kernel.org/netdev/20250110083511.30419-1-sgarzare@redhat.com/
>[3]https://lore.kernel.org/netdev/Z2LvdTTQR7dBmPb5@v4bel-B760M-AORUS-ELITE-AX/#t
>[4]https://lore.kernel.org/netdev/2b3062e3-bdaa-4c94-a3c0-2930595b9670@rbox.co/
>---
>Changes in v2:
>- Addressed Stefano's comments:
>    - Timeout is now using current_nsec()
>    - Check for return values
>    - Style issues
>- Added Hyunwoo Kim to Suggested-by
>- Link to v1: https://lore.kernel.org/r/20250306-test_vsock-v1-0-0320b5accf92@redhat.com
>---
> tools/testing/vsock/Makefile     |   1 +
> tools/testing/vsock/vsock_test.c | 101 +++++++++++++++++++++++++++++++++++++++
> 2 files changed, 102 insertions(+)
>
>diff --git a/tools/testing/vsock/Makefile b/tools/testing/vsock/Makefile
>index 6e0b4e95e230500f99bb9c74350701a037ecd198..88211fd132d23ecdfd56ab0815580a237889e7f2 100644
>--- a/tools/testing/vsock/Makefile
>+++ b/tools/testing/vsock/Makefile
>@@ -5,6 +5,7 @@ vsock_test: vsock_test.o vsock_test_zerocopy.o timeout.o control.o util.o msg_ze
> vsock_diag_test: vsock_diag_test.o timeout.o control.o util.o
> vsock_perf: vsock_perf.o msg_zerocopy_common.o
>
>+vsock_test: LDLIBS = -lpthread
> vsock_uring_test: LDLIBS = -luring
> vsock_uring_test: control.o util.o vsock_uring_test.o timeout.o msg_zerocopy_common.o
>
>diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
>index d0f6d253ac72d08a957cb81a3c38fcc72bec5a53..d2820a67403c95bc4a7e7a16113ae2f6137b4c73 100644
>--- a/tools/testing/vsock/vsock_test.c
>+++ b/tools/testing/vsock/vsock_test.c
>@@ -23,6 +23,7 @@
> #include <sys/ioctl.h>
> #include <linux/sockios.h>
> #include <linux/time64.h>
>+#include <pthread.h>
>
> #include "vsock_test_zerocopy.h"
> #include "timeout.h"
>@@ -1788,6 +1789,101 @@ static void test_stream_connect_retry_server(const struct test_opts *opts)
> 	close(fd);
> }
>
>+static void *test_stream_transport_change_thread(void *vargp)
>+{
>+	pid_t *pid = (pid_t *)vargp;
>+
>+	/* We want this thread to terminate as soon as possible */
>+	if (pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL)) {
>+		perror("pthread_setcanceltype");
>+		exit(EXIT_FAILURE);
>+	}
>+
>+	while (true) {
>+		if (kill(*pid, SIGUSR1) < 0) {
>+			perror("kill");
>+			exit(EXIT_FAILURE);
>+		}
>+	}
>+	return NULL;
>+}
>+
>+static void test_transport_change_signal_handler(int signal)
>+{
>+	/* We need a custom handler for SIGUSR1 as the default one terminates the process. */
>+}
>+
>+static void test_stream_transport_change_client(const struct test_opts *opts)
>+{
>+	__sighandler_t old_handler;
>+	pid_t pid = getpid();
>+	pthread_t thread_id;
>+	time_t tout;
>+
>+	old_handler = signal(SIGUSR1, test_transport_change_signal_handler);
>+	if (old_handler == SIG_ERR) {
>+		perror("signal");
>+		exit(EXIT_FAILURE);
>+	}
>+
>+	if (pthread_create(&thread_id, NULL, test_stream_transport_change_thread, &pid)) {
>+		perror("pthread_create");
>+		exit(EXIT_FAILURE);
>+	}
>+
>+	tout = current_nsec() + TIMEOUT * NSEC_PER_SEC;
>+	do {
>+		struct sockaddr_vm sa = {
>+			.svm_family = AF_VSOCK,
>+			.svm_cid = opts->peer_cid,
>+			.svm_port = opts->peer_port,
>+		};
>+		int s;
>+
>+		s = socket(AF_VSOCK, SOCK_STREAM, 0);
>+		if (s < 0) {
>+			perror("socket");
>+			exit(EXIT_FAILURE);
>+		}
>+
>+		connect(s, (struct sockaddr *)&sa, sizeof(sa));
>+
>+		/* Set CID to 0 cause a transport change. */
>+		sa.svm_cid = 0;
>+		connect(s, (struct sockaddr *)&sa, sizeof(sa));
>+
>+		close(s);
>+	} while (current_nsec() < tout);
>+
>+	if (pthread_cancel(thread_id)) {
>+		perror("pthread_cancel");
>+		exit(EXIT_FAILURE);
>+	}
>+
>+	/* Wait for the thread to terminate */
>+	if (pthread_join(thread_id, NULL)) {
>+		perror("pthread_join");
>+		exit(EXIT_FAILURE);
>+	}
>+
>+	/* Restore the old handler */
>+	if (signal(SIGUSR1, old_handler) == SIG_ERR) {
>+		perror("signal");
>+		exit(EXIT_FAILURE);
>+	}
>+}
>+
>+static void test_stream_transport_change_server(const struct test_opts *opts)
>+{
>+	time_t tout = current_nsec() + TIMEOUT * NSEC_PER_SEC;
>+
>+	do {
>+		int s = vsock_stream_listen(VMADDR_CID_ANY, opts->peer_port);
>+
>+		close(s);
>+	} while (current_nsec() < tout);
>+}
>+
> static void test_stream_linger_client(const struct test_opts *opts)
> {
> 	struct linger optval = {
>@@ -1984,6 +2080,11 @@ static struct test_case test_cases[] = {
> 		.run_client = test_stream_linger_client,
> 		.run_server = test_stream_linger_server,
> 	},
>+	{
>+		.name = "SOCK_STREAM transport change null-ptr-deref",
>+		.run_client = test_stream_transport_change_client,
>+		.run_server = test_stream_transport_change_server,
>+	},
> 	{},
> };
>
>
>---
>base-commit: 4d872d51bc9d7b899c1f61534e3dbde72613f627
>change-id: 20250306-test_vsock-3e77a9c7a245
>
>Best regards,
>-- 
>Luigi Leonardi <leonardi@redhat.com>
>