sockmap expects all vsocks to have a transport assigned, which is expressed
in vsock_proto::psock_update_sk_prot(). However, there is an edge case
where an unconnected (connectible) socket may lose its previously assigned
transport. This is handled with a NULL check in the vsock/BPF recv path.
Another design detail is that listening vsocks are not supposed to have any
transport assigned at all. Which implies they are not supported by the
sockmap. But this is complicated by the fact that a socket, before
switching to TCP_LISTEN, may have had some transport assigned during a
failed connect() attempt. Hence, we may end up with a listening vsock in a
sockmap, which blows up quickly:
KASAN: null-ptr-deref in range [0x0000000000000120-0x0000000000000127]
CPU: 7 UID: 0 PID: 56 Comm: kworker/7:0 Not tainted 6.14.0-rc1+
Workqueue: vsock-loopback vsock_loopback_work
RIP: 0010:vsock_read_skb+0x4b/0x90
Call Trace:
sk_psock_verdict_data_ready+0xa4/0x2e0
virtio_transport_recv_pkt+0x1ca8/0x2acc
vsock_loopback_work+0x27d/0x3f0
process_one_work+0x846/0x1420
worker_thread+0x5b3/0xf80
kthread+0x35a/0x700
ret_from_fork+0x2d/0x70
ret_from_fork_asm+0x1a/0x30
For connectible sockets, instead of relying solely on the state of
vsk->transport, tell sockmap to only allow those representing established
connections. This aligns with the behaviour for AF_INET and AF_UNIX.
Fixes: 634f1a7110b4 ("vsock: support sockmap")
Signed-off-by: Michal Luczaj <mhal@rbox.co>
---
net/core/sock_map.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index f1b9b3958792cd599efcb591742874e9b3f4a76b..2f1be9baad0578e2202b5cf79616b6e814c1ed54 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -541,6 +541,9 @@ static bool sock_map_sk_state_allowed(const struct sock *sk)
return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_LISTEN);
if (sk_is_stream_unix(sk))
return (1 << sk->sk_state) & TCPF_ESTABLISHED;
+ if (sk_is_vsock(sk) &&
+ (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET))
+ return (1 << sk->sk_state) & TCPF_ESTABLISHED;
return true;
}
--
2.48.1
> ...
> Another design detail is that listening vsocks are not supposed to have any
> transport assigned at all. Which implies they are not supported by the
> sockmap. But this is complicated by the fact that a socket, before
> switching to TCP_LISTEN, may have had some transport assigned during a
> failed connect() attempt. Hence, we may end up with a listening vsock in a
> sockmap, which blows up quickly:
>
> KASAN: null-ptr-deref in range [0x0000000000000120-0x0000000000000127]
> CPU: 7 UID: 0 PID: 56 Comm: kworker/7:0 Not tainted 6.14.0-rc1+
> Workqueue: vsock-loopback vsock_loopback_work
> RIP: 0010:vsock_read_skb+0x4b/0x90
> Call Trace:
> sk_psock_verdict_data_ready+0xa4/0x2e0
> virtio_transport_recv_pkt+0x1ca8/0x2acc
> vsock_loopback_work+0x27d/0x3f0
> process_one_work+0x846/0x1420
> worker_thread+0x5b3/0xf80
> kthread+0x35a/0x700
> ret_from_fork+0x2d/0x70
> ret_from_fork_asm+0x1a/0x30
Perhaps I should have expanded more on the null-ptr-deref itself.
The idea is: force a vsock into assigning a transport and add it to the
sockmap (with a verdict program), but keep it unconnected. Then, drop
the transport and set the vsock to TCP_LISTEN. The moment a new
connection is established:
virtio_transport_recv_pkt()
virtio_transport_recv_listen()
sk->sk_data_ready(sk) i.e. sk_psock_verdict_data_ready()
ops->read_skb() i.e. vsock_read_skb()
vsk->transport->read_skb() vsk->transport is NULL, boom
Here's a stand-alone repro:
/*
* # modprobe -a vsock_loopback vhost_vsock
* # gcc test.c && ./a.out
*/
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <sys/socket.h>
#include <sys/syscall.h>
#include <linux/bpf.h>
#include <linux/vm_sockets.h>
static void die(const char *msg)
{
perror(msg);
exit(-1);
}
static int sockmap_create(void)
{
union bpf_attr attr = {
.map_type = BPF_MAP_TYPE_SOCKMAP,
.key_size = sizeof(int),
.value_size = sizeof(int),
.max_entries = 1
};
int map;
map = syscall(SYS_bpf, BPF_MAP_CREATE, &attr, sizeof(attr));
if (map < 0)
die("map_create");
return map;
}
static void map_update_elem(int fd, int key, int value)
{
union bpf_attr attr = {
.map_fd = fd,
.key = (uint64_t)&key,
.value = (uint64_t)&value,
.flags = BPF_ANY
};
if (syscall(SYS_bpf, BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)))
die("map_update_elem");
}
static int prog_load(void)
{
/* mov %r0, 1; exit */
struct bpf_insn insns[] = {
{ .code = BPF_ALU64 | BPF_MOV | BPF_K, .dst_reg = 0, .imm = 1 },
{ .code = BPF_JMP | BPF_EXIT }
};
union bpf_attr attr = {
.prog_type = BPF_PROG_TYPE_SK_SKB,
.insn_cnt = sizeof(insns)/sizeof(insns[0]),
.insns = (long)insns,
.license = (long)"",
};
int prog = syscall(SYS_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
if (prog < 0)
die("prog_load");
return prog;
}
static void link_create(int prog_fd, int target_fd)
{
union bpf_attr attr = {
.link_create = {
.prog_fd = prog_fd,
.target_fd = target_fd,
.attach_type = BPF_SK_SKB_VERDICT
}
};
if (syscall(SYS_bpf, BPF_LINK_CREATE, &attr, sizeof(attr)) < 0)
die("link_create");
}
int main(void)
{
struct sockaddr_vm addr = {
.svm_family = AF_VSOCK,
.svm_cid = VMADDR_CID_LOCAL,
.svm_port = VMADDR_PORT_ANY
};
socklen_t alen = sizeof(addr);
int s, map, prog, c;
s = socket(AF_VSOCK, SOCK_SEQPACKET, 0);
if (s < 0)
die("socket");
if (bind(s, (struct sockaddr *)&addr, alen))
die("bind");
if (!connect(s, (struct sockaddr *)&addr, alen) || errno != ECONNRESET)
die("connect #1");
map = sockmap_create();
prog = prog_load();
link_create(prog, map);
map_update_elem(map, 0, s);
addr.svm_cid = 0x42424242; /* non-existing */
if (!connect(s, (struct sockaddr *)&addr, alen) || errno != ESOCKTNOSUPPORT)
die("connect #2");
if (listen(s, 1))
die("listen");
if (getsockname(s, (struct sockaddr *)&addr, &alen))
die("getsockname");
c = socket(AF_VSOCK, SOCK_SEQPACKET, 0);
if (c < 0)
die("socket c");
if (connect(c, (struct sockaddr *)&addr, alen))
die("connect #3");
return 0;
}
On Fri, Feb 14, 2025 at 02:11:48PM +0100, Michal Luczaj wrote:
>> ...
>> Another design detail is that listening vsocks are not supposed to have any
>> transport assigned at all. Which implies they are not supported by the
>> sockmap. But this is complicated by the fact that a socket, before
>> switching to TCP_LISTEN, may have had some transport assigned during a
>> failed connect() attempt. Hence, we may end up with a listening vsock in a
>> sockmap, which blows up quickly:
>>
>> KASAN: null-ptr-deref in range [0x0000000000000120-0x0000000000000127]
>> CPU: 7 UID: 0 PID: 56 Comm: kworker/7:0 Not tainted 6.14.0-rc1+
>> Workqueue: vsock-loopback vsock_loopback_work
>> RIP: 0010:vsock_read_skb+0x4b/0x90
>> Call Trace:
>> sk_psock_verdict_data_ready+0xa4/0x2e0
>> virtio_transport_recv_pkt+0x1ca8/0x2acc
>> vsock_loopback_work+0x27d/0x3f0
>> process_one_work+0x846/0x1420
>> worker_thread+0x5b3/0xf80
>> kthread+0x35a/0x700
>> ret_from_fork+0x2d/0x70
>> ret_from_fork_asm+0x1a/0x30
>
>Perhaps I should have expanded more on the null-ptr-deref itself.
>
>The idea is: force a vsock into assigning a transport and add it to the
>sockmap (with a verdict program), but keep it unconnected. Then, drop
>the transport and set the vsock to TCP_LISTEN. The moment a new
>connection is established:
>
>virtio_transport_recv_pkt()
> virtio_transport_recv_listen()
> sk->sk_data_ready(sk) i.e. sk_psock_verdict_data_ready()
> ops->read_skb() i.e. vsock_read_skb()
> vsk->transport->read_skb() vsk->transport is NULL, boom
>
Yes I agree, it's a little clearer with this, but I think it was also
clear the concept before. So with or without:
Acked-by: Stefano Garzarella <sgarzare@redhat.com>
>Here's a stand-alone repro:
>
>/*
> * # modprobe -a vsock_loopback vhost_vsock
> * # gcc test.c && ./a.out
> */
>#include <stdio.h>
>#include <stdint.h>
>#include <stdlib.h>
>#include <unistd.h>
>#include <errno.h>
>#include <sys/socket.h>
>#include <sys/syscall.h>
>#include <linux/bpf.h>
>#include <linux/vm_sockets.h>
>
>static void die(const char *msg)
>{
> perror(msg);
> exit(-1);
>}
>
>static int sockmap_create(void)
>{
> union bpf_attr attr = {
> .map_type = BPF_MAP_TYPE_SOCKMAP,
> .key_size = sizeof(int),
> .value_size = sizeof(int),
> .max_entries = 1
> };
> int map;
>
> map = syscall(SYS_bpf, BPF_MAP_CREATE, &attr, sizeof(attr));
> if (map < 0)
> die("map_create");
>
> return map;
>}
>
>static void map_update_elem(int fd, int key, int value)
>{
> union bpf_attr attr = {
> .map_fd = fd,
> .key = (uint64_t)&key,
> .value = (uint64_t)&value,
> .flags = BPF_ANY
> };
>
> if (syscall(SYS_bpf, BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)))
> die("map_update_elem");
>}
>
>static int prog_load(void)
>{
> /* mov %r0, 1; exit */
> struct bpf_insn insns[] = {
> { .code = BPF_ALU64 | BPF_MOV | BPF_K, .dst_reg = 0, .imm = 1 },
> { .code = BPF_JMP | BPF_EXIT }
> };
> union bpf_attr attr = {
> .prog_type = BPF_PROG_TYPE_SK_SKB,
> .insn_cnt = sizeof(insns)/sizeof(insns[0]),
> .insns = (long)insns,
> .license = (long)"",
> };
>
> int prog = syscall(SYS_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
> if (prog < 0)
> die("prog_load");
>
> return prog;
>}
>
>static void link_create(int prog_fd, int target_fd)
>{
> union bpf_attr attr = {
> .link_create = {
> .prog_fd = prog_fd,
> .target_fd = target_fd,
> .attach_type = BPF_SK_SKB_VERDICT
> }
> };
>
> if (syscall(SYS_bpf, BPF_LINK_CREATE, &attr, sizeof(attr)) < 0)
> die("link_create");
>}
>
>int main(void)
>{
> struct sockaddr_vm addr = {
> .svm_family = AF_VSOCK,
> .svm_cid = VMADDR_CID_LOCAL,
> .svm_port = VMADDR_PORT_ANY
> };
> socklen_t alen = sizeof(addr);
> int s, map, prog, c;
>
> s = socket(AF_VSOCK, SOCK_SEQPACKET, 0);
> if (s < 0)
> die("socket");
>
> if (bind(s, (struct sockaddr *)&addr, alen))
> die("bind");
>
> if (!connect(s, (struct sockaddr *)&addr, alen) || errno != ECONNRESET)
> die("connect #1");
>
> map = sockmap_create();
> prog = prog_load();
> link_create(prog, map);
> map_update_elem(map, 0, s);
>
> addr.svm_cid = 0x42424242; /* non-existing */
> if (!connect(s, (struct sockaddr *)&addr, alen) || errno != ESOCKTNOSUPPORT)
> die("connect #2");
>
> if (listen(s, 1))
> die("listen");
>
> if (getsockname(s, (struct sockaddr *)&addr, &alen))
> die("getsockname");
>
> c = socket(AF_VSOCK, SOCK_SEQPACKET, 0);
> if (c < 0)
> die("socket c");
>
> if (connect(c, (struct sockaddr *)&addr, alen))
> die("connect #3");
>
> return 0;
>}
>
© 2016 - 2025 Red Hat, Inc.