The sockmap feature allows bpf syscall from userspace, or based
on bpf sockops, replacing the sk_prot of sockets during protocol stack
processing with sockmap's custom read/write interfaces.
'''
tcp_rcv_state_process()
syn_recv_sock()/subflow_syn_recv_sock()
tcp_init_transfer(BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB)
bpf_skops_established <== sockops
bpf_sock_map_update(sk) <== call bpf helper
tcp_bpf_update_proto() <== update sk_prot
'''
When the server has MPTCP enabled but the client sends a TCP SYN
without MPTCP, subflow_syn_recv_sock() performs a fallback on the
subflow, replacing the subflow sk's sk_prot with the native sk_prot.
'''
subflow_syn_recv_sock()
subflow_ulp_fallback()
subflow_drop_ctx()
mptcp_subflow_ops_undo_override()
'''
Then, this subflow can be normally used by sockmap, which replaces the
native sk_prot with sockmap's custom sk_prot. The issue occurs when the
user executes accept::mptcp_stream_accept::mptcp_fallback_tcp_ops().
Here, it uses sk->sk_prot to compare with the native sk_prot, but this
is incorrect when sockmap is used, as we may incorrectly set
sk->sk_socket->ops.
This fix uses the more generic sk_family for the comparison instead.
Additionally, this also prevents a PANIC from occurring:
result from ./scripts/decode_stacktrace.sh:
------------[ cut here ]------------
BUG: kernel NULL pointer dereference, address: 00000000000004bb
PGD 0 P4D 0
Oops: 0000 [#1] SMP PTI
CPU: 0 PID: 400 Comm: test_progs Not tainted 6.1.0+ #16
RIP: 0010:mptcp_stream_accept (./include/linux/list.h:88 net/mptcp/protocol.c:3719)
RSP: 0018:ffffc90000ef3cf0 EFLAGS: 00010246
RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffff8880089dcc58
RDX: 0000000000000003 RSI: 0000002c000000b0 RDI: 0000000000000000
RBP: ffffc90000ef3d38 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: ffff8880089dc600
R13: ffff88800b859e00 R14: ffff88800638c680 R15: 0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00000000000004bb CR3: 000000000b8e8006 CR4: 0000000000770ef0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
PKRU: 55555554
Call Trace:
<TASK>
? apparmor_socket_accept (security/apparmor/lsm.c:966)
do_accept (net/socket.c:1856)
__sys_accept4 (net/socket.c:1897 net/socket.c:1927)
__x64_sys_accept (net/socket.c:1941)
do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80)
Fixes: d2f77c53342e ("mptcp: check for plain TCP sock at accept time")
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Reviewed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Signed-off-by: Jiayuan Chen <jiayuan.chen@linux.dev>
---
net/mptcp/protocol.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 1dbc62537259..13e3510e6c8f 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -79,8 +79,9 @@ static u64 mptcp_wnd_end(const struct mptcp_sock *msk)
static bool mptcp_is_tcpsk(struct sock *sk)
{
struct socket *sock = sk->sk_socket;
+ unsigned short family = READ_ONCE(sk->sk_family);
- if (unlikely(sk->sk_prot == &tcp_prot)) {
+ if (unlikely(family == AF_INET)) {
/* we are being invoked after mptcp_accept() has
* accepted a non-mp-capable flow: sk is a tcp_sk,
* not an mptcp one.
@@ -91,7 +92,7 @@ static bool mptcp_is_tcpsk(struct sock *sk)
sock->ops = &inet_stream_ops;
return true;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
- } else if (unlikely(sk->sk_prot == &tcpv6_prot)) {
+ } else if (unlikely(family == AF_INET6)) {
sock->ops = &inet6_stream_ops;
return true;
#endif
--
2.43.0
Hi Jiayuan,
On 30/11/2025 04:23, Jiayuan Chen wrote:
> The sockmap feature allows bpf syscall from userspace, or based
> on bpf sockops, replacing the sk_prot of sockets during protocol stack
> processing with sockmap's custom read/write interfaces.
(...)
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 1dbc62537259..13e3510e6c8f 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -79,8 +79,9 @@ static u64 mptcp_wnd_end(const struct mptcp_sock *msk)
> static bool mptcp_is_tcpsk(struct sock *sk)
> {
> struct socket *sock = sk->sk_socket;
> + unsigned short family = READ_ONCE(sk->sk_family);
>
> - if (unlikely(sk->sk_prot == &tcp_prot)) {
> + if (unlikely(family == AF_INET)) {
> /* we are being invoked after mptcp_accept() has
> * accepted a non-mp-capable flow: sk is a tcp_sk,
> * not an mptcp one.
> @@ -91,7 +92,7 @@ static bool mptcp_is_tcpsk(struct sock *sk)
> sock->ops = &inet_stream_ops;
> return true;
> #if IS_ENABLED(CONFIG_MPTCP_IPV6)
> - } else if (unlikely(sk->sk_prot == &tcpv6_prot)) {
> + } else if (unlikely(family == AF_INET6)) {
These modifications here break MPTCP: this function (mptcp_is_tcpsk) is
there to check if the socket is a "plain" TCP one (return "true") or an
MPTCP one (return "false"). If it is not an MPTCP one, the sock ops is
modified.
Here, you are saying: any IPv4 or IPv6 socket is a "plain" TCP one,
never an MPTCP socket then.
I suggest adding ...
if (sk->sk_protocol == IPPROTO_MPTCP)
return false;
... at the beginning of this function. I'm planning to send a patch
later on including this check. Once it is sent, do you mind checking it
with sockmap if you have the setup available, please?
Cheers,
Matt
--
Sponsored by the NGI0 Core fund.
2025/12/1 24:21, "Matthieu Baerts" <matttbe@kernel.org mailto:matttbe@kernel.org?to=%22Matthieu%20Baerts%22%20%3Cmatttbe%40kernel.org%3E > wrote:
>
> Hi Jiayuan,
>
> On 30/11/2025 04:23, Jiayuan Chen wrote:
>
> >
> > The sockmap feature allows bpf syscall from userspace, or based
> > on bpf sockops, replacing the sk_prot of sockets during protocol stack
> > processing with sockmap's custom read/write interfaces.
> >
> (...)
>
> >
> > diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> > index 1dbc62537259..13e3510e6c8f 100644
> > --- a/net/mptcp/protocol.c
> > +++ b/net/mptcp/protocol.c
> > @@ -79,8 +79,9 @@ static u64 mptcp_wnd_end(const struct mptcp_sock *msk)
> > static bool mptcp_is_tcpsk(struct sock *sk)
> > {
> > struct socket *sock = sk->sk_socket;
> > + unsigned short family = READ_ONCE(sk->sk_family);
> >
> > - if (unlikely(sk->sk_prot == &tcp_prot)) {
> > + if (unlikely(family == AF_INET)) {
> > /* we are being invoked after mptcp_accept() has
> > * accepted a non-mp-capable flow: sk is a tcp_sk,
> > * not an mptcp one.
> > @@ -91,7 +92,7 @@ static bool mptcp_is_tcpsk(struct sock *sk)
> > sock->ops = &inet_stream_ops;
> > return true;
> > #if IS_ENABLED(CONFIG_MPTCP_IPV6)
> > - } else if (unlikely(sk->sk_prot == &tcpv6_prot)) {
> > + } else if (unlikely(family == AF_INET6)) {
> >
> These modifications here break MPTCP: this function (mptcp_is_tcpsk) is
> there to check if the socket is a "plain" TCP one (return "true") or an
> MPTCP one (return "false"). If it is not an MPTCP one, the sock ops is
> modified.
>
> Here, you are saying: any IPv4 or IPv6 socket is a "plain" TCP one,
> never an MPTCP socket then.
>
> I suggest adding ...
>
> if (sk->sk_protocol == IPPROTO_MPTCP)
> return false;
>
> ... at the beginning of this function. I'm planning to send a patch
> later on including this check. Once it is sent, do you mind checking it
> with sockmap if you have the setup available, please?
Yes, of course. I can test it once I receive the patch.
> Cheers,
> Matt
> --
> Sponsored by the NGI0 Core fund.
>
© 2016 - 2025 Red Hat, Inc.