[PATCH mptcp-next v3 1/2] mptcp: implement psock_update_sk_prot

Geliang Tang posted 2 patches 2 months, 3 weeks ago
[PATCH mptcp-next v3 1/2] mptcp: implement psock_update_sk_prot
Posted by Geliang Tang 2 months, 3 weeks ago
From: Geliang Tang <tanggeliang@kylinos.cn>

Add MPTCP support for BPF sockmap by implementing psock_update_sk_prot
callback. This allows MPTCP sockets to dynamically switch protocol
handlers when attached to or detached from sockmap programs. Separate
protocol structures are maintained for IPv4/IPv6 and TX/RX configurations.

tcp_bpf_update_proto() in net/ipv4/tcp_bpf.c is a frame of reference for
this patch.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202512261144.DxrvwMS3-lkp@intel.com/
Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/521
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
---
 net/mptcp/protocol.c | 113 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 113 insertions(+)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index b5676b37f8f4..7a485694323b 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -11,6 +11,7 @@
 #include <linux/netdevice.h>
 #include <linux/sched/signal.h>
 #include <linux/atomic.h>
+#include <linux/skmsg.h>
 #include <net/aligned_data.h>
 #include <net/rps.h>
 #include <net/sock.h>
@@ -4020,6 +4021,106 @@ static int mptcp_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
 	return 0;
 }
 
+#ifdef CONFIG_BPF_SYSCALL
+enum {
+	MPTCP_BPF_IPV4,
+	MPTCP_BPF_IPV6,
+	MPTCP_BPF_NUM_PROTS,
+};
+
+enum {
+	MPTCP_BPF_BASE,
+	MPTCP_BPF_TX,
+	MPTCP_BPF_RX,
+	MPTCP_BPF_TXRX,
+	MPTCP_BPF_NUM_CFGS,
+};
+
+static struct proto mptcp_bpf_prots[MPTCP_BPF_NUM_PROTS][MPTCP_BPF_NUM_CFGS];
+
+static void mptcp_bpf_rebuild_protos(struct proto prot[MPTCP_BPF_NUM_CFGS],
+				     struct proto *base)
+{
+	prot[MPTCP_BPF_BASE]			= *base;
+	prot[MPTCP_BPF_BASE].destroy		= sock_map_destroy;
+	prot[MPTCP_BPF_BASE].close		= sock_map_close;
+	prot[MPTCP_BPF_BASE].sock_is_readable	= sk_msg_is_readable;
+
+	prot[MPTCP_BPF_TX]			= prot[MPTCP_BPF_BASE];
+	prot[MPTCP_BPF_RX]			= prot[MPTCP_BPF_BASE];
+	prot[MPTCP_BPF_TXRX]			= prot[MPTCP_BPF_TX];
+}
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+static struct proto *mptcpv6_prot_saved __read_mostly;
+static DEFINE_SPINLOCK(mptcpv6_prot_lock);
+
+static void mptcp_bpf_check_v6_needs_rebuild(struct proto *ops)
+{
+	/*
+	 * Load with acquire semantics to ensure we see the latest protocol
+	 * structure before checking for rebuild.
+	 */
+	if (unlikely(ops != smp_load_acquire(&mptcpv6_prot_saved))) {
+		spin_lock_bh(&mptcpv6_prot_lock);
+		if (likely(ops != mptcpv6_prot_saved)) {
+			mptcp_bpf_rebuild_protos(mptcp_bpf_prots[MPTCP_BPF_IPV6],
+						 ops);
+			/* Ensure mptcpv6_prot_saved update is visible before
+			 * releasing lock
+			 */
+			smp_store_release(&mptcpv6_prot_saved, ops);
+		}
+		spin_unlock_bh(&mptcpv6_prot_lock);
+	}
+}
+
+static int mptcp_bpf_assert_proto_ops(struct proto *ops)
+{
+	/* In order to avoid retpoline, we make assumptions when we call
+	 * into ops if e.g. a psock is not present. Make sure they are
+	 * indeed valid assumptions.
+	 */
+	return ops->recvmsg  == mptcp_recvmsg &&
+	       ops->sendmsg  == mptcp_sendmsg ? 0 : -EOPNOTSUPP;
+}
+#endif
+
+static int mptcp_bpf_update_proto(struct sock *sk,
+				  struct sk_psock *psock,
+				  bool restore)
+{
+	int family = sk->sk_family == AF_INET6 ? MPTCP_BPF_IPV6 :
+						 MPTCP_BPF_IPV4;
+	int config = psock->progs.msg_parser   ? MPTCP_BPF_TX   :
+						 MPTCP_BPF_BASE;
+
+	if (psock->progs.stream_verdict || psock->progs.skb_verdict)
+		config = (config == MPTCP_BPF_TX) ? MPTCP_BPF_TXRX :
+						    MPTCP_BPF_RX;
+
+	if (restore) {
+		WRITE_ONCE(sk->sk_write_space, psock->saved_write_space);
+		/* Pairs with lockless read in sk_clone_lock() */
+		sock_replace_proto(sk, psock->sk_proto);
+		return 0;
+	}
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+	if (sk->sk_family == AF_INET6) {
+		if (mptcp_bpf_assert_proto_ops(psock->sk_proto))
+			return -EINVAL;
+
+		mptcp_bpf_check_v6_needs_rebuild(psock->sk_proto);
+	}
+#endif
+
+	/* Pairs with lockless read in sk_clone_lock() */
+	sock_replace_proto(sk, &mptcp_bpf_prots[family][config]);
+	return 0;
+}
+#endif
+
 static struct proto mptcp_prot = {
 	.name		= "MPTCP",
 	.owner		= THIS_MODULE,
@@ -4051,8 +4152,20 @@ static struct proto mptcp_prot = {
 	.obj_size	= sizeof(struct mptcp_sock),
 	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
 	.no_autobind	= true,
+#ifdef CONFIG_BPF_SYSCALL
+	.psock_update_sk_prot	= mptcp_bpf_update_proto,
+#endif
 };
 
+#ifdef CONFIG_BPF_SYSCALL
+static int __init mptcp_bpf_v4_build_proto(void)
+{
+	mptcp_bpf_rebuild_protos(mptcp_bpf_prots[MPTCP_BPF_IPV4], &mptcp_prot);
+	return 0;
+}
+late_initcall(mptcp_bpf_v4_build_proto);
+#endif
+
 static int mptcp_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len)
 {
 	struct mptcp_sock *msk = mptcp_sk(sock->sk);
-- 
2.51.0
Re: [PATCH mptcp-next v3 1/2] mptcp: implement psock_update_sk_prot
Posted by Mat Martineau 1 month, 3 weeks ago
On Wed, 18 Mar 2026, Geliang Tang wrote:

> From: Geliang Tang <tanggeliang@kylinos.cn>
>
> Add MPTCP support for BPF sockmap by implementing psock_update_sk_prot
> callback. This allows MPTCP sockets to dynamically switch protocol
> handlers when attached to or detached from sockmap programs. Separate
> protocol structures are maintained for IPv4/IPv6 and TX/RX configurations.
>
> tcp_bpf_update_proto() in net/ipv4/tcp_bpf.c is a frame of reference for
> this patch.
>
> Reported-by: kernel test robot <lkp@intel.com>
> Closes: https://lore.kernel.org/oe-kbuild-all/202512261144.DxrvwMS3-lkp@intel.com/
> Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/521
> Cc: Cong Wang <xiyou.wangcong@gmail.com>
> Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
> ---
> net/mptcp/protocol.c | 113 +++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 113 insertions(+)
>
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index b5676b37f8f4..7a485694323b 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -11,6 +11,7 @@
> #include <linux/netdevice.h>
> #include <linux/sched/signal.h>
> #include <linux/atomic.h>
> +#include <linux/skmsg.h>
> #include <net/aligned_data.h>
> #include <net/rps.h>
> #include <net/sock.h>
> @@ -4020,6 +4021,106 @@ static int mptcp_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
> 	return 0;
> }
>
> +#ifdef CONFIG_BPF_SYSCALL
> +enum {
> +	MPTCP_BPF_IPV4,
> +	MPTCP_BPF_IPV6,
> +	MPTCP_BPF_NUM_PROTS,
> +};
> +
> +enum {
> +	MPTCP_BPF_BASE,
> +	MPTCP_BPF_TX,
> +	MPTCP_BPF_RX,
> +	MPTCP_BPF_TXRX,
> +	MPTCP_BPF_NUM_CFGS,
> +};
> +
> +static struct proto mptcp_bpf_prots[MPTCP_BPF_NUM_PROTS][MPTCP_BPF_NUM_CFGS];
> +
> +static void mptcp_bpf_rebuild_protos(struct proto prot[MPTCP_BPF_NUM_CFGS],
> +				     struct proto *base)
> +{
> +	prot[MPTCP_BPF_BASE]			= *base;
> +	prot[MPTCP_BPF_BASE].destroy		= sock_map_destroy;
> +	prot[MPTCP_BPF_BASE].close		= sock_map_close;
> +	prot[MPTCP_BPF_BASE].sock_is_readable	= sk_msg_is_readable;
> +
> +	prot[MPTCP_BPF_TX]			= prot[MPTCP_BPF_BASE];
> +	prot[MPTCP_BPF_RX]			= prot[MPTCP_BPF_BASE];
> +	prot[MPTCP_BPF_TXRX]			= prot[MPTCP_BPF_TX];

Hi Geliang -

Minor thing, can you change the last line to also assign from 
prot[MPTCP_BPF_BASE]? The tcp_bpf.c code does copy from TCP_BPF_TX but it 
has a different sendmsg member value.

Related question: why does TCP have separate sendmsg/recvmsg for BPF but 
MPTCP does not? The cover letter mentions "basic MPTCP BPF sockmap 
support", so it looks like not all sockmap functionality is implemented. 
Can you update the commit message and add some comment explaining what 
sockmap functionality is missing in MPTCP relative to TCP?


> +}
> +
> +#if IS_ENABLED(CONFIG_MPTCP_IPV6)
> +static struct proto *mptcpv6_prot_saved __read_mostly;
> +static DEFINE_SPINLOCK(mptcpv6_prot_lock);
> +
> +static void mptcp_bpf_check_v6_needs_rebuild(struct proto *ops)
> +{
> +	/*
> +	 * Load with acquire semantics to ensure we see the latest protocol
> +	 * structure before checking for rebuild.
> +	 */
> +	if (unlikely(ops != smp_load_acquire(&mptcpv6_prot_saved))) {
> +		spin_lock_bh(&mptcpv6_prot_lock);
> +		if (likely(ops != mptcpv6_prot_saved)) {
> +			mptcp_bpf_rebuild_protos(mptcp_bpf_prots[MPTCP_BPF_IPV6],
> +						 ops);
> +			/* Ensure mptcpv6_prot_saved update is visible before
> +			 * releasing lock
> +			 */
> +			smp_store_release(&mptcpv6_prot_saved, ops);
> +		}
> +		spin_unlock_bh(&mptcpv6_prot_lock);
> +	}
> +}
> +
> +static int mptcp_bpf_assert_proto_ops(struct proto *ops)
> +{
> +	/* In order to avoid retpoline, we make assumptions when we call
> +	 * into ops if e.g. a psock is not present. Make sure they are
> +	 * indeed valid assumptions.
> +	 */
> +	return ops->recvmsg  == mptcp_recvmsg &&
> +	       ops->sendmsg  == mptcp_sendmsg ? 0 : -EOPNOTSUPP;

Please fix the extra space before each "==" as well.

> +}
> +#endif
> +
> +static int mptcp_bpf_update_proto(struct sock *sk,
> +				  struct sk_psock *psock,
> +				  bool restore)
> +{
> +	int family = sk->sk_family == AF_INET6 ? MPTCP_BPF_IPV6 :
> +						 MPTCP_BPF_IPV4;
> +	int config = psock->progs.msg_parser   ? MPTCP_BPF_TX   :
> +						 MPTCP_BPF_BASE;
> +
> +	if (psock->progs.stream_verdict || psock->progs.skb_verdict)
> +		config = (config == MPTCP_BPF_TX) ? MPTCP_BPF_TXRX :
> +						    MPTCP_BPF_RX;
> +
> +	if (restore) {
> +		WRITE_ONCE(sk->sk_write_space, psock->saved_write_space);
> +		/* Pairs with lockless read in sk_clone_lock() */
> +		sock_replace_proto(sk, psock->sk_proto);
> +		return 0;
> +	}
> +
> +#if IS_ENABLED(CONFIG_MPTCP_IPV6)
> +	if (sk->sk_family == AF_INET6) {
> +		if (mptcp_bpf_assert_proto_ops(psock->sk_proto))
> +			return -EINVAL;
> +
> +		mptcp_bpf_check_v6_needs_rebuild(psock->sk_proto);
> +	}
> +#endif
> +
> +	/* Pairs with lockless read in sk_clone_lock() */

I see where this was copied from the tcp code, but there is no lockless 
read in sk_clone_lock(). It's in sk_clone().

Thanks,
Mat


> +	sock_replace_proto(sk, &mptcp_bpf_prots[family][config]);
> +	return 0;
> +}
> +#endif
> +
> static struct proto mptcp_prot = {
> 	.name		= "MPTCP",
> 	.owner		= THIS_MODULE,
> @@ -4051,8 +4152,20 @@ static struct proto mptcp_prot = {
> 	.obj_size	= sizeof(struct mptcp_sock),
> 	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
> 	.no_autobind	= true,
> +#ifdef CONFIG_BPF_SYSCALL
> +	.psock_update_sk_prot	= mptcp_bpf_update_proto,
> +#endif
> };
>
> +#ifdef CONFIG_BPF_SYSCALL
> +static int __init mptcp_bpf_v4_build_proto(void)
> +{
> +	mptcp_bpf_rebuild_protos(mptcp_bpf_prots[MPTCP_BPF_IPV4], &mptcp_prot);
> +	return 0;
> +}
> +late_initcall(mptcp_bpf_v4_build_proto);
> +#endif
> +
> static int mptcp_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len)
> {
> 	struct mptcp_sock *msk = mptcp_sk(sock->sk);
> -- 
> 2.51.0
>
>
>