From nobody Wed Apr 15 16:29:16 2026 Received: from out-181.mta0.migadu.com (out-181.mta0.migadu.com [91.218.175.181]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id DDB9E33AD88 for ; Wed, 4 Mar 2026 06:37:38 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=91.218.175.181 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1772606260; cv=none; b=h1bpZMY8zLU36W1RKWSnAol8Mh3koysGhbuwtj5bIgcN9YMNz7yVFLS1hGjoCE+JIemM16surrj4VcmLRnEm1GWW+LuK7q4VPQUqyqQXk+2FbrmvmwI5l7Qf/ekvR4hZrgpoKSJ54sFTnQwmsDt+/DYzSmztDXgre1qw7SGsuks= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1772606260; c=relaxed/simple; bh=lsIWMaQL/3whKexmrcJ91u/Whymn/Pb8xcsaW7Jb11M=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=InvbyxHv8Yd4qA8NpOPdZmPfN/xCrk4jzbM1W1VZ8CxTn/3fX63bniv5Y2mPXjfIHT89L/8CEmo5dH4J1ySV5EB2z8MqugzxH9U8inCkddiWsKV+AsOxHVzSwwNcVwWqdMnXb7mAW8AjY5+mzh1Y5PbJru6iSbxWAb5PG4Kni6M= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=GhSM2U/8; arc=none smtp.client-ip=91.218.175.181 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="GhSM2U/8" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1772606256; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=9VefZAyfd4lTF4RnV2A7nIRCdqG5nxtx9S7DWo98eaU=; b=GhSM2U/8YC96Es7pkV8cwof4OEYp+LUxYDTKPDho37h4hYPlF+IzxY2YlRhUCvJGQnuk6w heiudbs8dFsIptmC3EimiGBQ40Vd2s76Ts7IIN1FM7kZbfUZX5z7mjlSYfhWOj7lSYq4Py nOQWcshZ7LGy/9it3z5bsDgn4eK8Jmk= From: Jiayuan Chen To: bpf@vger.kernel.org, john.fastabend@gmail.com, jakub@cloudflare.com Cc: Jiayuan Chen , "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni , Simon Horman , Kuniyuki Iwashima , Willem de Bruijn , David Ahern , Neal Cardwell , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Jiapeng Chong , Ihor Solodrai , Michal Luczaj , netdev@vger.kernel.org, linux-kernel@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [PATCH bpf-next v1 1/7] net: add splice_read to struct proto and set it in tcp_prot/tcpv6_prot Date: Wed, 4 Mar 2026 14:33:52 +0800 Message-ID: <20260304063643.14581-2-jiayuan.chen@linux.dev> In-Reply-To: <20260304063643.14581-1-jiayuan.chen@linux.dev> References: <20260304063643.14581-1-jiayuan.chen@linux.dev> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" Add a splice_read function pointer to struct proto between recvmsg and splice_eof. Set it to tcp_splice_read in both tcp_prot and tcpv6_prot. Signed-off-by: Jiayuan Chen --- include/net/sock.h | 3 +++ net/ipv4/tcp_ipv4.c | 1 + net/ipv6/tcp_ipv6.c | 1 + 3 files changed, 5 insertions(+) diff --git a/include/net/sock.h b/include/net/sock.h index 6c9a83016e95..de28af168ec4 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1317,6 +1317,9 @@ struct proto { size_t len); int (*recvmsg)(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len); + ssize_t (*splice_read)(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags); void (*splice_eof)(struct socket *sock); int (*bind)(struct sock *sk, struct sockaddr_unsized *addr, int addr_len); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d53d39be291a..fa445f20b427 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3433,6 +3433,7 @@ struct proto tcp_prot =3D { .keepalive =3D tcp_set_keepalive, .recvmsg =3D tcp_recvmsg, .sendmsg =3D tcp_sendmsg, + .splice_read =3D tcp_splice_read, .splice_eof =3D tcp_splice_eof, .backlog_rcv =3D tcp_v4_do_rcv, .release_cb =3D tcp_release_cb, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index e46a0efae012..277b954c524d 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2295,6 +2295,7 @@ struct proto tcpv6_prot =3D { .keepalive =3D tcp_set_keepalive, .recvmsg =3D tcp_recvmsg, .sendmsg =3D tcp_sendmsg, + .splice_read =3D tcp_splice_read, .splice_eof =3D tcp_splice_eof, .backlog_rcv =3D tcp_v6_do_rcv, .release_cb =3D tcp_release_cb, --=20 2.43.0 From nobody Wed Apr 15 16:29:16 2026 Received: from out-170.mta0.migadu.com (out-170.mta0.migadu.com [91.218.175.170]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id E686437F8B9 for ; Wed, 4 Mar 2026 06:38:02 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=91.218.175.170 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1772606284; cv=none; b=hqPqjOIMI2sDUTlannzIQNxfVC/a6G5lsYNcirOQIbCFmoxekiYSR2wCdH9Jit7D0Ve6UbXLXiA6qF81bMvmRl4RlwKyRHTR5qWVOofzytHmzmPl9D+WP7r77+quAywBhPMgjXpohmD+EcC2/RHUl6p2R7ru3GQkSh8Mswrwz2g= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1772606284; c=relaxed/simple; bh=VBHqTt/FhWZsMO+m6rPxjyAbdZ7icfdYuGxRqnevfCM=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=TtcrtYUhrScpFhLlfwW95n0yLYLWjOoKX1VbaigS6bNgh/3qhLjqYlYYfYIp4lGGn73uadVKf026B4TWBdNvQMSIcATvgoUyVv7l/KhDcOY9gFssTHRcZ4aDIGOTPbERxuVly1YGjQ7vwyHfd7sg+VsIL4PLcEvK2+n8gKEOA5k= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=vsdHcr/h; arc=none smtp.client-ip=91.218.175.170 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="vsdHcr/h" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1772606280; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=rGZj5xXe+WPvDUb4thRq5WQY+Pyy46ij4fOCKK+Ii00=; b=vsdHcr/hO21XNvsSTo6Q0FVpU3e0DLphACPRp6pdWkVH3xMla7vZkvOWbDqxRogzwWYDnU ONtC7XuICoEQLTAlxAd4sBjhPOB6+TG9+Grv7E9KKW114rNrn8EWMfGii88UnVXcvrmKxz 9Yaov08wwGzxYNZBAwBFei8o2uhL+M8= From: Jiayuan Chen To: bpf@vger.kernel.org, john.fastabend@gmail.com, jakub@cloudflare.com Cc: Jiayuan Chen , "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni , Simon Horman , Kuniyuki Iwashima , Willem de Bruijn , David Ahern , Neal Cardwell , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Jiapeng Chong , Ihor Solodrai , Michal Luczaj , netdev@vger.kernel.org, linux-kernel@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [PATCH bpf-next v1 2/7] inet: add inet_splice_read() and use it in inet_stream_ops/inet6_stream_ops Date: Wed, 4 Mar 2026 14:33:53 +0800 Message-ID: <20260304063643.14581-3-jiayuan.chen@linux.dev> In-Reply-To: <20260304063643.14581-1-jiayuan.chen@linux.dev> References: <20260304063643.14581-1-jiayuan.chen@linux.dev> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" Add inet_splice_read() which dispatches to sk->sk_prot->splice_read via INDIRECT_CALL_1. Replace the direct tcp_splice_read reference in inet_stream_ops and inet6_stream_ops with inet_splice_read. Signed-off-by: Jiayuan Chen --- include/net/inet_common.h | 3 +++ net/ipv4/af_inet.c | 15 ++++++++++++++- net/ipv6/af_inet6.c | 2 +- 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/include/net/inet_common.h b/include/net/inet_common.h index 5dd2bf24449e..84f2744d57f8 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -34,6 +34,9 @@ void __inet_accept(struct socket *sock, struct socket *ne= wsock, struct sock *newsk); int inet_send_prepare(struct sock *sk); int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size); +ssize_t inet_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags); void inet_splice_eof(struct socket *sock); int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 8036e76aa1e4..2c7b35d9c62d 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -876,6 +876,19 @@ void inet_splice_eof(struct socket *sock) } EXPORT_SYMBOL_GPL(inet_splice_eof); =20 +ssize_t inet_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct sock *sk =3D sock->sk; + const struct proto *prot; + + prot =3D READ_ONCE(sk->sk_prot); + return INDIRECT_CALL_1(prot->splice_read, tcp_splice_read, sock, + ppos, pipe, len, flags); +} +EXPORT_SYMBOL_GPL(inet_splice_read); + INDIRECT_CALLABLE_DECLARE(int udp_recvmsg(struct sock *, struct msghdr *, size_t, int, int *)); int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, @@ -1079,7 +1092,7 @@ const struct proto_ops inet_stream_ops =3D { .mmap =3D tcp_mmap, #endif .splice_eof =3D inet_splice_eof, - .splice_read =3D tcp_splice_read, + .splice_read =3D inet_splice_read, .set_peek_off =3D sk_set_peek_off, .read_sock =3D tcp_read_sock, .read_skb =3D tcp_read_skb, diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 23cc9b4cb2f1..12256b0234ff 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -705,7 +705,7 @@ const struct proto_ops inet6_stream_ops =3D { #endif .splice_eof =3D inet_splice_eof, .sendmsg_locked =3D tcp_sendmsg_locked, - .splice_read =3D tcp_splice_read, + .splice_read =3D inet_splice_read, .set_peek_off =3D sk_set_peek_off, .read_sock =3D tcp_read_sock, .read_skb =3D tcp_read_skb, --=20 2.43.0 From nobody Wed Apr 15 16:29:16 2026 Received: from out-189.mta0.migadu.com (out-189.mta0.migadu.com [91.218.175.189]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 2174E381AEA for ; Wed, 4 Mar 2026 06:38:26 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=91.218.175.189 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1772606309; cv=none; b=MX+niSTvdcBzWIa16HSY2Sqvj5002hEqZuPYK/+LVsksgheJINLuNhVeACtPNJOuc3r4JkMQvvjLjIvxYdT7ADv5b+ufaCCgLhVDxn/TcQNkZ3zUF3qJnQFrcTrRRLiNS1C5t2eBvC4oVy7Y7xdTJ936qlrgRLNiO4g64uY207w= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1772606309; c=relaxed/simple; bh=rnvgDuIM8vQ4Ur2khkq6Tdvxz0lxrQhZ/hIDvNzyy0U=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=WWjLn4xdOtux9Owtmo6MnqOhSxTje9eVr7jSW/SZlYp1e50JdYIACQkGpX5i6PQxoWob+L6125E18uJH2Y1X3cmDRvK6FHYdnPXeeqjJbBxJR+NbrXDDcuU+GhK/rVveTzTAu+S0ie4C8PJkj+tgJBKQz+rLsbp5A6TWQCIRrtw= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=xfDP4xHP; arc=none smtp.client-ip=91.218.175.189 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="xfDP4xHP" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1772606305; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=mOpTXC80TFbvx4uT0sZN6YnfWzBhPWN4KMs+EKSr7jQ=; b=xfDP4xHPEXKJwwyYAnMNy7mgd5Pd43uM7Ho6f2jDCU6ljHy6Gj141uHst5V5g4l6/XC/7S J1Jf33ygGa0eJtR7tct3MO1tBTXJ032HyyW+jflHGZL5MidbafqtCEJRPM3ew3bNXPezn+ OY6J7HrtubuC74j7W3yDI4MIh2/2ma0= From: Jiayuan Chen To: bpf@vger.kernel.org, john.fastabend@gmail.com, jakub@cloudflare.com Cc: Jiayuan Chen , "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni , Simon Horman , Kuniyuki Iwashima , Willem de Bruijn , David Ahern , Neal Cardwell , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Jiapeng Chong , Ihor Solodrai , Michal Luczaj , netdev@vger.kernel.org, linux-kernel@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [PATCH bpf-next v1 3/7] tcp_bpf: refactor recvmsg with read actor abstraction Date: Wed, 4 Mar 2026 14:33:54 +0800 Message-ID: <20260304063643.14581-4-jiayuan.chen@linux.dev> In-Reply-To: <20260304063643.14581-1-jiayuan.chen@linux.dev> References: <20260304063643.14581-1-jiayuan.chen@linux.dev> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" Refactor the read operation with no functional changes. tcp_bpf has two read paths: strparser and non-strparser. Currently the differences are implemented directly in their respective recvmsg functions, which works fine. However, upcoming splice support would require duplicating the same logic for both paths. To avoid this, extract the strparser-specific differences into an independent abstraction that can be reused by splice. For ingress_msg data processing, introduce a function pointer callback approach. The current implementation passes sk_msg_recvmsg_actor(), which performs copy_page_to_iter() - the same copy logic previously embedded in sk_msg_recvmsg(). This provides the extension point for future splice support, where a different actor can be plugged in. Signed-off-by: Jiayuan Chen --- include/linux/skmsg.h | 12 ++++- net/core/skmsg.c | 24 +++++++-- net/ipv4/tcp_bpf.c | 123 ++++++++++++++++++++++++++++-------------- 3 files changed, 112 insertions(+), 47 deletions(-) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 19f4f253b4f9..d53756914b2f 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -143,8 +143,16 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct i= ov_iter *from, struct sk_msg *msg, u32 bytes); int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr = *msg, int len, int flags); -int __sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghd= r *msg, - int len, int flags, int *copied_from_self); +typedef int (*sk_msg_read_actor_t)(void *arg, struct page *page, + unsigned int offset, size_t len); +/* Core function for reading ingress_msg, dispatches to the given actor */ +int sk_msg_read_core(struct sock *sk, struct sk_psock *psock, + size_t len, int flags, + sk_msg_read_actor_t actor, void *actor_arg, + int *copied_from_self); +int sk_msg_recvmsg_actor(void *arg, struct page *page, + unsigned int offset, size_t len); + bool sk_msg_is_readable(struct sock *sk); =20 static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 byt= es) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 2e26174c9919..6a906bfe3aa4 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -409,10 +409,12 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct = iov_iter *from, } EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); =20 -int __sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghd= r *msg, - int len, int flags, int *copied_from_self) +/* Core function for reading ingress_msg, dispatches to the given actor */ +int sk_msg_read_core(struct sock *sk, struct sk_psock *psock, + size_t len, int flags, + sk_msg_read_actor_t actor, void *actor_arg, + int *copied_from_self) { - struct iov_iter *iter =3D &msg->msg_iter; int peek =3D flags & MSG_PEEK; struct sk_msg *msg_rx; int i, copied =3D 0; @@ -440,7 +442,8 @@ int __sk_msg_recvmsg(struct sock *sk, struct sk_psock *= psock, struct msghdr *msg if (copied + copy > len) copy =3D len - copied; if (copy) - copy =3D copy_page_to_iter(page, sge->offset, copy, iter); + copy =3D actor(actor_arg, page, + sge->offset, copy); if (!copy) { copied =3D copied ? copied : -EFAULT; goto out; @@ -495,12 +498,23 @@ int __sk_msg_recvmsg(struct sock *sk, struct sk_psock= *psock, struct msghdr *msg out: return copied; } +EXPORT_SYMBOL_GPL(sk_msg_read_core); + +int sk_msg_recvmsg_actor(void *arg, struct page *page, + unsigned int offset, size_t len) +{ + struct msghdr *msg =3D arg; + + return copy_page_to_iter(page, offset, len, &msg->msg_iter); +} +EXPORT_SYMBOL_GPL(sk_msg_recvmsg_actor); =20 /* Receive sk_msg from psock->ingress_msg to @msg. */ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr = *msg, int len, int flags) { - return __sk_msg_recvmsg(sk, psock, msg, len, flags, NULL); + return sk_msg_read_core(sk, psock, len, flags, + sk_msg_recvmsg_actor, msg, NULL); } EXPORT_SYMBOL_GPL(sk_msg_recvmsg); =20 diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index c449a044895e..606c2b079f86 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -218,31 +218,26 @@ static bool is_next_msg_fin(struct sk_psock *psock) return false; } =20 -static int tcp_bpf_recvmsg_parser(struct sock *sk, - struct msghdr *msg, - size_t len, - int flags, - int *addr_len) +/* + * __tcp_bpf_recvmsg_parser - inner recvmsg for strparser path + * + * Handles TCP seq tracking, pre-accept receive_queue draining, FIN detect= ion, + * and receive window updates. The actual data read is delegated to @actor. + * + * Caller must hold a psock ref. Socket lock is acquired/released internal= ly. + * Returns bytes read, or negative error. + */ +static int __tcp_bpf_recvmsg_parser(struct sock *sk, struct sk_psock *psoc= k, + sk_msg_read_actor_t actor, void *actor_arg, + size_t len, int flags) { int peek =3D flags & MSG_PEEK; - struct sk_psock *psock; - struct tcp_sock *tcp; + struct tcp_sock *tcp =3D tcp_sk(sk); int copied_from_self =3D 0; int copied =3D 0; u32 seq; =20 - if (unlikely(flags & MSG_ERRQUEUE)) - return inet_recv_error(sk, msg, len, addr_len); - - if (!len) - return 0; - - psock =3D sk_psock_get(sk); - if (unlikely(!psock)) - return tcp_recvmsg(sk, msg, len, flags, addr_len); - lock_sock(sk); - tcp =3D tcp_sk(sk); seq =3D tcp->copied_seq; /* We may have received data on the sk_receive_queue pre-accept and * then we can not use read_skb in this context because we haven't @@ -264,7 +259,8 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk, } =20 msg_bytes_ready: - copied =3D __sk_msg_recvmsg(sk, psock, msg, len, flags, &copied_from_self= ); + copied =3D sk_msg_read_core(sk, psock, len, flags, + actor, actor_arg, &copied_from_self); /* The typical case for EFAULT is the socket was gracefully * shutdown with a FIN pkt. So check here the other case is * some error on copy_page_to_iter which would be unexpected. @@ -329,10 +325,34 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk, =20 unlock: release_sock(sk); - sk_psock_put(sk, psock); return copied; } =20 +static int tcp_bpf_recvmsg_parser(struct sock *sk, + struct msghdr *msg, + size_t len, + int flags, + int *addr_len) +{ + struct sk_psock *psock; + int ret; + + if (unlikely(flags & MSG_ERRQUEUE)) + return inet_recv_error(sk, msg, len, addr_len); + + if (!len) + return 0; + + psock =3D sk_psock_get(sk); + if (unlikely(!psock)) + return tcp_recvmsg(sk, msg, len, flags, addr_len); + + ret =3D __tcp_bpf_recvmsg_parser(sk, psock, + sk_msg_recvmsg_actor, msg, len, flags); + sk_psock_put(sk, psock); + return ret; +} + static int tcp_bpf_ioctl(struct sock *sk, int cmd, int *karg) { bool slow; @@ -351,29 +371,25 @@ static int tcp_bpf_ioctl(struct sock *sk, int cmd, in= t *karg) return 0; } =20 -static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int flags, int *addr_len) +/* + * __tcp_bpf_recvmsg - inner recvmsg for non-parser (verdict only) path + * + * No TCP seq tracking needed (tcp_eat_skb handles it at verdict time). + * Returns bytes read, 0 if caller should fall back to the normal TCP + * read path (data on receive_queue but not in psock), or negative error. + * + * Caller must hold a psock ref. Socket lock is acquired/released internal= ly. + */ +static int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, + sk_msg_read_actor_t actor, void *actor_arg, + size_t len, int flags) { - struct sk_psock *psock; int copied, ret; =20 - if (unlikely(flags & MSG_ERRQUEUE)) - return inet_recv_error(sk, msg, len, addr_len); - - if (!len) - return 0; - - psock =3D sk_psock_get(sk); - if (unlikely(!psock)) - return tcp_recvmsg(sk, msg, len, flags, addr_len); - if (!skb_queue_empty(&sk->sk_receive_queue) && - sk_psock_queue_empty(psock)) { - sk_psock_put(sk, psock); - return tcp_recvmsg(sk, msg, len, flags, addr_len); - } lock_sock(sk); msg_bytes_ready: - copied =3D sk_msg_recvmsg(sk, psock, msg, len, flags); + copied =3D sk_msg_read_core(sk, psock, len, flags, + actor, actor_arg, NULL); if (!copied) { long timeo; int data; @@ -388,8 +404,7 @@ static int tcp_bpf_recvmsg(struct sock *sk, struct msgh= dr *msg, size_t len, if (!sk_psock_queue_empty(psock)) goto msg_bytes_ready; release_sock(sk); - sk_psock_put(sk, psock); - return tcp_recvmsg(sk, msg, len, flags, addr_len); + return 0; } copied =3D -EAGAIN; } @@ -397,7 +412,35 @@ static int tcp_bpf_recvmsg(struct sock *sk, struct msg= hdr *msg, size_t len, =20 unlock: release_sock(sk); + return ret; +} + +static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int flags, int *addr_len) +{ + struct sk_psock *psock; + int ret; + + if (unlikely(flags & MSG_ERRQUEUE)) + return inet_recv_error(sk, msg, len, addr_len); + + if (!len) + return 0; + + psock =3D sk_psock_get(sk); + if (unlikely(!psock)) + return tcp_recvmsg(sk, msg, len, flags, addr_len); + if (!skb_queue_empty(&sk->sk_receive_queue) && + sk_psock_queue_empty(psock)) { + sk_psock_put(sk, psock); + return tcp_recvmsg(sk, msg, len, flags, addr_len); + } + + ret =3D __tcp_bpf_recvmsg(sk, psock, sk_msg_recvmsg_actor, msg, + len, flags); sk_psock_put(sk, psock); + if (!ret) + return tcp_recvmsg(sk, msg, len, flags, addr_len); return ret; } =20 --=20 2.43.0 From nobody Wed Apr 15 16:29:16 2026 Received: from out-185.mta0.migadu.com (out-185.mta0.migadu.com [91.218.175.185]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 8D92737F8D0 for ; Wed, 4 Mar 2026 06:38:52 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=91.218.175.185 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1772606334; cv=none; b=CiAlZxMLkDHrDQF+lJ74v3LEcZL9N7OXFFVv0k/cf8rBBNa9BGzduBWxYGKKoWKsTaywksCLfU4Dio1DhPwwhVjixtUiE97RJ8jm4c4cIB8xwGiGd3+nU/fhRMmYdbPv7ibS1T9/2DoooF44EDA3Axax0gdxWap/QPUi4ugxIlw= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1772606334; c=relaxed/simple; bh=KvbS4fqeuLB3Xt9vJYeJeK3CkPVEUl3hZINO1cNAVoU=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=oIE4GG1pMLOOFaeyOwqHVy6uj2oy523kvkXyYUGOrU1/uCFNiWHkLSGSHrcE9On62xrwDbi3ID69K3FGRSob9KcdiPyQfm0WSDVYDR6g2sz5CklHA+J7AphbxCiz5XWocmN3Rmh3+OG4nBTkjfMchxcV3N0ZSrVsW58zx5H8UQw= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=d528Oi+o; arc=none smtp.client-ip=91.218.175.185 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="d528Oi+o" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1772606330; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=beX/DihSdzlZL0tDa9069FjqX9WJryfpvUqCEqHPrTc=; b=d528Oi+oRw4iNMJOtC1tQzAVg1+WrYyFLg0wp7dp0gpwc87orZljKi9wF828reEmGgCjy9 4PP4BypmWfGH6AruFo4FUj2wlJ1CYNC39HiNnqIDvD5EhO93IE5MmotJb5LR9bXvgPbJor X/dePstwvu6QYlIRYtdhRn04e1AgeA0= From: Jiayuan Chen To: bpf@vger.kernel.org, john.fastabend@gmail.com, jakub@cloudflare.com Cc: Jiayuan Chen , "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni , Simon Horman , Kuniyuki Iwashima , Willem de Bruijn , David Ahern , Neal Cardwell , Andrii Nakryiko , Eduard Zingerman , Alexei Starovoitov , Daniel Borkmann , Martin KaFai Lau , Song Liu , Yonghong Song , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Jiapeng Chong , Ihor Solodrai , Michal Luczaj , netdev@vger.kernel.org, linux-kernel@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [PATCH bpf-next v1 4/7] tcp_bpf: add splice_read support for sockmap Date: Wed, 4 Mar 2026 14:33:55 +0800 Message-ID: <20260304063643.14581-5-jiayuan.chen@linux.dev> In-Reply-To: <20260304063643.14581-1-jiayuan.chen@linux.dev> References: <20260304063643.14581-1-jiayuan.chen@linux.dev> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Implement splice_read for sockmap using an always-copy approach. Each page from the psock ingress scatterlist is copied to a newly allocated page before being added to the pipe, avoiding lifetime and slab-page issues. Add sk_msg_splice_actor() which allocates a fresh page via alloc_page(), copies the data with memcpy(), then passes it to add_to_pipe(). The newly allocated page already has a refcount of 1, so no additional get_page() is needed. On add_to_pipe() failure, no explicit cleanup is needed since add_to_pipe() internally calls pipe_buf_release(). Also fix sk_msg_read_core() to update msg_rx->sg.start when the actor returns 0 mid-way through processing. The loop processes msg_rx->sg entries sequentially =E2=80=94 if the actor fails (e.g. pipe full for splice, or user buffer fault for recvmsg), prior entries may already be consumed with sge->length set to 0. Without advancing sg.start, subsequent calls would revisit these zero-length entries and return -EFAULT. This is especially common with the splice actor since the pipe has a small fixed capacity (16 slots), but theoretically affects recvmsg as well. Signed-off-by: Jiayuan Chen --- net/core/skmsg.c | 10 ++++++ net/ipv4/tcp_bpf.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 6a906bfe3aa4..2fcbf8eaf4cf 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -445,6 +445,16 @@ int sk_msg_read_core(struct sock *sk, struct sk_psock = *psock, copy =3D actor(actor_arg, page, sge->offset, copy); if (!copy) { + /* + * The loop processes msg_rx->sg entries + * sequentially and prior entries may + * already be consumed. Advance sg.start + * so the next call resumes at the correct + * entry, otherwise it would revisit + * zero-length entries and return -EFAULT. + */ + if (!peek) + msg_rx->sg.start =3D i; copied =3D copied ? copied : -EFAULT; goto out; } diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 606c2b079f86..e85a27e32ea7 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -7,6 +7,7 @@ #include #include #include +#include =20 #include #include @@ -444,6 +445,85 @@ static int tcp_bpf_recvmsg(struct sock *sk, struct msg= hdr *msg, size_t len, return ret; } =20 +struct tcp_bpf_splice_ctx { + struct pipe_inode_info *pipe; +}; + +static int sk_msg_splice_actor(void *arg, struct page *page, + unsigned int offset, size_t len) +{ + struct tcp_bpf_splice_ctx *ctx =3D arg; + struct pipe_buffer buf =3D { + .ops =3D &nosteal_pipe_buf_ops, + }; + ssize_t ret; + + buf.page =3D alloc_page(GFP_KERNEL); + if (!buf.page) + return 0; + + memcpy(page_address(buf.page), page_address(page) + offset, len); + buf.offset =3D 0; + buf.len =3D len; + + /* + * add_to_pipe() calls pipe_buf_release() on failure, which + * handles put_page() via nosteal_pipe_buf_ops, so no explicit + * cleanup is needed here. + */ + ret =3D add_to_pipe(ctx->pipe, &buf); + if (ret <=3D 0) + return 0; + return ret; +} + +static ssize_t tcp_bpf_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct tcp_bpf_splice_ctx ctx =3D { .pipe =3D pipe }; + int bpf_flags =3D flags & SPLICE_F_NONBLOCK ? MSG_DONTWAIT : 0; + struct sock *sk =3D sock->sk; + struct sk_psock *psock; + int ret; + + psock =3D sk_psock_get(sk); + if (unlikely(!psock)) + return tcp_splice_read(sock, ppos, pipe, len, flags); + if (!skb_queue_empty(&sk->sk_receive_queue) && + sk_psock_queue_empty(psock)) { + sk_psock_put(sk, psock); + return tcp_splice_read(sock, ppos, pipe, len, flags); + } + + ret =3D __tcp_bpf_recvmsg(sk, psock, sk_msg_splice_actor, &ctx, + len, bpf_flags); + sk_psock_put(sk, psock); + if (!ret) + return tcp_splice_read(sock, ppos, pipe, len, flags); + return ret; +} + +static ssize_t tcp_bpf_splice_read_parser(struct socket *sock, loff_t *ppo= s, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct tcp_bpf_splice_ctx ctx =3D { .pipe =3D pipe }; + int bpf_flags =3D flags & SPLICE_F_NONBLOCK ? MSG_DONTWAIT : 0; + struct sock *sk =3D sock->sk; + struct sk_psock *psock; + int ret; + + psock =3D sk_psock_get(sk); + if (unlikely(!psock)) + return tcp_splice_read(sock, ppos, pipe, len, flags); + + ret =3D __tcp_bpf_recvmsg_parser(sk, psock, sk_msg_splice_actor, &ctx, + len, bpf_flags); + sk_psock_put(sk, psock); + return ret; +} + static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg, int *copied, int flags) { @@ -671,6 +751,7 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TC= P_BPF_NUM_CFGS], prot[TCP_BPF_BASE].destroy =3D sock_map_destroy; prot[TCP_BPF_BASE].close =3D sock_map_close; prot[TCP_BPF_BASE].recvmsg =3D tcp_bpf_recvmsg; + prot[TCP_BPF_BASE].splice_read =3D tcp_bpf_splice_read; prot[TCP_BPF_BASE].sock_is_readable =3D sk_msg_is_readable; prot[TCP_BPF_BASE].ioctl =3D tcp_bpf_ioctl; =20 @@ -679,9 +760,11 @@ static void tcp_bpf_rebuild_protos(struct proto prot[T= CP_BPF_NUM_CFGS], =20 prot[TCP_BPF_RX] =3D prot[TCP_BPF_BASE]; prot[TCP_BPF_RX].recvmsg =3D tcp_bpf_recvmsg_parser; + prot[TCP_BPF_RX].splice_read =3D tcp_bpf_splice_read_parser; =20 prot[TCP_BPF_TXRX] =3D prot[TCP_BPF_TX]; prot[TCP_BPF_TXRX].recvmsg =3D tcp_bpf_recvmsg_parser; + prot[TCP_BPF_TXRX].splice_read =3D tcp_bpf_splice_read_parser; } =20 static void tcp_bpf_check_v6_needs_rebuild(struct proto *ops) --=20 2.43.0 From nobody Wed Apr 15 16:29:16 2026 Received: from out-177.mta0.migadu.com (out-177.mta0.migadu.com [91.218.175.177]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 3752B375F80 for ; Wed, 4 Mar 2026 06:39:17 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=91.218.175.177 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1772606358; cv=none; b=HnLO9x2kfVWxRyfaXUR3Ruue5PEJ4pDuZ0rnnp1sd/nXrvy0xtiSJ9A3JjuoqZN9rhNxNOA/QG0rg9Y5zWYrCVhpEmnN8+htWP0nq/DXdY8EZc9yuqv/gCdVLfVXa/m2FwFvYqLM8Ct5ZemozMmHUuwaUl2Jpou1hU5C7cU6Hic= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1772606358; c=relaxed/simple; bh=T3LZkyXNgXp7yXEoUgK0q7pKCngpRITB3P7KxJw8R3k=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=ns/QgTwfRdbY1udqXBH4UT9iO1kXJWd8zzCwhCFngHPkaL45sgPwScE0vXmuHx6cm7Pbti86A+WYl+jM1T+KPX0yTxeWbiS7F3LBfLuRUhhc9SSy6MLl7N0e8Pnon0jiqqO6I63azcFKMl2Ds/fEuSViFx6kAYDiKa8MXhwPidg= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=XjbhhzTd; arc=none smtp.client-ip=91.218.175.177 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="XjbhhzTd" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1772606355; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=d5duGPLmRq29qJW6Ubc2JTj9Y024B43GziisGHsbdVM=; b=XjbhhzTdcorKHPYWcfmiX2oGlCoSL1wUr+CUtd8WlT2R3/xdlrugtM+MNIP1lrNiaqM+m5 jovxTnqsXHiZG+vYbETJvCkxEJ/3hEVv9aiNy4MsJ/xkQiw13drwVxcf3yTduSIFpQUpkw y74jmFQHAPmGee+k8P2fG3WOjoVJC5A= From: Jiayuan Chen To: bpf@vger.kernel.org, john.fastabend@gmail.com, jakub@cloudflare.com Cc: Jiayuan Chen , "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni , Simon Horman , Kuniyuki Iwashima , Willem de Bruijn , David Ahern , Neal Cardwell , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Jiapeng Chong , Ihor Solodrai , Michal Luczaj , netdev@vger.kernel.org, linux-kernel@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [PATCH bpf-next v1 5/7] tcp_bpf: optimize splice_read with zero-copy for non-slab pages Date: Wed, 4 Mar 2026 14:33:56 +0800 Message-ID: <20260304063643.14581-6-jiayuan.chen@linux.dev> In-Reply-To: <20260304063643.14581-1-jiayuan.chen@linux.dev> References: <20260304063643.14581-1-jiayuan.chen@linux.dev> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" The previous splice_read implementation copies all data through intermediate pages (alloc_page + memcpy). This is wasteful for skb fragment pages which are allocated from the page allocator and can be safely referenced via get_page(). Optimize by checking PageSlab() to distinguish between linear skb data (slab-backed) and fragment pages (page allocator-backed): - For slab pages (skb linear data): copy to a page fragment via sk_page_frag, matching what linear_to_page() does in the standard TCP splice path (skb_splice_bits). get_page() is invalid on slab pages so a copy is unavoidable here. - For non-slab pages (skb frags): use get_page() directly for true zero-copy, same as skb_splice_bits does for fragments. Both paths use nosteal_pipe_buf_ops. The sk_page_frag approach is more memory-efficient than alloc_page for small linear copies, as multiple copies can share a single page fragment. Benchmark results with rx-verdict-ingress mode (loopback, 8 CPUs): splice(2) + always-copy: ~2770 MB/s (before this patch) splice(2) + zero-copy: ~4270 MB/s (after this patch, +54%) read(2): ~4292 MB/s (baseline for reference) Signed-off-by: Jiayuan Chen --- net/ipv4/tcp_bpf.c | 41 +++++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index e85a27e32ea7..13506ba7672f 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -447,6 +447,7 @@ static int tcp_bpf_recvmsg(struct sock *sk, struct msgh= dr *msg, size_t len, =20 struct tcp_bpf_splice_ctx { struct pipe_inode_info *pipe; + struct sock *sk; }; =20 static int sk_msg_splice_actor(void *arg, struct page *page, @@ -458,13 +459,33 @@ static int sk_msg_splice_actor(void *arg, struct page= *page, }; ssize_t ret; =20 - buf.page =3D alloc_page(GFP_KERNEL); - if (!buf.page) - return 0; + if (PageSlab(page)) { + /* + * skb linear data is backed by slab memory where + * get_page() is invalid. Copy to a page fragment from + * the socket's page allocator, matching what + * linear_to_page() does in the standard TCP splice + * path (skb_splice_bits). + */ + struct page_frag *pfrag =3D sk_page_frag(ctx->sk); + + if (!sk_page_frag_refill(ctx->sk, pfrag)) + return 0; =20 - memcpy(page_address(buf.page), page_address(page) + offset, len); - buf.offset =3D 0; - buf.len =3D len; + len =3D min_t(size_t, len, pfrag->size - pfrag->offset); + memcpy(page_address(pfrag->page) + pfrag->offset, + page_address(page) + offset, len); + buf.page =3D pfrag->page; + buf.offset =3D pfrag->offset; + buf.len =3D len; + pfrag->offset +=3D len; + } else { + buf.page =3D page; + buf.offset =3D offset; + buf.len =3D len; + } + + get_page(buf.page); =20 /* * add_to_pipe() calls pipe_buf_release() on failure, which @@ -481,9 +502,9 @@ static ssize_t tcp_bpf_splice_read(struct socket *sock,= loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { - struct tcp_bpf_splice_ctx ctx =3D { .pipe =3D pipe }; - int bpf_flags =3D flags & SPLICE_F_NONBLOCK ? MSG_DONTWAIT : 0; struct sock *sk =3D sock->sk; + struct tcp_bpf_splice_ctx ctx =3D { .pipe =3D pipe, .sk =3D sk }; + int bpf_flags =3D flags & SPLICE_F_NONBLOCK ? MSG_DONTWAIT : 0; struct sk_psock *psock; int ret; =20 @@ -508,9 +529,9 @@ static ssize_t tcp_bpf_splice_read_parser(struct socket= *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { - struct tcp_bpf_splice_ctx ctx =3D { .pipe =3D pipe }; - int bpf_flags =3D flags & SPLICE_F_NONBLOCK ? MSG_DONTWAIT : 0; struct sock *sk =3D sock->sk; + struct tcp_bpf_splice_ctx ctx =3D { .pipe =3D pipe, .sk =3D sk }; + int bpf_flags =3D flags & SPLICE_F_NONBLOCK ? MSG_DONTWAIT : 0; struct sk_psock *psock; int ret; =20 --=20 2.43.0 From nobody Wed Apr 15 16:29:16 2026 Received: from out-185.mta0.migadu.com (out-185.mta0.migadu.com [91.218.175.185]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 4109B318ED9 for ; Wed, 4 Mar 2026 06:39:45 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=91.218.175.185 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1772606386; cv=none; b=U0aajMozh/8wh76A7sKxx7gda3w4pYVANTLzit3TVqBp1k92jbGXW+GNj1rywZ0Et7Z7qZSvxDsn9Hfx4HBiEl3+X3x411QjTuuLs4QEe1sRZmcdTEjVkRf8Z4JmKLN5ydpLr/VwFbzymQi2shz9czqRr7vmXJYUiCdD0ZoO4Tk= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1772606386; c=relaxed/simple; bh=6EK4kevhLvn8MwpRl+sYXsC6m0CP/64/dLc/zPKgHGY=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=umoFvw4zqlPMpoVssduSleUUMptORryspeByu1A/kgz7424iF/IzrzLlRD68Zx4fT6Cqieh7yJ9PveZCesoofo2IDFjALko36+tyYszi0ql+cdeEzpksSgRQtifI8kQ0NcVGQwLJJPfX30Mt3MfQCivg/s2srJmQ698F8gmzJJ0= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=W555oO6o; arc=none smtp.client-ip=91.218.175.185 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="W555oO6o" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1772606382; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=t+aNyjWwHYPmClz3BrgloLW8wipTLTFpy6aXnO+owLw=; b=W555oO6oGOGBLdPzbE4pSIegHFAu7S8ANwX9rU83zXyQQxmJ0ZPq80VTgxJzbbBoNnW2ar uF38wV//9YODNivW03OXFThuyYoiSmEwRdMB5SlZNoQBADbPWoKgpHRG51cCdd5C9KxJus FR4MfYGdWl5p2JPddgMamDnU31ojWew= From: Jiayuan Chen To: bpf@vger.kernel.org, john.fastabend@gmail.com, jakub@cloudflare.com Cc: Jiayuan Chen , "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni , Simon Horman , Kuniyuki Iwashima , Willem de Bruijn , David Ahern , Neal Cardwell , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Eduard Zingerman , Song Liu , Yonghong Song , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Jiapeng Chong , Ihor Solodrai , Michal Luczaj , netdev@vger.kernel.org, linux-kernel@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [PATCH bpf-next v1 6/7] selftests/bpf: add splice_read tests for sockmap Date: Wed, 4 Mar 2026 14:33:57 +0800 Message-ID: <20260304063643.14581-7-jiayuan.chen@linux.dev> In-Reply-To: <20260304063643.14581-1-jiayuan.chen@linux.dev> References: <20260304063643.14581-1-jiayuan.chen@linux.dev> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" Add splice_read coverage to sockmap_basic and sockmap_strp selftests. Each test suite now runs twice: once with normal recv_timeout() and once with splice-based reads, verifying that data read via splice(2) through a pipe produces identical results. A recv_timeout_with_splice() helper is added to sockmap_helpers.h that creates a temporary pipe, splices data from the socket into the pipe, then reads from the pipe into the user buffer. MSG_PEEK calls fall back to native recv since splice does not support peek. Non-TCP sockets also fall back to native recv. The splice subtests are distinguished by appending " splice" to each subtest name via a test__start_subtest macro override. ./test_progs -a sockmap_* ... Summary: 5/830 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Jiayuan Chen --- .../selftests/bpf/prog_tests/sockmap_basic.c | 28 ++++++++- .../bpf/prog_tests/sockmap_helpers.h | 62 +++++++++++++++++++ .../selftests/bpf/prog_tests/sockmap_strp.c | 28 ++++++++- 3 files changed, 116 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools= /testing/selftests/bpf/prog_tests/sockmap_basic.c index dd3c757859f6..ea0b49ec9a93 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -18,6 +18,23 @@ =20 #include "sockmap_helpers.h" =20 +static bool use_splice; + +static bool __start_subtest(const char *name) +{ + if (!use_splice) + return (test__start_subtest)(name); + + char buf[MAX_TEST_NAME]; + + snprintf(buf, sizeof(buf), "%s splice", name); + return (test__start_subtest)(buf); +} + +#define test__start_subtest(name) __start_subtest(name) +#define recv_timeout(fd, buf, len, flags, timeout) \ + recv_timeout_with_splice(fd, buf, len, flags, timeout, use_splice) + #define TCP_REPAIR 19 /* TCP sock is under repair right now */ =20 #define TCP_REPAIR_ON 1 @@ -1314,7 +1331,7 @@ static void test_sockmap_multi_channels(int sotype) test_sockmap_pass_prog__destroy(skel); } =20 -void test_sockmap_basic(void) +static void __test_sockmap_basic(void) { if (test__start_subtest("sockmap create_update_free")) test_sockmap_create_update_free(BPF_MAP_TYPE_SOCKMAP); @@ -1391,3 +1408,12 @@ void test_sockmap_basic(void) if (test__start_subtest("sockmap udp multi channels")) test_sockmap_multi_channels(SOCK_DGRAM); } + +void test_sockmap_basic(void) +{ + use_splice =3D false; + __test_sockmap_basic(); + + use_splice =3D true; + __test_sockmap_basic(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h b/too= ls/testing/selftests/bpf/prog_tests/sockmap_helpers.h index d815efac52fd..1f0da657243f 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h @@ -80,4 +80,66 @@ static inline int add_to_sockmap(int mapfd, int fd1, int= fd2) return xbpf_map_update_elem(mapfd, &u32(1), &u64(fd2), BPF_NOEXIST); } =20 +static inline ssize_t recv_timeout_with_splice(int fd, void *buf, size_t l= en, + int flags, + unsigned int timeout_sec, + bool do_splice) +{ + ssize_t total =3D 0; + int pipefd[2]; + int fl; + + int sotype, protocol; + socklen_t optlen =3D sizeof(sotype); + + if (!do_splice || (flags & MSG_PEEK) || + getsockopt(fd, SOL_SOCKET, SO_TYPE, &sotype, &optlen) || + sotype !=3D SOCK_STREAM || + getsockopt(fd, SOL_SOCKET, SO_PROTOCOL, &protocol, &optlen) || + protocol !=3D IPPROTO_TCP) + return recv_timeout(fd, buf, len, flags, timeout_sec); + + if (poll_read(fd, timeout_sec)) + return -1; + + if (pipe(pipefd) < 0) + return -1; + + /* + * tcp_splice_read() only checks sock->file->f_flags for + * O_NONBLOCK, ignoring SPLICE_F_NONBLOCK for the socket + * side timeout. Set O_NONBLOCK on the fd so the loop won't + * block forever when no more data is available. + */ + fl =3D fcntl(fd, F_GETFL); + fcntl(fd, F_SETFL, fl | O_NONBLOCK); + + /* + * Pipe has limited buffer slots (default 16), so a single + * splice may not transfer all requested bytes. Loop until + * we've read enough or no more data is available. + */ + while (total < (ssize_t)len) { + ssize_t spliced, n; + + spliced =3D splice(fd, NULL, pipefd[1], NULL, len - total, + SPLICE_F_NONBLOCK); + if (spliced <=3D 0) + break; + + n =3D read(pipefd[0], buf + total, spliced); + if (n <=3D 0) + break; + + total +=3D n; + } + + fcntl(fd, F_SETFL, fl); + + close(pipefd[0]); + close(pipefd[1]); + + return total > 0 ? total : -1; +} + #endif // __SOCKMAP_HELPERS__ diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_strp.c b/tools/= testing/selftests/bpf/prog_tests/sockmap_strp.c index 621b3b71888e..2226399eee0d 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_strp.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_strp.c @@ -6,6 +6,23 @@ #include "test_skmsg_load_helpers.skel.h" #include "test_sockmap_strp.skel.h" =20 +static bool use_splice; + +static bool __start_subtest(const char *name) +{ + if (!use_splice) + return (test__start_subtest)(name); + + char buf[MAX_TEST_NAME]; + + snprintf(buf, sizeof(buf), "%s splice", name); + return (test__start_subtest)(buf); +} + +#define test__start_subtest(name) __start_subtest(name) +#define recv_timeout(fd, buf, len, flags, timeout) \ + recv_timeout_with_splice(fd, buf, len, flags, timeout, use_splice) + #define STRP_PKT_HEAD_LEN 4 #define STRP_PKT_BODY_LEN 6 #define STRP_PKT_FULL_LEN (STRP_PKT_HEAD_LEN + STRP_PKT_BODY_LEN) @@ -431,7 +448,7 @@ static void test_sockmap_strp_verdict(int family, int s= otype) test_sockmap_strp__destroy(strp); } =20 -void test_sockmap_strp(void) +static void __test_sockmap_strp(void) { if (test__start_subtest("sockmap strp tcp pass")) test_sockmap_strp_pass(AF_INET, SOCK_STREAM, false); @@ -452,3 +469,12 @@ void test_sockmap_strp(void) if (test__start_subtest("sockmap strp tcp dispatch")) test_sockmap_strp_dispatch_pkt(AF_INET, SOCK_STREAM); } + +void test_sockmap_strp(void) +{ + use_splice =3D false; + __test_sockmap_strp(); + + use_splice =3D true; + __test_sockmap_strp(); +} --=20 2.43.0 From nobody Wed Apr 15 16:29:16 2026 Received: from out-181.mta0.migadu.com (out-181.mta0.migadu.com [91.218.175.181]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 150163101A0 for ; Wed, 4 Mar 2026 06:40:08 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=91.218.175.181 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1772606410; cv=none; b=TFm7HD19PPIziUK+xHoihj09BWqLmITTyd4HHflW2VB/hNwYc2ovAFoBigs7+nOUdfpyogIhi7ftvHaX11y5Fbfhgzb34vpmEnKXS7UvmI0WIcN5zBZJvyxEGhzEgGY3yT0SBRGyiF/b8hC6PoH4wyTbZPC8gP6ygvZBeyU3ZmQ= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1772606410; c=relaxed/simple; bh=xb+WBcf3xKm11jZGInNI3x0/hLtFqnRsVRRviuPxg3c=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=L8BJU5fgXTci/8pq1+5Y/ZOror0ctFoz6AFk349Rmv7ptWzs7DW2QQwj8FIb7dmyHLRLN/Put1eBWwC0tMJ1YYaOgc7I0bSzaJRk4OFbgLw7vd+NaJ5q5aWgCRuukGs/8UrlicaRazLZKEeY2UC+2FECg3zZVPXaaoRN5bFrLrs= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=q4FEJERc; arc=none smtp.client-ip=91.218.175.181 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="q4FEJERc" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1772606406; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=ejtUtr9rPgE5dpjPFrLD7UpC6yS26+UMUtgdrNoTYJI=; b=q4FEJERc+CBqIxLEbT3FTDIYP1/5pBRUeCveps8tRwH+1xukW2TNOlVzks6XzqS/UpBlUs SdATJsKXwnYjyURyZghdvSW37n0GpjhOTXcusTl20iRRcjjVhnukzD74F4SKXEuvV13fhp rxGvsnrRfNK3n/JGs9K43f1qmJvEe3Y= From: Jiayuan Chen To: bpf@vger.kernel.org, john.fastabend@gmail.com, jakub@cloudflare.com Cc: Jiayuan Chen , "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni , Simon Horman , Kuniyuki Iwashima , Willem de Bruijn , David Ahern , Neal Cardwell , Andrii Nakryiko , Eduard Zingerman , Alexei Starovoitov , Daniel Borkmann , Martin KaFai Lau , Song Liu , Yonghong Song , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Shuah Khan , Jiapeng Chong , Ihor Solodrai , Michal Luczaj , netdev@vger.kernel.org, linux-kernel@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [PATCH bpf-next v1 7/7] selftests/bpf: add splice option to sockmap benchmark Date: Wed, 4 Mar 2026 14:33:58 +0800 Message-ID: <20260304063643.14581-8-jiayuan.chen@linux.dev> In-Reply-To: <20260304063643.14581-1-jiayuan.chen@linux.dev> References: <20260304063643.14581-1-jiayuan.chen@linux.dev> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" Add --splice option to bench_sockmap that uses splice(2) instead of read(2) in the consumer path. A global pipe is created once during setup and reused across iterations to avoid per-call pipe creation overhead. When --splice is enabled, the consumer splices data from the socket into the pipe, then reads from the pipe into the user buffer. The socket is set to O_NONBLOCK to prevent tcp_splice_read() from blocking indefinitely, as it only checks sock->file->f_flags for non-blocking mode, ignoring SPLICE_F_NONBLOCK. Also increase SO_RCVBUF to 16MB to avoid sk_psock_backlog being throttled by the default sk_rcvbuf limit, and add --verify option to optionally enable data correctness checking (disabled by default for benchmark accuracy). Benchmark results with rx-verdict-ingress mode (loopback, 8 CPUs): read(2): ~4292 MB/s splice(2) + zero-copy: ~4270 MB/s splice(2) + always-copy: ~2770 MB/s Zero-copy splice achieves near-parity with read(2), while the always-copy fallback is ~35% slower. Usage: # Steer softirqs to CPU 7 to avoid contending with the producer CPU echo 80 > /sys/class/net/lo/queues/rx-0/rps_cpus # Raise the receive buffer ceiling so the benchmark can set 16MB rcvbuf sysctl -w net.core.rmem_max=3D16777216 # Run the benchmark ./bench sockmap --rx-verdict-ingress --splice -c 2 -p 1 -a -d 30 Signed-off-by: Jiayuan Chen --- .../selftests/bpf/benchs/bench_sockmap.c | 57 ++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/benchs/bench_sockmap.c b/tools/tes= ting/selftests/bpf/benchs/bench_sockmap.c index cfc072aa7fff..ffcf5ad8cafa 100644 --- a/tools/testing/selftests/bpf/benchs/bench_sockmap.c +++ b/tools/testing/selftests/bpf/benchs/bench_sockmap.c @@ -7,6 +7,9 @@ #include #include #include +#include +#include +#include #include #include "bench.h" #include "bench_sockmap_prog.skel.h" @@ -46,6 +49,8 @@ enum SOCKMAP_ARG_FLAG { ARG_CTL_RX_STRP, ARG_CONSUMER_DELAY_TIME, ARG_PRODUCER_DURATION, + ARG_CTL_SPLICE, + ARG_CTL_VERIFY, }; =20 #define TXMODE_NORMAL() \ @@ -110,6 +115,9 @@ static struct socmap_ctx { int delay_consumer; int prod_run_time; int strp_size; + bool use_splice; + bool verify; + int pipefd[2]; } ctx =3D { .prod_send =3D 0, .user_read =3D 0, @@ -119,6 +127,9 @@ static struct socmap_ctx { .delay_consumer =3D 0, .prod_run_time =3D 0, .strp_size =3D 0, + .use_splice =3D false, + .verify =3D false, + .pipefd =3D {-1, -1}, }; =20 static void bench_sockmap_prog_destroy(void) @@ -130,6 +141,11 @@ static void bench_sockmap_prog_destroy(void) close(ctx.fds[i]); } =20 + if (ctx.pipefd[0] >=3D 0) + close(ctx.pipefd[0]); + if (ctx.pipefd[1] >=3D 0) + close(ctx.pipefd[1]); + bench_sockmap_prog__destroy(ctx.skel); } =20 @@ -320,6 +336,7 @@ static int setup_tx_sockmap(void) =20 static void setup(void) { + int rcvbuf =3D 16 * 1024 * 1024; int err; =20 ctx.skel =3D bench_sockmap_prog__open_and_load(); @@ -350,6 +367,18 @@ static void setup(void) goto err; } =20 + if (ctx.use_splice) { + if (pipe(ctx.pipefd)) { + fprintf(stderr, "pipe error:%d\n", errno); + goto err; + } + } + + setsockopt(ctx.c2, SOL_SOCKET, SO_RCVBUF, &rcvbuf, sizeof(rcvbuf)); + + if (ctx.use_splice) + set_non_block(ctx.c2, true); + return; =20 err: @@ -368,6 +397,8 @@ static void measure(struct bench_res *res) =20 static void verify_data(int *check_pos, char *buf, int rcv) { + if (!ctx.verify) + return; for (int i =3D 0 ; i < rcv; i++) { if (buf[i] !=3D snd_data[(*check_pos) % DATA_REPEAT_SIZE]) { fprintf(stderr, "verify data fail"); @@ -388,6 +419,9 @@ static void *consumer(void *input) char *buf =3D malloc(recv_buf_size); int delay_read =3D ctx.delay_consumer; =20 + printf("cons[%d] started, tid=3D%ld cpu=3D%d\n", + tid, syscall(SYS_gettid), sched_getcpu()); + if (!buf) { fprintf(stderr, "fail to init read buffer"); return NULL; @@ -419,7 +453,15 @@ static void *consumer(void *input) } /* read real endpoint by consumer 0 */ atomic_inc(&ctx.read_calls); - rcv =3D read(ctx.c2, buf, recv_buf_size); + if (ctx.use_splice) { + rcv =3D splice(ctx.c2, NULL, ctx.pipefd[1], + NULL, recv_buf_size, + SPLICE_F_NONBLOCK); + if (rcv > 0) + rcv =3D read(ctx.pipefd[0], buf, rcv); + } else { + rcv =3D read(ctx.c2, buf, recv_buf_size); + } if (rcv < 0 && errno !=3D EAGAIN) { fprintf(stderr, "%s fail to read c2 %d\n", __func__, errno); return NULL; @@ -440,6 +482,9 @@ static void *producer(void *input) int target; FILE *file; =20 + printf("prod started, tid=3D%ld cpu=3D%d\n", + syscall(SYS_gettid), sched_getcpu()); + file =3D tmpfile(); if (!file) { fprintf(stderr, "create file for sendfile"); @@ -554,6 +599,10 @@ static const struct argp_option opts[] =3D { "delay consumer start"}, { "producer-duration", ARG_PRODUCER_DURATION, "SEC", 0, "producer duration"}, + { "splice", ARG_CTL_SPLICE, NULL, 0, + "use splice instead of read for consumer"}, + { "verify", ARG_CTL_VERIFY, NULL, 0, + "verify received data correctness"}, {}, }; =20 @@ -572,6 +621,12 @@ static error_t parse_arg(int key, char *arg, struct ar= gp_state *state) case ARG_CTL_RX_STRP: ctx.strp_size =3D strtol(arg, NULL, 10); break; + case ARG_CTL_SPLICE: + ctx.use_splice =3D true; + break; + case ARG_CTL_VERIFY: + ctx.verify =3D true; + break; default: return ARGP_ERR_UNKNOWN; } --=20 2.43.0