[PATCH bpf-next v3 2/3] bpf, sockmap: Fix FIONREAD for sockmap

Jiayuan Chen posted 3 patches 1 week, 3 days ago
There is a newer version of this series
[PATCH bpf-next v3 2/3] bpf, sockmap: Fix FIONREAD for sockmap
Posted by Jiayuan Chen 1 week, 3 days ago
A socket using sockmap has its own independent receive queue: ingress_msg.
This queue may contain data from its own protocol stack or from other
sockets.

Therefore, for sockmap, relying solely on copied_seq and rcv_nxt to
calculate FIONREAD is not enough.

This patch adds a new ingress_size field in the psock structure to record
the data length in ingress_msg. Additionally, we implement new ioctl
interfaces for TCP and UDP to intercept FIONREAD operations. While Unix
and VSOCK also support sockmap and have similar FIONREAD calculation
issues, fixing them would require more extensive changes
(please let me know if modifications are needed). I believe it's not
appropriate to include those changes under this fix patch.

Previous work by John Fastabend made some efforts towards FIONREAD support:
commit e5c6de5fa025 ("bpf, sockmap: Incorrectly handling copied_seq")
Although the current patch is based on the previous work by John Fastabend,
it is acceptable for our Fixes tag to point to the same commit.

                                                     FD1:read()
                                                     --  FD1->copied_seq++
                                                         |  [read data]
                                                         |
                                [enqueue data]           v
                  [sockmap]     -> ingress to self ->  ingress_msg queue
FD1 native stack  ------>                                 ^
-- FD1->rcv_nxt++               -> redirect to other      | [enqueue data]
                                       |                  |
                                       |             ingress to FD1
                                       v                  ^
                                      ...                 |  [sockmap]
                                                     FD2 native stack

Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()")
Signed-off-by: Jiayuan Chen <jiayuan.chen@linux.dev>
---
 include/linux/skmsg.h | 46 ++++++++++++++++++++++++++++++++++++++++++-
 net/core/skmsg.c      |  3 +++
 net/ipv4/tcp_bpf.c    | 21 ++++++++++++++++++++
 net/ipv4/udp_bpf.c    | 25 +++++++++++++++++++----
 4 files changed, 90 insertions(+), 5 deletions(-)

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 0323a2b6cf5e..49069b519499 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -97,6 +97,7 @@ struct sk_psock {
 	struct sk_buff_head		ingress_skb;
 	struct list_head		ingress_msg;
 	spinlock_t			ingress_lock;
+	ssize_t				ingress_size;
 	unsigned long			state;
 	struct list_head		link;
 	spinlock_t			link_lock;
@@ -321,6 +322,16 @@ static inline void sock_drop(struct sock *sk, struct sk_buff *skb)
 	kfree_skb(skb);
 }
 
+static inline ssize_t sk_psock_get_msg_size(struct sk_psock *psock)
+{
+	return psock->ingress_size;
+}
+
+static inline void sk_psock_inc_msg_size(struct sk_psock *psock, ssize_t diff)
+{
+	psock->ingress_size += diff;
+}
+
 static inline bool sk_psock_queue_msg(struct sk_psock *psock,
 				      struct sk_msg *msg)
 {
@@ -329,6 +340,7 @@ static inline bool sk_psock_queue_msg(struct sk_psock *psock,
 	spin_lock_bh(&psock->ingress_lock);
 	if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
 		list_add_tail(&msg->list, &psock->ingress_msg);
+		sk_psock_inc_msg_size(psock, msg->sg.size);
 		ret = true;
 	} else {
 		sk_msg_free(psock->sk, msg);
@@ -345,8 +357,10 @@ static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock)
 
 	spin_lock_bh(&psock->ingress_lock);
 	msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list);
-	if (msg)
+	if (msg) {
 		list_del(&msg->list);
+		sk_psock_inc_msg_size(psock, -msg->sg.size);
+	}
 	spin_unlock_bh(&psock->ingress_lock);
 	return msg;
 }
@@ -523,6 +537,36 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
 	return !!psock->saved_data_ready;
 }
 
+static inline ssize_t sk_psock_msg_inq(struct sock *sk)
+{
+	struct sk_psock *psock;
+	ssize_t inq = 0;
+
+	psock = sk_psock_get(sk);
+	if (likely(psock)) {
+		inq = sk_psock_get_msg_size(psock);
+		sk_psock_put(sk, psock);
+	}
+	return inq;
+}
+
+/* for udp */
+static inline ssize_t sk_msg_first_length(struct sock *sk)
+{
+	struct sk_psock *psock;
+	struct sk_msg *msg;
+	ssize_t inq = 0;
+
+	psock = sk_psock_get(sk);
+	if (likely(psock)) {
+		msg = sk_psock_peek_msg(psock);
+		if (msg)
+			inq = msg->sg.size;
+		sk_psock_put(sk, psock);
+	}
+	return inq;
+}
+
 #if IS_ENABLED(CONFIG_NET_SOCK_MSG)
 
 #define BPF_F_STRPARSER	(1UL << 1)
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index d73e03f7713a..c959d52a62b2 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -455,6 +455,7 @@ int __sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg
 					atomic_sub(copy, &sk->sk_rmem_alloc);
 				}
 				msg_rx->sg.size -= copy;
+				sk_psock_inc_msg_size(psock, -copy);
 
 				if (!sge->length) {
 					sk_msg_iter_var_next(i);
@@ -819,9 +820,11 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
 		list_del(&msg->list);
 		if (!msg->skb)
 			atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc);
+		sk_psock_inc_msg_size(psock, -((ssize_t)msg->sg.size));
 		sk_msg_free(psock->sk, msg);
 		kfree(msg);
 	}
+	WARN_ON_ONCE(psock->ingress_size);
 }
 
 static void __sk_psock_zap_ingress(struct sk_psock *psock)
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 6332fc36ffe6..a9c758868f13 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -10,6 +10,7 @@
 
 #include <net/inet_common.h>
 #include <net/tls.h>
+#include <asm/ioctls.h>
 
 void tcp_eat_skb(struct sock *sk, struct sk_buff *skb)
 {
@@ -332,6 +333,25 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk,
 	return copied;
 }
 
+static int tcp_bpf_ioctl(struct sock *sk, int cmd, int *karg)
+{
+	bool slow;
+
+	/* we only care about FIONREAD */
+	if (cmd != SIOCINQ)
+		return tcp_ioctl(sk, cmd, karg);
+
+	/* works similar as tcp_ioctl */
+	if (sk->sk_state == TCP_LISTEN)
+		return -EINVAL;
+
+	slow = lock_sock_fast(sk);
+	*karg = sk_psock_msg_inq(sk);
+	unlock_sock_fast(sk, slow);
+
+	return 0;
+}
+
 static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 			   int flags, int *addr_len)
 {
@@ -610,6 +630,7 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
 	prot[TCP_BPF_BASE].close		= sock_map_close;
 	prot[TCP_BPF_BASE].recvmsg		= tcp_bpf_recvmsg;
 	prot[TCP_BPF_BASE].sock_is_readable	= sk_msg_is_readable;
+	prot[TCP_BPF_BASE].ioctl		= tcp_bpf_ioctl;
 
 	prot[TCP_BPF_TX]			= prot[TCP_BPF_BASE];
 	prot[TCP_BPF_TX].sendmsg		= tcp_bpf_sendmsg;
diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c
index 0735d820e413..cc1156aef14d 100644
--- a/net/ipv4/udp_bpf.c
+++ b/net/ipv4/udp_bpf.c
@@ -5,6 +5,7 @@
 #include <net/sock.h>
 #include <net/udp.h>
 #include <net/inet_common.h>
+#include <asm/ioctls.h>
 
 #include "udp_impl.h"
 
@@ -111,12 +112,28 @@ enum {
 static DEFINE_SPINLOCK(udpv6_prot_lock);
 static struct proto udp_bpf_prots[UDP_BPF_NUM_PROTS];
 
+static int udp_bpf_ioctl(struct sock *sk, int cmd, int *karg)
+{
+	/* we only care about FIONREAD */
+	if (cmd != SIOCINQ)
+		return udp_ioctl(sk, cmd, karg);
+
+	/* works similar as udp_ioctl.
+	 * man udp(7): "FIONREAD (SIOCINQ): Returns the size of the next
+	 * pending datagram in the integer in bytes, or 0 when no datagram
+	 * is pending."
+	 */
+	*karg = sk_msg_first_length(sk);
+	return 0;
+}
+
 static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base)
 {
-	*prot        = *base;
-	prot->close  = sock_map_close;
-	prot->recvmsg = udp_bpf_recvmsg;
-	prot->sock_is_readable = sk_msg_is_readable;
+	*prot			= *base;
+	prot->close		= sock_map_close;
+	prot->recvmsg		= udp_bpf_recvmsg;
+	prot->sock_is_readable	= sk_msg_is_readable;
+	prot->ioctl		= udp_bpf_ioctl;
 }
 
 static void udp_bpf_check_v6_needs_rebuild(struct proto *ops)
-- 
2.43.0
Re: [PATCH bpf-next v3 2/3] bpf, sockmap: Fix FIONREAD for sockmap
Posted by bot+bpf-ci@kernel.org 1 week, 3 days ago
> diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
> index 49847888c..87772846f 100644
> --- a/include/linux/skmsg.h
> +++ b/include/linux/skmsg.h

[ ... ]

> +static inline ssize_t sk_psock_get_msg_size(struct sk_psock *psock)
> +{
> +	return psock->ingress_size;
> +}

[ ... ]

> +static inline ssize_t sk_psock_msg_inq(struct sock *sk)
> +{
> +	struct sk_psock *psock;
> +	ssize_t inq = 0;
> +
> +	psock = sk_psock_get(sk);
> +	if (likely(psock)) {
> +		inq = sk_psock_get_msg_size(psock);
> +		sk_psock_put(sk, psock);
> +	}
> +	return inq;
> +}

Does this create a data race on ingress_size? The field is read here
without holding ingress_lock, but all modifications happen under that
lock in sk_psock_queue_msg(), sk_psock_dequeue_msg(), and
sk_msg_recvmsg().

When ioctl(FIONREAD) executes concurrently with recv(), one thread
reads ingress_size locklessly in sk_psock_msg_inq() while another
thread modifies it under lock in sk_msg_recvmsg(). On 32-bit systems
with 32-bit ssize_t, this could produce torn reads.

Would READ_ONCE/WRITE_ONCE be needed for the lockless access?

> +/* for udp */
> +static inline ssize_t sk_msg_first_length(struct sock *sk)
> +{
> +	struct sk_psock *psock;
> +	struct sk_msg *msg;
> +	ssize_t inq = 0;
> +
> +	psock = sk_psock_get(sk);
> +	if (likely(psock)) {
> +		msg = sk_psock_peek_msg(psock);
> +		if (msg)
> +			inq = msg->sg.size;
> +		sk_psock_put(sk, psock);
> +	}
> +	return inq;
> +}

Can msg be freed between sk_psock_peek_msg() and reading msg->sg.size?
The peek function releases ingress_lock before returning the pointer.
If another thread in sk_msg_recvmsg() fully consumes the message after
the peek but before the size read, won't sk_psock_dequeue_msg() free
the message, leading to a use-after-free when we read msg->sg.size?

The call chain would be:
  Thread A: udp_bpf_ioctl()->sk_msg_first_length()->sk_psock_peek_msg()
            returns msg, releases lock, then reads msg->sg.size
  Thread B: udp_bpf_recvmsg()->sk_msg_recvmsg() consumes msg fully,
            calls sk_psock_dequeue_msg() which frees msg


---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/19573628679