'sk->copied_seq' was updated in the tcp_eat_skb() function when the
action of a BPF program was SK_REDIRECT. For other actions, like SK_PASS,
the update logic for 'sk->copied_seq' was moved to
tcp_bpf_recvmsg_parser() to ensure the accuracy of the 'fionread' feature.
It works for a single stream_verdict scenario, as it also modified
'sk_data_ready->sk_psock_verdict_data_ready->tcp_read_skb'
to remove updating 'sk->copied_seq'.
However, for programs where both stream_parser and stream_verdict are
active(strparser purpose), tcp_read_sock() was used instead of
tcp_read_skb() (sk_data_ready->strp_data_ready->tcp_read_sock)
tcp_read_sock() now still update 'sk->copied_seq', leading to duplicated
updates.
In summary, for strparser + SK_PASS, copied_seq is redundantly calculated
in both tcp_read_sock() and tcp_bpf_recvmsg_parser().
The issue causes incorrect copied_seq calculations, which prevent
correct data reads from the recv() interface in user-land.
We do not want to add new proto_ops to implement a new version of
tcp_read_sock, as this would introduce code complexity [1].
We add new callback for strparser for customized read operation, also as
a wrapper function it provides abstraction use psock.
[1]: https://lore.kernel.org/bpf/20241218053408.437295-1-mrpre@163.com
Fixes: e5c6de5fa025 ("bpf, sockmap: Incorrectly handling copied_seq")
Suggested-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Jiayuan Chen <mrpre@163.com>
---
include/linux/skmsg.h | 4 ++++
include/net/tcp.h | 3 +++
net/core/skmsg.c | 23 +++++++++++++++++++++
net/ipv4/tcp.c | 29 +++++++++++++++++++++-----
net/ipv4/tcp_bpf.c | 47 +++++++++++++++++++++++++++++++++++++++++++
5 files changed, 101 insertions(+), 5 deletions(-)
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 2cbe0c22a32f..c9343eeac8b3 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -91,6 +91,10 @@ struct sk_psock {
struct sk_psock_progs progs;
#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
struct strparser strp;
+ int (*read_sock)(struct sock *sk, read_descriptor_t *desc,
+ sk_read_actor_t recv_actor);
+ u32 copied_seq;
+ u32 ingress_bytes;
#endif
struct sk_buff_head ingress_skb;
struct list_head ingress_msg;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index e9b37b76e894..88e55e62023c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -729,6 +729,9 @@ void tcp_get_info(struct sock *, struct tcp_info *);
/* Read 'sendfile()'-style from a TCP socket */
int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
sk_read_actor_t recv_actor);
+int tcp_read_sock_noack(struct sock *sk, read_descriptor_t *desc,
+ sk_read_actor_t recv_actor, bool noack,
+ u32 *copied_seq);
int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off);
void tcp_read_done(struct sock *sk, size_t len);
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 61f3f3d4e528..6695659d3447 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -549,6 +549,9 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
return num_sge;
}
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
+ psock->ingress_bytes += len;
+#endif
copied = len;
msg->sg.start = 0;
msg->sg.size = copied;
@@ -1092,6 +1095,25 @@ static int sk_psock_strp_read_done(struct strparser *strp, int err)
return err;
}
+static int sk_psock_strp_read_sock(struct strparser *strp,
+ read_descriptor_t *desc,
+ sk_read_actor_t recv_actor)
+{
+ struct sock *sk = strp->sk;
+ struct socket *sock = sk->sk_socket;
+ struct sk_psock *psock;
+ int rv = 0;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (likely(psock && psock->read_sock))
+ rv = psock->read_sock(sk, desc, recv_actor);
+ else if (sock && sock->ops && sock->ops->read_sock)
+ rv = sock->ops->read_sock(sk, desc, recv_actor);
+ rcu_read_unlock();
+ return rv;
+}
+
static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb)
{
struct sk_psock *psock = container_of(strp, struct sk_psock, strp);
@@ -1136,6 +1158,7 @@ int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
static const struct strp_callbacks cb = {
.rcv_msg = sk_psock_strp_read,
+ .read_sock = sk_psock_strp_read_sock,
.read_sock_done = sk_psock_strp_read_done,
.parse_msg = sk_psock_strp_parse,
};
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0d704bda6c41..285678d8ce07 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1565,12 +1565,13 @@ EXPORT_SYMBOL(tcp_recv_skb);
* or for 'peeking' the socket using this routine
* (although both would be easy to implement).
*/
-int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
- sk_read_actor_t recv_actor)
+static int __tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
+ sk_read_actor_t recv_actor, bool noack,
+ u32 *copied_seq)
{
struct sk_buff *skb;
struct tcp_sock *tp = tcp_sk(sk);
- u32 seq = tp->copied_seq;
+ u32 seq = *copied_seq;
u32 offset;
int copied = 0;
@@ -1624,9 +1625,12 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
tcp_eat_recv_skb(sk, skb);
if (!desc->count)
break;
- WRITE_ONCE(tp->copied_seq, seq);
+ WRITE_ONCE(*copied_seq, seq);
}
- WRITE_ONCE(tp->copied_seq, seq);
+ WRITE_ONCE(*copied_seq, seq);
+
+ if (noack)
+ goto out;
tcp_rcv_space_adjust(sk);
@@ -1635,10 +1639,25 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
tcp_recv_skb(sk, seq, &offset);
tcp_cleanup_rbuf(sk, copied);
}
+out:
return copied;
}
+
+int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
+ sk_read_actor_t recv_actor)
+{
+ return __tcp_read_sock(sk, desc, recv_actor, false,
+ &tcp_sk(sk)->copied_seq);
+}
EXPORT_SYMBOL(tcp_read_sock);
+int tcp_read_sock_noack(struct sock *sk, read_descriptor_t *desc,
+ sk_read_actor_t recv_actor, bool noack,
+ u32 *copied_seq)
+{
+ return __tcp_read_sock(sk, desc, recv_actor, noack, copied_seq);
+}
+
int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
{
struct sk_buff *skb;
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 47f65b1b70ca..6dcde3506a9b 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -646,6 +646,47 @@ static int tcp_bpf_assert_proto_ops(struct proto *ops)
ops->sendmsg == tcp_sendmsg ? 0 : -ENOTSUPP;
}
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
+static int tcp_bpf_strp_read_sock(struct sock *sk, read_descriptor_t *desc,
+ sk_read_actor_t recv_actor)
+{
+ struct sk_psock *psock;
+ struct tcp_sock *tp;
+ int copied = 0;
+
+ tp = tcp_sk(sk);
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (WARN_ON(!psock)) {
+ desc->error = -EINVAL;
+ goto out;
+ }
+
+ psock->ingress_bytes = 0;
+ /* We could easily add copied_seq and noack into desc then call
+ * ops->read_sock without calling symbol directly. But unfortunately
+ * most descriptors used by other modules are not inited with zero.
+ * Also it not work by replacing ops->read_sock without introducing
+ * new ops as ops itself is located in rodata segment.
+ */
+ copied = tcp_read_sock_noack(sk, desc, recv_actor, true,
+ &psock->copied_seq);
+ if (copied < 0)
+ goto out;
+ /* recv_actor may redirect skb to another socket(SK_REDIRECT) or
+ * just put skb into ingress queue of current socket(SK_PASS).
+ * For SK_REDIRECT, we need 'ack' the frame immediately but for
+ * SK_PASS, the 'ack' was delay to tcp_bpf_recvmsg_parser()
+ */
+ tp->copied_seq = psock->copied_seq - psock->ingress_bytes;
+ tcp_rcv_space_adjust(sk);
+ __tcp_cleanup_rbuf(sk, copied - psock->ingress_bytes);
+out:
+ rcu_read_unlock();
+ return copied;
+}
+#endif /* CONFIG_BPF_STREAM_PARSER */
+
int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
{
int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
@@ -681,6 +722,12 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
/* Pairs with lockless read in sk_clone_lock() */
sock_replace_proto(sk, &tcp_bpf_prots[family][config]);
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
+ if (psock->progs.stream_parser && psock->progs.stream_verdict) {
+ psock->copied_seq = tcp_sk(sk)->copied_seq;
+ psock->read_sock = tcp_bpf_strp_read_sock;
+ }
+#endif
return 0;
}
EXPORT_SYMBOL_GPL(tcp_bpf_update_proto);
--
2.43.5
On Thu, Jan 16, 2025 at 10:05 PM +08, Jiayuan Chen wrote: > 'sk->copied_seq' was updated in the tcp_eat_skb() function when the > action of a BPF program was SK_REDIRECT. For other actions, like SK_PASS, > the update logic for 'sk->copied_seq' was moved to > tcp_bpf_recvmsg_parser() to ensure the accuracy of the 'fionread' feature. > > It works for a single stream_verdict scenario, as it also modified > 'sk_data_ready->sk_psock_verdict_data_ready->tcp_read_skb' > to remove updating 'sk->copied_seq'. > > However, for programs where both stream_parser and stream_verdict are > active(strparser purpose), tcp_read_sock() was used instead of > tcp_read_skb() (sk_data_ready->strp_data_ready->tcp_read_sock) > tcp_read_sock() now still update 'sk->copied_seq', leading to duplicated > updates. > > In summary, for strparser + SK_PASS, copied_seq is redundantly calculated > in both tcp_read_sock() and tcp_bpf_recvmsg_parser(). > > The issue causes incorrect copied_seq calculations, which prevent > correct data reads from the recv() interface in user-land. > > We do not want to add new proto_ops to implement a new version of > tcp_read_sock, as this would introduce code complexity [1]. > > We add new callback for strparser for customized read operation, also as > a wrapper function it provides abstraction use psock. > > [1]: https://lore.kernel.org/bpf/20241218053408.437295-1-mrpre@163.com > Fixes: e5c6de5fa025 ("bpf, sockmap: Incorrectly handling copied_seq") > Suggested-by: Jakub Sitnicki <jakub@cloudflare.com> > Signed-off-by: Jiayuan Chen <mrpre@163.com> > --- [...] > diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c > index 47f65b1b70ca..6dcde3506a9b 100644 > --- a/net/ipv4/tcp_bpf.c > +++ b/net/ipv4/tcp_bpf.c > @@ -646,6 +646,47 @@ static int tcp_bpf_assert_proto_ops(struct proto *ops) > ops->sendmsg == tcp_sendmsg ? 0 : -ENOTSUPP; > } > > +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) > +static int tcp_bpf_strp_read_sock(struct sock *sk, read_descriptor_t *desc, > + sk_read_actor_t recv_actor) > +{ > + struct sk_psock *psock; > + struct tcp_sock *tp; > + int copied = 0; > + > + tp = tcp_sk(sk); > + rcu_read_lock(); > + psock = sk_psock(sk); > + if (WARN_ON(!psock)) { > + desc->error = -EINVAL; > + goto out; > + } > + > + psock->ingress_bytes = 0; > + /* We could easily add copied_seq and noack into desc then call > + * ops->read_sock without calling symbol directly. But unfortunately > + * most descriptors used by other modules are not inited with zero. > + * Also it not work by replacing ops->read_sock without introducing > + * new ops as ops itself is located in rodata segment. > + */ > + copied = tcp_read_sock_noack(sk, desc, recv_actor, true, > + &psock->copied_seq); > + if (copied < 0) > + goto out; > + /* recv_actor may redirect skb to another socket(SK_REDIRECT) or > + * just put skb into ingress queue of current socket(SK_PASS). > + * For SK_REDIRECT, we need 'ack' the frame immediately but for > + * SK_PASS, the 'ack' was delay to tcp_bpf_recvmsg_parser() > + */ > + tp->copied_seq = psock->copied_seq - psock->ingress_bytes; > + tcp_rcv_space_adjust(sk); > + __tcp_cleanup_rbuf(sk, copied - psock->ingress_bytes); > +out: > + rcu_read_unlock(); > + return copied; > +} > +#endif /* CONFIG_BPF_STREAM_PARSER */ > + > int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) > { > int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; > @@ -681,6 +722,12 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) > > /* Pairs with lockless read in sk_clone_lock() */ > sock_replace_proto(sk, &tcp_bpf_prots[family][config]); > +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) > + if (psock->progs.stream_parser && psock->progs.stream_verdict) { > + psock->copied_seq = tcp_sk(sk)->copied_seq; > + psock->read_sock = tcp_bpf_strp_read_sock; Just directly set psock->strp.cb.read_sock to tcp_bpf_strp_read_sock. Then we don't need this intermediate psock->read_sock callback, which doesn't do anything useful. > + } > +#endif > return 0; > } > EXPORT_SYMBOL_GPL(tcp_bpf_update_proto);
On Sat, Jan 18, 2025 at 03:50:22PM +0100, Jakub Sitnicki wrote: > On Thu, Jan 16, 2025 at 10:05 PM +08, Jiayuan Chen wrote: > > 'sk->copied_seq' was updated in the tcp_eat_skb() function when the > > action of a BPF program was SK_REDIRECT. For other actions, like SK_PASS, > > +} > > +#endif /* CONFIG_BPF_STREAM_PARSER */ > > + > > int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) > > { > > int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; > > @@ -681,6 +722,12 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) > > > > /* Pairs with lockless read in sk_clone_lock() */ > > sock_replace_proto(sk, &tcp_bpf_prots[family][config]); > > +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) > > + if (psock->progs.stream_parser && psock->progs.stream_verdict) { > > + psock->copied_seq = tcp_sk(sk)->copied_seq; > > + psock->read_sock = tcp_bpf_strp_read_sock; > > Just directly set psock->strp.cb.read_sock to tcp_bpf_strp_read_sock. > Then we don't need this intermediate psock->read_sock callback, which > doesn't do anything useful. > Ok, I will do this. (BTW, I intended to avoid bringing "struct strparser" into tcp_bpf.c so I added a wrapper function instead in skmsg.c without calling it directly) > > + } > > +#endif > > return 0; > > } > > EXPORT_SYMBOL_GPL(tcp_bpf_update_proto);
On Sat, Jan 18, 2025 at 11:29:04PM +0800, Jiayuan Chen wrote: > On Sat, Jan 18, 2025 at 03:50:22PM +0100, Jakub Sitnicki wrote: > > On Thu, Jan 16, 2025 at 10:05 PM +08, Jiayuan Chen wrote: > > > 'sk->copied_seq' was updated in the tcp_eat_skb() function when the > > > action of a BPF program was SK_REDIRECT. For other actions, like SK_PASS, > > > +} > > > +#endif /* CONFIG_BPF_STREAM_PARSER */ > > > + > > > int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) > > > { > > > int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; > > > @@ -681,6 +722,12 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) > > > > > > /* Pairs with lockless read in sk_clone_lock() */ > > > sock_replace_proto(sk, &tcp_bpf_prots[family][config]); > > > +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) > > > + if (psock->progs.stream_parser && psock->progs.stream_verdict) { > > > + psock->copied_seq = tcp_sk(sk)->copied_seq; > > > + psock->read_sock = tcp_bpf_strp_read_sock; > > > > Just directly set psock->strp.cb.read_sock to tcp_bpf_strp_read_sock. > > Then we don't need this intermediate psock->read_sock callback, which > > doesn't do anything useful. > > > Ok, I will do this. > (BTW, I intended to avoid bringing "struct strparser" into tcp_bpf.c so I > added a wrapper function instead in skmsg.c without calling it directly) > I find that tcp_bpf_update_proto is called before sk_psock_init_strp. Any assignment of psock->cb.strp will be overwritten in sk_psock_init_strp. May read_sock still needed. But we can avoid adding wrapper function by assigning psock->read_sock to cb.read_sock directly like this: --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -1137,10 +1137,11 @@ int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) { int ret; - static const struct strp_callbacks cb = { + struct strp_callbacks cb = { .rcv_msg = sk_psock_strp_read, .read_sock_done = sk_psock_strp_read_done, .parse_msg = sk_psock_strp_parse, + .read_sock = psock->read_sock, }; ret = strp_init(&psock->strp, sk, &cb); --- Thanks
On Mon, Jan 20, 2025 at 11:35 AM +08, Jiayuan Chen wrote: > On Sat, Jan 18, 2025 at 11:29:04PM +0800, Jiayuan Chen wrote: >> On Sat, Jan 18, 2025 at 03:50:22PM +0100, Jakub Sitnicki wrote: >> > On Thu, Jan 16, 2025 at 10:05 PM +08, Jiayuan Chen wrote: >> > > 'sk->copied_seq' was updated in the tcp_eat_skb() function when the >> > > action of a BPF program was SK_REDIRECT. For other actions, like SK_PASS, >> > > +} >> > > +#endif /* CONFIG_BPF_STREAM_PARSER */ >> > > + >> > > int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) >> > > { >> > > int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; >> > > @@ -681,6 +722,12 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) >> > > >> > > /* Pairs with lockless read in sk_clone_lock() */ >> > > sock_replace_proto(sk, &tcp_bpf_prots[family][config]); >> > > +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) >> > > + if (psock->progs.stream_parser && psock->progs.stream_verdict) { >> > > + psock->copied_seq = tcp_sk(sk)->copied_seq; >> > > + psock->read_sock = tcp_bpf_strp_read_sock; >> > >> > Just directly set psock->strp.cb.read_sock to tcp_bpf_strp_read_sock. >> > Then we don't need this intermediate psock->read_sock callback, which >> > doesn't do anything useful. >> > >> Ok, I will do this. >> (BTW, I intended to avoid bringing "struct strparser" into tcp_bpf.c so I >> added a wrapper function instead in skmsg.c without calling it directly) >> > I find that tcp_bpf_update_proto is called before sk_psock_init_strp. Any > assignment of psock->cb.strp will be overwritten in sk_psock_init_strp. Or just don't set ->read_sock in strp_init. It's being reset only because you made it so in patch 1 :-)
© 2016 - 2025 Red Hat, Inc.