From: Geliang Tang <tanggeliang@kylinos.cn>
nvme_tcp_try_recv() needs to call .read_sock interface of struct
proto_ops, but it's not implemented in MPTCP.
This patch implements it with reference to __tcp_read_sock() and
__mptcp_recvmsg_mskq().
Corresponding to tcp_recv_skb(), a new helper for MPTCP named
mptcp_recv_skb() is added to peek a skb from sk->sk_receive_queue.
Compared with __mptcp_recvmsg_mskq(), mptcp_read_sock() uses
sk->sk_rcvbuf as the max read length. The LISTEN status is checked
before the while loop, and mptcp_recv_skb() and mptcp_cleanup_rbuf()
are invoked after the loop. In the loop, all flags checks for
__mptcp_recvmsg_mskq() are removed.
Reviewed-by: Hannes Reinecke <hare@kernel.org>
Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
---
net/mptcp/protocol.c | 82 ++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 82 insertions(+)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 9d67c751b7a9..01d602f0d556 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -4322,6 +4322,86 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
return mask;
}
+static struct sk_buff *mptcp_recv_skb(struct sock *sk, u32 *off)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct sk_buff *skb;
+ u32 offset;
+
+ if (!list_empty(&msk->backlog_list))
+ mptcp_move_skbs(sk);
+
+ while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
+ offset = MPTCP_SKB_CB(skb)->offset;
+ if (offset < skb->len) {
+ *off = offset;
+ return skb;
+ }
+ mptcp_eat_recv_skb(sk, skb);
+ }
+ return NULL;
+}
+
+/*
+ * Note:
+ * - It is assumed that the socket was locked by the caller.
+ */
+static int __mptcp_read_sock(struct sock *sk, read_descriptor_t *desc,
+ sk_read_actor_t recv_actor, bool noack)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct sk_buff *skb;
+ int copied = 0;
+ u32 offset;
+
+ msk_owned_by_me(msk);
+
+ if (sk->sk_state == TCP_LISTEN)
+ return -ENOTCONN;
+ while ((skb = mptcp_recv_skb(sk, &offset)) != NULL) {
+ u32 data_len = skb->len - offset;
+ int count;
+ u32 size;
+
+ size = min_t(size_t, data_len, INT_MAX);
+ count = recv_actor(desc, skb, offset, size);
+ if (count <= 0) {
+ if (!copied)
+ copied = count;
+ break;
+ }
+
+ copied += count;
+
+ msk->bytes_consumed += count;
+ if (count < data_len) {
+ MPTCP_SKB_CB(skb)->offset += count;
+ MPTCP_SKB_CB(skb)->map_seq += count;
+ break;
+ }
+
+ mptcp_eat_recv_skb(sk, skb);
+ }
+
+ if (noack)
+ goto out;
+
+ mptcp_rcv_space_adjust(msk, copied);
+
+ if (copied > 0) {
+ mptcp_recv_skb(sk, &offset);
+ mptcp_cleanup_rbuf(msk, copied);
+ }
+out:
+ return copied;
+}
+
+static int mptcp_read_sock(struct sock *sk, read_descriptor_t *desc,
+ sk_read_actor_t recv_actor)
+{
+ return __mptcp_read_sock(sk, desc, recv_actor, false);
+}
+
static const struct proto_ops mptcp_stream_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
@@ -4342,6 +4422,7 @@ static const struct proto_ops mptcp_stream_ops = {
.recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.set_rcvlowat = mptcp_set_rcvlowat,
+ .read_sock = mptcp_read_sock,
};
static struct inet_protosw mptcp_protosw = {
@@ -4446,6 +4527,7 @@ static const struct proto_ops mptcp_v6_stream_ops = {
.compat_ioctl = inet6_compat_ioctl,
#endif
.set_rcvlowat = mptcp_set_rcvlowat,
+ .read_sock = mptcp_read_sock,
};
static struct proto mptcp_v6_prot;
--
2.51.0
Hi Geliang,
Thank you for working on that!
On 06/12/2025 13:33, Geliang Tang wrote:
> From: Geliang Tang <tanggeliang@kylinos.cn>
>
> nvme_tcp_try_recv() needs to call .read_sock interface of struct
> proto_ops, but it's not implemented in MPTCP.
>
> This patch implements it with reference to __tcp_read_sock() and
> __mptcp_recvmsg_mskq().
>
> Corresponding to tcp_recv_skb(), a new helper for MPTCP named
> mptcp_recv_skb() is added to peek a skb from sk->sk_receive_queue.
>
> Compared with __mptcp_recvmsg_mskq(), mptcp_read_sock() uses
> sk->sk_rcvbuf as the max read length. The LISTEN status is checked
> before the while loop, and mptcp_recv_skb() and mptcp_cleanup_rbuf()
> are invoked after the loop. In the loop, all flags checks for
> __mptcp_recvmsg_mskq() are removed.
(...)
> +static int mptcp_read_sock(struct sock *sk, read_descriptor_t *desc,
> + sk_read_actor_t recv_actor)
> +{
> + return __mptcp_read_sock(sk, desc, recv_actor, false);
In this series, it looks like __mptcp_read_sock() is only called here,
always with noack == false then. Will you call this helper with noack
set to true in another series?
(Maybe the extension could be introduced later, but if the next series
follows, that's fine to keep it like that.)
Cheers,
Matt
--
Sponsored by the NGI0 Core fund.
Hi Matt,
On Thu, 2025-12-11 at 19:22 +0100, Matthieu Baerts wrote:
> Hi Geliang,
>
> Thank you for working on that!
>
> On 06/12/2025 13:33, Geliang Tang wrote:
> > From: Geliang Tang <tanggeliang@kylinos.cn>
> >
> > nvme_tcp_try_recv() needs to call .read_sock interface of struct
> > proto_ops, but it's not implemented in MPTCP.
> >
> > This patch implements it with reference to __tcp_read_sock() and
> > __mptcp_recvmsg_mskq().
> >
> > Corresponding to tcp_recv_skb(), a new helper for MPTCP named
> > mptcp_recv_skb() is added to peek a skb from sk->sk_receive_queue.
> >
> > Compared with __mptcp_recvmsg_mskq(), mptcp_read_sock() uses
> > sk->sk_rcvbuf as the max read length. The LISTEN status is checked
> > before the while loop, and mptcp_recv_skb() and
> > mptcp_cleanup_rbuf()
> > are invoked after the loop. In the loop, all flags checks for
> > __mptcp_recvmsg_mskq() are removed.
>
> (...)
>
> > +static int mptcp_read_sock(struct sock *sk, read_descriptor_t
> > *desc,
> > + sk_read_actor_t recv_actor)
> > +{
> > + return __mptcp_read_sock(sk, desc, recv_actor, false);
>
> In this series, it looks like __mptcp_read_sock() is only called
> here,
> always with noack == false then. Will you call this helper with noack
> set to true in another series?
Yes, indeed. This read_sock will be called in all three series that I
am currently developing.
1. In "NVME over MPTCP" series, mptcp_read_sock() will be used, see:
consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
in nvme_tcp_try_recv().
2. In "MPTCP sockmap support" series (issue #521),
mptcp_read_sock_noack() will be used, see:
copied = tcp_read_sock_noack(sk, desc, recv_actor, true,
&psock->copied_seq);
in tcp_bpf_strp_read_sock().
3. In "MPTCP KTLS support" series (issue #480), tcp_read_sock() is used
in tls_strp_read_copyin():
tcp_read_sock(strp->sk, &desc, tls_strp_copyin);
I originally intended to use mptcp_read_sock() here, but it would cause
a deadlock, so I had to switch to using tcp_read_sock_noack() instead.
>
> (Maybe the extension could be introduced later, but if the next
> series
> follows, that's fine to keep it like that.)
So I think it's better to keep this patch as is.
Thanks,
-Geliang
>
> Cheers,
> Matt
© 2016 - 2025 Red Hat, Inc.