[PATCH mptcp-next v7 3/4] mptcp: implement .splice_read

Geliang Tang posted 4 patches 2 months, 1 week ago
There is a newer version of this series
[PATCH mptcp-next v7 3/4] mptcp: implement .splice_read
Posted by Geliang Tang 2 months, 1 week ago
From: Geliang Tang <tanggeliang@kylinos.cn>

This patch implements .splice_read interface of mptcp struct proto_ops
as mptcp_splice_read() with reference to tcp_splice_read().

Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
---
 net/mptcp/protocol.c | 136 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 136 insertions(+)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index fc429d175ede..4638d4be2b98 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -4023,6 +4023,140 @@ static int mptcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 	return copied;
 }
 
+/*
+ * MPTCP splice context
+ */
+struct mptcp_splice_state {
+	struct pipe_inode_info *pipe;
+	size_t len;
+	unsigned int flags;
+};
+
+static int mptcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
+				  unsigned int offset, size_t len)
+{
+	struct mptcp_splice_state *mss = rd_desc->arg.data;
+	int ret;
+
+	ret = skb_splice_bits(skb, skb->sk, offset, mss->pipe,
+			      min(rd_desc->count, len), mss->flags);
+	if (ret > 0)
+		rd_desc->count -= ret;
+	return ret;
+}
+
+static int __mptcp_splice_read(struct sock *sk, struct mptcp_splice_state *mss)
+{
+	/* Store MPTCP splice context information in read_descriptor_t. */
+	read_descriptor_t rd_desc = {
+		.arg.data = mss,
+		.count	  = mss->len,
+	};
+
+	return mptcp_read_sock(sk, &rd_desc, mptcp_splice_data_recv);
+}
+
+/**
+ *  mptcp_splice_read - splice data from MPTCP socket to a pipe
+ * @sock:	socket to splice from
+ * @ppos:	position (not valid)
+ * @pipe:	pipe to splice to
+ * @len:	number of bytes to splice
+ * @flags:	splice modifier flags
+ *
+ * Description:
+ *    Will read pages from given socket and fill them into a pipe.
+ *
+ **/
+static ssize_t mptcp_splice_read(struct socket *sock, loff_t *ppos,
+				 struct pipe_inode_info *pipe, size_t len,
+				 unsigned int flags)
+{
+	struct mptcp_splice_state mss = {
+		.pipe = pipe,
+		.len = len,
+		.flags = flags,
+	};
+	struct sock *sk = sock->sk;
+	ssize_t spliced;
+	long timeo;
+	int ret;
+
+	/*
+	 * We can't seek on a socket input
+	 */
+	if (unlikely(*ppos))
+		return -ESPIPE;
+
+	spliced = 0;
+	ret = 0;
+
+	lock_sock(sk);
+
+	timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
+	while (mss.len) {
+		ret = __mptcp_splice_read(sk, &mss);
+		if (ret < 0) {
+			break;
+		} else if (!ret) {
+			if (spliced)
+				break;
+			if (sock_flag(sk, SOCK_DONE))
+				break;
+			if (sk->sk_err) {
+				ret = sock_error(sk);
+				break;
+			}
+			if (sk->sk_shutdown & RCV_SHUTDOWN) {
+				if (__mptcp_move_skbs(sk))
+					continue;
+				break;
+			}
+			if (sk->sk_state == TCP_CLOSE) {
+				ret = -ENOTCONN;
+				break;
+			}
+			if (!timeo) {
+				ret = -EAGAIN;
+				break;
+			}
+			/* if __mptcp_splice_read() got nothing while we have
+			 * an skb in receive queue, we do not want to loop.
+			 * This might happen with URG data.
+			 */
+			if (!skb_queue_empty(&sk->sk_receive_queue))
+				break;
+			ret = sk_wait_data(sk, &timeo, NULL);
+			if (ret < 0)
+				break;
+			if (signal_pending(current)) {
+				ret = sock_intr_errno(timeo);
+				break;
+			}
+			continue;
+		}
+		mss.len -= ret;
+		spliced += ret;
+
+		if (!mss.len || !timeo)
+			break;
+		release_sock(sk);
+		lock_sock(sk);
+
+		if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
+		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
+		    signal_pending(current))
+			break;
+	}
+
+	release_sock(sk);
+
+	if (spliced)
+		return spliced;
+
+	return ret;
+}
+
 static const struct proto_ops mptcp_stream_ops = {
 	.family		   = PF_INET,
 	.owner		   = THIS_MODULE,
@@ -4044,6 +4178,7 @@ static const struct proto_ops mptcp_stream_ops = {
 	.mmap		   = sock_no_mmap,
 	.set_rcvlowat	   = mptcp_set_rcvlowat,
 	.read_sock	   = mptcp_read_sock,
+	.splice_read	   = mptcp_splice_read,
 };
 
 static struct inet_protosw mptcp_protosw = {
@@ -4149,6 +4284,7 @@ static const struct proto_ops mptcp_v6_stream_ops = {
 #endif
 	.set_rcvlowat	   = mptcp_set_rcvlowat,
 	.read_sock	   = mptcp_read_sock,
+	.splice_read	   = mptcp_splice_read,
 };
 
 static struct proto mptcp_v6_prot;
-- 
2.48.1
Re: [PATCH mptcp-next v7 3/4] mptcp: implement .splice_read
Posted by Paolo Abeni 2 months, 1 week ago
On 7/7/25 11:34 AM, Geliang Tang wrote:
> From: Geliang Tang <tanggeliang@kylinos.cn>
> 
> This patch implements .splice_read interface of mptcp struct proto_ops
> as mptcp_splice_read() with reference to tcp_splice_read().
> 
> Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
> ---
>  net/mptcp/protocol.c | 136 +++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 136 insertions(+)
> 
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index fc429d175ede..4638d4be2b98 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -4023,6 +4023,140 @@ static int mptcp_read_sock(struct sock *sk, read_descriptor_t *desc,
>  	return copied;
>  }
>  
> +/*
> + * MPTCP splice context
> + */
> +struct mptcp_splice_state {
> +	struct pipe_inode_info *pipe;
> +	size_t len;
> +	unsigned int flags;
> +};
> +
> +static int mptcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
> +				  unsigned int offset, size_t len)
> +{
> +	struct mptcp_splice_state *mss = rd_desc->arg.data;
> +	int ret;
> +
> +	ret = skb_splice_bits(skb, skb->sk, offset, mss->pipe,
> +			      min(rd_desc->count, len), mss->flags);
> +	if (ret > 0)
> +		rd_desc->count -= ret;
> +	return ret;
> +}

I have mixed feeling WRT the above. I'm wondering if we should reuse the
same code already existing in TCP, moving tcp_spice_state definition in
some shared hdr and macking tcp_splice_data_recv not static.

> +static int __mptcp_splice_read(struct sock *sk, struct mptcp_splice_state *mss)
> +{
> +	/* Store MPTCP splice context information in read_descriptor_t. */
> +	read_descriptor_t rd_desc = {
> +		.arg.data = mss,
> +		.count	  = mss->len,
> +	};
> +
> +	return mptcp_read_sock(sk, &rd_desc, mptcp_splice_data_recv);
> +}
> +
> +/**
> + *  mptcp_splice_read - splice data from MPTCP socket to a pipe
> + * @sock:	socket to splice from
> + * @ppos:	position (not valid)
> + * @pipe:	pipe to splice to
> + * @len:	number of bytes to splice
> + * @flags:	splice modifier flags
> + *
> + * Description:
> + *    Will read pages from given socket and fill them into a pipe.
> + *
> + **/
> +static ssize_t mptcp_splice_read(struct socket *sock, loff_t *ppos,
> +				 struct pipe_inode_info *pipe, size_t len,
> +				 unsigned int flags)
> +{
> +	struct mptcp_splice_state mss = {
> +		.pipe = pipe,
> +		.len = len,
> +		.flags = flags,
> +	};
> +	struct sock *sk = sock->sk;
> +	ssize_t spliced;
> +	long timeo;
> +	int ret;
> +
> +	/*
> +	 * We can't seek on a socket input
> +	 */
> +	if (unlikely(*ppos))
> +		return -ESPIPE;
> +
> +	spliced = 0;
> +	ret = 0;
> +
> +	lock_sock(sk);
> +
> +	timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
> +	while (mss.len) {
> +		ret = __mptcp_splice_read(sk, &mss);
> +		if (ret < 0) {
> +			break;
> +		} else if (!ret) {
> +			if (spliced)
> +				break;
> +			if (sock_flag(sk, SOCK_DONE))
> +				break;
> +			if (sk->sk_err) {
> +				ret = sock_error(sk);
> +				break;
> +			}
> +			if (sk->sk_shutdown & RCV_SHUTDOWN) {
> +				if (__mptcp_move_skbs(sk))
> +					continue;
> +				break;
> +			}
> +			if (sk->sk_state == TCP_CLOSE) {
> +				ret = -ENOTCONN;
> +				break;
> +			}
> +			if (!timeo) {
> +				ret = -EAGAIN;
> +				break;
> +			}
> +			/* if __mptcp_splice_read() got nothing while we have
> +			 * an skb in receive queue, we do not want to loop.
> +			 * This might happen with URG data.
> +			 */
> +			if (!skb_queue_empty(&sk->sk_receive_queue))
> +				break;
> +			ret = sk_wait_data(sk, &timeo, NULL);
> +			if (ret < 0)
> +				break;
> +			if (signal_pending(current)) {
> +				ret = sock_intr_errno(timeo);
> +				break;
> +			}

I think that moving the above if statement before the queue empty check
will not change the overall behavior.

With that in place you could factor out an

bool mptcp_recv_should_stop(struct sock *sk, int err)

helper from mptcp_recvmsg() and use it verbatim in both in
mptcp_recvmsg() and here.

Side note: suggestions for a better helper name welcome!

/P
Re: [PATCH mptcp-next v7 3/4] mptcp: implement .splice_read
Posted by Geliang Tang 2 months, 1 week ago
Hi Paolo,

On Tue, 2025-07-08 at 16:52 +0200, Paolo Abeni wrote:
> On 7/7/25 11:34 AM, Geliang Tang wrote:
> > From: Geliang Tang <tanggeliang@kylinos.cn>
> > 
> > This patch implements .splice_read interface of mptcp struct
> > proto_ops
> > as mptcp_splice_read() with reference to tcp_splice_read().
> > 
> > Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
> > ---
> >  net/mptcp/protocol.c | 136
> > +++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 136 insertions(+)
> > 
> > diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> > index fc429d175ede..4638d4be2b98 100644
> > --- a/net/mptcp/protocol.c
> > +++ b/net/mptcp/protocol.c
> > @@ -4023,6 +4023,140 @@ static int mptcp_read_sock(struct sock *sk,
> > read_descriptor_t *desc,
> >  	return copied;
> >  }
> >  
> > +/*
> > + * MPTCP splice context
> > + */
> > +struct mptcp_splice_state {
> > +	struct pipe_inode_info *pipe;
> > +	size_t len;
> > +	unsigned int flags;
> > +};
> > +
> > +static int mptcp_splice_data_recv(read_descriptor_t *rd_desc,
> > struct sk_buff *skb,
> > +				  unsigned int offset, size_t len)
> > +{
> > +	struct mptcp_splice_state *mss = rd_desc->arg.data;
> > +	int ret;
> > +
> > +	ret = skb_splice_bits(skb, skb->sk, offset, mss->pipe,
> > +			      min(rd_desc->count, len), mss-
> > >flags);
> > +	if (ret > 0)
> > +		rd_desc->count -= ret;
> > +	return ret;
> > +}
> 
> I have mixed feeling WRT the above. I'm wondering if we should reuse
> the
> same code already existing in TCP, moving tcp_spice_state definition
> in
> some shared hdr and macking tcp_splice_data_recv not static.
> 
> > +static int __mptcp_splice_read(struct sock *sk, struct
> > mptcp_splice_state *mss)
> > +{
> > +	/* Store MPTCP splice context information in
> > read_descriptor_t. */
> > +	read_descriptor_t rd_desc = {
> > +		.arg.data = mss,
> > +		.count	  = mss->len,
> > +	};
> > +
> > +	return mptcp_read_sock(sk, &rd_desc,
> > mptcp_splice_data_recv);
> > +}
> > +
> > +/**
> > + *  mptcp_splice_read - splice data from MPTCP socket to a pipe
> > + * @sock:	socket to splice from
> > + * @ppos:	position (not valid)
> > + * @pipe:	pipe to splice to
> > + * @len:	number of bytes to splice
> > + * @flags:	splice modifier flags
> > + *
> > + * Description:
> > + *    Will read pages from given socket and fill them into a pipe.
> > + *
> > + **/
> > +static ssize_t mptcp_splice_read(struct socket *sock, loff_t
> > *ppos,
> > +				 struct pipe_inode_info *pipe,
> > size_t len,
> > +				 unsigned int flags)
> > +{
> > +	struct mptcp_splice_state mss = {
> > +		.pipe = pipe,
> > +		.len = len,
> > +		.flags = flags,
> > +	};
> > +	struct sock *sk = sock->sk;
> > +	ssize_t spliced;
> > +	long timeo;
> > +	int ret;
> > +
> > +	/*
> > +	 * We can't seek on a socket input
> > +	 */
> > +	if (unlikely(*ppos))
> > +		return -ESPIPE;
> > +
> > +	spliced = 0;
> > +	ret = 0;
> > +
> > +	lock_sock(sk);
> > +
> > +	timeo = sock_rcvtimeo(sk, sock->file->f_flags &
> > O_NONBLOCK);
> > +	while (mss.len) {
> > +		ret = __mptcp_splice_read(sk, &mss);
> > +		if (ret < 0) {
> > +			break;
> > +		} else if (!ret) {
> > +			if (spliced)
> > +				break;
> > +			if (sock_flag(sk, SOCK_DONE))
> > +				break;

I noticed that this SOCK_DONE flag is also checked in
tcp_recvmsg_locked() but not in mptcp_recvmsg(). I wonder if this flag
should also be checked in mptcp_recvmsg() too.

> > +			if (sk->sk_err) {
> > +				ret = sock_error(sk);
> > +				break;
> > +			}
> > +			if (sk->sk_shutdown & RCV_SHUTDOWN) {
> > +				if (__mptcp_move_skbs(sk))
> > +					continue;
> > +				break;
> > +			}
> > +			if (sk->sk_state == TCP_CLOSE) {
> > +				ret = -ENOTCONN;
> > +				break;
> > +			}
> > +			if (!timeo) {
> > +				ret = -EAGAIN;
> > +				break;
> > +			}
> > +			/* if __mptcp_splice_read() got nothing
> > while we have
> > +			 * an skb in receive queue, we do not want
> > to loop.
> > +			 * This might happen with URG data.
> > +			 */
> > +			if (!skb_queue_empty(&sk-
> > >sk_receive_queue))
> > +				break;
> > +			ret = sk_wait_data(sk, &timeo, NULL);
> > +			if (ret < 0)
> > +				break;
> > +			if (signal_pending(current)) {
> > +				ret = sock_intr_errno(timeo);
> > +				break;
> > +			}
> 
> I think that moving the above if statement before the queue empty
> check
> will not change the overall behavior.
> 
> With that in place you could factor out an
> 
> bool mptcp_recv_should_stop(struct sock *sk, int err)

This helper can also be used in tcp_recvmsg_locked and tcp_splice_read
too. What about rename it as tcp_recv_should_stop, then add it in
include/net/tcp.h and use it for both TCP and MPTCP. WDYT?

Thanks,
-Geliang

> 
> helper from mptcp_recvmsg() and use it verbatim in both in
> mptcp_recvmsg() and here.
> 
> Side note: suggestions for a better helper name welcome!
> 
> /P
>