[v2] mptcp: receive path improvement

[PATCH mptcp-next v2 03/12] mptcp: rcvbuf auto-tuning improvement

Posted by Paolo Abeni 3 weeks, 1 day ago

Apply to the MPTCP auto-tuning the same improvements introduced for the
TCP protocol by the merge commit 2da35e4b4df9 ("Merge branch
'tcp-receive-side-improvements'").

The main difference is that TCP subflow and the main MPTCP socket need
to account separately for OoO: MPTCP does not care for TCP-level OoO
and vice versa, as a consequence do not reflect MPTCP-level rcvbuf
increase due to OoO packets at the subflow level.

This refeactor additionally allow dropping the msk receive buffer update
at receive time, as the latter only intended to cope with subflow receive
buffer increase due to OoO packets.

Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/487
Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/559
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
v1 -> v2:
  - fix unused variable
  - reword the commit message
---
 net/mptcp/protocol.c | 92 ++++++++++++++++++++------------------------
 net/mptcp/protocol.h |  4 +-
 2 files changed, 44 insertions(+), 52 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 9d95d24781509..162abafe3f320 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -179,6 +179,30 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to,
 	return mptcp_try_coalesce((struct sock *)msk, to, from);
 }
 
+static bool mptcp_rcvbuf_grow(struct sock *sk)
+{
+	struct mptcp_sock *msk = mptcp_sk(sk);
+	int rcvwin, rcvbuf;
+
+	if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) ||
+	    (sk->sk_userlocks & SOCK_RCVBUF_LOCK))
+		return false;
+
+	rcvwin = ((u64)msk->rcvq_space.space << 1);
+
+	if (!RB_EMPTY_ROOT(&msk->out_of_order_queue))
+		rcvwin += MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq - msk->ack_seq;
+
+	rcvbuf = min_t(u64, mptcp_space_from_win(sk, rcvwin),
+		       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
+
+	if (rcvbuf > sk->sk_rcvbuf) {
+		WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
+		return true;
+	}
+	return false;
+}
+
 /* "inspired" by tcp_data_queue_ofo(), main differences:
  * - use mptcp seqs
  * - don't cope with sacks
@@ -292,6 +316,9 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
 end:
 	skb_condense(skb);
 	skb_set_owner_r(skb, sk);
+	/* do not grow rcvbuf for not-yet-accepted or orphaned sockets. */
+	if (sk->sk_socket)
+		mptcp_rcvbuf_grow(sk);
 }
 
 static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
@@ -784,18 +811,10 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
 	return moved;
 }
 
-static void __mptcp_rcvbuf_update(struct sock *sk, struct sock *ssk)
-{
-	if (unlikely(ssk->sk_rcvbuf > sk->sk_rcvbuf))
-		WRITE_ONCE(sk->sk_rcvbuf, ssk->sk_rcvbuf);
-}
-
 static void __mptcp_data_ready(struct sock *sk, struct sock *ssk)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
 
-	__mptcp_rcvbuf_update(sk, ssk);
-
 	/* Wake-up the reader only for in-sequence data */
 	if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk))
 		sk->sk_data_ready(sk);
@@ -2014,48 +2033,26 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
 	if (msk->rcvq_space.copied <= msk->rcvq_space.space)
 		goto new_measure;
 
-	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
-	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
-		u64 rcvwin, grow;
-		int rcvbuf;
-
-		rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss;
-
-		grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space);
-
-		do_div(grow, msk->rcvq_space.space);
-		rcvwin += (grow << 1);
-
-		rcvbuf = min_t(u64, mptcp_space_from_win(sk, rcvwin),
-			       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
-
-		if (rcvbuf > sk->sk_rcvbuf) {
-			u32 window_clamp;
-
-			window_clamp = mptcp_win_from_space(sk, rcvbuf);
-			WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
+	msk->rcvq_space.space = msk->rcvq_space.copied;
+	if (mptcp_rcvbuf_grow(sk)) {
 
-			/* Make subflows follow along.  If we do not do this, we
-			 * get drops at subflow level if skbs can't be moved to
-			 * the mptcp rx queue fast enough (announced rcv_win can
-			 * exceed ssk->sk_rcvbuf).
-			 */
-			mptcp_for_each_subflow(msk, subflow) {
-				struct sock *ssk;
-				bool slow;
+		/* Make subflows follow along.  If we do not do this, we
+		 * get drops at subflow level if skbs can't be moved to
+		 * the mptcp rx queue fast enough (announced rcv_win can
+		 * exceed ssk->sk_rcvbuf).
+		 */
+		mptcp_for_each_subflow(msk, subflow) {
+			struct sock *ssk;
+			bool slow;
 
-				ssk = mptcp_subflow_tcp_sock(subflow);
-				slow = lock_sock_fast(ssk);
-				WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf);
-				WRITE_ONCE(tcp_sk(ssk)->window_clamp, window_clamp);
-				if (tcp_can_send_ack(ssk))
-					tcp_cleanup_rbuf(ssk, 1);
-				unlock_sock_fast(ssk, slow);
-			}
+			ssk = mptcp_subflow_tcp_sock(subflow);
+			slow = lock_sock_fast(ssk);
+			tcp_sk(ssk)->rcvq_space.space = msk->rcvq_space.copied;
+			tcp_rcvbuf_grow(ssk);
+			unlock_sock_fast(ssk, slow);
 		}
 	}
 
-	msk->rcvq_space.space = msk->rcvq_space.copied;
 new_measure:
 	msk->rcvq_space.copied = 0;
 	msk->rcvq_space.time = mstamp;
@@ -2084,11 +2081,6 @@ static bool __mptcp_move_skbs(struct sock *sk)
 	if (list_empty(&msk->conn_list))
 		return false;
 
-	/* verify we can move any data from the subflow, eventually updating */
-	if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
-		mptcp_for_each_subflow(msk, subflow)
-			__mptcp_rcvbuf_update(sk, subflow->tcp_sock);
-
 	subflow = list_first_entry(&msk->conn_list,
 				   struct mptcp_subflow_context, node);
 	for (;;) {
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 9b5a248bad404..6ac58e92a1aa3 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -342,8 +342,8 @@ struct mptcp_sock {
 	struct mptcp_pm_data	pm;
 	struct mptcp_sched_ops	*sched;
 	struct {
-		u32	space;	/* bytes copied in last measurement window */
-		u32	copied; /* bytes copied in this measurement window */
+		int	space;	/* bytes copied in last measurement window */
+		int	copied; /* bytes copied in this measurement window */
 		u64	time;	/* start time of measurement window */
 		u64	rtt_us; /* last maximum rtt of subflows */
 	} rcvq_space;
-- 
2.51.0

Re: [PATCH mptcp-next v2 03/12] mptcp: rcvbuf auto-tuning improvement

Posted by Matthieu Baerts 3 weeks ago

Hi Paolo,

On 18/09/2025 19:14, Paolo Abeni wrote:
> Apply to the MPTCP auto-tuning the same improvements introduced for the
> TCP protocol by the merge commit 2da35e4b4df9 ("Merge branch
> 'tcp-receive-side-improvements'").
> 
> The main difference is that TCP subflow and the main MPTCP socket need
> to account separately for OoO: MPTCP does not care for TCP-level OoO
> and vice versa, as a consequence do not reflect MPTCP-level rcvbuf
> increase due to OoO packets at the subflow level.
> 
> This refeactor additionally allow dropping the msk receive buffer update
> at receive time, as the latter only intended to cope with subflow receive
> buffer increase due to OoO packets.
> 
> Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/487
> Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/559
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>

Thank you for looking at that!

> ---
> v1 -> v2:
>   - fix unused variable
>   - reword the commit message
> ---
>  net/mptcp/protocol.c | 92 ++++++++++++++++++++------------------------
>  net/mptcp/protocol.h |  4 +-
>  2 files changed, 44 insertions(+), 52 deletions(-)
> 
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 9d95d24781509..162abafe3f320 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -179,6 +179,30 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to,
>  	return mptcp_try_coalesce((struct sock *)msk, to, from);
>  }
>  
> +static bool mptcp_rcvbuf_grow(struct sock *sk)

detail: can we add the similar comment as the one below for
mptcp_data_queue_ofo()?

   /* "inspired" by tcp_rcvbuf_grow(), main differences:
    * - ...

I can do the modification when applying the patches.

> +{
> +	struct mptcp_sock *msk = mptcp_sk(sk);
> +	int rcvwin, rcvbuf;
> +
> +	if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) ||
> +	    (sk->sk_userlocks & SOCK_RCVBUF_LOCK))
> +		return false;
> +
> +	rcvwin = ((u64)msk->rcvq_space.space << 1);

Why do you need the u64 cast here? (and the parenthesis)

> +
> +	if (!RB_EMPTY_ROOT(&msk->out_of_order_queue))
> +		rcvwin += MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq - msk->ack_seq;
> +
> +	rcvbuf = min_t(u64, mptcp_space_from_win(sk, rcvwin),
> +		       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));

detail: mptcp_space_from_win() and *sysctl_tcp_rmem are on 32 bits, no
need to do a 'min_t(u64)'. Maybe we can copy what is done in
tcp_rcvbuf_grow()?

  rcvbuf = min_t(u32, ...

While at it, could we get even closer to tcp_rcvbuf_grow()? Declaring
and using 'net' and 'cap' variable + keep the comments? By doing that,
it would be easier to spot differences :)

I can also do the modifications when applying the patches if you prefer.

The suggested modifications are minor, and the rest looks good to me!

Reviewed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>

Cheers,
Matt
-- 
Sponsored by the NGI0 Core fund.

Re: [PATCH mptcp-next v2 03/12] mptcp: rcvbuf auto-tuning improvement

Posted by Geliang Tang 3 weeks ago

On Thu, 2025-09-18 at 19:14 +0200, Paolo Abeni wrote:
> Apply to the MPTCP auto-tuning the same improvements introduced for
> the
> TCP protocol by the merge commit 2da35e4b4df9 ("Merge branch
> 'tcp-receive-side-improvements'").
> 
> The main difference is that TCP subflow and the main MPTCP socket
> need
> to account separately for OoO: MPTCP does not care for TCP-level OoO
> and vice versa, as a consequence do not reflect MPTCP-level rcvbuf
> increase due to OoO packets at the subflow level.
> 
> This refeactor additionally allow dropping the msk receive buffer
> update
> at receive time, as the latter only intended to cope with subflow
> receive
> buffer increase due to OoO packets.
> 
> Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/487
> Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/559
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>

LGTM!

Reviewed-by: Geliang Tang <geliang@kernel.org>
Tested-by: Geliang Tang <geliang@kernel.org>

Thanks,
-Geliang

> ---
> v1 -> v2:
>   - fix unused variable
>   - reword the commit message
> ---
>  net/mptcp/protocol.c | 92 ++++++++++++++++++++----------------------
> --
>  net/mptcp/protocol.h |  4 +-
>  2 files changed, 44 insertions(+), 52 deletions(-)
> 
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 9d95d24781509..162abafe3f320 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -179,6 +179,30 @@ static bool mptcp_ooo_try_coalesce(struct
> mptcp_sock *msk, struct sk_buff *to,
>  	return mptcp_try_coalesce((struct sock *)msk, to, from);
>  }
>  
> +static bool mptcp_rcvbuf_grow(struct sock *sk)
> +{
> +	struct mptcp_sock *msk = mptcp_sk(sk);
> +	int rcvwin, rcvbuf;
> +
> +	if (!READ_ONCE(sock_net(sk)-
> >ipv4.sysctl_tcp_moderate_rcvbuf) ||
> +	    (sk->sk_userlocks & SOCK_RCVBUF_LOCK))
> +		return false;
> +
> +	rcvwin = ((u64)msk->rcvq_space.space << 1);
> +
> +	if (!RB_EMPTY_ROOT(&msk->out_of_order_queue))
> +		rcvwin += MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq -
> msk->ack_seq;
> +
> +	rcvbuf = min_t(u64, mptcp_space_from_win(sk, rcvwin),
> +		       READ_ONCE(sock_net(sk)-
> >ipv4.sysctl_tcp_rmem[2]));
> +
> +	if (rcvbuf > sk->sk_rcvbuf) {
> +		WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
> +		return true;
> +	}
> +	return false;
> +}
> +
>  /* "inspired" by tcp_data_queue_ofo(), main differences:
>   * - use mptcp seqs
>   * - don't cope with sacks
> @@ -292,6 +316,9 @@ static void mptcp_data_queue_ofo(struct
> mptcp_sock *msk, struct sk_buff *skb)
>  end:
>  	skb_condense(skb);
>  	skb_set_owner_r(skb, sk);
> +	/* do not grow rcvbuf for not-yet-accepted or orphaned
> sockets. */
> +	if (sk->sk_socket)
> +		mptcp_rcvbuf_grow(sk);
>  }
>  
>  static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock
> *ssk,
> @@ -784,18 +811,10 @@ static bool move_skbs_to_msk(struct mptcp_sock
> *msk, struct sock *ssk)
>  	return moved;
>  }
>  
> -static void __mptcp_rcvbuf_update(struct sock *sk, struct sock *ssk)
> -{
> -	if (unlikely(ssk->sk_rcvbuf > sk->sk_rcvbuf))
> -		WRITE_ONCE(sk->sk_rcvbuf, ssk->sk_rcvbuf);
> -}
> -
>  static void __mptcp_data_ready(struct sock *sk, struct sock *ssk)
>  {
>  	struct mptcp_sock *msk = mptcp_sk(sk);
>  
> -	__mptcp_rcvbuf_update(sk, ssk);
> -
>  	/* Wake-up the reader only for in-sequence data */
>  	if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk))
>  		sk->sk_data_ready(sk);
> @@ -2014,48 +2033,26 @@ static void mptcp_rcv_space_adjust(struct
> mptcp_sock *msk, int copied)
>  	if (msk->rcvq_space.copied <= msk->rcvq_space.space)
>  		goto new_measure;
>  
> -	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf)
> &&
> -	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
> -		u64 rcvwin, grow;
> -		int rcvbuf;
> -
> -		rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 *
> advmss;
> -
> -		grow = rcvwin * (msk->rcvq_space.copied - msk-
> >rcvq_space.space);
> -
> -		do_div(grow, msk->rcvq_space.space);
> -		rcvwin += (grow << 1);
> -
> -		rcvbuf = min_t(u64, mptcp_space_from_win(sk,
> rcvwin),
> -			       READ_ONCE(sock_net(sk)-
> >ipv4.sysctl_tcp_rmem[2]));
> -
> -		if (rcvbuf > sk->sk_rcvbuf) {
> -			u32 window_clamp;
> -
> -			window_clamp = mptcp_win_from_space(sk,
> rcvbuf);
> -			WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
> +	msk->rcvq_space.space = msk->rcvq_space.copied;
> +	if (mptcp_rcvbuf_grow(sk)) {
>  
> -			/* Make subflows follow along.  If we do not
> do this, we
> -			 * get drops at subflow level if skbs can't
> be moved to
> -			 * the mptcp rx queue fast enough (announced
> rcv_win can
> -			 * exceed ssk->sk_rcvbuf).
> -			 */
> -			mptcp_for_each_subflow(msk, subflow) {
> -				struct sock *ssk;
> -				bool slow;
> +		/* Make subflows follow along.  If we do not do
> this, we
> +		 * get drops at subflow level if skbs can't be moved
> to
> +		 * the mptcp rx queue fast enough (announced rcv_win
> can
> +		 * exceed ssk->sk_rcvbuf).
> +		 */
> +		mptcp_for_each_subflow(msk, subflow) {
> +			struct sock *ssk;
> +			bool slow;
>  
> -				ssk =
> mptcp_subflow_tcp_sock(subflow);
> -				slow = lock_sock_fast(ssk);
> -				WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf);
> -				WRITE_ONCE(tcp_sk(ssk)-
> >window_clamp, window_clamp);
> -				if (tcp_can_send_ack(ssk))
> -					tcp_cleanup_rbuf(ssk, 1);
> -				unlock_sock_fast(ssk, slow);
> -			}
> +			ssk = mptcp_subflow_tcp_sock(subflow);
> +			slow = lock_sock_fast(ssk);
> +			tcp_sk(ssk)->rcvq_space.space = msk-
> >rcvq_space.copied;
> +			tcp_rcvbuf_grow(ssk);
> +			unlock_sock_fast(ssk, slow);
>  		}
>  	}
>  
> -	msk->rcvq_space.space = msk->rcvq_space.copied;
>  new_measure:
>  	msk->rcvq_space.copied = 0;
>  	msk->rcvq_space.time = mstamp;
> @@ -2084,11 +2081,6 @@ static bool __mptcp_move_skbs(struct sock *sk)
>  	if (list_empty(&msk->conn_list))
>  		return false;
>  
> -	/* verify we can move any data from the subflow, eventually
> updating */
> -	if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
> -		mptcp_for_each_subflow(msk, subflow)
> -			__mptcp_rcvbuf_update(sk, subflow-
> >tcp_sock);
> -
>  	subflow = list_first_entry(&msk->conn_list,
>  				   struct mptcp_subflow_context,
> node);
>  	for (;;) {
> diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
> index 9b5a248bad404..6ac58e92a1aa3 100644
> --- a/net/mptcp/protocol.h
> +++ b/net/mptcp/protocol.h
> @@ -342,8 +342,8 @@ struct mptcp_sock {
>  	struct mptcp_pm_data	pm;
>  	struct mptcp_sched_ops	*sched;
>  	struct {
> -		u32	space;	/* bytes copied in last measurement
> window */
> -		u32	copied; /* bytes copied in this measurement
> window */
> +		int	space;	/* bytes copied in last measurement
> window */
> +		int	copied; /* bytes copied in this measurement
> window */
>  		u64	time;	/* start time of measurement window
> */
>  		u64	rtt_us; /* last maximum rtt of subflows */
>  	} rcvq_space;