[MPTCP next 03/12] mptcp: rcvbuf auto-tuning improvement

Paolo Abeni posted 12 patches 23 hours ago
[MPTCP next 03/12] mptcp: rcvbuf auto-tuning improvement
Posted by Paolo Abeni 23 hours ago
Apply to the MPTCP auto-tuning the same improvements introduced for the
TCP protocol by the merge commit 2da35e4b4df9 ("Merge branch
'tcp-receive-side-improvements'").

The main difference is that TCP subflow and the main MPTCP socket need
to account separately for OoO: MPTCP does not care for TCP-level OoO
and vice versa.

The above additionally allow dropping the msk receive buffer update at
receive time, as the latter only intended to cope with subflow receive
buffer increase due to OoO packets.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/mptcp/protocol.c | 95 +++++++++++++++++++++-----------------------
 net/mptcp/protocol.h |  4 +-
 2 files changed, 47 insertions(+), 52 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index c51aede20779d..671c51cb9539c 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -178,6 +178,33 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to,
 	return mptcp_try_coalesce((struct sock *)msk, to, from);
 }
 
+static bool mptcp_rcvbuf_grow(struct sock *sk)
+{
+	struct mptcp_sock *msk = mptcp_sk(sk);
+	int rcvwin, rcvbuf;
+
+	if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) ||
+	    (sk->sk_userlocks & SOCK_RCVBUF_LOCK))
+		return false;
+
+	rcvwin = ((u64)msk->rcvq_space.space << 1);
+
+	if (!RB_EMPTY_ROOT(&msk->out_of_order_queue))
+		rcvwin += MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq - msk->ack_seq;
+
+	rcvbuf = min_t(u64, mptcp_space_from_win(sk, rcvwin),
+		       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
+
+	if (rcvbuf > sk->sk_rcvbuf) {
+		u32 window_clamp;
+
+		window_clamp = mptcp_win_from_space(sk, rcvbuf);
+		WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
+		return true;
+	}
+	return false;
+}
+
 /* "inspired" by tcp_data_queue_ofo(), main differences:
  * - use mptcp seqs
  * - don't cope with sacks
@@ -291,6 +318,9 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
 end:
 	skb_condense(skb);
 	skb_set_owner_r(skb, sk);
+	/* do not grow rcvbuf for not-yet-accepted or orphaned sockets. */
+	if (sk->sk_socket)
+		mptcp_rcvbuf_grow(sk);
 }
 
 static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
@@ -770,18 +800,10 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
 	return moved;
 }
 
-static void __mptcp_rcvbuf_update(struct sock *sk, struct sock *ssk)
-{
-	if (unlikely(ssk->sk_rcvbuf > sk->sk_rcvbuf))
-		WRITE_ONCE(sk->sk_rcvbuf, ssk->sk_rcvbuf);
-}
-
 static void __mptcp_data_ready(struct sock *sk, struct sock *ssk)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
 
-	__mptcp_rcvbuf_update(sk, ssk);
-
 	/* Wake-up the reader only for in-sequence data */
 	if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk))
 		sk->sk_data_ready(sk);
@@ -1984,48 +2006,26 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
 	if (msk->rcvq_space.copied <= msk->rcvq_space.space)
 		goto new_measure;
 
-	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
-	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
-		u64 rcvwin, grow;
-		int rcvbuf;
-
-		rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss;
-
-		grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space);
-
-		do_div(grow, msk->rcvq_space.space);
-		rcvwin += (grow << 1);
-
-		rcvbuf = min_t(u64, mptcp_space_from_win(sk, rcvwin),
-			       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
-
-		if (rcvbuf > sk->sk_rcvbuf) {
-			u32 window_clamp;
-
-			window_clamp = mptcp_win_from_space(sk, rcvbuf);
-			WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
+	msk->rcvq_space.space = msk->rcvq_space.copied;
+	if (mptcp_rcvbuf_grow(sk)) {
 
-			/* Make subflows follow along.  If we do not do this, we
-			 * get drops at subflow level if skbs can't be moved to
-			 * the mptcp rx queue fast enough (announced rcv_win can
-			 * exceed ssk->sk_rcvbuf).
-			 */
-			mptcp_for_each_subflow(msk, subflow) {
-				struct sock *ssk;
-				bool slow;
+		/* Make subflows follow along.  If we do not do this, we
+		 * get drops at subflow level if skbs can't be moved to
+		 * the mptcp rx queue fast enough (announced rcv_win can
+		 * exceed ssk->sk_rcvbuf).
+		 */
+		mptcp_for_each_subflow(msk, subflow) {
+			struct sock *ssk;
+			bool slow;
 
-				ssk = mptcp_subflow_tcp_sock(subflow);
-				slow = lock_sock_fast(ssk);
-				WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf);
-				WRITE_ONCE(tcp_sk(ssk)->window_clamp, window_clamp);
-				if (tcp_can_send_ack(ssk))
-					tcp_cleanup_rbuf(ssk, 1);
-				unlock_sock_fast(ssk, slow);
-			}
+			ssk = mptcp_subflow_tcp_sock(subflow);
+			slow = lock_sock_fast(ssk);
+			tcp_sk(ssk)->rcvq_space.space = msk->rcvq_space.copied;
+			tcp_rcvbuf_grow(ssk);
+			unlock_sock_fast(ssk, slow);
 		}
 	}
 
-	msk->rcvq_space.space = msk->rcvq_space.copied;
 new_measure:
 	msk->rcvq_space.copied = 0;
 	msk->rcvq_space.time = mstamp;
@@ -2054,11 +2054,6 @@ static bool __mptcp_move_skbs(struct sock *sk)
 	if (list_empty(&msk->conn_list))
 		return false;
 
-	/* verify we can move any data from the subflow, eventually updating */
-	if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
-		mptcp_for_each_subflow(msk, subflow)
-			__mptcp_rcvbuf_update(sk, subflow->tcp_sock);
-
 	subflow = list_first_entry(&msk->conn_list,
 				   struct mptcp_subflow_context, node);
 	for (;;) {
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index a1787a1344ac1..128baea5b496e 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -341,8 +341,8 @@ struct mptcp_sock {
 	struct mptcp_pm_data	pm;
 	struct mptcp_sched_ops	*sched;
 	struct {
-		u32	space;	/* bytes copied in last measurement window */
-		u32	copied; /* bytes copied in this measurement window */
+		int	space;	/* bytes copied in last measurement window */
+		int	copied; /* bytes copied in this measurement window */
 		u64	time;	/* start time of measurement window */
 		u64	rtt_us; /* last maximum rtt of subflows */
 	} rcvq_space;
-- 
2.51.0
Re: [MPTCP next 03/12] mptcp: rcvbuf auto-tuning improvement
Posted by kernel test robot 5 hours ago
Hi Paolo,

kernel test robot noticed the following build warnings:

[auto build test WARNING on mptcp/export-net]
[also build test WARNING on linus/master v6.17-rc6 next-20250916]
[cannot apply to next-20250916 mptcp/export]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Paolo-Abeni/mptcp-leverage-skb-deferral-free/20250917-003012
base:   https://github.com/multipath-tcp/mptcp_net-next.git export-net
patch link:    https://lore.kernel.org/r/d3f96328a76e6fffcb871e8542c526ea99135ea5.1758039775.git.pabeni%40redhat.com
patch subject: [MPTCP next 03/12] mptcp: rcvbuf auto-tuning improvement
config: arc-nsimosci_hs_smp_defconfig (https://download.01.org/0day-ci/archive/20250917/202509171856.1KhVdgmP-lkp@intel.com/config)
compiler: arc-linux-gcc (GCC) 15.1.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250917/202509171856.1KhVdgmP-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202509171856.1KhVdgmP-lkp@intel.com/

All warnings (new ones prefixed by >>):

   net/mptcp/protocol.c: In function 'mptcp_rcvbuf_grow':
>> net/mptcp/protocol.c:199:21: warning: variable 'window_clamp' set but not used [-Wunused-but-set-variable]
     199 |                 u32 window_clamp;
         |                     ^~~~~~~~~~~~


vim +/window_clamp +199 net/mptcp/protocol.c

   180	
   181	static bool mptcp_rcvbuf_grow(struct sock *sk)
   182	{
   183		struct mptcp_sock *msk = mptcp_sk(sk);
   184		int rcvwin, rcvbuf;
   185	
   186		if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) ||
   187		    (sk->sk_userlocks & SOCK_RCVBUF_LOCK))
   188			return false;
   189	
   190		rcvwin = ((u64)msk->rcvq_space.space << 1);
   191	
   192		if (!RB_EMPTY_ROOT(&msk->out_of_order_queue))
   193			rcvwin += MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq - msk->ack_seq;
   194	
   195		rcvbuf = min_t(u64, mptcp_space_from_win(sk, rcvwin),
   196			       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
   197	
   198		if (rcvbuf > sk->sk_rcvbuf) {
 > 199			u32 window_clamp;
   200	
   201			window_clamp = mptcp_win_from_space(sk, rcvbuf);
   202			WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
   203			return true;
   204		}
   205		return false;
   206	}
   207	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki