[PATCH v3 mptcp-net 3/3] Squash-to: "mptcp: leverage the backlog for RX packet processing"

Paolo Abeni posted 3 patches 2 weeks ago
There is a newer version of this series
[PATCH v3 mptcp-net 3/3] Squash-to: "mptcp: leverage the backlog for RX packet processing"
Posted by Paolo Abeni 2 weeks ago
If a subflow receives data before gaining the memcg while the msk
socket lock is held at accept time, or the PM locks the msk socket
while still unaccepted and subflows push data to it at the same time,
the mptcp_graph_subflows() can complete with a non empty backlog.

The msk will try to borrow such memory, but (some) of the skbs there
where not memcg charged. When the msk finally will return such accounted
memory, we should hit the same splat of #597.
[even if so far I was unable to replicate this scenario]

This patch tries to address such potential issue by:
- explicitly keep track of the amount of memory added to the backlog
  not CG accounted
- additionally accouting for such memory at accept time
- preventing any subflow from adding memory to the backlog not CG
  accounted after the above flush

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/mptcp/protocol.c | 64 +++++++++++++++++++++++++++++++++++++++++---
 net/mptcp/protocol.h |  1 +
 2 files changed, 61 insertions(+), 4 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index addd8025d235..abf0edc4b888 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -658,6 +658,7 @@ static void __mptcp_add_backlog(struct sock *sk,
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
 	struct sk_buff *tail = NULL;
+	struct sock *ssk = skb->sk;
 	bool fragstolen;
 	int delta;
 
@@ -671,18 +672,26 @@ static void __mptcp_add_backlog(struct sock *sk,
 		tail = list_last_entry(&msk->backlog_list, struct sk_buff, list);
 
 	if (tail && MPTCP_SKB_CB(skb)->map_seq == MPTCP_SKB_CB(tail)->end_seq &&
-	    skb->sk == tail->sk &&
+	    ssk == tail->sk &&
 	    __mptcp_try_coalesce(sk, tail, skb, &fragstolen, &delta)) {
 		skb->truesize -= delta;
 		kfree_skb_partial(skb, fragstolen);
 		__mptcp_subflow_lend_fwdmem(subflow, delta);
-		WRITE_ONCE(msk->backlog_len, msk->backlog_len + delta);
-		return;
+		goto account;
 	}
 
 	list_add_tail(&skb->list, &msk->backlog_list);
 	mptcp_subflow_lend_fwdmem(subflow, skb);
-	WRITE_ONCE(msk->backlog_len, msk->backlog_len + skb->truesize);
+	delta = skb->truesize;
+
+account:
+	WRITE_ONCE(msk->backlog_len, msk->backlog_len + delta);
+
+	/* Possibly not accept()ed yet, keep track of memory not CG
+	 * accounted, mptcp_grapt_subflows will handle it.
+	 */
+	if (!ssk->sk_memcg)
+		msk->backlog_unaccounted += delta;
 }
 
 static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
@@ -2154,6 +2163,12 @@ static bool mptcp_can_spool_backlog(struct sock *sk, struct list_head *skbs)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
 
+	/* After CG initialization, subflows should never add skb before
+	 * gaining the CG themself.
+	 */
+	DEBUG_NET_WARN_ON_ONCE(msk->backlog_unaccounted && sk->sk_socket &&
+			       mem_cgroup_from_sk(sk));
+
 	/* Don't spool the backlog if the rcvbuf is full. */
 	if (list_empty(&msk->backlog_list) ||
 	    sk_rmem_alloc_get(sk) > sk->sk_rcvbuf)
@@ -4059,6 +4074,22 @@ static void mptcp_graft_subflows(struct sock *sk)
 	struct mptcp_subflow_context *subflow;
 	struct mptcp_sock *msk = mptcp_sk(sk);
 
+	if (mem_cgroup_sockets_enabled) {
+		LIST_HEAD(join_list);
+
+		/* Subflows joining after __inet_accept() with get the
+		 * mem CG properly initialized at mptcp_finish_join() time,
+		 * but subflows pending in join_list need explicit
+		 * initialization before flushing `backlog_unaccounted`
+		 * or we can cat unexpeced unaccounted memory later.
+		 */
+		mptcp_data_lock(sk);
+		list_splice_init(&msk->join_list, &join_list);
+		mptcp_data_unlock(sk);
+
+		__mptcp_flush_join_list(sk, &join_list);
+	}
+
 	mptcp_for_each_subflow(msk, subflow) {
 		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
 
@@ -4070,10 +4101,35 @@ static void mptcp_graft_subflows(struct sock *sk)
 		if (!ssk->sk_socket)
 			mptcp_sock_graft(ssk, sk->sk_socket);
 
+		if (!mem_cgroup_sk_enabled(sk))
+			goto unlock;
+
 		__mptcp_inherit_cgrp_data(sk, ssk);
 		__mptcp_inherit_memcg(sk, ssk, GFP_KERNEL);
+
+unlock:
 		release_sock(ssk);
 	}
+
+	if (mem_cgroup_sk_enabled(sk)) {
+		gfp_t gfp = GFP_KERNEL | __GFP_NOFAIL;
+		int amt;
+
+		/* Account the backlog memory; prior accept() is aware of
+		 * fwd and rmem only
+		 */
+		mptcp_data_lock(sk);
+		amt = sk_mem_pages(sk->sk_forward_alloc +
+				   msk->backlog_unaccounted +
+				   atomic_read(&sk->sk_rmem_alloc)) -
+		      sk_mem_pages(sk->sk_forward_alloc +
+				   atomic_read(&sk->sk_rmem_alloc));
+		msk->backlog_unaccounted = 0;
+		mptcp_data_unlock(sk);
+
+		if (amt)
+			mem_cgroup_sk_charge(sk, amt, gfp);
+	}
 }
 
 static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 161b704be16b..199f28f3dd5e 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -360,6 +360,7 @@ struct mptcp_sock {
 
 	struct list_head backlog_list;	/* protected by the data lock */
 	u32		backlog_len;
+	u32		backlog_unaccounted;
 };
 
 #define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock)
-- 
2.51.1
Re: [PATCH v3 mptcp-net 3/3] Squash-to: "mptcp: leverage the backlog for RX packet processing"
Posted by kernel test robot 2 weeks ago
Hi Paolo,

kernel test robot noticed the following build errors:

[auto build test ERROR on mptcp/export]
[cannot apply to mptcp/export-net linus/master v6.18-rc5 next-20251113]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Paolo-Abeni/mptcp-fix-grafting-corner-case/20251113-081224
base:   https://github.com/multipath-tcp/mptcp_net-next.git export
patch link:    https://lore.kernel.org/r/ee40999f77d0186abba2a6b21958c1a8c4881c57.1762992570.git.pabeni%40redhat.com
patch subject: [PATCH v3 mptcp-net 3/3] Squash-to: "mptcp: leverage the backlog for RX packet processing"
config: sparc64-defconfig (https://download.01.org/0day-ci/archive/20251113/202511131922.61JkdJO9-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251113/202511131922.61JkdJO9-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202511131922.61JkdJO9-lkp@intel.com/

All errors (new ones prefixed by >>):

>> net/mptcp/protocol.c:693:12: error: no member named 'sk_memcg' in 'struct sock'
     693 |         if (!ssk->sk_memcg)
         |              ~~~  ^
   1 error generated.


vim +693 net/mptcp/protocol.c

   654	
   655	static void __mptcp_add_backlog(struct sock *sk,
   656					struct mptcp_subflow_context *subflow,
   657					struct sk_buff *skb)
   658	{
   659		struct mptcp_sock *msk = mptcp_sk(sk);
   660		struct sk_buff *tail = NULL;
   661		struct sock *ssk = skb->sk;
   662		bool fragstolen;
   663		int delta;
   664	
   665		if (unlikely(sk->sk_state == TCP_CLOSE)) {
   666			kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE);
   667			return;
   668		}
   669	
   670		/* Try to coalesce with the last skb in our backlog */
   671		if (!list_empty(&msk->backlog_list))
   672			tail = list_last_entry(&msk->backlog_list, struct sk_buff, list);
   673	
   674		if (tail && MPTCP_SKB_CB(skb)->map_seq == MPTCP_SKB_CB(tail)->end_seq &&
   675		    ssk == tail->sk &&
   676		    __mptcp_try_coalesce(sk, tail, skb, &fragstolen, &delta)) {
   677			skb->truesize -= delta;
   678			kfree_skb_partial(skb, fragstolen);
   679			__mptcp_subflow_lend_fwdmem(subflow, delta);
   680			goto account;
   681		}
   682	
   683		list_add_tail(&skb->list, &msk->backlog_list);
   684		mptcp_subflow_lend_fwdmem(subflow, skb);
   685		delta = skb->truesize;
   686	
   687	account:
   688		WRITE_ONCE(msk->backlog_len, msk->backlog_len + delta);
   689	
   690		/* Possibly not accept()ed yet, keep track of memory not CG
   691		 * accounted, mptcp_grapt_subflows will handle it.
   692		 */
 > 693		if (!ssk->sk_memcg)
   694			msk->backlog_unaccounted += delta;
   695	}
   696	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Re: [PATCH v3 mptcp-net 3/3] Squash-to: "mptcp: leverage the backlog for RX packet processing"
Posted by kernel test robot 2 weeks ago
Hi Paolo,

kernel test robot noticed the following build errors:

[auto build test ERROR on mptcp/export]
[cannot apply to mptcp/export-net linus/master v6.18-rc5 next-20251113]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Paolo-Abeni/mptcp-fix-grafting-corner-case/20251113-081224
base:   https://github.com/multipath-tcp/mptcp_net-next.git export
patch link:    https://lore.kernel.org/r/ee40999f77d0186abba2a6b21958c1a8c4881c57.1762992570.git.pabeni%40redhat.com
patch subject: [PATCH v3 mptcp-net 3/3] Squash-to: "mptcp: leverage the backlog for RX packet processing"
config: openrisc-defconfig (https://download.01.org/0day-ci/archive/20251113/202511131931.1yh3eKYe-lkp@intel.com/config)
compiler: or1k-linux-gcc (GCC) 15.1.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251113/202511131931.1yh3eKYe-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202511131931.1yh3eKYe-lkp@intel.com/

All errors (new ones prefixed by >>):

   net/mptcp/protocol.c: In function '__mptcp_add_backlog':
>> net/mptcp/protocol.c:693:17: error: 'struct sock' has no member named 'sk_memcg'
     693 |         if (!ssk->sk_memcg)
         |                 ^~


vim +693 net/mptcp/protocol.c

   654	
   655	static void __mptcp_add_backlog(struct sock *sk,
   656					struct mptcp_subflow_context *subflow,
   657					struct sk_buff *skb)
   658	{
   659		struct mptcp_sock *msk = mptcp_sk(sk);
   660		struct sk_buff *tail = NULL;
   661		struct sock *ssk = skb->sk;
   662		bool fragstolen;
   663		int delta;
   664	
   665		if (unlikely(sk->sk_state == TCP_CLOSE)) {
   666			kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE);
   667			return;
   668		}
   669	
   670		/* Try to coalesce with the last skb in our backlog */
   671		if (!list_empty(&msk->backlog_list))
   672			tail = list_last_entry(&msk->backlog_list, struct sk_buff, list);
   673	
   674		if (tail && MPTCP_SKB_CB(skb)->map_seq == MPTCP_SKB_CB(tail)->end_seq &&
   675		    ssk == tail->sk &&
   676		    __mptcp_try_coalesce(sk, tail, skb, &fragstolen, &delta)) {
   677			skb->truesize -= delta;
   678			kfree_skb_partial(skb, fragstolen);
   679			__mptcp_subflow_lend_fwdmem(subflow, delta);
   680			goto account;
   681		}
   682	
   683		list_add_tail(&skb->list, &msk->backlog_list);
   684		mptcp_subflow_lend_fwdmem(subflow, skb);
   685		delta = skb->truesize;
   686	
   687	account:
   688		WRITE_ONCE(msk->backlog_len, msk->backlog_len + delta);
   689	
   690		/* Possibly not accept()ed yet, keep track of memory not CG
   691		 * accounted, mptcp_grapt_subflows will handle it.
   692		 */
 > 693		if (!ssk->sk_memcg)
   694			msk->backlog_unaccounted += delta;
   695	}
   696	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Re: [PATCH v3 mptcp-net 3/3] Squash-to: "mptcp: leverage the backlog for RX packet processing"
Posted by Matthieu Baerts 1 week, 6 days ago
Hello,

On 13/11/2025 12:30, kernel test robot wrote:
> Hi Paolo,
> 
> kernel test robot noticed the following build errors:
> 
> [auto build test ERROR on mptcp/export]
> [cannot apply to mptcp/export-net linus/master v6.18-rc5 next-20251113]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch#_base_tree_information]
> 
> url:    https://github.com/intel-lab-lkp/linux/commits/Paolo-Abeni/mptcp-fix-grafting-corner-case/20251113-081224
> base:   https://github.com/multipath-tcp/mptcp_net-next.git export
> patch link:    https://lore.kernel.org/r/ee40999f77d0186abba2a6b21958c1a8c4881c57.1762992570.git.pabeni%40redhat.com
> patch subject: [PATCH v3 mptcp-net 3/3] Squash-to: "mptcp: leverage the backlog for RX packet processing"
> config: openrisc-defconfig (https://download.01.org/0day-ci/archive/20251113/202511131931.1yh3eKYe-lkp@intel.com/config)
> compiler: or1k-linux-gcc (GCC) 15.1.0
> reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251113/202511131931.1yh3eKYe-lkp@intel.com/reproduce)
> 
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <lkp@intel.com>
> | Closes: https://lore.kernel.org/oe-kbuild-all/202511131931.1yh3eKYe-lkp@intel.com/
> 
> All errors (new ones prefixed by >>):
> 
>    net/mptcp/protocol.c: In function '__mptcp_add_backlog':
>>> net/mptcp/protocol.c:693:17: error: 'struct sock' has no member named 'sk_memcg'
>      693 |         if (!ssk->sk_memcg)

I guess mem_cgroup_from_sk() should be used if this patch doesn't target
-net.

Cheers,
Matt
-- 
Sponsored by the NGI0 Core fund.
Re: [PATCH v3 mptcp-net 3/3] Squash-to: "mptcp: leverage the backlog for RX packet processing"
Posted by Matthieu Baerts 2 weeks ago
Hi Paolo,

On 13/11/2025 01:10, Paolo Abeni wrote:
> If a subflow receives data before gaining the memcg while the msk
> socket lock is held at accept time, or the PM locks the msk socket
> while still unaccepted and subflows push data to it at the same time,
> the mptcp_graph_subflows() can complete with a non empty backlog.
> 
> The msk will try to borrow such memory, but (some) of the skbs there
> where not memcg charged. When the msk finally will return such accounted
> memory, we should hit the same splat of #597.
> [even if so far I was unable to replicate this scenario]
> 
> This patch tries to address such potential issue by:
> - explicitly keep track of the amount of memory added to the backlog
>   not CG accounted
> - additionally accouting for such memory at accept time
> - preventing any subflow from adding memory to the backlog not CG
>   accounted after the above flush
> 
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
>  net/mptcp/protocol.c | 64 +++++++++++++++++++++++++++++++++++++++++---
>  net/mptcp/protocol.h |  1 +
>  2 files changed, 61 insertions(+), 4 deletions(-)
> 
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index addd8025d235..abf0edc4b888 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -658,6 +658,7 @@ static void __mptcp_add_backlog(struct sock *sk,
>  {
>  	struct mptcp_sock *msk = mptcp_sk(sk);
>  	struct sk_buff *tail = NULL;
> +	struct sock *ssk = skb->sk;
>  	bool fragstolen;
>  	int delta;
>  
> @@ -671,18 +672,26 @@ static void __mptcp_add_backlog(struct sock *sk,
>  		tail = list_last_entry(&msk->backlog_list, struct sk_buff, list);
>  
>  	if (tail && MPTCP_SKB_CB(skb)->map_seq == MPTCP_SKB_CB(tail)->end_seq &&
> -	    skb->sk == tail->sk &&
> +	    ssk == tail->sk &&
>  	    __mptcp_try_coalesce(sk, tail, skb, &fragstolen, &delta)) {
>  		skb->truesize -= delta;
>  		kfree_skb_partial(skb, fragstolen);
>  		__mptcp_subflow_lend_fwdmem(subflow, delta);
> -		WRITE_ONCE(msk->backlog_len, msk->backlog_len + delta);
> -		return;
> +		goto account;
>  	}
>  
>  	list_add_tail(&skb->list, &msk->backlog_list);
>  	mptcp_subflow_lend_fwdmem(subflow, skb);
> -	WRITE_ONCE(msk->backlog_len, msk->backlog_len + skb->truesize);
> +	delta = skb->truesize;
> +
> +account:
> +	WRITE_ONCE(msk->backlog_len, msk->backlog_len + delta);
> +
> +	/* Possibly not accept()ed yet, keep track of memory not CG
> +	 * accounted, mptcp_grapt_subflows will handle it.

detail: s/grapt/graft

> +	 */
> +	if (!ssk->sk_memcg)
> +		msk->backlog_unaccounted += delta;
>  }
>  
>  static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
> @@ -2154,6 +2163,12 @@ static bool mptcp_can_spool_backlog(struct sock *sk, struct list_head *skbs)
>  {
>  	struct mptcp_sock *msk = mptcp_sk(sk);
>  
> +	/* After CG initialization, subflows should never add skb before
> +	 * gaining the CG themself.
> +	 */
> +	DEBUG_NET_WARN_ON_ONCE(msk->backlog_unaccounted && sk->sk_socket &&
> +			       mem_cgroup_from_sk(sk));
> +
>  	/* Don't spool the backlog if the rcvbuf is full. */
>  	if (list_empty(&msk->backlog_list) ||
>  	    sk_rmem_alloc_get(sk) > sk->sk_rcvbuf)
> @@ -4059,6 +4074,22 @@ static void mptcp_graft_subflows(struct sock *sk)
>  	struct mptcp_subflow_context *subflow;
>  	struct mptcp_sock *msk = mptcp_sk(sk);
>  
> +	if (mem_cgroup_sockets_enabled) {
> +		LIST_HEAD(join_list);
> +
> +		/* Subflows joining after __inet_accept() with get the
> +		 * mem CG properly initialized at mptcp_finish_join() time,
> +		 * but subflows pending in join_list need explicit
> +		 * initialization before flushing `backlog_unaccounted`
> +		 * or we can cat unexpeced unaccounted memory later.

detail: s/cat/catch/ or s/cat/get/?

> +		 */
> +		mptcp_data_lock(sk);
> +		list_splice_init(&msk->join_list, &join_list);
> +		mptcp_data_unlock(sk);
> +
> +		__mptcp_flush_join_list(sk, &join_list);
> +	}
> +
>  	mptcp_for_each_subflow(msk, subflow) {
>  		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
>  
> @@ -4070,10 +4101,35 @@ static void mptcp_graft_subflows(struct sock *sk)
>  		if (!ssk->sk_socket)
>  			mptcp_sock_graft(ssk, sk->sk_socket);
>  
> +		if (!mem_cgroup_sk_enabled(sk))

detail: what's the cost of this call? (static key + get)
Do we need to use a local variable to call this helper only once?

> +			goto unlock;
> +
>  		__mptcp_inherit_cgrp_data(sk, ssk);
>  		__mptcp_inherit_memcg(sk, ssk, GFP_KERNEL);
> +
> +unlock:
>  		release_sock(ssk);
>  	}
> +
> +	if (mem_cgroup_sk_enabled(sk)) {
> +		gfp_t gfp = GFP_KERNEL | __GFP_NOFAIL;
> +		int amt;
> +
> +		/* Account the backlog memory; prior accept() is aware of
> +		 * fwd and rmem only
> +		 */
> +		mptcp_data_lock(sk);
> +		amt = sk_mem_pages(sk->sk_forward_alloc +
> +				   msk->backlog_unaccounted +
> +				   atomic_read(&sk->sk_rmem_alloc)) -
> +		      sk_mem_pages(sk->sk_forward_alloc +
> +				   atomic_read(&sk->sk_rmem_alloc));
> +		msk->backlog_unaccounted = 0;
> +		mptcp_data_unlock(sk);
> +
> +		if (amt)
> +			mem_cgroup_sk_charge(sk, amt, gfp);
> +	}
>  }
>  
>  static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
> diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
> index 161b704be16b..199f28f3dd5e 100644
> --- a/net/mptcp/protocol.h
> +++ b/net/mptcp/protocol.h
> @@ -360,6 +360,7 @@ struct mptcp_sock {
>  
>  	struct list_head backlog_list;	/* protected by the data lock */
>  	u32		backlog_len;
> +	u32		backlog_unaccounted;
>  };
>  
>  #define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock)

Cheers,
Matt
-- 
Sponsored by the NGI0 Core fund.