[MPTCP next 10/12] mptcp: prevernt __mptcp_move_skbs() interferring with the fastpath

Paolo Abeni posted 12 patches 23 hours ago
[MPTCP next 10/12] mptcp: prevernt __mptcp_move_skbs() interferring with the fastpath
Posted by Paolo Abeni 23 hours ago
skbs will be left waiting in the subflow only in exceptional cases,
we want to avoid messing with the fast path by unintentionally
processing in __mptcp_move_skbs() packets landed into the subflows
after the last check.

Use a separate flag to mark delayed skbs and only process subflow
with such flag set. Also add new mibs to track the exceptional events.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/mptcp/mib.c      |  2 ++
 net/mptcp/mib.h      |  4 ++++
 net/mptcp/protocol.c | 40 ++++++++++++----------------------------
 net/mptcp/protocol.h |  1 +
 4 files changed, 19 insertions(+), 28 deletions(-)

diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c
index cf879c188ca26..7af9d35cde884 100644
--- a/net/mptcp/mib.c
+++ b/net/mptcp/mib.c
@@ -85,6 +85,8 @@ static const struct snmp_mib mptcp_snmp_list[] = {
 	SNMP_MIB_ITEM("DssFallback", MPTCP_MIB_DSSFALLBACK),
 	SNMP_MIB_ITEM("SimultConnectFallback", MPTCP_MIB_SIMULTCONNFALLBACK),
 	SNMP_MIB_ITEM("FallbackFailed", MPTCP_MIB_FALLBACKFAILED),
+	SNMP_MIB_ITEM("RcvDelayed", MPTCP_MIB_RCVDELAYED),
+	SNMP_MIB_ITEM("DelayedProcess", MPTCP_MIB_DELAYED_PROCESS),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h
index 309bac6fea325..f6d0eaea463e5 100644
--- a/net/mptcp/mib.h
+++ b/net/mptcp/mib.h
@@ -88,6 +88,10 @@ enum linux_mptcp_mib_field {
 	MPTCP_MIB_DSSFALLBACK,		/* Bad or missing DSS */
 	MPTCP_MIB_SIMULTCONNFALLBACK,	/* Simultaneous connect */
 	MPTCP_MIB_FALLBACKFAILED,	/* Can't fallback due to msk status */
+	MPTCP_MIB_RCVDELAYED,		/* Data move from subflow is delayed due to msk
+					 * receive buffer full
+					 */
+	MPTCP_MIB_DELAYED_PROCESS,	/* Delayed data moved in slowpath */
 	__MPTCP_MIB_MAX
 };
 
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 9c3baed948d1d..f211a939daf39 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -665,13 +665,17 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
 	tp = tcp_sk(ssk);
 	do {
 		int mem = own_msk ? sk_rmem_alloc_get(sk) : sk->sk_backlog.len;
+		bool over_limit = mem > READ_ONCE(sk->sk_rcvbuf);
 		u32 map_remaining, offset;
 		u32 seq = tp->copied_seq;
 		struct sk_buff *skb;
 		bool fin;
 
-		if (mem > READ_ONCE(sk->sk_rcvbuf))
+		WRITE_ONCE(subflow->data_delayed, over_limit);
+		if (subflow->data_delayed) {
+			MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVDELAYED);
 			break;
+		}
 
 		/* try to move as much data as available */
 		map_remaining = subflow->map_data_len -
@@ -2081,32 +2085,13 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
 	msk->rcvq_space.time = mstamp;
 }
 
-static struct mptcp_subflow_context *
-__mptcp_first_ready_from(struct mptcp_sock *msk,
-			 struct mptcp_subflow_context *subflow)
-{
-	struct mptcp_subflow_context *start_subflow = subflow;
-
-	while (!READ_ONCE(subflow->data_avail)) {
-		subflow = mptcp_next_subflow(msk, subflow);
-		if (subflow == start_subflow)
-			return NULL;
-	}
-	return subflow;
-}
-
 static bool __mptcp_move_skbs(struct sock *sk)
 {
 	struct mptcp_subflow_context *subflow;
 	struct mptcp_sock *msk = mptcp_sk(sk);
 	bool ret = false;
 
-	if (list_empty(&msk->conn_list))
-		return false;
-
-	subflow = list_first_entry(&msk->conn_list,
-				   struct mptcp_subflow_context, node);
-	for (;;) {
+	mptcp_for_each_subflow(msk, subflow) {
 		struct sock *ssk;
 		bool slowpath;
 
@@ -2117,23 +2102,22 @@ static bool __mptcp_move_skbs(struct sock *sk)
 		if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf)
 			break;
 
-		subflow = __mptcp_first_ready_from(msk, subflow);
-		if (!subflow)
-			break;
+		if (!subflow->data_delayed)
+			continue;
 
 		ssk = mptcp_subflow_tcp_sock(subflow);
 		slowpath = lock_sock_fast(ssk);
-		ret = __mptcp_move_skbs_from_subflow(msk, ssk, true) || ret;
+		ret |= __mptcp_move_skbs_from_subflow(msk, ssk, true);
 		if (unlikely(ssk->sk_err))
 			__mptcp_error_report(sk);
 		unlock_sock_fast(ssk, slowpath);
-
-		subflow = mptcp_next_subflow(msk, subflow);
 	}
 
 	__mptcp_ofo_queue(msk);
-	if (ret)
+	if (ret) {
+		MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DELAYED_PROCESS);
 		mptcp_check_data_fin((struct sock *)msk);
+	}
 	return ret;
 }
 
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index a6e775d6412e5..2905e4b250070 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -559,6 +559,7 @@ struct mptcp_subflow_context {
 	u8	reset_transient:1;
 	u8	reset_reason:4;
 	u8	stale_count;
+	bool	data_delayed;
 
 	u32	subflow_id;
 
-- 
2.51.0
Re: [MPTCP next 10/12] mptcp: prevernt __mptcp_move_skbs() interferring with the fastpath
Posted by Geliang Tang 12 hours ago
Hi Paolo,

Thanks for this patch.

On Tue, 2025-09-16 at 18:27 +0200, Paolo Abeni wrote:
> skbs will be left waiting in the subflow only in exceptional cases,
> we want to avoid messing with the fast path by unintentionally
> processing in __mptcp_move_skbs() packets landed into the subflows
> after the last check.
> 
> Use a separate flag to mark delayed skbs and only process subflow
> with such flag set. Also add new mibs to track the exceptional
> events.
> 
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
>  net/mptcp/mib.c      |  2 ++
>  net/mptcp/mib.h      |  4 ++++
>  net/mptcp/protocol.c | 40 ++++++++++++----------------------------
>  net/mptcp/protocol.h |  1 +
>  4 files changed, 19 insertions(+), 28 deletions(-)
> 
> diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c
> index cf879c188ca26..7af9d35cde884 100644
> --- a/net/mptcp/mib.c
> +++ b/net/mptcp/mib.c
> @@ -85,6 +85,8 @@ static const struct snmp_mib mptcp_snmp_list[] = {
>  	SNMP_MIB_ITEM("DssFallback", MPTCP_MIB_DSSFALLBACK),
>  	SNMP_MIB_ITEM("SimultConnectFallback",
> MPTCP_MIB_SIMULTCONNFALLBACK),
>  	SNMP_MIB_ITEM("FallbackFailed", MPTCP_MIB_FALLBACKFAILED),
> +	SNMP_MIB_ITEM("RcvDelayed", MPTCP_MIB_RCVDELAYED),
> +	SNMP_MIB_ITEM("DelayedProcess", MPTCP_MIB_DELAYED_PROCESS),
>  	SNMP_MIB_SENTINEL

I guess this line was accidentally pasted, which prevents this patch
from being applied.

Thanks,
-Geliang

>  };
>  
> diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h
> index 309bac6fea325..f6d0eaea463e5 100644
> --- a/net/mptcp/mib.h
> +++ b/net/mptcp/mib.h
> @@ -88,6 +88,10 @@ enum linux_mptcp_mib_field {
>  	MPTCP_MIB_DSSFALLBACK,		/* Bad or missing DSS */
>  	MPTCP_MIB_SIMULTCONNFALLBACK,	/* Simultaneous connect */
>  	MPTCP_MIB_FALLBACKFAILED,	/* Can't fallback due to msk
> status */
> +	MPTCP_MIB_RCVDELAYED,		/* Data move from subflow is
> delayed due to msk
> +					 * receive buffer full
> +					 */
> +	MPTCP_MIB_DELAYED_PROCESS,	/* Delayed data moved in
> slowpath */
>  	__MPTCP_MIB_MAX
>  };
>  
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 9c3baed948d1d..f211a939daf39 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -665,13 +665,17 @@ static bool
> __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
>  	tp = tcp_sk(ssk);
>  	do {
>  		int mem = own_msk ? sk_rmem_alloc_get(sk) : sk-
> >sk_backlog.len;
> +		bool over_limit = mem > READ_ONCE(sk->sk_rcvbuf);
>  		u32 map_remaining, offset;
>  		u32 seq = tp->copied_seq;
>  		struct sk_buff *skb;
>  		bool fin;
>  
> -		if (mem > READ_ONCE(sk->sk_rcvbuf))
> +		WRITE_ONCE(subflow->data_delayed, over_limit);
> +		if (subflow->data_delayed) {
> +			MPTCP_INC_STATS(sock_net(sk),
> MPTCP_MIB_RCVDELAYED);
>  			break;
> +		}
>  
>  		/* try to move as much data as available */
>  		map_remaining = subflow->map_data_len -
> @@ -2081,32 +2085,13 @@ static void mptcp_rcv_space_adjust(struct
> mptcp_sock *msk, int copied)
>  	msk->rcvq_space.time = mstamp;
>  }
>  
> -static struct mptcp_subflow_context *
> -__mptcp_first_ready_from(struct mptcp_sock *msk,
> -			 struct mptcp_subflow_context *subflow)
> -{
> -	struct mptcp_subflow_context *start_subflow = subflow;
> -
> -	while (!READ_ONCE(subflow->data_avail)) {
> -		subflow = mptcp_next_subflow(msk, subflow);
> -		if (subflow == start_subflow)
> -			return NULL;
> -	}
> -	return subflow;
> -}
> -
>  static bool __mptcp_move_skbs(struct sock *sk)
>  {
>  	struct mptcp_subflow_context *subflow;
>  	struct mptcp_sock *msk = mptcp_sk(sk);
>  	bool ret = false;
>  
> -	if (list_empty(&msk->conn_list))
> -		return false;
> -
> -	subflow = list_first_entry(&msk->conn_list,
> -				   struct mptcp_subflow_context,
> node);
> -	for (;;) {
> +	mptcp_for_each_subflow(msk, subflow) {
>  		struct sock *ssk;
>  		bool slowpath;
>  
> @@ -2117,23 +2102,22 @@ static bool __mptcp_move_skbs(struct sock
> *sk)
>  		if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf)
>  			break;
>  
> -		subflow = __mptcp_first_ready_from(msk, subflow);
> -		if (!subflow)
> -			break;
> +		if (!subflow->data_delayed)
> +			continue;
>  
>  		ssk = mptcp_subflow_tcp_sock(subflow);
>  		slowpath = lock_sock_fast(ssk);
> -		ret = __mptcp_move_skbs_from_subflow(msk, ssk, true)
> || ret;
> +		ret |= __mptcp_move_skbs_from_subflow(msk, ssk,
> true);
>  		if (unlikely(ssk->sk_err))
>  			__mptcp_error_report(sk);
>  		unlock_sock_fast(ssk, slowpath);
> -
> -		subflow = mptcp_next_subflow(msk, subflow);
>  	}
>  
>  	__mptcp_ofo_queue(msk);
> -	if (ret)
> +	if (ret) {
> +		MPTCP_INC_STATS(sock_net(sk),
> MPTCP_MIB_DELAYED_PROCESS);
>  		mptcp_check_data_fin((struct sock *)msk);
> +	}
>  	return ret;
>  }
>  
> diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
> index a6e775d6412e5..2905e4b250070 100644
> --- a/net/mptcp/protocol.h
> +++ b/net/mptcp/protocol.h
> @@ -559,6 +559,7 @@ struct mptcp_subflow_context {
>  	u8	reset_transient:1;
>  	u8	reset_reason:4;
>  	u8	stale_count;
> +	bool	data_delayed;
>  
>  	u32	subflow_id;
>  
Re: [MPTCP next 10/12] mptcp: prevernt __mptcp_move_skbs() interferring with the fastpath
Posted by Matthieu Baerts 3 hours ago
Hi Geliang, Paolo,

On 17/09/2025 05:39, Geliang Tang wrote:
> Hi Paolo,
> 
> Thanks for this patch.
> 
> On Tue, 2025-09-16 at 18:27 +0200, Paolo Abeni wrote:
>> skbs will be left waiting in the subflow only in exceptional cases,
>> we want to avoid messing with the fast path by unintentionally
>> processing in __mptcp_move_skbs() packets landed into the subflows
>> after the last check.
>>
>> Use a separate flag to mark delayed skbs and only process subflow
>> with such flag set. Also add new mibs to track the exceptional
>> events.
>>
>> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
>> ---
>>  net/mptcp/mib.c      |  2 ++
>>  net/mptcp/mib.h      |  4 ++++
>>  net/mptcp/protocol.c | 40 ++++++++++++----------------------------
>>  net/mptcp/protocol.h |  1 +
>>  4 files changed, 19 insertions(+), 28 deletions(-)
>>
>> diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c
>> index cf879c188ca26..7af9d35cde884 100644
>> --- a/net/mptcp/mib.c
>> +++ b/net/mptcp/mib.c
>> @@ -85,6 +85,8 @@ static const struct snmp_mib mptcp_snmp_list[] = {
>>  	SNMP_MIB_ITEM("DssFallback", MPTCP_MIB_DSSFALLBACK),
>>  	SNMP_MIB_ITEM("SimultConnectFallback",
>> MPTCP_MIB_SIMULTCONNFALLBACK),
>>  	SNMP_MIB_ITEM("FallbackFailed", MPTCP_MIB_FALLBACKFAILED),
>> +	SNMP_MIB_ITEM("RcvDelayed", MPTCP_MIB_RCVDELAYED),
>> +	SNMP_MIB_ITEM("DelayedProcess", MPTCP_MIB_DELAYED_PROCESS),
>>  	SNMP_MIB_SENTINEL
> 
> I guess this line was accidentally pasted, which prevents this patch
> from being applied.

FYI, this line has been removed recently by commit 35cb2da0abaf ("mptcp:
snmp: do not use SNMP_MIB_SENTINEL anymore").

I manually fixed the conflict, and sent the series to the CI. I also
restarted my syzkaller instances to validate your series.

Cheers,
Matt
-- 
Sponsored by the NGI0 Core fund.