[PATCH v5 mptcp-next 10/10] mptcp: leverage the backlog for RX packet processing

Paolo Abeni posted 10 patches 4 months ago
There is a newer version of this series
[PATCH v5 mptcp-next 10/10] mptcp: leverage the backlog for RX packet processing
Posted by Paolo Abeni 4 months ago
When the msk socket is owned or the msk receive buffer is full,
move the incoming skbs in a msk level backlog list. This avoid
traversing the joined subflows and acquiring the subflow level
socket lock at reception time, improving the RX performances.

when processing the backlog, use the fwd alloc memory borrowed from
the incoming subflow. skbs exceeding the msk receive space are
not dropped; instead they are kept into the backlog until the receive
buffer is freed. Dropping packets already acked at the TCP level is
explicitly discouraged by the RFC and would corrupt the data stream
for fallback sockets.

Special care is needed to avoid adding skbs to the backlog of a closed
msk, and to avoid leaving dangling references into the backlog
at subflow closing time.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
v4 -> v5:
  - consolidate ssk rcvbuf accunting in __mptcp_move_skb(), remove
    some code duplication
  - return soon in __mptcp_add_backlog() when dropping skbs due to
    the msk closed. This avoid later UaF
---
 net/mptcp/protocol.c | 137 ++++++++++++++++++++++++-------------------
 net/mptcp/protocol.h |   2 +-
 2 files changed, 79 insertions(+), 60 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 2d5d3da67d1ac..a97a92eccc502 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -696,7 +696,7 @@ static void __mptcp_add_backlog(struct sock *sk, struct sk_buff *skb)
 }
 
 static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
-					   struct sock *ssk)
+					   struct sock *ssk, bool own_msk)
 {
 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
 	struct sock *sk = (struct sock *)msk;
@@ -712,9 +712,6 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
 		struct sk_buff *skb;
 		bool fin;
 
-		if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf)
-			break;
-
 		/* try to move as much data as available */
 		map_remaining = subflow->map_data_len -
 				mptcp_subflow_get_map_offset(subflow);
@@ -742,9 +739,12 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
 			int bmem;
 
 			bmem = mptcp_init_skb(ssk, skb, offset, len);
-			sk_forward_alloc_add(sk, bmem);
+			if (own_msk)
+				sk_forward_alloc_add(sk, bmem);
+			else
+				msk->borrowed_mem += bmem;
 
-			if (true)
+			if (own_msk && sk_rmem_alloc_get(sk) < sk->sk_rcvbuf)
 				ret |= __mptcp_move_skb(sk, skb);
 			else
 				__mptcp_add_backlog(sk, skb);
@@ -866,7 +866,7 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
 	struct sock *sk = (struct sock *)msk;
 	bool moved;
 
-	moved = __mptcp_move_skbs_from_subflow(msk, ssk);
+	moved = __mptcp_move_skbs_from_subflow(msk, ssk, true);
 	__mptcp_ofo_queue(msk);
 	if (unlikely(ssk->sk_err))
 		__mptcp_subflow_error_report(sk, ssk);
@@ -898,9 +898,8 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
 		/* Wake-up the reader only for in-sequence data */
 		if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk))
 			sk->sk_data_ready(sk);
-		
 	} else {
-		__set_bit(MPTCP_DEQUEUE, &mptcp_sk(sk)->cb_flags);
+		__mptcp_move_skbs_from_subflow(msk, ssk, false);
 	}
 	mptcp_data_unlock(sk);
 }
@@ -2135,60 +2134,56 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
 	msk->rcvq_space.time = mstamp;
 }
 
-static struct mptcp_subflow_context *
-__mptcp_first_ready_from(struct mptcp_sock *msk,
-			 struct mptcp_subflow_context *subflow)
-{
-	struct mptcp_subflow_context *start_subflow = subflow;
-
-	while (!READ_ONCE(subflow->data_avail)) {
-		subflow = mptcp_next_subflow(msk, subflow);
-		if (subflow == start_subflow)
-			return NULL;
-	}
-	return subflow;
-}
-
-static bool __mptcp_move_skbs(struct sock *sk)
+static bool __mptcp_move_skbs(struct sock *sk, struct list_head *skbs, u32 *delta)
 {
-	struct mptcp_subflow_context *subflow;
+	struct sk_buff *skb = list_first_entry(skbs, struct sk_buff, list);
 	struct mptcp_sock *msk = mptcp_sk(sk);
-	bool ret = false;
-
-	if (list_empty(&msk->conn_list))
-		return false;
-
-	subflow = list_first_entry(&msk->conn_list,
-				   struct mptcp_subflow_context, node);
-	for (;;) {
-		struct sock *ssk;
-		bool slowpath;
+	bool moved = false;
 
-		/*
-		 * As an optimization avoid traversing the subflows list
-		 * and ev. acquiring the subflow socket lock before baling out
-		 */
+	while (1) {
+		/* If the msk recvbuf is full stop, don't drop */
 		if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf)
 			break;
 
-		subflow = __mptcp_first_ready_from(msk, subflow);
-		if (!subflow)
-			break;
+		prefetch(skb->next);
+		list_del(&skb->list);
+		*delta += skb->truesize;
 
-		ssk = mptcp_subflow_tcp_sock(subflow);
-		slowpath = lock_sock_fast(ssk);
-		ret = __mptcp_move_skbs_from_subflow(msk, ssk) || ret;
-		if (unlikely(ssk->sk_err))
-			__mptcp_error_report(sk);
-		unlock_sock_fast(ssk, slowpath);
+		moved |= __mptcp_move_skb(sk, skb);
+		if (list_empty(skbs))
+			break;
 
-		subflow = mptcp_next_subflow(msk, subflow);
+		skb = list_first_entry(skbs, struct sk_buff, list);
 	}
 
 	__mptcp_ofo_queue(msk);
-	if (ret)
+	if (moved)
 		mptcp_check_data_fin((struct sock *)msk);
-	return ret;
+	return moved;
+}
+
+static bool mptcp_move_skbs(struct sock *sk)
+{
+	struct mptcp_sock *msk = mptcp_sk(sk);
+	bool moved = false;
+	LIST_HEAD(skbs);
+	u32 delta = 0;
+
+	mptcp_data_lock(sk);
+	while (!list_empty(&msk->backlog_list)) {
+		list_splice_init(&msk->backlog_list, &skbs);
+		mptcp_data_unlock(sk);
+		moved |= __mptcp_move_skbs(sk, &skbs, &delta);
+
+		mptcp_data_lock(sk);
+		if (!list_empty(&skbs)) {
+			list_splice(&skbs, &msk->backlog_list);
+			break;
+		}
+	}
+	WRITE_ONCE(msk->backlog_len, msk->backlog_len - delta);
+	mptcp_data_unlock(sk);
+	return moved;
 }
 
 static unsigned int mptcp_inq_hint(const struct sock *sk)
@@ -2254,7 +2249,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 
 		copied += bytes_read;
 
-		if (skb_queue_empty(&sk->sk_receive_queue) && __mptcp_move_skbs(sk))
+		if (!list_empty(&msk->backlog_list) && mptcp_move_skbs(sk))
 			continue;
 
 		/* only the MPTCP socket status is relevant here. The exit
@@ -2559,6 +2554,9 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
 void mptcp_close_ssk(struct sock *sk, struct sock *ssk,
 		     struct mptcp_subflow_context *subflow)
 {
+	struct mptcp_sock *msk = mptcp_sk(sk);
+	struct sk_buff *skb;
+
 	/* The first subflow can already be closed and still in the list */
 	if (subflow->close_event_done)
 		return;
@@ -2568,6 +2566,18 @@ void mptcp_close_ssk(struct sock *sk, struct sock *ssk,
 	if (sk->sk_state == TCP_ESTABLISHED)
 		mptcp_event(MPTCP_EVENT_SUB_CLOSED, mptcp_sk(sk), ssk, GFP_KERNEL);
 
+	/* Remove any reference from the backlog to this ssk, accounting the
+	 * related skb directly to the main socket
+	 */
+	list_for_each_entry(skb, &msk->backlog_list, list) {
+		if (skb->sk != ssk)
+			continue;
+
+		atomic_sub(skb->truesize, &skb->sk->sk_rmem_alloc);
+		atomic_add(skb->truesize, &sk->sk_rmem_alloc);
+		skb->sk = sk;
+	}
+
 	/* subflow aborted before reaching the fully_established status
 	 * attempt the creation of the next subflow
 	 */
@@ -3509,23 +3519,29 @@ void __mptcp_check_push(struct sock *sk, struct sock *ssk)
 
 #define MPTCP_FLAGS_PROCESS_CTX_NEED (BIT(MPTCP_PUSH_PENDING) | \
 				      BIT(MPTCP_RETRANSMIT) | \
-				      BIT(MPTCP_FLUSH_JOIN_LIST) | \
-				      BIT(MPTCP_DEQUEUE))
+				      BIT(MPTCP_FLUSH_JOIN_LIST))
 
 /* processes deferred events and flush wmem */
 static void mptcp_release_cb(struct sock *sk)
 	__must_hold(&sk->sk_lock.slock)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
+	u32 delta = 0;
 
 	for (;;) {
 		unsigned long flags = (msk->cb_flags & MPTCP_FLAGS_PROCESS_CTX_NEED);
-		struct list_head join_list;
+		LIST_HEAD(join_list);
+		LIST_HEAD(skbs);
+
+		sk_forward_alloc_add(sk, msk->borrowed_mem);
+		msk->borrowed_mem = 0;
+
+		if (sk_rmem_alloc_get(sk) < sk->sk_rcvbuf)
+			list_splice_init(&msk->backlog_list, &skbs);
 
-		if (!flags)
+		if (!flags && list_empty(&skbs))
 			break;
 
-		INIT_LIST_HEAD(&join_list);
 		list_splice_init(&msk->join_list, &join_list);
 
 		/* the following actions acquire the subflow socket lock
@@ -3544,7 +3560,8 @@ static void mptcp_release_cb(struct sock *sk)
 			__mptcp_push_pending(sk, 0);
 		if (flags & BIT(MPTCP_RETRANSMIT))
 			__mptcp_retrans(sk);
-		if ((flags & BIT(MPTCP_DEQUEUE)) && __mptcp_move_skbs(sk)) {
+		if (!list_empty(&skbs) &&
+		    __mptcp_move_skbs(sk, &skbs, &delta)) {
 			/* notify ack seq update */
 			mptcp_cleanup_rbuf(msk, 0);
 			sk->sk_data_ready(sk);
@@ -3552,7 +3569,9 @@ static void mptcp_release_cb(struct sock *sk)
 
 		cond_resched();
 		spin_lock_bh(&sk->sk_lock.slock);
+		list_splice(&skbs, &msk->backlog_list);
 	}
+	WRITE_ONCE(msk->backlog_len, msk->backlog_len - delta);
 
 	if (__test_and_clear_bit(MPTCP_CLEAN_UNA, &msk->cb_flags))
 		__mptcp_clean_una_wakeup(sk);
@@ -3784,7 +3803,7 @@ static int mptcp_ioctl(struct sock *sk, int cmd, int *karg)
 			return -EINVAL;
 
 		lock_sock(sk);
-		if (__mptcp_move_skbs(sk))
+		if (mptcp_move_skbs(sk))
 			mptcp_cleanup_rbuf(msk, 0);
 		*karg = mptcp_inq_hint(sk);
 		release_sock(sk);
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index a21c4955f4cfb..cfabda66e7ac4 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -124,7 +124,6 @@
 #define MPTCP_FLUSH_JOIN_LIST	5
 #define MPTCP_SYNC_STATE	6
 #define MPTCP_SYNC_SNDBUF	7
-#define MPTCP_DEQUEUE		8
 
 struct mptcp_skb_cb {
 	u64 map_seq;
@@ -301,6 +300,7 @@ struct mptcp_sock {
 	u32		last_ack_recv;
 	unsigned long	timer_ival;
 	u32		token;
+	u32		borrowed_mem;
 	unsigned long	flags;
 	unsigned long	cb_flags;
 	bool		recovery;		/* closing subflow write queue reinjected */
-- 
2.51.0
Re: [PATCH v5 mptcp-next 10/10] mptcp: leverage the backlog for RX packet processing
Posted by Mat Martineau 3 months, 2 weeks ago
On Mon, 6 Oct 2025, Paolo Abeni wrote:

> When the msk socket is owned or the msk receive buffer is full,
> move the incoming skbs in a msk level backlog list. This avoid
> traversing the joined subflows and acquiring the subflow level
> socket lock at reception time, improving the RX performances.
>
> when processing the backlog, use the fwd alloc memory borrowed from
> the incoming subflow. skbs exceeding the msk receive space are
> not dropped; instead they are kept into the backlog until the receive
> buffer is freed. Dropping packets already acked at the TCP level is
> explicitly discouraged by the RFC and would corrupt the data stream
> for fallback sockets.
>
> Special care is needed to avoid adding skbs to the backlog of a closed
> msk, and to avoid leaving dangling references into the backlog
> at subflow closing time.
>
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
> v4 -> v5:
>  - consolidate ssk rcvbuf accunting in __mptcp_move_skb(), remove
>    some code duplication
>  - return soon in __mptcp_add_backlog() when dropping skbs due to
>    the msk closed. This avoid later UaF
> ---
> net/mptcp/protocol.c | 137 ++++++++++++++++++++++++-------------------
> net/mptcp/protocol.h |   2 +-
> 2 files changed, 79 insertions(+), 60 deletions(-)
>
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 2d5d3da67d1ac..a97a92eccc502 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c

...

> @@ -3509,23 +3519,29 @@ void __mptcp_check_push(struct sock *sk, struct sock *ssk)
>
> #define MPTCP_FLAGS_PROCESS_CTX_NEED (BIT(MPTCP_PUSH_PENDING) | \
> 				      BIT(MPTCP_RETRANSMIT) | \
> -				      BIT(MPTCP_FLUSH_JOIN_LIST) | \
> -				      BIT(MPTCP_DEQUEUE))
> +				      BIT(MPTCP_FLUSH_JOIN_LIST))
>
> /* processes deferred events and flush wmem */
> static void mptcp_release_cb(struct sock *sk)
> 	__must_hold(&sk->sk_lock.slock)
> {
> 	struct mptcp_sock *msk = mptcp_sk(sk);
> +	u32 delta = 0;
>
> 	for (;;) {
> 		unsigned long flags = (msk->cb_flags & MPTCP_FLAGS_PROCESS_CTX_NEED);
> -		struct list_head join_list;
> +		LIST_HEAD(join_list);
> +		LIST_HEAD(skbs);
> +
> +		sk_forward_alloc_add(sk, msk->borrowed_mem);
> +		msk->borrowed_mem = 0;
> +
> +		if (sk_rmem_alloc_get(sk) < sk->sk_rcvbuf)
> +			list_splice_init(&msk->backlog_list, &skbs);
>
> -		if (!flags)
> +		if (!flags && list_empty(&skbs))
> 			break;
>
> -		INIT_LIST_HEAD(&join_list);
> 		list_splice_init(&msk->join_list, &join_list);
>
> 		/* the following actions acquire the subflow socket lock
> @@ -3544,7 +3560,8 @@ static void mptcp_release_cb(struct sock *sk)
> 			__mptcp_push_pending(sk, 0);
> 		if (flags & BIT(MPTCP_RETRANSMIT))
> 			__mptcp_retrans(sk);
> -		if ((flags & BIT(MPTCP_DEQUEUE)) && __mptcp_move_skbs(sk)) {
> +		if (!list_empty(&skbs) &&
> +		    __mptcp_move_skbs(sk, &skbs, &delta)) {
> 			/* notify ack seq update */
> 			mptcp_cleanup_rbuf(msk, 0);
> 			sk->sk_data_ready(sk);
> @@ -3552,7 +3569,9 @@ static void mptcp_release_cb(struct sock *sk)
>
> 		cond_resched();
> 		spin_lock_bh(&sk->sk_lock.slock);
> +		list_splice(&skbs, &msk->backlog_list);
> 	}
> +	WRITE_ONCE(msk->backlog_len, msk->backlog_len - delta);

Hi Paolo -

Given the possible multiple calls to __mptcp_move_skbs() and that the 
spinlock is released/reacquired (and the cond_resched) in the middle, 
would it make sense to update msk->backlog_len for each iteration of the 
loop so __mptcp_space() and mptcp_space() don't under-report available 
space and mptcp_cleanup_rbuf() can make incremental progress?

I know we don't want to WRITE_ONCE() more than necessary, but it seems 
like there won't typically be more than one loop iteration. In the cases 
where it does repeat the loop that means data is arriving quickly and 
reporting mptcp_space accurately will be important.

- Mat


>
> 	if (__test_and_clear_bit(MPTCP_CLEAN_UNA, &msk->cb_flags))
> 		__mptcp_clean_una_wakeup(sk);
Re: [PATCH v5 mptcp-next 10/10] mptcp: leverage the backlog for RX packet processing
Posted by Paolo Abeni 3 months, 2 weeks ago
On 10/21/25 1:32 AM, Mat Martineau wrote:
> On Mon, 6 Oct 2025, Paolo Abeni wrote:
>> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
>> index 2d5d3da67d1ac..a97a92eccc502 100644
>> --- a/net/mptcp/protocol.c
>> +++ b/net/mptcp/protocol.c
> 
> ...
> 
>> @@ -3509,23 +3519,29 @@ void __mptcp_check_push(struct sock *sk, struct sock *ssk)
>>
>> #define MPTCP_FLAGS_PROCESS_CTX_NEED (BIT(MPTCP_PUSH_PENDING) | \
>> 				      BIT(MPTCP_RETRANSMIT) | \
>> -				      BIT(MPTCP_FLUSH_JOIN_LIST) | \
>> -				      BIT(MPTCP_DEQUEUE))
>> +				      BIT(MPTCP_FLUSH_JOIN_LIST))
>>
>> /* processes deferred events and flush wmem */
>> static void mptcp_release_cb(struct sock *sk)
>> 	__must_hold(&sk->sk_lock.slock)
>> {
>> 	struct mptcp_sock *msk = mptcp_sk(sk);
>> +	u32 delta = 0;
>>
>> 	for (;;) {
>> 		unsigned long flags = (msk->cb_flags & MPTCP_FLAGS_PROCESS_CTX_NEED);
>> -		struct list_head join_list;
>> +		LIST_HEAD(join_list);
>> +		LIST_HEAD(skbs);
>> +
>> +		sk_forward_alloc_add(sk, msk->borrowed_mem);
>> +		msk->borrowed_mem = 0;
>> +
>> +		if (sk_rmem_alloc_get(sk) < sk->sk_rcvbuf)
>> +			list_splice_init(&msk->backlog_list, &skbs);
>>
>> -		if (!flags)
>> +		if (!flags && list_empty(&skbs))
>> 			break;
>>
>> -		INIT_LIST_HEAD(&join_list);
>> 		list_splice_init(&msk->join_list, &join_list);
>>
>> 		/* the following actions acquire the subflow socket lock
>> @@ -3544,7 +3560,8 @@ static void mptcp_release_cb(struct sock *sk)
>> 			__mptcp_push_pending(sk, 0);
>> 		if (flags & BIT(MPTCP_RETRANSMIT))
>> 			__mptcp_retrans(sk);
>> -		if ((flags & BIT(MPTCP_DEQUEUE)) && __mptcp_move_skbs(sk)) {
>> +		if (!list_empty(&skbs) &&
>> +		    __mptcp_move_skbs(sk, &skbs, &delta)) {
>> 			/* notify ack seq update */
>> 			mptcp_cleanup_rbuf(msk, 0);
>> 			sk->sk_data_ready(sk);
>> @@ -3552,7 +3569,9 @@ static void mptcp_release_cb(struct sock *sk)
>>
>> 		cond_resched();
>> 		spin_lock_bh(&sk->sk_lock.slock);
>> +		list_splice(&skbs, &msk->backlog_list);
>> 	}
>> +	WRITE_ONCE(msk->backlog_len, msk->backlog_len - delta);
> 
> Hi Paolo -
> 
> Given the possible multiple calls to __mptcp_move_skbs() and that the 
> spinlock is released/reacquired (and the cond_resched) in the middle, 
> would it make sense to update msk->backlog_len for each iteration of the 
> loop so __mptcp_space() and mptcp_space() don't under-report available 
> space and mptcp_cleanup_rbuf() can make incremental progress?
> 
> I know we don't want to WRITE_ONCE() more than necessary, but it seems 
> like there won't typically be more than one loop iteration. In the cases 
> where it does repeat the loop that means data is arriving quickly and 
> reporting mptcp_space accurately will be important.

That WRITE_ONCE() is intentionally out of the loop, but not as an
optimization, but for a functional goal, similar to:

https://elixir.bootlin.com/linux/v6.17.4/source/net/core/sock.c#L3190

without it, in exceptional situation, the loop could run for an
unbounded amount of time.

Given this is MPTCP-level, and packets went already through the TCP
subflow possibly such scenario is even more unlikely, but I think it's
still possible and serious enough we want to avoid it.

WRT the receive buffer utilization, it should not change much, as either
on the backlog or in the receiver buffer, the skb should be accounted
for the whole truesize.

Cheers,

Paolo
Re: [PATCH v5 mptcp-next 10/10] mptcp: leverage the backlog for RX packet processing
Posted by Mat Martineau 3 months, 2 weeks ago
On Tue, 21 Oct 2025, Paolo Abeni wrote:

> On 10/21/25 1:32 AM, Mat Martineau wrote:
>> On Mon, 6 Oct 2025, Paolo Abeni wrote:
>>> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
>>> index 2d5d3da67d1ac..a97a92eccc502 100644
>>> --- a/net/mptcp/protocol.c
>>> +++ b/net/mptcp/protocol.c
>>
>> ...
>>
>>> @@ -3509,23 +3519,29 @@ void __mptcp_check_push(struct sock *sk, struct sock *ssk)
>>>
>>> #define MPTCP_FLAGS_PROCESS_CTX_NEED (BIT(MPTCP_PUSH_PENDING) | \
>>> 				      BIT(MPTCP_RETRANSMIT) | \
>>> -				      BIT(MPTCP_FLUSH_JOIN_LIST) | \
>>> -				      BIT(MPTCP_DEQUEUE))
>>> +				      BIT(MPTCP_FLUSH_JOIN_LIST))
>>>
>>> /* processes deferred events and flush wmem */
>>> static void mptcp_release_cb(struct sock *sk)
>>> 	__must_hold(&sk->sk_lock.slock)
>>> {
>>> 	struct mptcp_sock *msk = mptcp_sk(sk);
>>> +	u32 delta = 0;
>>>
>>> 	for (;;) {
>>> 		unsigned long flags = (msk->cb_flags & MPTCP_FLAGS_PROCESS_CTX_NEED);
>>> -		struct list_head join_list;
>>> +		LIST_HEAD(join_list);
>>> +		LIST_HEAD(skbs);
>>> +
>>> +		sk_forward_alloc_add(sk, msk->borrowed_mem);
>>> +		msk->borrowed_mem = 0;
>>> +
>>> +		if (sk_rmem_alloc_get(sk) < sk->sk_rcvbuf)
>>> +			list_splice_init(&msk->backlog_list, &skbs);
>>>
>>> -		if (!flags)
>>> +		if (!flags && list_empty(&skbs))
>>> 			break;
>>>
>>> -		INIT_LIST_HEAD(&join_list);
>>> 		list_splice_init(&msk->join_list, &join_list);
>>>
>>> 		/* the following actions acquire the subflow socket lock
>>> @@ -3544,7 +3560,8 @@ static void mptcp_release_cb(struct sock *sk)
>>> 			__mptcp_push_pending(sk, 0);
>>> 		if (flags & BIT(MPTCP_RETRANSMIT))
>>> 			__mptcp_retrans(sk);
>>> -		if ((flags & BIT(MPTCP_DEQUEUE)) && __mptcp_move_skbs(sk)) {
>>> +		if (!list_empty(&skbs) &&
>>> +		    __mptcp_move_skbs(sk, &skbs, &delta)) {
>>> 			/* notify ack seq update */
>>> 			mptcp_cleanup_rbuf(msk, 0);
>>> 			sk->sk_data_ready(sk);
>>> @@ -3552,7 +3569,9 @@ static void mptcp_release_cb(struct sock *sk)
>>>
>>> 		cond_resched();
>>> 		spin_lock_bh(&sk->sk_lock.slock);
>>> +		list_splice(&skbs, &msk->backlog_list);
>>> 	}
>>> +	WRITE_ONCE(msk->backlog_len, msk->backlog_len - delta);
>>
>> Hi Paolo -
>>
>> Given the possible multiple calls to __mptcp_move_skbs() and that the
>> spinlock is released/reacquired (and the cond_resched) in the middle,
>> would it make sense to update msk->backlog_len for each iteration of the
>> loop so __mptcp_space() and mptcp_space() don't under-report available
>> space and mptcp_cleanup_rbuf() can make incremental progress?
>>
>> I know we don't want to WRITE_ONCE() more than necessary, but it seems
>> like there won't typically be more than one loop iteration. In the cases
>> where it does repeat the loop that means data is arriving quickly and
>> reporting mptcp_space accurately will be important.
>
> That WRITE_ONCE() is intentionally out of the loop, but not as an
> optimization, but for a functional goal, similar to:
>
> https://elixir.bootlin.com/linux/v6.17.4/source/net/core/sock.c#L3190
>
> without it, in exceptional situation, the loop could run for an
> unbounded amount of time.
>
> Given this is MPTCP-level, and packets went already through the TCP
> subflow possibly such scenario is even more unlikely, but I think it's
> still possible and serious enough we want to avoid it.
>

Ah, I think I see where that could happen if the received packets get 
discarded and don't get counted against the rcv buffer limits.  "Double 
counting" the most recently moved packets using backlog_len creates the 
temporary appearance of no buffer space in that "wild producer" scenario. 
Still applies at the MPTCP level, I agree.

> WRT the receive buffer utilization, it should not change much, as either
> on the backlog or in the receiver buffer, the skb should be accounted
> for the whole truesize.

My concern is mostly that the moved skbs are accounted twice until the 
loop is finally exited, so in the non-adversarial case where packets are 
accepted there would be an impact on the behavior of the rx window during 
high speed transfers.

If the overall delta was used for the purpose of exiting the loop, the 
backlog_len could be updated on each loop iteration while still keeping 
the loop bounded.

Taking into consideration the expected case of high-throughput behavior 
vs. an unexpected/infrequent "wild producer" scenario, does it still seem 
best to keep the above code as-is? I'll see what you decide in v6 :)

- Mat