[v8] mptcp: introduce backlog processing

[PATCH mptcp-next v8 4/4] mptcp: leverage the backlog for RX packet processing

Posted by Paolo Abeni 3 weeks, 1 day ago

When the msk socket is owned or the msk receive buffer is full,
move the incoming skbs in a msk level backlog list. This avoid
traversing the joined subflows and acquiring the subflow level
socket lock at reception time, improving the RX performances.

When processing the backlog, use the fwd alloc memory borrowed from
the incoming subflow. skbs exceeding the msk receive space are
not dropped; instead they are kept into the backlog until the receive
buffer is freed. Dropping packets already acked at the TCP level is
explicitly discouraged by the RFC and would corrupt the data stream
for fallback sockets.

Special care is needed to avoid adding skbs to the backlog of a closed
msk and to avoid leaving dangling references into the backlog
at subflow closing time.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
v6 -> v7:
  - do not limit the overall backlog spooling loop, it's hard to do it
    right and the pre backlog code did not care for the similar existing
    loop

v5 -> v6:
  - do backlog len update asap to advise the correct window.
  - explicitly bound backlog processing loop to the maximum BL len

v4 -> v5:
  - consolidate ssk rcvbuf accunting in __mptcp_move_skb(), remove
    some code duplication
  - return soon in __mptcp_add_backlog() when dropping skbs due to
    the msk closed. This avoid later UaF
---
 net/mptcp/protocol.c | 121 ++++++++++++++++++++++++-------------------
 net/mptcp/protocol.h |   1 -
 2 files changed, 68 insertions(+), 54 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index d075db13bb72..01114456dec6 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -684,7 +684,7 @@ static void __mptcp_add_backlog(struct sock *sk,
 }
 
 static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
-					   struct sock *ssk)
+					   struct sock *ssk, bool own_msk)
 {
 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
 	struct sock *sk = (struct sock *)msk;
@@ -700,9 +700,6 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
 		struct sk_buff *skb;
 		bool fin;
 
-		if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf)
-			break;
-
 		/* try to move as much data as available */
 		map_remaining = subflow->map_data_len -
 				mptcp_subflow_get_map_offset(subflow);
@@ -730,7 +727,7 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
 
 			mptcp_init_skb(ssk, skb, offset, len);
 
-			if (true) {
+			if (own_msk && sk_rmem_alloc_get(sk) < sk->sk_rcvbuf) {
 				mptcp_subflow_lend_fwdmem(subflow, skb);
 				ret |= __mptcp_move_skb(sk, skb);
 			} else {
@@ -854,7 +851,7 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
 	struct sock *sk = (struct sock *)msk;
 	bool moved;
 
-	moved = __mptcp_move_skbs_from_subflow(msk, ssk);
+	moved = __mptcp_move_skbs_from_subflow(msk, ssk, true);
 	__mptcp_ofo_queue(msk);
 	if (unlikely(ssk->sk_err))
 		__mptcp_subflow_error_report(sk, ssk);
@@ -887,7 +884,7 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
 		if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk))
 			sk->sk_data_ready(sk);
 	} else {
-		__set_bit(MPTCP_DEQUEUE, &mptcp_sk(sk)->cb_flags);
+		__mptcp_move_skbs_from_subflow(msk, ssk, false);
 	}
 	mptcp_data_unlock(sk);
 }
@@ -2128,60 +2125,74 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
 	msk->rcvq_space.time = mstamp;
 }
 
-static struct mptcp_subflow_context *
-__mptcp_first_ready_from(struct mptcp_sock *msk,
-			 struct mptcp_subflow_context *subflow)
+static bool __mptcp_move_skbs(struct sock *sk, struct list_head *skbs, u32 *delta)
 {
-	struct mptcp_subflow_context *start_subflow = subflow;
+	struct sk_buff *skb = list_first_entry(skbs, struct sk_buff, list);
+	struct mptcp_sock *msk = mptcp_sk(sk);
+	bool moved = false;
+
+	*delta = 0;
+	while (1) {
+		/* If the msk recvbuf is full stop, don't drop */
+		if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf)
+			break;
+
+		prefetch(skb->next);
+		list_del(&skb->list);
+		*delta += skb->truesize;
+
+		moved |= __mptcp_move_skb(sk, skb);
+		if (list_empty(skbs))
+			break;
 
-	while (!READ_ONCE(subflow->data_avail)) {
-		subflow = mptcp_next_subflow(msk, subflow);
-		if (subflow == start_subflow)
-			return NULL;
+		skb = list_first_entry(skbs, struct sk_buff, list);
 	}
-	return subflow;
+
+	__mptcp_ofo_queue(msk);
+	if (moved)
+		mptcp_check_data_fin((struct sock *)msk);
+	return moved;
 }
 
-static bool __mptcp_move_skbs(struct sock *sk)
+static bool mptcp_can_spool_backlog(struct sock *sk, struct list_head *skbs)
 {
-	struct mptcp_subflow_context *subflow;
 	struct mptcp_sock *msk = mptcp_sk(sk);
-	bool ret = false;
 
-	if (list_empty(&msk->conn_list))
+	/* Don't spool the backlog if the rcvbuf is full. */
+	if (list_empty(&msk->backlog_list) ||
+	    sk_rmem_alloc_get(sk) > sk->sk_rcvbuf)
 		return false;
 
-	subflow = list_first_entry(&msk->conn_list,
-				   struct mptcp_subflow_context, node);
-	for (;;) {
-		struct sock *ssk;
-		bool slowpath;
+	INIT_LIST_HEAD(skbs);
+	list_splice_init(&msk->backlog_list, skbs);
+	return true;
+}
 
-		/*
-		 * As an optimization avoid traversing the subflows list
-		 * and ev. acquiring the subflow socket lock before baling out
-		 */
-		if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf)
-			break;
+static void mptcp_backlog_spooled(struct sock *sk, u32 moved,
+				  struct list_head *skbs)
+{
+	struct mptcp_sock *msk = mptcp_sk(sk);
 
-		subflow = __mptcp_first_ready_from(msk, subflow);
-		if (!subflow)
-			break;
+	WRITE_ONCE(msk->backlog_len, msk->backlog_len - moved);
+	list_splice(skbs, &msk->backlog_list);
+}
 
-		ssk = mptcp_subflow_tcp_sock(subflow);
-		slowpath = lock_sock_fast(ssk);
-		ret = __mptcp_move_skbs_from_subflow(msk, ssk) || ret;
-		if (unlikely(ssk->sk_err))
-			__mptcp_error_report(sk);
-		unlock_sock_fast(ssk, slowpath);
+static bool mptcp_move_skbs(struct sock *sk)
+{
+	struct list_head skbs;
+	bool enqueued = false;
+	u32 moved;
 
-		subflow = mptcp_next_subflow(msk, subflow);
-	}
+	mptcp_data_lock(sk);
+	while (mptcp_can_spool_backlog(sk, &skbs)) {
+		mptcp_data_unlock(sk);
+		enqueued |= __mptcp_move_skbs(sk, &skbs, &moved);
 
-	__mptcp_ofo_queue(msk);
-	if (ret)
-		mptcp_check_data_fin((struct sock *)msk);
-	return ret;
+		mptcp_data_lock(sk);
+		mptcp_backlog_spooled(sk, moved, &skbs);
+	}
+	mptcp_data_unlock(sk);
+	return enqueued;
 }
 
 static unsigned int mptcp_inq_hint(const struct sock *sk)
@@ -2247,7 +2258,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 
 		copied += bytes_read;
 
-		if (skb_queue_empty(&sk->sk_receive_queue) && __mptcp_move_skbs(sk))
+		if (!list_empty(&msk->backlog_list) && mptcp_move_skbs(sk))
 			continue;
 
 		/* only the MPTCP socket status is relevant here. The exit
@@ -3532,8 +3543,7 @@ void __mptcp_check_push(struct sock *sk, struct sock *ssk)
 
 #define MPTCP_FLAGS_PROCESS_CTX_NEED (BIT(MPTCP_PUSH_PENDING) | \
 				      BIT(MPTCP_RETRANSMIT) | \
-				      BIT(MPTCP_FLUSH_JOIN_LIST) | \
-				      BIT(MPTCP_DEQUEUE))
+				      BIT(MPTCP_FLUSH_JOIN_LIST))
 
 /* processes deferred events and flush wmem */
 static void mptcp_release_cb(struct sock *sk)
@@ -3543,9 +3553,12 @@ static void mptcp_release_cb(struct sock *sk)
 
 	for (;;) {
 		unsigned long flags = (msk->cb_flags & MPTCP_FLAGS_PROCESS_CTX_NEED);
-		struct list_head join_list;
+		struct list_head join_list, skbs;
+		bool spool_bl;
+		u32 moved;
 
-		if (!flags)
+		spool_bl = mptcp_can_spool_backlog(sk, &skbs);
+		if (!flags && !spool_bl)
 			break;
 
 		INIT_LIST_HEAD(&join_list);
@@ -3567,7 +3580,7 @@ static void mptcp_release_cb(struct sock *sk)
 			__mptcp_push_pending(sk, 0);
 		if (flags & BIT(MPTCP_RETRANSMIT))
 			__mptcp_retrans(sk);
-		if ((flags & BIT(MPTCP_DEQUEUE)) && __mptcp_move_skbs(sk)) {
+		if (spool_bl && __mptcp_move_skbs(sk, &skbs, &moved)) {
 			/* notify ack seq update */
 			mptcp_cleanup_rbuf(msk, 0);
 			sk->sk_data_ready(sk);
@@ -3575,6 +3588,8 @@ static void mptcp_release_cb(struct sock *sk)
 
 		cond_resched();
 		spin_lock_bh(&sk->sk_lock.slock);
+		if (spool_bl)
+			mptcp_backlog_spooled(sk, moved, &skbs);
 	}
 
 	if (__test_and_clear_bit(MPTCP_CLEAN_UNA, &msk->cb_flags))
@@ -3807,7 +3822,7 @@ static int mptcp_ioctl(struct sock *sk, int cmd, int *karg)
 			return -EINVAL;
 
 		lock_sock(sk);
-		if (__mptcp_move_skbs(sk))
+		if (mptcp_move_skbs(sk))
 			mptcp_cleanup_rbuf(msk, 0);
 		*karg = mptcp_inq_hint(sk);
 		release_sock(sk);
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index ca43d345b65a..d0881db16b12 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -124,7 +124,6 @@
 #define MPTCP_FLUSH_JOIN_LIST	5
 #define MPTCP_SYNC_STATE	6
 #define MPTCP_SYNC_SNDBUF	7
-#define MPTCP_DEQUEUE		8
 
 struct mptcp_skb_cb {
 	u64 map_seq;
-- 
2.51.0

Re: [PATCH mptcp-next v8 4/4] mptcp: leverage the backlog for RX packet processing

Posted by Matthieu Baerts 3 weeks, 1 day ago

Hi Paolo,

On 04/11/2025 16:59, Paolo Abeni wrote:
> When the msk socket is owned or the msk receive buffer is full,
> move the incoming skbs in a msk level backlog list. This avoid
> traversing the joined subflows and acquiring the subflow level
> socket lock at reception time, improving the RX performances.
> 
> When processing the backlog, use the fwd alloc memory borrowed from
> the incoming subflow. skbs exceeding the msk receive space are
> not dropped; instead they are kept into the backlog until the receive
> buffer is freed. Dropping packets already acked at the TCP level is
> explicitly discouraged by the RFC and would corrupt the data stream
> for fallback sockets.
> 
> Special care is needed to avoid adding skbs to the backlog of a closed
> msk and to avoid leaving dangling references into the backlog
> at subflow closing time.

Thank you for the patches!

> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index d075db13bb72..01114456dec6 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -684,7 +684,7 @@ static void __mptcp_add_backlog(struct sock *sk,
>  }
>  
>  static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
> -					   struct sock *ssk)
> +					   struct sock *ssk, bool own_msk)
>  {
>  	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
>  	struct sock *sk = (struct sock *)msk;
> @@ -700,9 +700,6 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
>  		struct sk_buff *skb;
>  		bool fin;
>  
> -		if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf)

I don't think that's important, but I prefer to report it just in case:

I see that you moved this condition from here ...

> -			break;
> -
>  		/* try to move as much data as available */
>  		map_remaining = subflow->map_data_len -
>  				mptcp_subflow_get_map_offset(subflow);
> @@ -730,7 +727,7 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
>  
>  			mptcp_init_skb(ssk, skb, offset, len);
>  
> -			if (true) {
> +			if (own_msk && sk_rmem_alloc_get(sk) < sk->sk_rcvbuf) {

... to here, but almost the opposite: '>' → '<'. Fine with '<', or
should it be '<='?

But I guess we will not often hit just the exact number.

>  				mptcp_subflow_lend_fwdmem(subflow, skb);
>  				ret |= __mptcp_move_skb(sk, skb);
>  			} else {

Cheers,
Matt
-- 
Sponsored by the NGI0 Core fund.

Re: [PATCH mptcp-next v8 4/4] mptcp: leverage the backlog for RX packet processing

Posted by Paolo Abeni 3 weeks ago


On 11/5/25 11:43 AM, Matthieu Baerts wrote:
> Hi Paolo,
> 
> On 04/11/2025 16:59, Paolo Abeni wrote:
>> When the msk socket is owned or the msk receive buffer is full,
>> move the incoming skbs in a msk level backlog list. This avoid
>> traversing the joined subflows and acquiring the subflow level
>> socket lock at reception time, improving the RX performances.
>>
>> When processing the backlog, use the fwd alloc memory borrowed from
>> the incoming subflow. skbs exceeding the msk receive space are
>> not dropped; instead they are kept into the backlog until the receive
>> buffer is freed. Dropping packets already acked at the TCP level is
>> explicitly discouraged by the RFC and would corrupt the data stream
>> for fallback sockets.
>>
>> Special care is needed to avoid adding skbs to the backlog of a closed
>> msk and to avoid leaving dangling references into the backlog
>> at subflow closing time.
> 
> Thank you for the patches!
> 
>> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
>> index d075db13bb72..01114456dec6 100644
>> --- a/net/mptcp/protocol.c
>> +++ b/net/mptcp/protocol.c
>> @@ -684,7 +684,7 @@ static void __mptcp_add_backlog(struct sock *sk,
>>  }
>>  
>>  static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
>> -					   struct sock *ssk)
>> +					   struct sock *ssk, bool own_msk)
>>  {
>>  	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
>>  	struct sock *sk = (struct sock *)msk;
>> @@ -700,9 +700,6 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
>>  		struct sk_buff *skb;
>>  		bool fin;
>>  
>> -		if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf)
> 
> I don't think that's important, but I prefer to report it just in case:
> 
> I see that you moved this condition from here ...
> 
>> -			break;
>> -
>>  		/* try to move as much data as available */
>>  		map_remaining = subflow->map_data_len -
>>  				mptcp_subflow_get_map_offset(subflow);
>> @@ -730,7 +727,7 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
>>  
>>  			mptcp_init_skb(ssk, skb, offset, len);
>>  
>> -			if (true) {
>> +			if (own_msk && sk_rmem_alloc_get(sk) < sk->sk_rcvbuf) {
> 
> ... to here, but almost the opposite: '>' → '<'. Fine with '<', or
> should it be '<='?
> 
> But I guess we will not often hit just the exact number.

Actually the new test is more correct than the old one: if the rcvbuf is
full we should not add data to the backlog.

Arguably a better test would be alike tcp_can_ingest(), but that change
caused several regression and I would refrain/wait a bit before porting
it to mptcp.

/P

[PATCH mptcp-next v8 1/4] mptcp: handle first subflow closing consistently
[PATCH mptcp-next v8 2/4] mptcp: borrow forward memory from subflow
[PATCH mptcp-next v8 3/4] mptcp: introduce mptcp-level backlog
[PATCH mptcp-next v8 4/4] mptcp: leverage the backlog for RX packet processing