skbs will be left waiting in the subflow only in exceptional cases,
we want to avoid messing with the fast path by unintentionally
processing in __mptcp_move_skbs() packets landed into the subflows
after the last check.
Use a separate flag to mark delayed skbs and only process subflow
with such flag set. Also add new mibs to track the exceptional events.
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
v1 -> v2:
- rebased
---
net/mptcp/mib.c | 2 ++
net/mptcp/mib.h | 4 ++++
net/mptcp/protocol.c | 40 ++++++++++++----------------------------
net/mptcp/protocol.h | 1 +
4 files changed, 19 insertions(+), 28 deletions(-)
diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c
index 6003e47c770a7..ac5ccf81159de 100644
--- a/net/mptcp/mib.c
+++ b/net/mptcp/mib.c
@@ -85,6 +85,8 @@ static const struct snmp_mib mptcp_snmp_list[] = {
SNMP_MIB_ITEM("DssFallback", MPTCP_MIB_DSSFALLBACK),
SNMP_MIB_ITEM("SimultConnectFallback", MPTCP_MIB_SIMULTCONNFALLBACK),
SNMP_MIB_ITEM("FallbackFailed", MPTCP_MIB_FALLBACKFAILED),
+ SNMP_MIB_ITEM("RcvDelayed", MPTCP_MIB_RCVDELAYED),
+ SNMP_MIB_ITEM("DelayedProcess", MPTCP_MIB_DELAYED_PROCESS),
};
/* mptcp_mib_alloc - allocate percpu mib counters
diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h
index 309bac6fea325..f6d0eaea463e5 100644
--- a/net/mptcp/mib.h
+++ b/net/mptcp/mib.h
@@ -88,6 +88,10 @@ enum linux_mptcp_mib_field {
MPTCP_MIB_DSSFALLBACK, /* Bad or missing DSS */
MPTCP_MIB_SIMULTCONNFALLBACK, /* Simultaneous connect */
MPTCP_MIB_FALLBACKFAILED, /* Can't fallback due to msk status */
+ MPTCP_MIB_RCVDELAYED, /* Data move from subflow is delayed due to msk
+ * receive buffer full
+ */
+ MPTCP_MIB_DELAYED_PROCESS, /* Delayed data moved in slowpath */
__MPTCP_MIB_MAX
};
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 201e6ac5fe631..2a025c0c4ca0c 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -681,13 +681,17 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
tp = tcp_sk(ssk);
do {
int mem = own_msk ? sk_rmem_alloc_get(sk) : sk->sk_backlog.len;
+ bool over_limit = mem > READ_ONCE(sk->sk_rcvbuf);
u32 map_remaining, offset;
u32 seq = tp->copied_seq;
struct sk_buff *skb;
bool fin;
- if (mem > READ_ONCE(sk->sk_rcvbuf))
+ WRITE_ONCE(subflow->data_delayed, over_limit);
+ if (subflow->data_delayed) {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVDELAYED);
break;
+ }
/* try to move as much data as available */
map_remaining = subflow->map_data_len -
@@ -2113,32 +2117,13 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
msk->rcvq_space.time = mstamp;
}
-static struct mptcp_subflow_context *
-__mptcp_first_ready_from(struct mptcp_sock *msk,
- struct mptcp_subflow_context *subflow)
-{
- struct mptcp_subflow_context *start_subflow = subflow;
-
- while (!READ_ONCE(subflow->data_avail)) {
- subflow = mptcp_next_subflow(msk, subflow);
- if (subflow == start_subflow)
- return NULL;
- }
- return subflow;
-}
-
static bool __mptcp_move_skbs(struct sock *sk)
{
struct mptcp_subflow_context *subflow;
struct mptcp_sock *msk = mptcp_sk(sk);
bool ret = false;
- if (list_empty(&msk->conn_list))
- return false;
-
- subflow = list_first_entry(&msk->conn_list,
- struct mptcp_subflow_context, node);
- for (;;) {
+ mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk;
bool slowpath;
@@ -2149,23 +2134,22 @@ static bool __mptcp_move_skbs(struct sock *sk)
if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf)
break;
- subflow = __mptcp_first_ready_from(msk, subflow);
- if (!subflow)
- break;
+ if (!subflow->data_delayed)
+ continue;
ssk = mptcp_subflow_tcp_sock(subflow);
slowpath = lock_sock_fast(ssk);
- ret = __mptcp_move_skbs_from_subflow(msk, ssk, true) || ret;
+ ret |= __mptcp_move_skbs_from_subflow(msk, ssk, true);
if (unlikely(ssk->sk_err))
__mptcp_error_report(sk);
unlock_sock_fast(ssk, slowpath);
-
- subflow = mptcp_next_subflow(msk, subflow);
}
__mptcp_ofo_queue(msk);
- if (ret)
+ if (ret) {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DELAYED_PROCESS);
mptcp_check_data_fin((struct sock *)msk);
+ }
return ret;
}
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 7bfd4e0d21a8a..a295ce11774ea 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -560,6 +560,7 @@ struct mptcp_subflow_context {
u8 reset_transient:1;
u8 reset_reason:4;
u8 stale_count;
+ bool data_delayed;
u32 subflow_id;
--
2.51.0
On Fri, 2025-09-19 at 17:53 +0200, Paolo Abeni wrote:
> skbs will be left waiting in the subflow only in exceptional cases,
> we want to avoid messing with the fast path by unintentionally
> processing in __mptcp_move_skbs() packets landed into the subflows
> after the last check.
>
> Use a separate flag to mark delayed skbs and only process subflow
> with such flag set. Also add new mibs to track the exceptional
> events.
>
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
LGTM!
Reviewed-by: Geliang Tang <geliang@kernel.org>
Tested-by: Geliang Tang <geliang@kernel.org>
Thanks,
-Geliang
> ---
> v1 -> v2:
> - rebased
> ---
> net/mptcp/mib.c | 2 ++
> net/mptcp/mib.h | 4 ++++
> net/mptcp/protocol.c | 40 ++++++++++++----------------------------
> net/mptcp/protocol.h | 1 +
> 4 files changed, 19 insertions(+), 28 deletions(-)
>
> diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c
> index 6003e47c770a7..ac5ccf81159de 100644
> --- a/net/mptcp/mib.c
> +++ b/net/mptcp/mib.c
> @@ -85,6 +85,8 @@ static const struct snmp_mib mptcp_snmp_list[] = {
> SNMP_MIB_ITEM("DssFallback", MPTCP_MIB_DSSFALLBACK),
> SNMP_MIB_ITEM("SimultConnectFallback",
> MPTCP_MIB_SIMULTCONNFALLBACK),
> SNMP_MIB_ITEM("FallbackFailed", MPTCP_MIB_FALLBACKFAILED),
> + SNMP_MIB_ITEM("RcvDelayed", MPTCP_MIB_RCVDELAYED),
> + SNMP_MIB_ITEM("DelayedProcess", MPTCP_MIB_DELAYED_PROCESS),
> };
>
> /* mptcp_mib_alloc - allocate percpu mib counters
> diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h
> index 309bac6fea325..f6d0eaea463e5 100644
> --- a/net/mptcp/mib.h
> +++ b/net/mptcp/mib.h
> @@ -88,6 +88,10 @@ enum linux_mptcp_mib_field {
> MPTCP_MIB_DSSFALLBACK, /* Bad or missing DSS */
> MPTCP_MIB_SIMULTCONNFALLBACK, /* Simultaneous connect */
> MPTCP_MIB_FALLBACKFAILED, /* Can't fallback due to msk
> status */
> + MPTCP_MIB_RCVDELAYED, /* Data move from subflow is
> delayed due to msk
> + * receive buffer full
> + */
> + MPTCP_MIB_DELAYED_PROCESS, /* Delayed data moved in
> slowpath */
> __MPTCP_MIB_MAX
> };
>
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 201e6ac5fe631..2a025c0c4ca0c 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -681,13 +681,17 @@ static bool
> __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
> tp = tcp_sk(ssk);
> do {
> int mem = own_msk ? sk_rmem_alloc_get(sk) : sk-
> >sk_backlog.len;
> + bool over_limit = mem > READ_ONCE(sk->sk_rcvbuf);
> u32 map_remaining, offset;
> u32 seq = tp->copied_seq;
> struct sk_buff *skb;
> bool fin;
>
> - if (mem > READ_ONCE(sk->sk_rcvbuf))
> + WRITE_ONCE(subflow->data_delayed, over_limit);
> + if (subflow->data_delayed) {
> + MPTCP_INC_STATS(sock_net(sk),
> MPTCP_MIB_RCVDELAYED);
> break;
> + }
>
> /* try to move as much data as available */
> map_remaining = subflow->map_data_len -
> @@ -2113,32 +2117,13 @@ static void mptcp_rcv_space_adjust(struct
> mptcp_sock *msk, int copied)
> msk->rcvq_space.time = mstamp;
> }
>
> -static struct mptcp_subflow_context *
> -__mptcp_first_ready_from(struct mptcp_sock *msk,
> - struct mptcp_subflow_context *subflow)
> -{
> - struct mptcp_subflow_context *start_subflow = subflow;
> -
> - while (!READ_ONCE(subflow->data_avail)) {
> - subflow = mptcp_next_subflow(msk, subflow);
> - if (subflow == start_subflow)
> - return NULL;
> - }
> - return subflow;
> -}
> -
> static bool __mptcp_move_skbs(struct sock *sk)
> {
> struct mptcp_subflow_context *subflow;
> struct mptcp_sock *msk = mptcp_sk(sk);
> bool ret = false;
>
> - if (list_empty(&msk->conn_list))
> - return false;
> -
> - subflow = list_first_entry(&msk->conn_list,
> - struct mptcp_subflow_context,
> node);
> - for (;;) {
> + mptcp_for_each_subflow(msk, subflow) {
> struct sock *ssk;
> bool slowpath;
>
> @@ -2149,23 +2134,22 @@ static bool __mptcp_move_skbs(struct sock
> *sk)
> if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf)
> break;
>
> - subflow = __mptcp_first_ready_from(msk, subflow);
> - if (!subflow)
> - break;
> + if (!subflow->data_delayed)
> + continue;
>
> ssk = mptcp_subflow_tcp_sock(subflow);
> slowpath = lock_sock_fast(ssk);
> - ret = __mptcp_move_skbs_from_subflow(msk, ssk, true)
> || ret;
> + ret |= __mptcp_move_skbs_from_subflow(msk, ssk,
> true);
> if (unlikely(ssk->sk_err))
> __mptcp_error_report(sk);
> unlock_sock_fast(ssk, slowpath);
> -
> - subflow = mptcp_next_subflow(msk, subflow);
> }
>
> __mptcp_ofo_queue(msk);
> - if (ret)
> + if (ret) {
> + MPTCP_INC_STATS(sock_net(sk),
> MPTCP_MIB_DELAYED_PROCESS);
> mptcp_check_data_fin((struct sock *)msk);
> + }
> return ret;
> }
>
> diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
> index 7bfd4e0d21a8a..a295ce11774ea 100644
> --- a/net/mptcp/protocol.h
> +++ b/net/mptcp/protocol.h
> @@ -560,6 +560,7 @@ struct mptcp_subflow_context {
> u8 reset_transient:1;
> u8 reset_reason:4;
> u8 stale_count;
> + bool data_delayed;
>
> u32 subflow_id;
>
© 2016 - 2026 Red Hat, Inc.