skbs will be left waiting in the subflow only in exceptional cases,
we want to avoid messing with the fast path by unintentionally
processing in __mptcp_move_skbs() packets landed into the subflows
after the last check.
Use a separate flag to mark delayed skbs and only process subflow
with such flag set. Also add new mibs to track the exceptional events.
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
net/mptcp/mib.c | 2 ++
net/mptcp/mib.h | 4 ++++
net/mptcp/protocol.c | 40 ++++++++++++----------------------------
net/mptcp/protocol.h | 1 +
4 files changed, 19 insertions(+), 28 deletions(-)
diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c
index cf879c188ca26..7af9d35cde884 100644
--- a/net/mptcp/mib.c
+++ b/net/mptcp/mib.c
@@ -85,6 +85,8 @@ static const struct snmp_mib mptcp_snmp_list[] = {
SNMP_MIB_ITEM("DssFallback", MPTCP_MIB_DSSFALLBACK),
SNMP_MIB_ITEM("SimultConnectFallback", MPTCP_MIB_SIMULTCONNFALLBACK),
SNMP_MIB_ITEM("FallbackFailed", MPTCP_MIB_FALLBACKFAILED),
+ SNMP_MIB_ITEM("RcvDelayed", MPTCP_MIB_RCVDELAYED),
+ SNMP_MIB_ITEM("DelayedProcess", MPTCP_MIB_DELAYED_PROCESS),
SNMP_MIB_SENTINEL
};
diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h
index 309bac6fea325..f6d0eaea463e5 100644
--- a/net/mptcp/mib.h
+++ b/net/mptcp/mib.h
@@ -88,6 +88,10 @@ enum linux_mptcp_mib_field {
MPTCP_MIB_DSSFALLBACK, /* Bad or missing DSS */
MPTCP_MIB_SIMULTCONNFALLBACK, /* Simultaneous connect */
MPTCP_MIB_FALLBACKFAILED, /* Can't fallback due to msk status */
+ MPTCP_MIB_RCVDELAYED, /* Data move from subflow is delayed due to msk
+ * receive buffer full
+ */
+ MPTCP_MIB_DELAYED_PROCESS, /* Delayed data moved in slowpath */
__MPTCP_MIB_MAX
};
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 9c3baed948d1d..f211a939daf39 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -665,13 +665,17 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
tp = tcp_sk(ssk);
do {
int mem = own_msk ? sk_rmem_alloc_get(sk) : sk->sk_backlog.len;
+ bool over_limit = mem > READ_ONCE(sk->sk_rcvbuf);
u32 map_remaining, offset;
u32 seq = tp->copied_seq;
struct sk_buff *skb;
bool fin;
- if (mem > READ_ONCE(sk->sk_rcvbuf))
+ WRITE_ONCE(subflow->data_delayed, over_limit);
+ if (subflow->data_delayed) {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVDELAYED);
break;
+ }
/* try to move as much data as available */
map_remaining = subflow->map_data_len -
@@ -2081,32 +2085,13 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
msk->rcvq_space.time = mstamp;
}
-static struct mptcp_subflow_context *
-__mptcp_first_ready_from(struct mptcp_sock *msk,
- struct mptcp_subflow_context *subflow)
-{
- struct mptcp_subflow_context *start_subflow = subflow;
-
- while (!READ_ONCE(subflow->data_avail)) {
- subflow = mptcp_next_subflow(msk, subflow);
- if (subflow == start_subflow)
- return NULL;
- }
- return subflow;
-}
-
static bool __mptcp_move_skbs(struct sock *sk)
{
struct mptcp_subflow_context *subflow;
struct mptcp_sock *msk = mptcp_sk(sk);
bool ret = false;
- if (list_empty(&msk->conn_list))
- return false;
-
- subflow = list_first_entry(&msk->conn_list,
- struct mptcp_subflow_context, node);
- for (;;) {
+ mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk;
bool slowpath;
@@ -2117,23 +2102,22 @@ static bool __mptcp_move_skbs(struct sock *sk)
if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf)
break;
- subflow = __mptcp_first_ready_from(msk, subflow);
- if (!subflow)
- break;
+ if (!subflow->data_delayed)
+ continue;
ssk = mptcp_subflow_tcp_sock(subflow);
slowpath = lock_sock_fast(ssk);
- ret = __mptcp_move_skbs_from_subflow(msk, ssk, true) || ret;
+ ret |= __mptcp_move_skbs_from_subflow(msk, ssk, true);
if (unlikely(ssk->sk_err))
__mptcp_error_report(sk);
unlock_sock_fast(ssk, slowpath);
-
- subflow = mptcp_next_subflow(msk, subflow);
}
__mptcp_ofo_queue(msk);
- if (ret)
+ if (ret) {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DELAYED_PROCESS);
mptcp_check_data_fin((struct sock *)msk);
+ }
return ret;
}
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index a6e775d6412e5..2905e4b250070 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -559,6 +559,7 @@ struct mptcp_subflow_context {
u8 reset_transient:1;
u8 reset_reason:4;
u8 stale_count;
+ bool data_delayed;
u32 subflow_id;
--
2.51.0
Hi Paolo, Thanks for this patch. On Tue, 2025-09-16 at 18:27 +0200, Paolo Abeni wrote: > skbs will be left waiting in the subflow only in exceptional cases, > we want to avoid messing with the fast path by unintentionally > processing in __mptcp_move_skbs() packets landed into the subflows > after the last check. > > Use a separate flag to mark delayed skbs and only process subflow > with such flag set. Also add new mibs to track the exceptional > events. > > Signed-off-by: Paolo Abeni <pabeni@redhat.com> > --- > net/mptcp/mib.c | 2 ++ > net/mptcp/mib.h | 4 ++++ > net/mptcp/protocol.c | 40 ++++++++++++---------------------------- > net/mptcp/protocol.h | 1 + > 4 files changed, 19 insertions(+), 28 deletions(-) > > diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c > index cf879c188ca26..7af9d35cde884 100644 > --- a/net/mptcp/mib.c > +++ b/net/mptcp/mib.c > @@ -85,6 +85,8 @@ static const struct snmp_mib mptcp_snmp_list[] = { > SNMP_MIB_ITEM("DssFallback", MPTCP_MIB_DSSFALLBACK), > SNMP_MIB_ITEM("SimultConnectFallback", > MPTCP_MIB_SIMULTCONNFALLBACK), > SNMP_MIB_ITEM("FallbackFailed", MPTCP_MIB_FALLBACKFAILED), > + SNMP_MIB_ITEM("RcvDelayed", MPTCP_MIB_RCVDELAYED), > + SNMP_MIB_ITEM("DelayedProcess", MPTCP_MIB_DELAYED_PROCESS), > SNMP_MIB_SENTINEL I guess this line was accidentally pasted, which prevents this patch from being applied. Thanks, -Geliang > }; > > diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h > index 309bac6fea325..f6d0eaea463e5 100644 > --- a/net/mptcp/mib.h > +++ b/net/mptcp/mib.h > @@ -88,6 +88,10 @@ enum linux_mptcp_mib_field { > MPTCP_MIB_DSSFALLBACK, /* Bad or missing DSS */ > MPTCP_MIB_SIMULTCONNFALLBACK, /* Simultaneous connect */ > MPTCP_MIB_FALLBACKFAILED, /* Can't fallback due to msk > status */ > + MPTCP_MIB_RCVDELAYED, /* Data move from subflow is > delayed due to msk > + * receive buffer full > + */ > + MPTCP_MIB_DELAYED_PROCESS, /* Delayed data moved in > slowpath */ > __MPTCP_MIB_MAX > }; > > diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c > index 9c3baed948d1d..f211a939daf39 100644 > --- a/net/mptcp/protocol.c > +++ b/net/mptcp/protocol.c > @@ -665,13 +665,17 @@ static bool > __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, > tp = tcp_sk(ssk); > do { > int mem = own_msk ? sk_rmem_alloc_get(sk) : sk- > >sk_backlog.len; > + bool over_limit = mem > READ_ONCE(sk->sk_rcvbuf); > u32 map_remaining, offset; > u32 seq = tp->copied_seq; > struct sk_buff *skb; > bool fin; > > - if (mem > READ_ONCE(sk->sk_rcvbuf)) > + WRITE_ONCE(subflow->data_delayed, over_limit); > + if (subflow->data_delayed) { > + MPTCP_INC_STATS(sock_net(sk), > MPTCP_MIB_RCVDELAYED); > break; > + } > > /* try to move as much data as available */ > map_remaining = subflow->map_data_len - > @@ -2081,32 +2085,13 @@ static void mptcp_rcv_space_adjust(struct > mptcp_sock *msk, int copied) > msk->rcvq_space.time = mstamp; > } > > -static struct mptcp_subflow_context * > -__mptcp_first_ready_from(struct mptcp_sock *msk, > - struct mptcp_subflow_context *subflow) > -{ > - struct mptcp_subflow_context *start_subflow = subflow; > - > - while (!READ_ONCE(subflow->data_avail)) { > - subflow = mptcp_next_subflow(msk, subflow); > - if (subflow == start_subflow) > - return NULL; > - } > - return subflow; > -} > - > static bool __mptcp_move_skbs(struct sock *sk) > { > struct mptcp_subflow_context *subflow; > struct mptcp_sock *msk = mptcp_sk(sk); > bool ret = false; > > - if (list_empty(&msk->conn_list)) > - return false; > - > - subflow = list_first_entry(&msk->conn_list, > - struct mptcp_subflow_context, > node); > - for (;;) { > + mptcp_for_each_subflow(msk, subflow) { > struct sock *ssk; > bool slowpath; > > @@ -2117,23 +2102,22 @@ static bool __mptcp_move_skbs(struct sock > *sk) > if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf) > break; > > - subflow = __mptcp_first_ready_from(msk, subflow); > - if (!subflow) > - break; > + if (!subflow->data_delayed) > + continue; > > ssk = mptcp_subflow_tcp_sock(subflow); > slowpath = lock_sock_fast(ssk); > - ret = __mptcp_move_skbs_from_subflow(msk, ssk, true) > || ret; > + ret |= __mptcp_move_skbs_from_subflow(msk, ssk, > true); > if (unlikely(ssk->sk_err)) > __mptcp_error_report(sk); > unlock_sock_fast(ssk, slowpath); > - > - subflow = mptcp_next_subflow(msk, subflow); > } > > __mptcp_ofo_queue(msk); > - if (ret) > + if (ret) { > + MPTCP_INC_STATS(sock_net(sk), > MPTCP_MIB_DELAYED_PROCESS); > mptcp_check_data_fin((struct sock *)msk); > + } > return ret; > } > > diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h > index a6e775d6412e5..2905e4b250070 100644 > --- a/net/mptcp/protocol.h > +++ b/net/mptcp/protocol.h > @@ -559,6 +559,7 @@ struct mptcp_subflow_context { > u8 reset_transient:1; > u8 reset_reason:4; > u8 stale_count; > + bool data_delayed; > > u32 subflow_id; >
Hi Geliang, Paolo, On 17/09/2025 05:39, Geliang Tang wrote: > Hi Paolo, > > Thanks for this patch. > > On Tue, 2025-09-16 at 18:27 +0200, Paolo Abeni wrote: >> skbs will be left waiting in the subflow only in exceptional cases, >> we want to avoid messing with the fast path by unintentionally >> processing in __mptcp_move_skbs() packets landed into the subflows >> after the last check. >> >> Use a separate flag to mark delayed skbs and only process subflow >> with such flag set. Also add new mibs to track the exceptional >> events. >> >> Signed-off-by: Paolo Abeni <pabeni@redhat.com> >> --- >> net/mptcp/mib.c | 2 ++ >> net/mptcp/mib.h | 4 ++++ >> net/mptcp/protocol.c | 40 ++++++++++++---------------------------- >> net/mptcp/protocol.h | 1 + >> 4 files changed, 19 insertions(+), 28 deletions(-) >> >> diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c >> index cf879c188ca26..7af9d35cde884 100644 >> --- a/net/mptcp/mib.c >> +++ b/net/mptcp/mib.c >> @@ -85,6 +85,8 @@ static const struct snmp_mib mptcp_snmp_list[] = { >> SNMP_MIB_ITEM("DssFallback", MPTCP_MIB_DSSFALLBACK), >> SNMP_MIB_ITEM("SimultConnectFallback", >> MPTCP_MIB_SIMULTCONNFALLBACK), >> SNMP_MIB_ITEM("FallbackFailed", MPTCP_MIB_FALLBACKFAILED), >> + SNMP_MIB_ITEM("RcvDelayed", MPTCP_MIB_RCVDELAYED), >> + SNMP_MIB_ITEM("DelayedProcess", MPTCP_MIB_DELAYED_PROCESS), >> SNMP_MIB_SENTINEL > > I guess this line was accidentally pasted, which prevents this patch > from being applied. FYI, this line has been removed recently by commit 35cb2da0abaf ("mptcp: snmp: do not use SNMP_MIB_SENTINEL anymore"). I manually fixed the conflict, and sent the series to the CI. I also restarted my syzkaller instances to validate your series. Cheers, Matt -- Sponsored by the NGI0 Core fund.
© 2016 - 2025 Red Hat, Inc.