Similar to tcp, provide a new tracepoint to better understand
mptcp_rcv_space_adjust() behavior, which presents many artifacts.
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
include/trace/events/mptcp.h | 74 ++++++++++++++++++++++++++++++++++++
net/mptcp/protocol.c | 3 ++
2 files changed, 77 insertions(+)
diff --git a/include/trace/events/mptcp.h b/include/trace/events/mptcp.h
index 085b749cdd97..71fd6d33f48b 100644
--- a/include/trace/events/mptcp.h
+++ b/include/trace/events/mptcp.h
@@ -178,6 +178,80 @@ TRACE_EVENT(subflow_check_data_avail,
__entry->skb)
);
+#include <trace/events/net_probe_common.h>
+
+TRACE_EVENT(mptcp_rcvbuf_grow,
+
+ TP_PROTO(struct sock *sk, int time),
+
+ TP_ARGS(sk, time),
+
+ TP_STRUCT__entry(
+ __field(int, time)
+ __field(__u32, rtt_us)
+ __field(__u32, copied)
+ __field(__u32, inq)
+ __field(__u32, space)
+ __field(__u32, ooo_space)
+ __field(__u32, rcvbuf)
+ __field(__u32, rcv_wnd)
+ __field(__u8, scaling_ratio)
+ __field(__u16, sport)
+ __field(__u16, dport)
+ __field(__u16, family)
+ __array(__u8, saddr, 4)
+ __array(__u8, daddr, 4)
+ __array(__u8, saddr_v6, 16)
+ __array(__u8, daddr_v6, 16)
+ __field(const void *, skaddr)
+ ),
+
+ TP_fast_assign(
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct inet_sock *inet = inet_sk(sk);
+ __be32 *p32;
+
+ __entry->time = time;
+ __entry->rtt_us = msk->rcvq_space.rtt_us >> 3;
+ __entry->copied = msk->rcvq_space.copied;
+ __entry->inq = mptcp_inq_hint(sk);
+ __entry->space = msk->rcvq_space.space;
+ __entry->ooo_space = RB_EMPTY_ROOT(&msk->out_of_order_queue) ? 0 :
+ MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq -
+ msk->ack_seq;
+
+ __entry->rcvbuf = sk->sk_rcvbuf;
+ __entry->rcv_wnd = atomic64_read(&msk->rcv_wnd_sent) - msk->ack_seq;
+ __entry->scaling_ratio = msk->scaling_ratio;
+ __entry->sport = ntohs(inet->inet_sport);
+ __entry->dport = ntohs(inet->inet_dport);
+ __entry->family = sk->sk_family;
+
+ p32 = (__be32 *) __entry->saddr;
+ *p32 = inet->inet_saddr;
+
+ p32 = (__be32 *) __entry->daddr;
+ *p32 = inet->inet_daddr;
+
+ TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
+ sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
+
+ __entry->skaddr = sk;
+ ),
+
+ TP_printk("time=%u rtt_us=%u copied=%u inq=%u space=%u ooo=%u scaling_ratio=%u rcvbuf=%u "
+ "rcv_wnd=%u "
+ "sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 "
+ "saddrv6=%pI6c daddrv6=%pI6c skaddr=%p",
+ __entry->time, __entry->rtt_us, __entry->copied,
+ __entry->inq, __entry->space, __entry->ooo_space,
+ __entry->scaling_ratio, __entry->rcvbuf,
+ __entry->rcv_wnd,
+ __entry->sport, __entry->dport,
+ __entry->saddr, __entry->daddr,
+ __entry->saddr_v6, __entry->daddr_v6,
+ __entry->skaddr)
+);
#endif /* _TRACE_MPTCP_H */
/* This part must be outside protection */
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 01114456dec6..443406bc4a54 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -28,6 +28,8 @@
#include "protocol.h"
#include "mib.h"
+static unsigned int mptcp_inq_hint(const struct sock *sk);
+
#define CREATE_TRACE_POINTS
#include <trace/events/mptcp.h>
@@ -2101,6 +2103,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
if (msk->rcvq_space.copied <= msk->rcvq_space.space)
goto new_measure;
+ trace_mptcp_rcvbuf_grow(sk, time);
if (mptcp_rcvbuf_grow(sk, msk->rcvq_space.copied)) {
/* Make subflows follow along. If we do not do this, we
* get drops at subflow level if skbs can't be moved to
--
2.51.0
On Tue, 4 Nov 2025, Paolo Abeni wrote:
> Similar to tcp, provide a new tracepoint to better understand
> mptcp_rcv_space_adjust() behavior, which presents many artifacts.
>
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
> include/trace/events/mptcp.h | 74 ++++++++++++++++++++++++++++++++++++
> net/mptcp/protocol.c | 3 ++
> 2 files changed, 77 insertions(+)
>
> diff --git a/include/trace/events/mptcp.h b/include/trace/events/mptcp.h
> index 085b749cdd97..71fd6d33f48b 100644
> --- a/include/trace/events/mptcp.h
> +++ b/include/trace/events/mptcp.h
> @@ -178,6 +178,80 @@ TRACE_EVENT(subflow_check_data_avail,
> __entry->skb)
> );
>
> +#include <trace/events/net_probe_common.h>
> +
> +TRACE_EVENT(mptcp_rcvbuf_grow,
> +
> + TP_PROTO(struct sock *sk, int time),
> +
> + TP_ARGS(sk, time),
> +
> + TP_STRUCT__entry(
> + __field(int, time)
> + __field(__u32, rtt_us)
> + __field(__u32, copied)
> + __field(__u32, inq)
> + __field(__u32, space)
> + __field(__u32, ooo_space)
> + __field(__u32, rcvbuf)
> + __field(__u32, rcv_wnd)
> + __field(__u8, scaling_ratio)
> + __field(__u16, sport)
> + __field(__u16, dport)
> + __field(__u16, family)
> + __array(__u8, saddr, 4)
> + __array(__u8, daddr, 4)
> + __array(__u8, saddr_v6, 16)
> + __array(__u8, daddr_v6, 16)
> + __field(const void *, skaddr)
> + ),
> +
> + TP_fast_assign(
> + struct mptcp_sock *msk = mptcp_sk(sk);
> + struct inet_sock *inet = inet_sk(sk);
> + __be32 *p32;
> +
> + __entry->time = time;
> + __entry->rtt_us = msk->rcvq_space.rtt_us >> 3;
> + __entry->copied = msk->rcvq_space.copied;
> + __entry->inq = mptcp_inq_hint(sk);
> + __entry->space = msk->rcvq_space.space;
> + __entry->ooo_space = RB_EMPTY_ROOT(&msk->out_of_order_queue) ? 0 :
> + MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq -
> + msk->ack_seq;
> +
> + __entry->rcvbuf = sk->sk_rcvbuf;
> + __entry->rcv_wnd = atomic64_read(&msk->rcv_wnd_sent) - msk->ack_seq;
> + __entry->scaling_ratio = msk->scaling_ratio;
> + __entry->sport = ntohs(inet->inet_sport);
> + __entry->dport = ntohs(inet->inet_dport);
> + __entry->family = sk->sk_family;
Hi Paolo -
__entry->family isn't referenced in the TP_printk() below.
Other than that, the series is looking good. I still need to work on
understanding the last 2 patches, even with the commit messages & comments
the behavioral changes/consequences aren't clear to me yet.
- Mat
> +
> + p32 = (__be32 *) __entry->saddr;
> + *p32 = inet->inet_saddr;
> +
> + p32 = (__be32 *) __entry->daddr;
> + *p32 = inet->inet_daddr;
> +
> + TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
> + sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
> +
> + __entry->skaddr = sk;
> + ),
> +
> + TP_printk("time=%u rtt_us=%u copied=%u inq=%u space=%u ooo=%u scaling_ratio=%u rcvbuf=%u "
> + "rcv_wnd=%u "
> + "sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 "
> + "saddrv6=%pI6c daddrv6=%pI6c skaddr=%p",
> + __entry->time, __entry->rtt_us, __entry->copied,
> + __entry->inq, __entry->space, __entry->ooo_space,
> + __entry->scaling_ratio, __entry->rcvbuf,
> + __entry->rcv_wnd,
> + __entry->sport, __entry->dport,
> + __entry->saddr, __entry->daddr,
> + __entry->saddr_v6, __entry->daddr_v6,
> + __entry->skaddr)
> +);
> #endif /* _TRACE_MPTCP_H */
>
> /* This part must be outside protection */
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 01114456dec6..443406bc4a54 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -28,6 +28,8 @@
> #include "protocol.h"
> #include "mib.h"
>
> +static unsigned int mptcp_inq_hint(const struct sock *sk);
> +
> #define CREATE_TRACE_POINTS
> #include <trace/events/mptcp.h>
>
> @@ -2101,6 +2103,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
> if (msk->rcvq_space.copied <= msk->rcvq_space.space)
> goto new_measure;
>
> + trace_mptcp_rcvbuf_grow(sk, time);
> if (mptcp_rcvbuf_grow(sk, msk->rcvq_space.copied)) {
> /* Make subflows follow along. If we do not do this, we
> * get drops at subflow level if skbs can't be moved to
> --
> 2.51.0
>
>
>
On 11/5/25 1:24 AM, Mat Martineau wrote: > On Tue, 4 Nov 2025, Paolo Abeni wrote: > >> Similar to tcp, provide a new tracepoint to better understand >> mptcp_rcv_space_adjust() behavior, which presents many artifacts. >> >> Signed-off-by: Paolo Abeni <pabeni@redhat.com> >> --- >> include/trace/events/mptcp.h | 74 ++++++++++++++++++++++++++++++++++++ >> net/mptcp/protocol.c | 3 ++ >> 2 files changed, 77 insertions(+) >> >> diff --git a/include/trace/events/mptcp.h b/include/trace/events/mptcp.h >> index 085b749cdd97..71fd6d33f48b 100644 >> --- a/include/trace/events/mptcp.h >> +++ b/include/trace/events/mptcp.h >> @@ -178,6 +178,80 @@ TRACE_EVENT(subflow_check_data_avail, >> __entry->skb) >> ); >> >> +#include <trace/events/net_probe_common.h> >> + >> +TRACE_EVENT(mptcp_rcvbuf_grow, >> + >> + TP_PROTO(struct sock *sk, int time), >> + >> + TP_ARGS(sk, time), >> + >> + TP_STRUCT__entry( >> + __field(int, time) >> + __field(__u32, rtt_us) >> + __field(__u32, copied) >> + __field(__u32, inq) >> + __field(__u32, space) >> + __field(__u32, ooo_space) >> + __field(__u32, rcvbuf) >> + __field(__u32, rcv_wnd) >> + __field(__u8, scaling_ratio) >> + __field(__u16, sport) >> + __field(__u16, dport) >> + __field(__u16, family) >> + __array(__u8, saddr, 4) >> + __array(__u8, daddr, 4) >> + __array(__u8, saddr_v6, 16) >> + __array(__u8, daddr_v6, 16) >> + __field(const void *, skaddr) >> + ), >> + >> + TP_fast_assign( >> + struct mptcp_sock *msk = mptcp_sk(sk); >> + struct inet_sock *inet = inet_sk(sk); >> + __be32 *p32; >> + >> + __entry->time = time; >> + __entry->rtt_us = msk->rcvq_space.rtt_us >> 3; >> + __entry->copied = msk->rcvq_space.copied; >> + __entry->inq = mptcp_inq_hint(sk); >> + __entry->space = msk->rcvq_space.space; >> + __entry->ooo_space = RB_EMPTY_ROOT(&msk->out_of_order_queue) ? 0 : >> + MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq - >> + msk->ack_seq; >> + >> + __entry->rcvbuf = sk->sk_rcvbuf; >> + __entry->rcv_wnd = atomic64_read(&msk->rcv_wnd_sent) - msk->ack_seq; >> + __entry->scaling_ratio = msk->scaling_ratio; >> + __entry->sport = ntohs(inet->inet_sport); >> + __entry->dport = ntohs(inet->inet_dport); >> + __entry->family = sk->sk_family; > > Hi Paolo - > > __entry->family isn't referenced in the TP_printk() below. > > Other than that, the series is looking good. I still need to work on > understanding the last 2 patches, even with the commit messages & comments > the behavioral changes/consequences aren't clear to me yet. patch 7/7 is just 'inspired' by similar tcp change (commit ea33537d82921e71f852ea2ed985acc562125efe) the goal is making DRS converging faster to the right size. It should not have any downside. patch 6/7 'fixes rtt estimation'; with the current algo there are some major issues: - max() is simply wrong, as we need to react reasonably fast. If a link has extreme high latency and another is very fast, DRS will converge very slowly. This is addressed using min() - the subflow rtt is biased by mptcp; i.e. on the rx side is the time measured between an ack and the next data. If the connection is CPU bounded, and the scheduler picks a different subflow in response to an incoming ack, the rtt estimated by acked subflow on the rx side could be much more higher then the actual delay. This is addressed explicitly filtering out "too high" rtt value (i.e. double than previous sample) - the most subtle but very effective problem. When the link latency is very low (i.e. I have 2 hosts b2b connected) the first rtt sample will be much higher than the next ones (in my scenario 40K us vs 80 us, note the missing 'K'), because the first sample includes all process scheduler and the socket creation overhead. With the current algo I see: mptcp_rcv_space_adjust() // time = ~40K us mptcp_rcv_space_init() // msk rtt = ~40K us, ssk rtt = ~40K mptcp_rcvbuf_grow() mptcp_rcv_space_adjust() // for 40K us keeps increasing `copied` ... mptcp_rcv_space_adjust() // time = ~40K us, copied is very high // ssk rtt ~= 80us, msk rtt = ~80 us mptcp_rcvbuf_grow() // set sk_rcvbuf to tcp_rmem[2], because // `copied` accumulated data for much more than // one rtt the root cause of the problem is that the msk rtt is updated with a period equal to the previous rtt sample, which in turn is too high at connection start. The solution is keeping the msk rtt up2date with the subflow ones, with update every rcvwin. In theory could be at every incoming packet, but it would be useless because the subflow update the rtt every rcv wnd and touching too often the msk field could cause performance regressions. Side note: the msk rtt needs periodic 'reset'/'refresh' because otherwise it could not deal with events like 'subflow with lowest rtt is closed'. Such reset happens every <rcv wnd> * <number of subflows> incoming bytes. Please LMK if the above helps. Cheers, Paolo
© 2016 - 2025 Red Hat, Inc.