[v2] mptcp: autotune related improvement

[PATCH v2 mptcp-next 1/7] trace: mptcp: add mptcp_rcvbuf_grow tracepoint

Posted by Paolo Abeni 3 weeks, 1 day ago

Similar to tcp, provide a new tracepoint to better understand
mptcp_rcv_space_adjust() behavior, which presents many artifacts.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/trace/events/mptcp.h | 74 ++++++++++++++++++++++++++++++++++++
 net/mptcp/protocol.c         |  3 ++
 2 files changed, 77 insertions(+)

diff --git a/include/trace/events/mptcp.h b/include/trace/events/mptcp.h
index 085b749cdd97..71fd6d33f48b 100644
--- a/include/trace/events/mptcp.h
+++ b/include/trace/events/mptcp.h
@@ -178,6 +178,80 @@ TRACE_EVENT(subflow_check_data_avail,
 		  __entry->skb)
 );
 
+#include <trace/events/net_probe_common.h>
+
+TRACE_EVENT(mptcp_rcvbuf_grow,
+
+	TP_PROTO(struct sock *sk, int time),
+
+	TP_ARGS(sk, time),
+
+	TP_STRUCT__entry(
+		__field(int, time)
+		__field(__u32, rtt_us)
+		__field(__u32, copied)
+		__field(__u32, inq)
+		__field(__u32, space)
+		__field(__u32, ooo_space)
+		__field(__u32, rcvbuf)
+		__field(__u32, rcv_wnd)
+		__field(__u8, scaling_ratio)
+		__field(__u16, sport)
+		__field(__u16, dport)
+		__field(__u16, family)
+		__array(__u8, saddr, 4)
+		__array(__u8, daddr, 4)
+		__array(__u8, saddr_v6, 16)
+		__array(__u8, daddr_v6, 16)
+		__field(const void *, skaddr)
+	),
+
+	TP_fast_assign(
+		struct mptcp_sock *msk = mptcp_sk(sk);
+		struct inet_sock *inet = inet_sk(sk);
+		__be32 *p32;
+
+		__entry->time = time;
+		__entry->rtt_us = msk->rcvq_space.rtt_us >> 3;
+		__entry->copied = msk->rcvq_space.copied;
+		__entry->inq = mptcp_inq_hint(sk);
+		__entry->space = msk->rcvq_space.space;
+		__entry->ooo_space = RB_EMPTY_ROOT(&msk->out_of_order_queue) ? 0 :
+				     MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq -
+				     msk->ack_seq;
+
+		__entry->rcvbuf = sk->sk_rcvbuf;
+		__entry->rcv_wnd = atomic64_read(&msk->rcv_wnd_sent) -  msk->ack_seq;
+		__entry->scaling_ratio = msk->scaling_ratio;
+		__entry->sport = ntohs(inet->inet_sport);
+		__entry->dport = ntohs(inet->inet_dport);
+		__entry->family = sk->sk_family;
+
+		p32 = (__be32 *) __entry->saddr;
+		*p32 = inet->inet_saddr;
+
+		p32 = (__be32 *) __entry->daddr;
+		*p32 = inet->inet_daddr;
+
+		TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
+			       sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
+
+		__entry->skaddr = sk;
+	),
+
+	TP_printk("time=%u rtt_us=%u copied=%u inq=%u space=%u ooo=%u scaling_ratio=%u rcvbuf=%u "
+		  "rcv_wnd=%u "
+		  "sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 "
+		  "saddrv6=%pI6c daddrv6=%pI6c skaddr=%p",
+		  __entry->time, __entry->rtt_us, __entry->copied,
+		  __entry->inq, __entry->space, __entry->ooo_space,
+		  __entry->scaling_ratio, __entry->rcvbuf,
+		  __entry->rcv_wnd,
+		  __entry->sport, __entry->dport,
+		  __entry->saddr, __entry->daddr,
+		  __entry->saddr_v6, __entry->daddr_v6,
+		  __entry->skaddr)
+);
 #endif /* _TRACE_MPTCP_H */
 
 /* This part must be outside protection */
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 01114456dec6..443406bc4a54 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -28,6 +28,8 @@
 #include "protocol.h"
 #include "mib.h"
 
+static unsigned int mptcp_inq_hint(const struct sock *sk);
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/mptcp.h>
 
@@ -2101,6 +2103,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
 	if (msk->rcvq_space.copied <= msk->rcvq_space.space)
 		goto new_measure;
 
+	trace_mptcp_rcvbuf_grow(sk, time);
 	if (mptcp_rcvbuf_grow(sk, msk->rcvq_space.copied)) {
 		/* Make subflows follow along.  If we do not do this, we
 		 * get drops at subflow level if skbs can't be moved to
-- 
2.51.0

Re: [PATCH v2 mptcp-next 1/7] trace: mptcp: add mptcp_rcvbuf_grow tracepoint

Posted by Mat Martineau 3 weeks, 1 day ago

On Tue, 4 Nov 2025, Paolo Abeni wrote:

> Similar to tcp, provide a new tracepoint to better understand
> mptcp_rcv_space_adjust() behavior, which presents many artifacts.
>
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
> include/trace/events/mptcp.h | 74 ++++++++++++++++++++++++++++++++++++
> net/mptcp/protocol.c         |  3 ++
> 2 files changed, 77 insertions(+)
>
> diff --git a/include/trace/events/mptcp.h b/include/trace/events/mptcp.h
> index 085b749cdd97..71fd6d33f48b 100644
> --- a/include/trace/events/mptcp.h
> +++ b/include/trace/events/mptcp.h
> @@ -178,6 +178,80 @@ TRACE_EVENT(subflow_check_data_avail,
> 		  __entry->skb)
> );
>
> +#include <trace/events/net_probe_common.h>
> +
> +TRACE_EVENT(mptcp_rcvbuf_grow,
> +
> +	TP_PROTO(struct sock *sk, int time),
> +
> +	TP_ARGS(sk, time),
> +
> +	TP_STRUCT__entry(
> +		__field(int, time)
> +		__field(__u32, rtt_us)
> +		__field(__u32, copied)
> +		__field(__u32, inq)
> +		__field(__u32, space)
> +		__field(__u32, ooo_space)
> +		__field(__u32, rcvbuf)
> +		__field(__u32, rcv_wnd)
> +		__field(__u8, scaling_ratio)
> +		__field(__u16, sport)
> +		__field(__u16, dport)
> +		__field(__u16, family)
> +		__array(__u8, saddr, 4)
> +		__array(__u8, daddr, 4)
> +		__array(__u8, saddr_v6, 16)
> +		__array(__u8, daddr_v6, 16)
> +		__field(const void *, skaddr)
> +	),
> +
> +	TP_fast_assign(
> +		struct mptcp_sock *msk = mptcp_sk(sk);
> +		struct inet_sock *inet = inet_sk(sk);
> +		__be32 *p32;
> +
> +		__entry->time = time;
> +		__entry->rtt_us = msk->rcvq_space.rtt_us >> 3;
> +		__entry->copied = msk->rcvq_space.copied;
> +		__entry->inq = mptcp_inq_hint(sk);
> +		__entry->space = msk->rcvq_space.space;
> +		__entry->ooo_space = RB_EMPTY_ROOT(&msk->out_of_order_queue) ? 0 :
> +				     MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq -
> +				     msk->ack_seq;
> +
> +		__entry->rcvbuf = sk->sk_rcvbuf;
> +		__entry->rcv_wnd = atomic64_read(&msk->rcv_wnd_sent) -  msk->ack_seq;
> +		__entry->scaling_ratio = msk->scaling_ratio;
> +		__entry->sport = ntohs(inet->inet_sport);
> +		__entry->dport = ntohs(inet->inet_dport);
> +		__entry->family = sk->sk_family;

Hi Paolo -

__entry->family isn't referenced in the TP_printk() below.

Other than that, the series is looking good. I still need to work on 
understanding the last 2 patches, even with the commit messages & comments 
the behavioral changes/consequences aren't clear to me yet.

- Mat

> +
> +		p32 = (__be32 *) __entry->saddr;
> +		*p32 = inet->inet_saddr;
> +
> +		p32 = (__be32 *) __entry->daddr;
> +		*p32 = inet->inet_daddr;
> +
> +		TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
> +			       sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
> +
> +		__entry->skaddr = sk;
> +	),
> +
> +	TP_printk("time=%u rtt_us=%u copied=%u inq=%u space=%u ooo=%u scaling_ratio=%u rcvbuf=%u "
> +		  "rcv_wnd=%u "
> +		  "sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 "
> +		  "saddrv6=%pI6c daddrv6=%pI6c skaddr=%p",
> +		  __entry->time, __entry->rtt_us, __entry->copied,
> +		  __entry->inq, __entry->space, __entry->ooo_space,
> +		  __entry->scaling_ratio, __entry->rcvbuf,
> +		  __entry->rcv_wnd,
> +		  __entry->sport, __entry->dport,
> +		  __entry->saddr, __entry->daddr,
> +		  __entry->saddr_v6, __entry->daddr_v6,
> +		  __entry->skaddr)
> +);
> #endif /* _TRACE_MPTCP_H */
>
> /* This part must be outside protection */
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 01114456dec6..443406bc4a54 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -28,6 +28,8 @@
> #include "protocol.h"
> #include "mib.h"
>
> +static unsigned int mptcp_inq_hint(const struct sock *sk);
> +
> #define CREATE_TRACE_POINTS
> #include <trace/events/mptcp.h>
>
> @@ -2101,6 +2103,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
> 	if (msk->rcvq_space.copied <= msk->rcvq_space.space)
> 		goto new_measure;
>
> +	trace_mptcp_rcvbuf_grow(sk, time);
> 	if (mptcp_rcvbuf_grow(sk, msk->rcvq_space.copied)) {
> 		/* Make subflows follow along.  If we do not do this, we
> 		 * get drops at subflow level if skbs can't be moved to
> -- 
> 2.51.0
>
>
>

Re: [PATCH v2 mptcp-next 1/7] trace: mptcp: add mptcp_rcvbuf_grow tracepoint

Posted by Paolo Abeni 3 weeks, 1 day ago


On 11/5/25 1:24 AM, Mat Martineau wrote:
> On Tue, 4 Nov 2025, Paolo Abeni wrote:
> 
>> Similar to tcp, provide a new tracepoint to better understand
>> mptcp_rcv_space_adjust() behavior, which presents many artifacts.
>>
>> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
>> ---
>> include/trace/events/mptcp.h | 74 ++++++++++++++++++++++++++++++++++++
>> net/mptcp/protocol.c         |  3 ++
>> 2 files changed, 77 insertions(+)
>>
>> diff --git a/include/trace/events/mptcp.h b/include/trace/events/mptcp.h
>> index 085b749cdd97..71fd6d33f48b 100644
>> --- a/include/trace/events/mptcp.h
>> +++ b/include/trace/events/mptcp.h
>> @@ -178,6 +178,80 @@ TRACE_EVENT(subflow_check_data_avail,
>> 		  __entry->skb)
>> );
>>
>> +#include <trace/events/net_probe_common.h>
>> +
>> +TRACE_EVENT(mptcp_rcvbuf_grow,
>> +
>> +	TP_PROTO(struct sock *sk, int time),
>> +
>> +	TP_ARGS(sk, time),
>> +
>> +	TP_STRUCT__entry(
>> +		__field(int, time)
>> +		__field(__u32, rtt_us)
>> +		__field(__u32, copied)
>> +		__field(__u32, inq)
>> +		__field(__u32, space)
>> +		__field(__u32, ooo_space)
>> +		__field(__u32, rcvbuf)
>> +		__field(__u32, rcv_wnd)
>> +		__field(__u8, scaling_ratio)
>> +		__field(__u16, sport)
>> +		__field(__u16, dport)
>> +		__field(__u16, family)
>> +		__array(__u8, saddr, 4)
>> +		__array(__u8, daddr, 4)
>> +		__array(__u8, saddr_v6, 16)
>> +		__array(__u8, daddr_v6, 16)
>> +		__field(const void *, skaddr)
>> +	),
>> +
>> +	TP_fast_assign(
>> +		struct mptcp_sock *msk = mptcp_sk(sk);
>> +		struct inet_sock *inet = inet_sk(sk);
>> +		__be32 *p32;
>> +
>> +		__entry->time = time;
>> +		__entry->rtt_us = msk->rcvq_space.rtt_us >> 3;
>> +		__entry->copied = msk->rcvq_space.copied;
>> +		__entry->inq = mptcp_inq_hint(sk);
>> +		__entry->space = msk->rcvq_space.space;
>> +		__entry->ooo_space = RB_EMPTY_ROOT(&msk->out_of_order_queue) ? 0 :
>> +				     MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq -
>> +				     msk->ack_seq;
>> +
>> +		__entry->rcvbuf = sk->sk_rcvbuf;
>> +		__entry->rcv_wnd = atomic64_read(&msk->rcv_wnd_sent) -  msk->ack_seq;
>> +		__entry->scaling_ratio = msk->scaling_ratio;
>> +		__entry->sport = ntohs(inet->inet_sport);
>> +		__entry->dport = ntohs(inet->inet_dport);
>> +		__entry->family = sk->sk_family;
> 
> Hi Paolo -
> 
> __entry->family isn't referenced in the TP_printk() below.
> 
> Other than that, the series is looking good. I still need to work on 
> understanding the last 2 patches, even with the commit messages & comments 
> the behavioral changes/consequences aren't clear to me yet.

patch 7/7 is just 'inspired' by similar tcp change (commit
ea33537d82921e71f852ea2ed985acc562125efe) the goal is making DRS
converging faster to the right size. It should not have any downside.

patch 6/7 'fixes rtt estimation'; with the current algo there are some
major issues:
- max() is simply wrong, as we need to react reasonably fast. If a link
has extreme high latency and another is very fast, DRS will converge
very slowly. This is addressed using min()
- the subflow rtt is biased by mptcp; i.e. on the rx side is the time
measured between an ack and the next data. If the connection is CPU
bounded, and the scheduler picks a different subflow in response to an
incoming ack, the rtt estimated by acked subflow on the rx side could be
much more higher then the actual delay. This is addressed explicitly
filtering out "too high" rtt value (i.e. double than previous sample)
- the most subtle but very effective problem. When the link latency is
very low (i.e. I have 2 hosts b2b connected) the first rtt sample will
be much higher than the next ones (in my scenario 40K us vs 80 us, note
the missing 'K'), because the first sample includes all process
scheduler and the socket creation overhead. With the current algo I see:

	mptcp_rcv_space_adjust() // time  = ~40K us
	  mptcp_rcv_space_init() // msk rtt = ~40K us, ssk rtt = ~40K
	  mptcp_rcvbuf_grow()	

	mptcp_rcv_space_adjust() // for 40K us keeps increasing `copied`
	...
	mptcp_rcv_space_adjust() // time = ~40K us, copied is very high
	  // ssk rtt ~= 80us, msk rtt = ~80 us
	  mptcp_rcvbuf_grow() // set sk_rcvbuf to tcp_rmem[2], because
			// `copied` accumulated data for much more than
			// one rtt

  the root cause of the problem is that the msk rtt is updated with a
period equal to the previous rtt sample, which in turn is too high at
connection start.

The solution is keeping the msk rtt up2date with the subflow ones, with
update every rcvwin. In theory could be at every incoming packet, but it
would be useless because the subflow update the rtt every rcv wnd and
touching too often the msk field could cause performance regressions.

Side note: the msk rtt needs periodic 'reset'/'refresh' because
otherwise it could not deal with events like 'subflow with lowest rtt is
closed'. Such reset happens every <rcv wnd> * <number of subflows>
incoming bytes.

Please LMK if the above helps.

Cheers,

Paolo

[PATCH v2 mptcp-next 1/7] trace: mptcp: add mptcp_rcvbuf_grow tracepoint
[PATCH v2 mptcp-next 2/7] mptcp: avoid unneeded subflow-level drops.
[PATCH v2 mptcp-next 3/7] mptcp: fix receive space timestamp initialization.
[PATCH v2 mptcp-next 4/7] mptcp: consolidate rcv space init
[PATCH v2 mptcp-next 5/7] mptcp: better rcv space initialization
[PATCH v2 mptcp-next 6/7] mptcp: better mptcp-level rtt estimator
[PATCH v2 mptcp-next 7/7] mptcp: add receive queue awareness in tcp_rcv_space_adjust()