[PATCH mptcp-next] mptcp: fix rcv buffer auto-tuning.

Paolo Abeni posted 1 patch 9 months, 2 weeks ago
Patches applied successfully (tree, apply log)
git fetch https://github.com/multipath-tcp/mptcp_net-next tags/patchew/099977f9fe49d41ee0aa2eddefe5e2e09da3a3f1.1689836587.git.pabeni@redhat.com
Maintainers: Eric Dumazet <edumazet@google.com>, "David S. Miller" <davem@davemloft.net>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Matthieu Baerts <matthieu.baerts@tessares.net>, Mat Martineau <martineau@kernel.org>
There is a newer version of this series
include/net/tcp.h    | 20 +++++++++++++++-----
net/mptcp/protocol.c | 14 +++++++-------
net/mptcp/protocol.h |  8 +++++++-
net/mptcp/subflow.c  |  2 +-
4 files changed, 30 insertions(+), 14 deletions(-)
[PATCH mptcp-next] mptcp: fix rcv buffer auto-tuning.
Posted by Paolo Abeni 9 months, 2 weeks ago
The MPTCP code uses the assumption that the tcp_win_from_space() helper
does not use any TCP-specific field, and thus works correctly operating
on an MPTCP socket.
The commit dfa2f0483360 ("tcp: get rid of sysctl_tcp_adv_win_scale") broke
such assumption, and as a consequence most MPTCP connections stall on
zero-window event due to auto-tuning changing the rcv buffer size quite
randomly.
Address the issue synching again the MPTCP auto-tuning code with the TCP
one. To achieve that, factor out the windows size logic in socket
independent helpers, and reuse them in mptcp_rcv_space_adjust(). The MPTCP
level scaling_ratio is selected as the minimum one from the all the subflows,
as a worst-case estimate.

Co-developed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
I still see quite frequent failures on mptcp_join tests 19 && 20:
020 multi flows, signal, bidi, link fail
      Info: Test file (size 12288 KB) for server
      syn                                 [ ok ]
      synack                              [ ok ]
      ack                                 [ ok ]
      add                                 [ ok ]
      echo                                [ ok ]
      stale                               [fail] got 0 stale[s] 0 recover[s],   expected stale in range [1..-1],  stale-recover delta 1

I *think* such failures are still caused by dfa2f0483360 ("tcp: get
rid of sysctl_tcp_adv_win_scale") because it should change (increase?) the
rcv buffer/adv win size, and such tests depends on the transferred files
being significantly larger then the adv win. Since the current breakage is
pretty bad and the above needs more investigation, sharing it now.

@Matttbe: fill free to adjust the tag whatever it fit you better!!!
---
 include/net/tcp.h    | 20 +++++++++++++++-----
 net/mptcp/protocol.c | 14 +++++++-------
 net/mptcp/protocol.h |  8 +++++++-
 net/mptcp/subflow.c  |  2 +-
 4 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 2104a71c75ba..33f60088e0df 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1432,22 +1432,32 @@ void tcp_select_initial_window(const struct sock *sk, int __space,
 			       __u32 *window_clamp, int wscale_ok,
 			       __u8 *rcv_wscale, __u32 init_rcv_wnd);
 
-static inline int tcp_win_from_space(const struct sock *sk, int space)
+static inline int __tcp_win_from_space(u8 scaling_ratio, int space)
 {
-	s64 scaled_space = (s64)space * tcp_sk(sk)->scaling_ratio;
+	s64 scaled_space = (s64)space * scaling_ratio;
 
 	return scaled_space >> TCP_RMEM_TO_WIN_SCALE;
 }
 
-/* inverse of tcp_win_from_space() */
-static inline int tcp_space_from_win(const struct sock *sk, int win)
+static inline int tcp_win_from_space(const struct sock *sk, int space)
+{
+	return __tcp_win_from_space(tcp_sk(sk)->scaling_ratio, space);
+}
+
+/* inverse of __tcp_win_from_space() */
+static inline int __tcp_space_from_win(u8 scaling_ratio, int win)
 {
 	u64 val = (u64)win << TCP_RMEM_TO_WIN_SCALE;
 
-	do_div(val, tcp_sk(sk)->scaling_ratio);
+	do_div(val, scaling_ratio);
 	return val;
 }
 
+static inline int tcp_space_from_win(const struct sock *sk, int win)
+{
+	return __tcp_space_from_win(tcp_sk(sk)->scaling_ratio, win);
+}
+
 static inline void tcp_scaling_ratio_init(struct sock *sk)
 {
 	/* Assume a conservative default of 1200 bytes of payload per 4K page.
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 9f00a0288a0a..3ab67e0dfda9 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -90,6 +90,7 @@ static int __mptcp_socket_create(struct mptcp_sock *msk)
 	if (err)
 		return err;
 
+	msk->scaling_ratio = tcp_sk(ssock->sk)->scaling_ratio;
 	WRITE_ONCE(msk->first, ssock->sk);
 	subflow = mptcp_subflow_ctx(ssock->sk);
 	list_add(&subflow->node, &msk->conn_list);
@@ -1906,6 +1907,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
 {
 	struct mptcp_subflow_context *subflow;
 	struct sock *sk = (struct sock *)msk;
+	u8 scaling_ratio = 255;
 	u32 time, advmss = 1;
 	u64 rtt_us, mstamp;
 
@@ -1936,9 +1938,11 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
 
 		rtt_us = max(sf_rtt_us, rtt_us);
 		advmss = max(sf_advmss, advmss);
+		scaling_ratio = min(tp->scaling_ratio, scaling_ratio);
 	}
 
 	msk->rcvq_space.rtt_us = rtt_us;
+	msk->scaling_ratio = scaling_ratio;
 	if (time < (rtt_us >> 3) || rtt_us == 0)
 		return;
 
@@ -1947,8 +1951,8 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
 
 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
 	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
-		int rcvmem, rcvbuf;
 		u64 rcvwin, grow;
+		int rcvbuf;
 
 		rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss;
 
@@ -1957,18 +1961,14 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
 		do_div(grow, msk->rcvq_space.space);
 		rcvwin += (grow << 1);
 
-		rcvmem = SKB_TRUESIZE(advmss + MAX_TCP_HEADER);
-		while (tcp_win_from_space(sk, rcvmem) < advmss)
-			rcvmem += 128;
-
 		do_div(rcvwin, advmss);
-		rcvbuf = min_t(u64, rcvwin * rcvmem,
+		rcvbuf = min_t(u64, __tcp_space_from_win(scaling_ratio, rcvwin),
 			       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
 
 		if (rcvbuf > sk->sk_rcvbuf) {
 			u32 window_clamp;
 
-			window_clamp = tcp_win_from_space(sk, rcvbuf);
+			window_clamp = __tcp_win_from_space(scaling_ratio, rcvbuf);
 			WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
 
 			/* Make subflows follow along.  If we do not do this, we
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 1b4457c44fe8..86cd4d9e8432 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -293,6 +293,7 @@ struct mptcp_sock {
 	bool		csum_enabled;
 	bool		allow_infinite_fallback;
 	u8		mpc_endpoint_id;
+	u8		scaling_ratio;
 	u8		recvmsg_inq:1,
 			cork:1,
 			nodelay:1,
@@ -349,9 +350,14 @@ static inline int __mptcp_rmem(const struct sock *sk)
 	return atomic_read(&sk->sk_rmem_alloc) - READ_ONCE(mptcp_sk(sk)->rmem_released);
 }
 
+static inline int mptcp_win_from_space(const struct sock *sk, int space)
+{
+	return __tcp_win_from_space(mptcp_sk(sk)->scaling_ratio, space);
+}
+
 static inline int __mptcp_space(const struct sock *sk)
 {
-	return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk));
+	return mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk));
 }
 
 static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk)
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 9ee3b7abbaf6..ad7080fabb2f 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -1359,7 +1359,7 @@ void mptcp_space(const struct sock *ssk, int *space, int *full_space)
 	const struct sock *sk = subflow->conn;
 
 	*space = __mptcp_space(sk);
-	*full_space = tcp_full_space(sk);
+	*full_space = mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf));
 }
 
 void __mptcp_error_report(struct sock *sk)
-- 
2.41.0
Re: [PATCH mptcp-next] mptcp: fix rcv buffer auto-tuning.
Posted by Matthieu Baerts 9 months, 2 weeks ago
Hi Paolo,

On 20/07/2023 09:03, Paolo Abeni wrote:
> The MPTCP code uses the assumption that the tcp_win_from_space() helper
> does not use any TCP-specific field, and thus works correctly operating
> on an MPTCP socket.
> The commit dfa2f0483360 ("tcp: get rid of sysctl_tcp_adv_win_scale") broke
> such assumption, and as a consequence most MPTCP connections stall on
> zero-window event due to auto-tuning changing the rcv buffer size quite
> randomly.
> Address the issue synching again the MPTCP auto-tuning code with the TCP
> one. To achieve that, factor out the windows size logic in socket
> independent helpers, and reuse them in mptcp_rcv_space_adjust(). The MPTCP
> level scaling_ratio is selected as the minimum one from the all the subflows,
> as a worst-case estimate.

Thank you for having shared this patch!

> Co-developed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
> Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
> I still see quite frequent failures on mptcp_join tests 19 && 20:
> 020 multi flows, signal, bidi, link fail
>       Info: Test file (size 12288 KB) for server
>       syn                                 [ ok ]
>       synack                              [ ok ]
>       ack                                 [ ok ]
>       add                                 [ ok ]
>       echo                                [ ok ]
>       stale                               [fail] got 0 stale[s] 0 recover[s],   expected stale in range [1..-1],  stale-recover delta 1
> 
> I *think* such failures are still caused by dfa2f0483360 ("tcp: get
> rid of sysctl_tcp_adv_win_scale") because it should change (increase?) the
> rcv buffer/adv win size, and such tests depends on the transferred files
> being significantly larger then the adv win. Since the current breakage is
> pretty bad and the above needs more investigation, sharing it now.

I agree, fine to delay the fix of this test.

> @Matttbe: fill free to adjust the tag whatever it fit you better!!!

Thank you for having mixed the two versions :)
Tags look good to me (with the Fixes one)

> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 9f00a0288a0a..3ab67e0dfda9 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -90,6 +90,7 @@ static int __mptcp_socket_create(struct mptcp_sock *msk)
>  	if (err)
>  		return err;
>  
> +	msk->scaling_ratio = tcp_sk(ssock->sk)->scaling_ratio;
>  	WRITE_ONCE(msk->first, ssock->sk);
>  	subflow = mptcp_subflow_ctx(ssock->sk);
>  	list_add(&subflow->node, &msk->conn_list);
> @@ -1906,6 +1907,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
>  {
>  	struct mptcp_subflow_context *subflow;
>  	struct sock *sk = (struct sock *)msk;
> +	u8 scaling_ratio = 255;

Small detail: can I set it to U8_MAX when applying the patch?

(...)

> diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
> index 1b4457c44fe8..86cd4d9e8432 100644
> --- a/net/mptcp/protocol.h
> +++ b/net/mptcp/protocol.h
> @@ -293,6 +293,7 @@ struct mptcp_sock {
>  	bool		csum_enabled;
>  	bool		allow_infinite_fallback;
>  	u8		mpc_endpoint_id;
> +	u8		scaling_ratio;

Small detail: should we declare that in "rcvq_space"?

  msk->rcvq_space.scaling_ratio

It's up to you, I'm suggesting that because it is linked to the rcv
space stuff and I see we put variables linked to that there.

(I can also do the modification when applying the patch)

Once applied on our side, do you prefer to send it directly to netdev?

Cheers,
Matt
-- 
Tessares | Belgium | Hybrid Access Solutions
www.tessares.net
Re: [PATCH mptcp-next] mptcp: fix rcv buffer auto-tuning.
Posted by Paolo Abeni 9 months, 2 weeks ago
On Thu, 2023-07-20 at 10:52 +0200, Matthieu Baerts wrote:
> Hi Paolo,
> 
> On 20/07/2023 09:03, Paolo Abeni wrote:
> > The MPTCP code uses the assumption that the tcp_win_from_space() helper
> > does not use any TCP-specific field, and thus works correctly operating
> > on an MPTCP socket.
> > The commit dfa2f0483360 ("tcp: get rid of sysctl_tcp_adv_win_scale") broke
> > such assumption, and as a consequence most MPTCP connections stall on
> > zero-window event due to auto-tuning changing the rcv buffer size quite
> > randomly.
> > Address the issue synching again the MPTCP auto-tuning code with the TCP
> > one. To achieve that, factor out the windows size logic in socket
> > independent helpers, and reuse them in mptcp_rcv_space_adjust(). The MPTCP
> > level scaling_ratio is selected as the minimum one from the all the subflows,
> > as a worst-case estimate.
> 
> Thank you for having shared this patch!
> 
> > Co-developed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
> > Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
> > Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> > ---
> > I still see quite frequent failures on mptcp_join tests 19 && 20:
> > 020 multi flows, signal, bidi, link fail
> >       Info: Test file (size 12288 KB) for server
> >       syn                                 [ ok ]
> >       synack                              [ ok ]
> >       ack                                 [ ok ]
> >       add                                 [ ok ]
> >       echo                                [ ok ]
> >       stale                               [fail] got 0 stale[s] 0 recover[s],   expected stale in range [1..-1],  stale-recover delta 1
> > 
> > I *think* such failures are still caused by dfa2f0483360 ("tcp: get
> > rid of sysctl_tcp_adv_win_scale") because it should change (increase?) the
> > rcv buffer/adv win size, and such tests depends on the transferred files
> > being significantly larger then the adv win. Since the current breakage is
> > pretty bad and the above needs more investigation, sharing it now.
> 
> I agree, fine to delay the fix of this test.
> 
> > @Matttbe: fill free to adjust the tag whatever it fit you better!!!
> 
> Thank you for having mixed the two versions :)
> Tags look good to me (with the Fixes one)
> 
> > diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> > index 9f00a0288a0a..3ab67e0dfda9 100644
> > --- a/net/mptcp/protocol.c
> > +++ b/net/mptcp/protocol.c
> > @@ -90,6 +90,7 @@ static int __mptcp_socket_create(struct mptcp_sock *msk)
> >  	if (err)
> >  		return err;
> >  
> > +	msk->scaling_ratio = tcp_sk(ssock->sk)->scaling_ratio;
> >  	WRITE_ONCE(msk->first, ssock->sk);
> >  	subflow = mptcp_subflow_ctx(ssock->sk);
> >  	list_add(&subflow->node, &msk->conn_list);
> > @@ -1906,6 +1907,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
> >  {
> >  	struct mptcp_subflow_context *subflow;
> >  	struct sock *sk = (struct sock *)msk;
> > +	u8 scaling_ratio = 255;
> 
> Small detail: can I set it to U8_MAX when applying the patch?

Sure, thanks for the cleanup!

> (...)
> 
> > diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
> > index 1b4457c44fe8..86cd4d9e8432 100644
> > --- a/net/mptcp/protocol.h
> > +++ b/net/mptcp/protocol.h
> > @@ -293,6 +293,7 @@ struct mptcp_sock {
> >  	bool		csum_enabled;
> >  	bool		allow_infinite_fallback;
> >  	u8		mpc_endpoint_id;
> > +	u8		scaling_ratio;
> 
> Small detail: should we declare that in "rcvq_space"?
> 
>   msk->rcvq_space.scaling_ratio
> 
> It's up to you, I'm suggesting that because it is linked to the rcv
> space stuff and I see we put variables linked to that there.
> 
> (I can also do the modification when applying the patch)
> 
> Once applied on our side, do you prefer to send it directly to netdev?

To me it's the same. Whatever you prefer.

Cheers,

Paolo
Re: [PATCH mptcp-next] mptcp: fix rcv buffer auto-tuning.
Posted by Paolo Abeni 9 months, 2 weeks ago
On Thu, 2023-07-20 at 09:03 +0200, Paolo Abeni wrote:
> The MPTCP code uses the assumption that the tcp_win_from_space() helper
> does not use any TCP-specific field, and thus works correctly operating
> on an MPTCP socket.
> The commit dfa2f0483360 ("tcp: get rid of sysctl_tcp_adv_win_scale") broke
> such assumption, and as a consequence most MPTCP connections stall on
> zero-window event due to auto-tuning changing the rcv buffer size quite
> randomly.
> Address the issue synching again the MPTCP auto-tuning code with the TCP
> one. To achieve that, factor out the windows size logic in socket
> independent helpers, and reuse them in mptcp_rcv_space_adjust(). The MPTCP
> level scaling_ratio is selected as the minimum one from the all the subflows,
> as a worst-case estimate.
> 
> Co-developed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
> Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
> I still see quite frequent failures on mptcp_join tests 19 && 20:
> 020 multi flows, signal, bidi, link fail
>       Info: Test file (size 12288 KB) for server
>       syn                                 [ ok ]
>       synack                              [ ok ]
>       ack                                 [ ok ]
>       add                                 [ ok ]
>       echo                                [ ok ]
>       stale                               [fail] got 0 stale[s] 0 recover[s],   expected stale in range [1..-1],  stale-recover delta 1
> 
> I *think* such failures are still caused by dfa2f0483360 ("tcp: get
> rid of sysctl_tcp_adv_win_scale") because it should change (increase?) the
> rcv buffer/adv win size, and such tests depends on the transferred files
> being significantly larger then the adv win. Since the current breakage is
> pretty bad and the above needs more investigation, sharing it now.
> 
> @Matttbe: fill free to adjust the tag whatever it fit you better!!!

At very least this needs a:

Fixes: dfa2f0483360 ("tcp: get rid of sysctl_tcp_adv_win_scale")

/P