[v2] mptcp: just another receive path refactor

[PATCH v2 mptcp-next 2/4] mptcp: more accurate receive buffer updates

Posted by Paolo Abeni 3 years, 6 months ago

Currently mptcp_cleanup_rbuf() makes a significant effort to avoid
acquiring the subflow socket lock, estimating if the tcp level
cleanup could actually send an ack.

Such estimate is a bit rough when accounting for receive window
change, as it consider the msk available buffer space instead
of the announced mptcp-level window.

Let's consider the announced window instead, mirroring closely
the plain TCP implementation. We need to lockless access a bunch
of additional, tcp fields.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/mptcp.h  |  2 ++
 net/ipv4/tcp.c       |  3 +++
 net/mptcp/options.c  |  1 -
 net/mptcp/protocol.c | 14 +++++++++++---
 net/mptcp/protocol.h |  6 +++++-
 5 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index 7af7fd48acc7..4b6c66b73bf4 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -136,6 +136,7 @@ static inline bool rsk_drop_req(const struct request_sock *req)
 	return tcp_rsk(req)->is_mptcp && tcp_rsk(req)->drop_req;
 }
 
+void mptcp_receive_window(const struct sock *ssk, int *rwnd);
 void mptcp_space(const struct sock *ssk, int *space, int *full_space);
 bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb,
 		       unsigned int *size, struct mptcp_out_options *opts);
@@ -284,6 +285,7 @@ static inline bool mptcp_skb_can_collapse(const struct sk_buff *to,
 	return true;
 }
 
+static inline void mptcp_receive_window(const struct sock *ssk, int *rwnd) { }
 static inline void mptcp_space(const struct sock *ssk, int *s, int *fs) { }
 static inline void mptcp_seq_show(struct seq_file *seq) { }
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 970e9a2cca4a..d0c0100f74f3 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1606,6 +1606,9 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
 	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
 		__u32 rcv_window_now = tcp_receive_window(tp);
 
+		if (sk_is_mptcp(sk))
+			mptcp_receive_window(sk, &rcv_window_now);
+
 		/* Optimize, __tcp_select_window() is not cheap. */
 		if (2*rcv_window_now <= tp->window_clamp) {
 			__u32 new_window = __tcp_select_window(sk);
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 30d289044e71..563ef8fe5a85 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -604,7 +604,6 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
 	}
 	opts->ext_copy.use_ack = 1;
 	opts->suboptions = OPTION_MPTCP_DSS;
-	WRITE_ONCE(msk->old_wspace, __mptcp_space((struct sock *)msk));
 
 	/* Add kind/length/subtype/flag overhead if mapping is not populated */
 	if (dss_size == 0)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 5af3d591a20b..17e2dbe43639 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -535,6 +535,14 @@ static void mptcp_subflow_cleanup_rbuf(struct sock *ssk)
 	unlock_sock_fast(ssk, slow);
 }
 
+void mptcp_receive_window(const struct sock *ssk, int *rwnd)
+{
+	const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+	const struct sock *sk = subflow->conn;
+
+	*rwnd = __mptcp_receive_window(mptcp_sk(sk));
+}
+
 static bool mptcp_subflow_could_cleanup(const struct sock *ssk, bool rx_empty)
 {
 	const struct inet_connection_sock *icsk = inet_csk(ssk);
@@ -550,13 +558,13 @@ static bool mptcp_subflow_could_cleanup(const struct sock *ssk, bool rx_empty)
 
 static void mptcp_cleanup_rbuf(struct mptcp_sock *msk)
 {
-	int old_space = READ_ONCE(msk->old_wspace);
+	int cur_window = __mptcp_receive_window(msk);
 	struct mptcp_subflow_context *subflow;
 	struct sock *sk = (struct sock *)msk;
-	int space =  __mptcp_space(sk);
+	int new_window =  __mptcp_space(sk);
 	bool cleanup, rx_empty;
 
-	cleanup = (space > 0) && (space >= (old_space << 1));
+	cleanup = new_window > 0 && new_window >= (cur_window << 1);
 	rx_empty = !__mptcp_rmem(sk);
 
 	mptcp_for_each_subflow(msk, subflow) {
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 1bc9f6e77ddd..1e603f28f1db 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -260,7 +260,6 @@ struct mptcp_sock {
 	int		rmem_fwd_alloc;
 	struct sock	*last_snd;
 	int		snd_burst;
-	int		old_wspace;
 	u64		recovery_snd_nxt;	/* in recovery mode accept up to this seq;
 						 * recovery related fields are under data_lock
 						 * protection
@@ -336,6 +335,11 @@ static inline int __mptcp_rmem(const struct sock *sk)
 	return atomic_read(&sk->sk_rmem_alloc) - READ_ONCE(mptcp_sk(sk)->rmem_released);
 }
 
+static inline int __mptcp_receive_window(const struct mptcp_sock *msk)
+{
+	return atomic64_read(&msk->rcv_wnd_sent) - READ_ONCE(msk->ack_seq);
+}
+
 static inline int __mptcp_space(const struct sock *sk)
 {
 	return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk));
-- 
2.35.3

Re: [PATCH v2 mptcp-next 2/4] mptcp: more accurate receive buffer updates

Posted by Mat Martineau 3 years, 6 months ago

On Fri, 29 Jul 2022, Paolo Abeni wrote:

> Currently mptcp_cleanup_rbuf() makes a significant effort to avoid
> acquiring the subflow socket lock, estimating if the tcp level
> cleanup could actually send an ack.
>
> Such estimate is a bit rough when accounting for receive window
> change, as it consider the msk available buffer space instead
> of the announced mptcp-level window.
>
> Let's consider the announced window instead, mirroring closely
> the plain TCP implementation. We need to lockless access a bunch
> of additional, tcp fields.

Hi Paolo -

The code changes look good. For this last sentence above ("...lockless 
access a bunch of ... tcp fields"), I only see access to msk->rcv_wnd_sent 
and msk->ack_seq, and nothing obvious in the later commits. Could you 
clarify?


- Mat



>
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
> include/net/mptcp.h  |  2 ++
> net/ipv4/tcp.c       |  3 +++
> net/mptcp/options.c  |  1 -
> net/mptcp/protocol.c | 14 +++++++++++---
> net/mptcp/protocol.h |  6 +++++-
> 5 files changed, 21 insertions(+), 5 deletions(-)
>
> diff --git a/include/net/mptcp.h b/include/net/mptcp.h
> index 7af7fd48acc7..4b6c66b73bf4 100644
> --- a/include/net/mptcp.h
> +++ b/include/net/mptcp.h
> @@ -136,6 +136,7 @@ static inline bool rsk_drop_req(const struct request_sock *req)
> 	return tcp_rsk(req)->is_mptcp && tcp_rsk(req)->drop_req;
> }
>
> +void mptcp_receive_window(const struct sock *ssk, int *rwnd);
> void mptcp_space(const struct sock *ssk, int *space, int *full_space);
> bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb,
> 		       unsigned int *size, struct mptcp_out_options *opts);
> @@ -284,6 +285,7 @@ static inline bool mptcp_skb_can_collapse(const struct sk_buff *to,
> 	return true;
> }
>
> +static inline void mptcp_receive_window(const struct sock *ssk, int *rwnd) { }
> static inline void mptcp_space(const struct sock *ssk, int *s, int *fs) { }
> static inline void mptcp_seq_show(struct seq_file *seq) { }
>
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index 970e9a2cca4a..d0c0100f74f3 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -1606,6 +1606,9 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
> 	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
> 		__u32 rcv_window_now = tcp_receive_window(tp);
>
> +		if (sk_is_mptcp(sk))
> +			mptcp_receive_window(sk, &rcv_window_now);
> +
> 		/* Optimize, __tcp_select_window() is not cheap. */
> 		if (2*rcv_window_now <= tp->window_clamp) {
> 			__u32 new_window = __tcp_select_window(sk);
> diff --git a/net/mptcp/options.c b/net/mptcp/options.c
> index 30d289044e71..563ef8fe5a85 100644
> --- a/net/mptcp/options.c
> +++ b/net/mptcp/options.c
> @@ -604,7 +604,6 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
> 	}
> 	opts->ext_copy.use_ack = 1;
> 	opts->suboptions = OPTION_MPTCP_DSS;
> -	WRITE_ONCE(msk->old_wspace, __mptcp_space((struct sock *)msk));
>
> 	/* Add kind/length/subtype/flag overhead if mapping is not populated */
> 	if (dss_size == 0)
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 5af3d591a20b..17e2dbe43639 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -535,6 +535,14 @@ static void mptcp_subflow_cleanup_rbuf(struct sock *ssk)
> 	unlock_sock_fast(ssk, slow);
> }
>
> +void mptcp_receive_window(const struct sock *ssk, int *rwnd)
> +{
> +	const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
> +	const struct sock *sk = subflow->conn;
> +
> +	*rwnd = __mptcp_receive_window(mptcp_sk(sk));
> +}
> +
> static bool mptcp_subflow_could_cleanup(const struct sock *ssk, bool rx_empty)
> {
> 	const struct inet_connection_sock *icsk = inet_csk(ssk);
> @@ -550,13 +558,13 @@ static bool mptcp_subflow_could_cleanup(const struct sock *ssk, bool rx_empty)
>
> static void mptcp_cleanup_rbuf(struct mptcp_sock *msk)
> {
> -	int old_space = READ_ONCE(msk->old_wspace);
> +	int cur_window = __mptcp_receive_window(msk);
> 	struct mptcp_subflow_context *subflow;
> 	struct sock *sk = (struct sock *)msk;
> -	int space =  __mptcp_space(sk);
> +	int new_window =  __mptcp_space(sk);
> 	bool cleanup, rx_empty;
>
> -	cleanup = (space > 0) && (space >= (old_space << 1));
> +	cleanup = new_window > 0 && new_window >= (cur_window << 1);
> 	rx_empty = !__mptcp_rmem(sk);
>
> 	mptcp_for_each_subflow(msk, subflow) {
> diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
> index 1bc9f6e77ddd..1e603f28f1db 100644
> --- a/net/mptcp/protocol.h
> +++ b/net/mptcp/protocol.h
> @@ -260,7 +260,6 @@ struct mptcp_sock {
> 	int		rmem_fwd_alloc;
> 	struct sock	*last_snd;
> 	int		snd_burst;
> -	int		old_wspace;
> 	u64		recovery_snd_nxt;	/* in recovery mode accept up to this seq;
> 						 * recovery related fields are under data_lock
> 						 * protection
> @@ -336,6 +335,11 @@ static inline int __mptcp_rmem(const struct sock *sk)
> 	return atomic_read(&sk->sk_rmem_alloc) - READ_ONCE(mptcp_sk(sk)->rmem_released);
> }
>
> +static inline int __mptcp_receive_window(const struct mptcp_sock *msk)
> +{
> +	return atomic64_read(&msk->rcv_wnd_sent) - READ_ONCE(msk->ack_seq);
> +}
> +
> static inline int __mptcp_space(const struct sock *sk)
> {
> 	return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk));
> -- 
> 2.35.3
>
>
>

--
Mat Martineau
Intel

Re: [PATCH v2 mptcp-next 2/4] mptcp: more accurate receive buffer updates

Posted by Paolo Abeni 3 years, 6 months ago

On Fri, 2022-07-29 at 16:01 -0700, Mat Martineau wrote:
> On Fri, 29 Jul 2022, Paolo Abeni wrote:
> 
> > Currently mptcp_cleanup_rbuf() makes a significant effort to avoid
> > acquiring the subflow socket lock, estimating if the tcp level
> > cleanup could actually send an ack.
> > 
> > Such estimate is a bit rough when accounting for receive window
> > change, as it consider the msk available buffer space instead
> > of the announced mptcp-level window.
> > 
> > Let's consider the announced window instead, mirroring closely
> > the plain TCP implementation. We need to lockless access a bunch
> > of additional, tcp fields.
> 
> Hi Paolo -
> 
> The code changes look good. For this last sentence above ("...lockless 
> access a bunch of ... tcp fields"), I only see access to msk->rcv_wnd_sent 
> and msk->ack_seq, and nothing obvious in the later commits. Could you 
> clarify?

You are right, that comment is inaccurate. Is there from a previous
internal version. I'll drop it in the next iteration.

Thanks!

Paolo

[PATCH v2 mptcp-next 1/4] mptcp: move RCVPRUNE event later
[PATCH v2 mptcp-next 2/4] mptcp: more accurate receive buffer updates
[PATCH v2 mptcp-next 3/4] mptcp: move msk input path under full msk socket lock
[PATCH v2 mptcp-next 4/4] mptcp: use common helper for rmem memory accounting