Currently mptcp_cleanup_rbuf() makes a significant effort to avoid
acquiring the subflow socket lock, estimating if the tcp level
cleanup could actually send an ack.
Such estimate is a bit rough when accounting for receive window
change, as it consider the msk available buffer space instead
of the announced mptcp-level window.
Let's consider the announced window instead, mirroring closely
the plain TCP implementation. We need to lockless access a bunch
of additional, tcp fields.
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
include/net/mptcp.h | 2 ++
net/ipv4/tcp.c | 3 +++
net/mptcp/options.c | 1 -
net/mptcp/protocol.c | 14 +++++++++++---
net/mptcp/protocol.h | 6 +++++-
5 files changed, 21 insertions(+), 5 deletions(-)
diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index 7af7fd48acc7..4b6c66b73bf4 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -136,6 +136,7 @@ static inline bool rsk_drop_req(const struct request_sock *req)
return tcp_rsk(req)->is_mptcp && tcp_rsk(req)->drop_req;
}
+void mptcp_receive_window(const struct sock *ssk, int *rwnd);
void mptcp_space(const struct sock *ssk, int *space, int *full_space);
bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb,
unsigned int *size, struct mptcp_out_options *opts);
@@ -284,6 +285,7 @@ static inline bool mptcp_skb_can_collapse(const struct sk_buff *to,
return true;
}
+static inline void mptcp_receive_window(const struct sock *ssk, int *rwnd) { }
static inline void mptcp_space(const struct sock *ssk, int *s, int *fs) { }
static inline void mptcp_seq_show(struct seq_file *seq) { }
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 970e9a2cca4a..d0c0100f74f3 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1606,6 +1606,9 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
__u32 rcv_window_now = tcp_receive_window(tp);
+ if (sk_is_mptcp(sk))
+ mptcp_receive_window(sk, &rcv_window_now);
+
/* Optimize, __tcp_select_window() is not cheap. */
if (2*rcv_window_now <= tp->window_clamp) {
__u32 new_window = __tcp_select_window(sk);
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 30d289044e71..563ef8fe5a85 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -604,7 +604,6 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
}
opts->ext_copy.use_ack = 1;
opts->suboptions = OPTION_MPTCP_DSS;
- WRITE_ONCE(msk->old_wspace, __mptcp_space((struct sock *)msk));
/* Add kind/length/subtype/flag overhead if mapping is not populated */
if (dss_size == 0)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 5af3d591a20b..17e2dbe43639 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -535,6 +535,14 @@ static void mptcp_subflow_cleanup_rbuf(struct sock *ssk)
unlock_sock_fast(ssk, slow);
}
+void mptcp_receive_window(const struct sock *ssk, int *rwnd)
+{
+ const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ const struct sock *sk = subflow->conn;
+
+ *rwnd = __mptcp_receive_window(mptcp_sk(sk));
+}
+
static bool mptcp_subflow_could_cleanup(const struct sock *ssk, bool rx_empty)
{
const struct inet_connection_sock *icsk = inet_csk(ssk);
@@ -550,13 +558,13 @@ static bool mptcp_subflow_could_cleanup(const struct sock *ssk, bool rx_empty)
static void mptcp_cleanup_rbuf(struct mptcp_sock *msk)
{
- int old_space = READ_ONCE(msk->old_wspace);
+ int cur_window = __mptcp_receive_window(msk);
struct mptcp_subflow_context *subflow;
struct sock *sk = (struct sock *)msk;
- int space = __mptcp_space(sk);
+ int new_window = __mptcp_space(sk);
bool cleanup, rx_empty;
- cleanup = (space > 0) && (space >= (old_space << 1));
+ cleanup = new_window > 0 && new_window >= (cur_window << 1);
rx_empty = !__mptcp_rmem(sk);
mptcp_for_each_subflow(msk, subflow) {
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 1bc9f6e77ddd..1e603f28f1db 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -260,7 +260,6 @@ struct mptcp_sock {
int rmem_fwd_alloc;
struct sock *last_snd;
int snd_burst;
- int old_wspace;
u64 recovery_snd_nxt; /* in recovery mode accept up to this seq;
* recovery related fields are under data_lock
* protection
@@ -336,6 +335,11 @@ static inline int __mptcp_rmem(const struct sock *sk)
return atomic_read(&sk->sk_rmem_alloc) - READ_ONCE(mptcp_sk(sk)->rmem_released);
}
+static inline int __mptcp_receive_window(const struct mptcp_sock *msk)
+{
+ return atomic64_read(&msk->rcv_wnd_sent) - READ_ONCE(msk->ack_seq);
+}
+
static inline int __mptcp_space(const struct sock *sk)
{
return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk));
--
2.35.3
On Fri, 29 Jul 2022, Paolo Abeni wrote:
> Currently mptcp_cleanup_rbuf() makes a significant effort to avoid
> acquiring the subflow socket lock, estimating if the tcp level
> cleanup could actually send an ack.
>
> Such estimate is a bit rough when accounting for receive window
> change, as it consider the msk available buffer space instead
> of the announced mptcp-level window.
>
> Let's consider the announced window instead, mirroring closely
> the plain TCP implementation. We need to lockless access a bunch
> of additional, tcp fields.
Hi Paolo -
The code changes look good. For this last sentence above ("...lockless
access a bunch of ... tcp fields"), I only see access to msk->rcv_wnd_sent
and msk->ack_seq, and nothing obvious in the later commits. Could you
clarify?
- Mat
>
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
> include/net/mptcp.h | 2 ++
> net/ipv4/tcp.c | 3 +++
> net/mptcp/options.c | 1 -
> net/mptcp/protocol.c | 14 +++++++++++---
> net/mptcp/protocol.h | 6 +++++-
> 5 files changed, 21 insertions(+), 5 deletions(-)
>
> diff --git a/include/net/mptcp.h b/include/net/mptcp.h
> index 7af7fd48acc7..4b6c66b73bf4 100644
> --- a/include/net/mptcp.h
> +++ b/include/net/mptcp.h
> @@ -136,6 +136,7 @@ static inline bool rsk_drop_req(const struct request_sock *req)
> return tcp_rsk(req)->is_mptcp && tcp_rsk(req)->drop_req;
> }
>
> +void mptcp_receive_window(const struct sock *ssk, int *rwnd);
> void mptcp_space(const struct sock *ssk, int *space, int *full_space);
> bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb,
> unsigned int *size, struct mptcp_out_options *opts);
> @@ -284,6 +285,7 @@ static inline bool mptcp_skb_can_collapse(const struct sk_buff *to,
> return true;
> }
>
> +static inline void mptcp_receive_window(const struct sock *ssk, int *rwnd) { }
> static inline void mptcp_space(const struct sock *ssk, int *s, int *fs) { }
> static inline void mptcp_seq_show(struct seq_file *seq) { }
>
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index 970e9a2cca4a..d0c0100f74f3 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -1606,6 +1606,9 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
> if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
> __u32 rcv_window_now = tcp_receive_window(tp);
>
> + if (sk_is_mptcp(sk))
> + mptcp_receive_window(sk, &rcv_window_now);
> +
> /* Optimize, __tcp_select_window() is not cheap. */
> if (2*rcv_window_now <= tp->window_clamp) {
> __u32 new_window = __tcp_select_window(sk);
> diff --git a/net/mptcp/options.c b/net/mptcp/options.c
> index 30d289044e71..563ef8fe5a85 100644
> --- a/net/mptcp/options.c
> +++ b/net/mptcp/options.c
> @@ -604,7 +604,6 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
> }
> opts->ext_copy.use_ack = 1;
> opts->suboptions = OPTION_MPTCP_DSS;
> - WRITE_ONCE(msk->old_wspace, __mptcp_space((struct sock *)msk));
>
> /* Add kind/length/subtype/flag overhead if mapping is not populated */
> if (dss_size == 0)
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 5af3d591a20b..17e2dbe43639 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -535,6 +535,14 @@ static void mptcp_subflow_cleanup_rbuf(struct sock *ssk)
> unlock_sock_fast(ssk, slow);
> }
>
> +void mptcp_receive_window(const struct sock *ssk, int *rwnd)
> +{
> + const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
> + const struct sock *sk = subflow->conn;
> +
> + *rwnd = __mptcp_receive_window(mptcp_sk(sk));
> +}
> +
> static bool mptcp_subflow_could_cleanup(const struct sock *ssk, bool rx_empty)
> {
> const struct inet_connection_sock *icsk = inet_csk(ssk);
> @@ -550,13 +558,13 @@ static bool mptcp_subflow_could_cleanup(const struct sock *ssk, bool rx_empty)
>
> static void mptcp_cleanup_rbuf(struct mptcp_sock *msk)
> {
> - int old_space = READ_ONCE(msk->old_wspace);
> + int cur_window = __mptcp_receive_window(msk);
> struct mptcp_subflow_context *subflow;
> struct sock *sk = (struct sock *)msk;
> - int space = __mptcp_space(sk);
> + int new_window = __mptcp_space(sk);
> bool cleanup, rx_empty;
>
> - cleanup = (space > 0) && (space >= (old_space << 1));
> + cleanup = new_window > 0 && new_window >= (cur_window << 1);
> rx_empty = !__mptcp_rmem(sk);
>
> mptcp_for_each_subflow(msk, subflow) {
> diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
> index 1bc9f6e77ddd..1e603f28f1db 100644
> --- a/net/mptcp/protocol.h
> +++ b/net/mptcp/protocol.h
> @@ -260,7 +260,6 @@ struct mptcp_sock {
> int rmem_fwd_alloc;
> struct sock *last_snd;
> int snd_burst;
> - int old_wspace;
> u64 recovery_snd_nxt; /* in recovery mode accept up to this seq;
> * recovery related fields are under data_lock
> * protection
> @@ -336,6 +335,11 @@ static inline int __mptcp_rmem(const struct sock *sk)
> return atomic_read(&sk->sk_rmem_alloc) - READ_ONCE(mptcp_sk(sk)->rmem_released);
> }
>
> +static inline int __mptcp_receive_window(const struct mptcp_sock *msk)
> +{
> + return atomic64_read(&msk->rcv_wnd_sent) - READ_ONCE(msk->ack_seq);
> +}
> +
> static inline int __mptcp_space(const struct sock *sk)
> {
> return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk));
> --
> 2.35.3
>
>
>
--
Mat Martineau
Intel
On Fri, 2022-07-29 at 16:01 -0700, Mat Martineau wrote:
> On Fri, 29 Jul 2022, Paolo Abeni wrote:
>
> > Currently mptcp_cleanup_rbuf() makes a significant effort to avoid
> > acquiring the subflow socket lock, estimating if the tcp level
> > cleanup could actually send an ack.
> >
> > Such estimate is a bit rough when accounting for receive window
> > change, as it consider the msk available buffer space instead
> > of the announced mptcp-level window.
> >
> > Let's consider the announced window instead, mirroring closely
> > the plain TCP implementation. We need to lockless access a bunch
> > of additional, tcp fields.
>
> Hi Paolo -
>
> The code changes look good. For this last sentence above ("...lockless
> access a bunch of ... tcp fields"), I only see access to msk->rcv_wnd_sent
> and msk->ack_seq, and nothing obvious in the later commits. Could you
> clarify?
You are right, that comment is inaccurate. Is there from a previous
internal version. I'll drop it in the next iteration.
Thanks!
Paolo
© 2016 - 2026 Red Hat, Inc.