On high speed links, the MPTCP-level receive buffer auto-tuning
happens with a frequency well above the TCP-level's one. That in
turn can cause excessive/unneeded receive buffer increase.
On such links, the initial rtt_us value is considerably higher
than the actual delay, but the current mptcp_rcv_space_adjust() logic
prevents msk->rcvq_space.rtt_us from decreasing.
Address the issue with a more accurate RTT estimation strategy: the
MPTCP-level RTT is set to the minimum of all the subflows, in a rcv-win
based interval, feeding data into the MPTCP-receive buffer.
Use some care to avoid updating msk and ssk level fields too often.
Fixes: a6b118febbab ("mptcp: add receive buffer auto-tuning")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
v1 -> v2:
- do not use explicit reset flags - do rcv win based decision instead
- discard 0 rtt_us samples from subflows
- discard samples on non empty rx queue
- discard "too high" samples, see the code comments WRT the whys
---
include/trace/events/mptcp.h | 2 +-
net/mptcp/protocol.c | 74 ++++++++++++++++++++++--------------
net/mptcp/protocol.h | 7 +++-
3 files changed, 53 insertions(+), 30 deletions(-)
diff --git a/include/trace/events/mptcp.h b/include/trace/events/mptcp.h
index 71fd6d33f48b..999133000cb8 100644
--- a/include/trace/events/mptcp.h
+++ b/include/trace/events/mptcp.h
@@ -212,7 +212,7 @@ TRACE_EVENT(mptcp_rcvbuf_grow,
__be32 *p32;
__entry->time = time;
- __entry->rtt_us = msk->rcvq_space.rtt_us >> 3;
+ __entry->rtt_us = msk->rcv_rtt_est.rtt_us >> 3;
__entry->copied = msk->rcvq_space.copied;
__entry->inq = mptcp_inq_hint(sk);
__entry->space = msk->rcvq_space.space;
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index c85ad7ef29b0..414cca078541 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -868,6 +868,42 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
return moved;
}
+static void mptcp_rcv_rtt_update(struct mptcp_sock *msk,
+ struct mptcp_subflow_context *subflow)
+{
+ const struct tcp_sock *tp = tcp_sk(subflow->tcp_sock);
+ u32 rtt_us = tp->rcv_rtt_est.rtt_us;
+ u8 sr = tp->scaling_ratio;
+
+ /* MPTCP can react to incoming acks pushing data on different subflows,
+ * causing apparent high RTT: ignore large samples; also do the update
+ * only on RTT changes
+ */
+ if (tp->rcv_rtt_est.seq == subflow->prev_rtt_seq ||
+ (subflow->prev_rtt_us && (rtt_us >> 1) > subflow->prev_rtt_us))
+ return;
+
+ /* Similar to plain TCP, only consider samples with empty RX queue. */
+ if (!rtt_us || mptcp_data_avail(msk))
+ return;
+
+ /* Refresh the RTT after a full win per subflow */
+ subflow->prev_rtt_us = rtt_us;
+ subflow->prev_rtt_seq = tp->rcv_rtt_est.seq;
+ if (after(subflow->map_seq, msk->rcv_rtt_est.seq)) {
+ msk->rcv_rtt_est.seq = subflow->map_seq +
+ tp->rcv_wnd * msk->pm.extra_subflows;
+ msk->rcv_rtt_est.rtt_us = rtt_us;
+ msk->scaling_ratio = sr;
+ return;
+ }
+
+ if (rtt_us < msk->rcv_rtt_est.rtt_us)
+ msk->rcv_rtt_est.rtt_us = rtt_us;
+ if (sr < msk->scaling_ratio)
+ msk->scaling_ratio = sr;
+}
+
void mptcp_data_ready(struct sock *sk, struct sock *ssk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
@@ -881,6 +917,7 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
return;
mptcp_data_lock(sk);
+ mptcp_rcv_rtt_update(msk, subflow);
if (!sock_owned_by_user(sk)) {
/* Wake-up the reader only for in-sequence data */
if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk))
@@ -2058,6 +2095,7 @@ static void mptcp_rcv_space_init(struct mptcp_sock *msk)
msk->rcvspace_init = 1;
mptcp_data_lock(sk);
+ msk->rcv_rtt_est.seq = atomic64_read(&msk->rcv_wnd_sent);
__mptcp_sync_rcvspace(sk);
/* Paranoid check: at least one subflow pushed data to the msk. */
@@ -2076,9 +2114,8 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
{
struct mptcp_subflow_context *subflow;
struct sock *sk = (struct sock *)msk;
- u8 scaling_ratio = U8_MAX;
- u32 time, advmss = 1;
- u64 rtt_us, mstamp;
+ u32 rtt_us, time;
+ u64 mstamp;
msk_owned_by_me(msk);
@@ -2093,29 +2130,8 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
mstamp = mptcp_stamp();
time = tcp_stamp_us_delta(mstamp, READ_ONCE(msk->rcvq_space.time));
- rtt_us = msk->rcvq_space.rtt_us;
- if (rtt_us && time < (rtt_us >> 3))
- return;
-
- rtt_us = 0;
- mptcp_for_each_subflow(msk, subflow) {
- const struct tcp_sock *tp;
- u64 sf_rtt_us;
- u32 sf_advmss;
-
- tp = tcp_sk(mptcp_subflow_tcp_sock(subflow));
-
- sf_rtt_us = READ_ONCE(tp->rcv_rtt_est.rtt_us);
- sf_advmss = READ_ONCE(tp->advmss);
-
- rtt_us = max(sf_rtt_us, rtt_us);
- advmss = max(sf_advmss, advmss);
- scaling_ratio = min(tp->scaling_ratio, scaling_ratio);
- }
-
- msk->rcvq_space.rtt_us = rtt_us;
- msk->scaling_ratio = scaling_ratio;
- if (time < (rtt_us >> 3) || rtt_us == 0)
+ rtt_us = READ_ONCE(msk->rcv_rtt_est.rtt_us);
+ if (rtt_us == U32_MAX || time < (rtt_us >> 3))
return;
if (msk->rcvq_space.copied <= msk->rcvq_space.space)
@@ -2941,7 +2957,8 @@ static void __mptcp_init_sock(struct sock *sk)
msk->scaling_ratio = TCP_DEFAULT_SCALING_RATIO;
msk->backlog_len = 0;
msk->rcvq_space.copied = 0;
- msk->rcvq_space.rtt_us = 0;
+ msk->rcv_rtt_est.rtt_us = U32_MAX;
+ msk->scaling_ratio = U8_MAX;
WRITE_ONCE(msk->first, NULL);
inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
@@ -3385,7 +3402,8 @@ static int mptcp_disconnect(struct sock *sk, int flags)
msk->bytes_retrans = 0;
msk->rcvspace_init = 0;
msk->rcvq_space.copied = 0;
- msk->rcvq_space.rtt_us = 0;
+ msk->scaling_ratio = U8_MAX;
+ msk->rcv_rtt_est.rtt_us = U32_MAX;
/* for fallback's sake */
WRITE_ONCE(msk->ack_seq, 0);
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index adc0851bad69..051f21b06d33 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -340,11 +340,14 @@ struct mptcp_sock {
*/
struct mptcp_pm_data pm;
struct mptcp_sched_ops *sched;
+ struct {
+ u32 rtt_us; /* Minimum RTT of subflows */
+ u64 seq; /* Refresh RTT after this seq */
+ } rcv_rtt_est;
struct {
int space; /* bytes copied in last measurement window */
int copied; /* bytes copied in this measurement window */
u64 time; /* start time of measurement window */
- u64 rtt_us; /* last maximum rtt of subflows */
} rcvq_space;
u8 scaling_ratio;
bool allow_subflows;
@@ -523,6 +526,8 @@ struct mptcp_subflow_context {
u32 map_data_len;
__wsum map_data_csum;
u32 map_csum_len;
+ u32 prev_rtt_us;
+ u32 prev_rtt_seq;
u32 request_mptcp : 1, /* send MP_CAPABLE */
request_join : 1, /* send MP_JOIN */
request_bkup : 1,
--
2.51.0
© 2016 - 2025 Red Hat, Inc.