As per RFC, the offered MPTCP-level window should never shrink.
While we currently track the right edge, we don't enforce the
above constraint on the wire.
Additionally, concurrent xmit on different subflows can end-up in
erroneous right edge update.
Address the above explicitly updating the announced window and
protecting the update with an additional atomic operation (sic)
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
RFC -> v1:
- rebased on previous patch
---
net/mptcp/options.c | 52 +++++++++++++++++++++++++++++++++++++++------
1 file changed, 45 insertions(+), 7 deletions(-)
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 2570911735ab..9c76a171af1e 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -1224,20 +1224,58 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
return true;
}
-static void mptcp_set_rwin(const struct tcp_sock *tp)
+static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th)
{
const struct sock *ssk = (const struct sock *)tp;
- const struct mptcp_subflow_context *subflow;
+ struct mptcp_subflow_context *subflow;
+ u64 ack_seq, rcv_wnd_old, rcv_wnd_new;
struct mptcp_sock *msk;
- u64 ack_seq;
+ u32 new_win;
+ u64 win;
subflow = mptcp_subflow_ctx(ssk);
msk = mptcp_sk(subflow->conn);
- ack_seq = READ_ONCE(msk->ack_seq) + tp->rcv_wnd;
+ ack_seq = READ_ONCE(msk->ack_seq);
+ rcv_wnd_new = ack_seq + tp->rcv_wnd;
+
+ rcv_wnd_old = READ_ONCE(msk->rcv_wnd_sent);
+ if (after64(rcv_wnd_new, rcv_wnd_old)) {
+ u64 rcv_wnd;
+
+ for (;;) {
+ rcv_wnd = cmpxchg64(&msk->rcv_wnd_sent, rcv_wnd_old, rcv_wnd_new);
+
+ if (rcv_wnd == rcv_wnd_old)
+ break;
+ if (before64(rcv_wnd_new, rcv_wnd))
+ goto raise_win;
+ rcv_wnd_old = rcv_wnd;
+ };
+ return;
+ }
+
+ if (rcv_wnd_new != rcv_wnd_old) {
+raise_win:
+ win = rcv_wnd_old - ack_seq;
+ tp->rcv_wnd = min_t(u64, win, U32_MAX);
+ new_win = tp->rcv_wnd;
- if (after64(ack_seq, READ_ONCE(msk->rcv_wnd_sent)))
- WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
+ /* Make sure we do not exceed the maximum possible
+ * scaled window.
+ */
+ if (unlikely(th->syn))
+ new_win = min(new_win, 65535U) << tp->rx_opt.rcv_wscale;
+ if (!tp->rx_opt.rcv_wscale &&
+ sock_net(ssk)->ipv4.sysctl_tcp_workaround_signed_windows)
+ new_win = min(new_win, MAX_TCP_WINDOW);
+ else
+ new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
+
+ /* RFC1323 scaling applied */
+ new_win >>= tp->rx_opt.rcv_wscale;
+ th->window = htons(new_win);
+ }
}
u16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum)
@@ -1554,7 +1592,7 @@ void mptcp_write_options(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp,
}
if (tp)
- mptcp_set_rwin(tp);
+ mptcp_set_rwin(tp, th);
}
__be32 mptcp_get_reset_option(const struct sk_buff *skb)
--
2.35.1
On Fri, 15 Apr 2022, Paolo Abeni wrote:
> As per RFC, the offered MPTCP-level window should never shrink.
> While we currently track the right edge, we don't enforce the
> above constraint on the wire.
> Additionally, concurrent xmit on different subflows can end-up in
> erroneous right edge update.
> Address the above explicitly updating the announced window and
> protecting the update with an additional atomic operation (sic)
By "(sic)", are you saying the new atomic operation is consistent with the
use of an atomic write in the previously written code?
I'm accustomed to "sic" meaning "there's a misspelling in the previous few
words that have been preserved from some copied text" when I encounter it
in English text, so I figure the Latin word may have slightly different
meaning here!
But, aside from the commit message, the code seems to build & test out
fine - so just the one code question in patch 3.
- Mat
>
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
> RFC -> v1:
> - rebased on previous patch
> ---
> net/mptcp/options.c | 52 +++++++++++++++++++++++++++++++++++++++------
> 1 file changed, 45 insertions(+), 7 deletions(-)
>
> diff --git a/net/mptcp/options.c b/net/mptcp/options.c
> index 2570911735ab..9c76a171af1e 100644
> --- a/net/mptcp/options.c
> +++ b/net/mptcp/options.c
> @@ -1224,20 +1224,58 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
> return true;
> }
>
> -static void mptcp_set_rwin(const struct tcp_sock *tp)
> +static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th)
> {
> const struct sock *ssk = (const struct sock *)tp;
> - const struct mptcp_subflow_context *subflow;
> + struct mptcp_subflow_context *subflow;
> + u64 ack_seq, rcv_wnd_old, rcv_wnd_new;
> struct mptcp_sock *msk;
> - u64 ack_seq;
> + u32 new_win;
> + u64 win;
>
> subflow = mptcp_subflow_ctx(ssk);
> msk = mptcp_sk(subflow->conn);
>
> - ack_seq = READ_ONCE(msk->ack_seq) + tp->rcv_wnd;
> + ack_seq = READ_ONCE(msk->ack_seq);
> + rcv_wnd_new = ack_seq + tp->rcv_wnd;
> +
> + rcv_wnd_old = READ_ONCE(msk->rcv_wnd_sent);
> + if (after64(rcv_wnd_new, rcv_wnd_old)) {
> + u64 rcv_wnd;
> +
> + for (;;) {
> + rcv_wnd = cmpxchg64(&msk->rcv_wnd_sent, rcv_wnd_old, rcv_wnd_new);
> +
> + if (rcv_wnd == rcv_wnd_old)
> + break;
> + if (before64(rcv_wnd_new, rcv_wnd))
> + goto raise_win;
> + rcv_wnd_old = rcv_wnd;
> + };
> + return;
> + }
> +
> + if (rcv_wnd_new != rcv_wnd_old) {
> +raise_win:
> + win = rcv_wnd_old - ack_seq;
> + tp->rcv_wnd = min_t(u64, win, U32_MAX);
> + new_win = tp->rcv_wnd;
>
> - if (after64(ack_seq, READ_ONCE(msk->rcv_wnd_sent)))
> - WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
> + /* Make sure we do not exceed the maximum possible
> + * scaled window.
> + */
> + if (unlikely(th->syn))
> + new_win = min(new_win, 65535U) << tp->rx_opt.rcv_wscale;
> + if (!tp->rx_opt.rcv_wscale &&
> + sock_net(ssk)->ipv4.sysctl_tcp_workaround_signed_windows)
> + new_win = min(new_win, MAX_TCP_WINDOW);
> + else
> + new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
> +
> + /* RFC1323 scaling applied */
> + new_win >>= tp->rx_opt.rcv_wscale;
> + th->window = htons(new_win);
> + }
> }
>
> u16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum)
> @@ -1554,7 +1592,7 @@ void mptcp_write_options(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp,
> }
>
> if (tp)
> - mptcp_set_rwin(tp);
> + mptcp_set_rwin(tp, th);
> }
>
> __be32 mptcp_get_reset_option(const struct sk_buff *skb)
> --
> 2.35.1
>
>
>
--
Mat Martineau
Intel
On Mon, 2022-04-18 at 17:29 -0700, Mat Martineau wrote:
> On Fri, 15 Apr 2022, Paolo Abeni wrote:
>
> > As per RFC, the offered MPTCP-level window should never shrink.
> > While we currently track the right edge, we don't enforce the
> > above constraint on the wire.
> > Additionally, concurrent xmit on different subflows can end-up in
> > erroneous right edge update.
> > Address the above explicitly updating the announced window and
> > protecting the update with an additional atomic operation (sic)
>
> By "(sic)", are you saying the new atomic operation is consistent with the
> use of an atomic write in the previously written code?
>
> I'm accustomed to "sic" meaning "there's a misspelling in the previous few
> words that have been preserved from some copied text" when I encounter it
> in English text, so I figure the Latin word may have slightly different
> meaning here!
TL;DR: I guess we can drop '(sic)' from the commit message.
"sic" is the short form of "Sic et simpliciter", literally meaning
"it's exactly like that", often used as you noted to declare that an
apparent typo is actually intended to be there.
Here the above usage is extented to generally remark we do not like
whatever stated proviously, even if it's exactly like what we just
wrote: "Unfortunally it's like that".
The latter usage is somewhat e neologism due to the assonance with a
comics book onomatopoeia ("sigh")
:)
/P
© 2016 - 2026 Red Hat, Inc.