net/mptcp/protocol.c | 110 ++++++++++++++++++------------------------- 1 file changed, 46 insertions(+), 64 deletions(-)
Eric Dumazet suggests:
> The fact that mptcp_is_tcpsk() was able to write over sock->ops was a
> bit strange to me.
> mptcp_is_tcpsk() should answer a question, with a read-only argument.
re-factor code to avoid overwriting sock_ops inside that function. Also,
change the helper name to reflect the semantics and to disambiguate from
its dual, sk_is_mptcp(). While at it, collapse mptcp_stream_accept() and
mptcp_accept() into a single function, where fallback / non-fallback are
separated into a single sk_is_mptcp() conditional.
Link: https://github.com/multipath-tcp/mptcp_net-next/issues/432
Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
---
net/mptcp/protocol.c | 110 ++++++++++++++++++-------------------------
1 file changed, 46 insertions(+), 64 deletions(-)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 6956fde61c68d..31822a4fb16e8 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -55,28 +55,16 @@ u64 mptcp_wnd_end(const struct mptcp_sock *msk)
return READ_ONCE(msk->wnd_end);
}
-static bool mptcp_is_tcpsk(struct sock *sk)
+static const struct proto_ops *mptcp_fallback_tcp_ops(const struct sock *sk)
{
- struct socket *sock = sk->sk_socket;
-
- if (unlikely(sk->sk_prot == &tcp_prot)) {
- /* we are being invoked after mptcp_accept() has
- * accepted a non-mp-capable flow: sk is a tcp_sk,
- * not an mptcp one.
- *
- * Hand the socket over to tcp so all further socket ops
- * bypass mptcp.
- */
- WRITE_ONCE(sock->ops, &inet_stream_ops);
- return true;
+ if (sk->sk_prot == &tcp_prot)
+ return &inet_stream_ops;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
- } else if (unlikely(sk->sk_prot == &tcpv6_prot)) {
- WRITE_ONCE(sock->ops, &inet6_stream_ops);
- return true;
+ else if (sk->sk_prot == &tcpv6_prot)
+ return &inet6_stream_ops;
#endif
- }
-
- return false;
+ WARN_ON_ONCE(1);
+ return sk->sk_socket->ops;
}
static int __mptcp_socket_create(struct mptcp_sock *msk)
@@ -3258,44 +3246,6 @@ void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk)
WRITE_ONCE(msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd);
}
-static struct sock *mptcp_accept(struct sock *ssk, int flags, int *err,
- bool kern)
-{
- struct sock *newsk;
-
- pr_debug("ssk=%p, listener=%p", ssk, mptcp_subflow_ctx(ssk));
- newsk = inet_csk_accept(ssk, flags, err, kern);
- if (!newsk)
- return NULL;
-
- pr_debug("newsk=%p, subflow is mptcp=%d", newsk, sk_is_mptcp(newsk));
- if (sk_is_mptcp(newsk)) {
- struct mptcp_subflow_context *subflow;
- struct sock *new_mptcp_sock;
-
- subflow = mptcp_subflow_ctx(newsk);
- new_mptcp_sock = subflow->conn;
-
- /* is_mptcp should be false if subflow->conn is missing, see
- * subflow_syn_recv_sock()
- */
- if (WARN_ON_ONCE(!new_mptcp_sock)) {
- tcp_sk(newsk)->is_mptcp = 0;
- goto out;
- }
-
- newsk = new_mptcp_sock;
- MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEPASSIVEACK);
- } else {
- MPTCP_INC_STATS(sock_net(ssk),
- MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK);
- }
-
-out:
- newsk->sk_kern_sock = kern;
- return newsk;
-}
-
void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags)
{
struct mptcp_subflow_context *subflow, *tmp;
@@ -3739,7 +3689,6 @@ static struct proto mptcp_prot = {
.connect = mptcp_connect,
.disconnect = mptcp_disconnect,
.close = mptcp_close,
- .accept = mptcp_accept,
.setsockopt = mptcp_setsockopt,
.getsockopt = mptcp_getsockopt,
.shutdown = mptcp_shutdown,
@@ -3849,18 +3798,36 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
if (!ssk)
return -EINVAL;
- newsk = mptcp_accept(ssk, flags, &err, kern);
+ pr_debug("ssk=%p, listener=%p", ssk, mptcp_subflow_ctx(ssk));
+ newsk = inet_csk_accept(ssk, flags, &err, kern);
if (!newsk)
return err;
- lock_sock(newsk);
-
- __inet_accept(sock, newsock, newsk);
- if (!mptcp_is_tcpsk(newsock->sk)) {
- struct mptcp_sock *msk = mptcp_sk(newsk);
+ pr_debug("newsk=%p, subflow is mptcp=%d", newsk, sk_is_mptcp(newsk));
+ if (sk_is_mptcp(newsk)) {
struct mptcp_subflow_context *subflow;
+ struct sock *new_mptcp_sock;
+
+ subflow = mptcp_subflow_ctx(newsk);
+ new_mptcp_sock = subflow->conn;
+
+ /* is_mptcp should be false if subflow->conn is missing, see
+ * subflow_syn_recv_sock()
+ */
+ if (WARN_ON_ONCE(!new_mptcp_sock)) {
+ tcp_sk(newsk)->is_mptcp = 0;
+ goto out;
+ }
+
+ newsk = new_mptcp_sock;
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEPASSIVEACK);
+
+ newsk->sk_kern_sock = kern;
+ lock_sock(newsk);
+ __inet_accept(sock, newsock, newsk);
set_bit(SOCK_CUSTOM_SOCKOPT, &newsock->flags);
+ msk = mptcp_sk(newsk);
msk->in_accept_queue = 0;
/* set ssk->sk_socket of accept()ed flows to mptcp socket.
@@ -3882,6 +3849,21 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
if (unlikely(list_is_singular(&msk->conn_list)))
inet_sk_state_store(newsk, TCP_CLOSE);
}
+ } else {
+ MPTCP_INC_STATS(sock_net(ssk),
+ MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK);
+out:
+ newsk->sk_kern_sock = kern;
+ lock_sock(newsk);
+ __inet_accept(sock, newsock, newsk);
+ /* we are being invoked after accepting a non-mp-capable
+ * flow: sk is a tcp_sk, not an mptcp one.
+ *
+ * Hand the socket over to tcp so all further socket ops
+ * bypass mptcp.
+ */
+ WRITE_ONCE(newsock->sk->sk_socket->ops,
+ mptcp_fallback_tcp_ops(newsock->sk));
}
release_sock(newsk);
--
2.27.0
Hi Davide,
Thank you for your modifications, that's great!
Our CI did some validations and here is its report:
- KVM Validation: normal (except selftest_mptcp_join):
- Success! ✅:
- Task: https://cirrus-ci.com/task/5343194284556288
- Summary: https://api.cirrus-ci.com/v1/artifact/task/5343194284556288/summary/summary.txt
- KVM Validation: debug (except selftest_mptcp_join):
- Success! ✅:
- Task: https://cirrus-ci.com/task/5061719307845632
- Summary: https://api.cirrus-ci.com/v1/artifact/task/5061719307845632/summary/summary.txt
- KVM Validation: debug (only selftest_mptcp_join):
- Success! ✅:
- Task: https://cirrus-ci.com/task/6187619214688256
- Summary: https://api.cirrus-ci.com/v1/artifact/task/6187619214688256/summary/summary.txt
- KVM Validation: normal (only selftest_mptcp_join):
- Success! ✅:
- Task: https://cirrus-ci.com/task/6469094191398912
- Summary: https://api.cirrus-ci.com/v1/artifact/task/6469094191398912/summary/summary.txt
Initiator: Patchew Applier
Commits: https://github.com/multipath-tcp/mptcp_net-next/commits/d8d6f517221b
If there are some issues, you can reproduce them using the same environment as
the one used by the CI thanks to a docker image, e.g.:
$ cd [kernel source code]
$ docker run -v "${PWD}:${PWD}:rw" -w "${PWD}" --privileged --rm -it \
--pull always mptcp/mptcp-upstream-virtme-docker:latest \
auto-debug
For more details:
https://github.com/multipath-tcp/mptcp-upstream-virtme-docker
Please note that despite all the efforts that have been already done to have a
stable tests suite when executed on a public CI like here, it is possible some
reported issues are not due to your modifications. Still, do not hesitate to
help us improve that ;-)
Cheers,
MPTCP GH Action bot
Bot operated by Matthieu Baerts (NGI0 Core)
Hi,
On Tue, 2023-12-05 at 10:12 +0100, Davide Caratti wrote:
> Eric Dumazet suggests:
>
> > The fact that mptcp_is_tcpsk() was able to write over sock->ops was a
> > bit strange to me.
> > mptcp_is_tcpsk() should answer a question, with a read-only argument.
>
> re-factor code to avoid overwriting sock_ops inside that function. Also,
> change the helper name to reflect the semantics and to disambiguate from
> its dual, sk_is_mptcp(). While at it, collapse mptcp_stream_accept() and
> mptcp_accept() into a single function, where fallback / non-fallback are
> separated into a single sk_is_mptcp() conditional.
> Link: https://github.com/multipath-tcp/mptcp_net-next/issues/432
> Suggested-by: Eric Dumazet <edumazet@google.com>
> Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Nice! this avoids a few conditionals in fast-path!
> ---
> net/mptcp/protocol.c | 110 ++++++++++++++++++-------------------------
> 1 file changed, 46 insertions(+), 64 deletions(-)
>
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 6956fde61c68d..31822a4fb16e8 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -55,28 +55,16 @@ u64 mptcp_wnd_end(const struct mptcp_sock *msk)
> return READ_ONCE(msk->wnd_end);
> }
>
> -static bool mptcp_is_tcpsk(struct sock *sk)
> +static const struct proto_ops *mptcp_fallback_tcp_ops(const struct sock *sk)
> {
> - struct socket *sock = sk->sk_socket;
> -
> - if (unlikely(sk->sk_prot == &tcp_prot)) {
> - /* we are being invoked after mptcp_accept() has
> - * accepted a non-mp-capable flow: sk is a tcp_sk,
> - * not an mptcp one.
> - *
> - * Hand the socket over to tcp so all further socket ops
> - * bypass mptcp.
> - */
> - WRITE_ONCE(sock->ops, &inet_stream_ops);
> - return true;
> + if (sk->sk_prot == &tcp_prot)
> + return &inet_stream_ops;
> #if IS_ENABLED(CONFIG_MPTCP_IPV6)
> - } else if (unlikely(sk->sk_prot == &tcpv6_prot)) {
> - WRITE_ONCE(sock->ops, &inet6_stream_ops);
> - return true;
> + else if (sk->sk_prot == &tcpv6_prot)
> + return &inet6_stream_ops;
> #endif
> - }
> -
> - return false;
> + WARN_ON_ONCE(1);
> + return sk->sk_socket->ops;
minor nit: you can avoid an additional conditional changing the order
of the above tests:
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
if (sk->sk_prot == &tcpv6_prot)
return &inet6_stream_ops;
#endif
WARN_ON_ONCE(sk->sk_prot != &tcp_prot);
return &inet_stream_ops;
Returning &inet_stream_ops for non tcpv4 sockets would likely lead to
later problems, but that warn is there only to protect from unknown/not
foreseen serious bugs. Anything that happens after such warn is really
doomed and even the 'return sk->sk_socket->ops;' present in the current
form of this patch will cause trouble later.
> }
>
> static int __mptcp_socket_create(struct mptcp_sock *msk)
> @@ -3258,44 +3246,6 @@ void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk)
> WRITE_ONCE(msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd);
> }
>
> -static struct sock *mptcp_accept(struct sock *ssk, int flags, int *err,
> - bool kern)
> -{
> - struct sock *newsk;
> -
> - pr_debug("ssk=%p, listener=%p", ssk, mptcp_subflow_ctx(ssk));
> - newsk = inet_csk_accept(ssk, flags, err, kern);
> - if (!newsk)
> - return NULL;
> -
> - pr_debug("newsk=%p, subflow is mptcp=%d", newsk, sk_is_mptcp(newsk));
> - if (sk_is_mptcp(newsk)) {
> - struct mptcp_subflow_context *subflow;
> - struct sock *new_mptcp_sock;
> -
> - subflow = mptcp_subflow_ctx(newsk);
> - new_mptcp_sock = subflow->conn;
> -
> - /* is_mptcp should be false if subflow->conn is missing, see
> - * subflow_syn_recv_sock()
> - */
> - if (WARN_ON_ONCE(!new_mptcp_sock)) {
> - tcp_sk(newsk)->is_mptcp = 0;
> - goto out;
> - }
> -
> - newsk = new_mptcp_sock;
> - MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEPASSIVEACK);
> - } else {
> - MPTCP_INC_STATS(sock_net(ssk),
> - MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK);
> - }
> -
> -out:
> - newsk->sk_kern_sock = kern;
> - return newsk;
> -}
> -
> void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags)
> {
> struct mptcp_subflow_context *subflow, *tmp;
> @@ -3739,7 +3689,6 @@ static struct proto mptcp_prot = {
> .connect = mptcp_connect,
> .disconnect = mptcp_disconnect,
> .close = mptcp_close,
> - .accept = mptcp_accept,
> .setsockopt = mptcp_setsockopt,
> .getsockopt = mptcp_getsockopt,
> .shutdown = mptcp_shutdown,
> @@ -3849,18 +3798,36 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
> if (!ssk)
> return -EINVAL;
>
> - newsk = mptcp_accept(ssk, flags, &err, kern);
> + pr_debug("ssk=%p, listener=%p", ssk, mptcp_subflow_ctx(ssk));
> + newsk = inet_csk_accept(ssk, flags, &err, kern);
> if (!newsk)
> return err;
>
> - lock_sock(newsk);
> -
> - __inet_accept(sock, newsock, newsk);
> - if (!mptcp_is_tcpsk(newsock->sk)) {
> - struct mptcp_sock *msk = mptcp_sk(newsk);
> + pr_debug("newsk=%p, subflow is mptcp=%d", newsk, sk_is_mptcp(newsk));
> + if (sk_is_mptcp(newsk)) {
> struct mptcp_subflow_context *subflow;
> + struct sock *new_mptcp_sock;
> +
> + subflow = mptcp_subflow_ctx(newsk);
> + new_mptcp_sock = subflow->conn;
> +
> + /* is_mptcp should be false if subflow->conn is missing, see
> + * subflow_syn_recv_sock()
> + */
> + if (WARN_ON_ONCE(!new_mptcp_sock)) {
> + tcp_sk(newsk)->is_mptcp = 0;
> + goto out;
very minor nit: the label 'out' above could possibly be renamed to
something more descriptive, e.g. 'fallback'.
> + }
> +
> + newsk = new_mptcp_sock;
> + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEPASSIVEACK);
> +
> + newsk->sk_kern_sock = kern;
> + lock_sock(newsk);
> + __inet_accept(sock, newsock, newsk);
>
> set_bit(SOCK_CUSTOM_SOCKOPT, &newsock->flags);
> + msk = mptcp_sk(newsk);
> msk->in_accept_queue = 0;
>
> /* set ssk->sk_socket of accept()ed flows to mptcp socket.
> @@ -3882,6 +3849,21 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
> if (unlikely(list_is_singular(&msk->conn_list)))
> inet_sk_state_store(newsk, TCP_CLOSE);
> }
> + } else {
> + MPTCP_INC_STATS(sock_net(ssk),
> + MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK);
> +out:
> + newsk->sk_kern_sock = kern;
> + lock_sock(newsk);
> + __inet_accept(sock, newsock, newsk);
> + /* we are being invoked after accepting a non-mp-capable
> + * flow: sk is a tcp_sk, not an mptcp one.
> + *
> + * Hand the socket over to tcp so all further socket ops
> + * bypass mptcp.
> + */
> + WRITE_ONCE(newsock->sk->sk_socket->ops,
> + mptcp_fallback_tcp_ops(newsock->sk));
> }
> release_sock(newsk);
>
Both notes not intended to block the patch, feel free it ignore.
In any case:
Acked-by: Paolo Abeni <pabeni@redhat.com>
hi Paolo, thanks for reviewing!
On Tue, Dec 5, 2023 at 6:10 PM Paolo Abeni <pabeni@redhat.com> wrote:
>
> Hi,
>
> On Tue, 2023-12-05 at 10:12 +0100, Davide Caratti wrote:
> > Eric Dumazet suggests:
[...]
> > diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> > index 6956fde61c68d..31822a4fb16e8 100644
> > --- a/net/mptcp/protocol.c
> > +++ b/net/mptcp/protocol.c
> > @@ -55,28 +55,16 @@ u64 mptcp_wnd_end(const struct mptcp_sock *msk)
> > return READ_ONCE(msk->wnd_end);
> > }
> >
> > -static bool mptcp_is_tcpsk(struct sock *sk)
> > +static const struct proto_ops *mptcp_fallback_tcp_ops(const struct sock *sk)
> > {
> > - struct socket *sock = sk->sk_socket;
> > -
> > - if (unlikely(sk->sk_prot == &tcp_prot)) {
> > - /* we are being invoked after mptcp_accept() has
> > - * accepted a non-mp-capable flow: sk is a tcp_sk,
> > - * not an mptcp one.
> > - *
> > - * Hand the socket over to tcp so all further socket ops
> > - * bypass mptcp.
> > - */
> > - WRITE_ONCE(sock->ops, &inet_stream_ops);
> > - return true;
> > + if (sk->sk_prot == &tcp_prot)
> > + return &inet_stream_ops;
> > #if IS_ENABLED(CONFIG_MPTCP_IPV6)
> > - } else if (unlikely(sk->sk_prot == &tcpv6_prot)) {
> > - WRITE_ONCE(sock->ops, &inet6_stream_ops);
> > - return true;
> > + else if (sk->sk_prot == &tcpv6_prot)
> > + return &inet6_stream_ops;
> > #endif
> > - }
> > -
> > - return false;
> > + WARN_ON_ONCE(1);
> > + return sk->sk_socket->ops;
>
> minor nit: you can avoid an additional conditional changing the order
> of the above tests:
it's not avoided, it's just evaluated inside the WARN_ON_ONCE() :-)
> #if IS_ENABLED(CONFIG_MPTCP_IPV6)
> if (sk->sk_prot == &tcpv6_prot)
> return &inet6_stream_ops;
> #endif
>
> WARN_ON_ONCE(sk->sk_prot != &tcp_prot);
> return &inet_stream_ops;
>
> Returning &inet_stream_ops for non tcpv4 sockets would likely lead to
> later problems, but that warn is there only to protect from unknown/not
> foreseen serious bugs. Anything that happens after such warn is really
> doomed and even the 'return sk->sk_socket->ops;' present in the current
> form of this patch will cause trouble later.
I cowardly decided to let the helper return the current value of ->ops
with the WARN_ON(1) line, with the goal of analyzing the splat on a
live system, with the hope of not doing it on a crashed kernel. But
then it didn't happen on a 5 iterations of kselftests + packetdrill,
so... probably we can shorten the function like you suggested, i.e.:
WARN_ON_ONCE(sk->sk_prot != &tcp_prot);
return &inet_stream_ops;
[...]
> > @@ -3849,18 +3798,36 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
> > if (!ssk)
> > return -EINVAL;
> >
> > - newsk = mptcp_accept(ssk, flags, &err, kern);
> > + pr_debug("ssk=%p, listener=%p", ssk, mptcp_subflow_ctx(ssk));
> > + newsk = inet_csk_accept(ssk, flags, &err, kern);
> > if (!newsk)
> > return err;
> >
> > - lock_sock(newsk);
> > -
> > - __inet_accept(sock, newsock, newsk);
> > - if (!mptcp_is_tcpsk(newsock->sk)) {
> > - struct mptcp_sock *msk = mptcp_sk(newsk);
> > + pr_debug("newsk=%p, subflow is mptcp=%d", newsk, sk_is_mptcp(newsk));
> > + if (sk_is_mptcp(newsk)) {
> > struct mptcp_subflow_context *subflow;
> > + struct sock *new_mptcp_sock;
> > +
> > + subflow = mptcp_subflow_ctx(newsk);
> > + new_mptcp_sock = subflow->conn;
> > +
> > + /* is_mptcp should be false if subflow->conn is missing, see
> > + * subflow_syn_recv_sock()
> > + */
> > + if (WARN_ON_ONCE(!new_mptcp_sock)) {
> > + tcp_sk(newsk)->is_mptcp = 0;
> > + goto out;
>
> very minor nit: the label 'out' above could possibly be renamed to
> something more descriptive, e.g. 'fallback'.
right, I'll send a v3 in the next hours.
> In any case:
>
> Acked-by: Paolo Abeni <pabeni@redhat.com>
thanks a lot!
--
davide
Hi Davide,
Thank you for your modifications, that's great!
Our CI did some validations and here is its report:
- KVM Validation: normal (except selftest_mptcp_join):
- Success! ✅:
- Task: https://cirrus-ci.com/task/6556569052119040
- Summary: https://api.cirrus-ci.com/v1/artifact/task/6556569052119040/summary/summary.txt
- KVM Validation: debug (only selftest_mptcp_join):
- Success! ✅:
- Task: https://cirrus-ci.com/task/5289931656921088
- Summary: https://api.cirrus-ci.com/v1/artifact/task/5289931656921088/summary/summary.txt
- KVM Validation: debug (except selftest_mptcp_join):
- Success! ✅:
- Task: https://cirrus-ci.com/task/5852881610342400
- Summary: https://api.cirrus-ci.com/v1/artifact/task/5852881610342400/summary/summary.txt
- KVM Validation: normal (only selftest_mptcp_join):
- Success! ✅:
- Task: https://cirrus-ci.com/task/4726981703499776
- Summary: https://api.cirrus-ci.com/v1/artifact/task/4726981703499776/summary/summary.txt
Initiator: Patchew Applier
Commits: https://github.com/multipath-tcp/mptcp_net-next/commits/68ead4ff9b66
If there are some issues, you can reproduce them using the same environment as
the one used by the CI thanks to a docker image, e.g.:
$ cd [kernel source code]
$ docker run -v "${PWD}:${PWD}:rw" -w "${PWD}" --privileged --rm -it \
--pull always mptcp/mptcp-upstream-virtme-docker:latest \
auto-debug
For more details:
https://github.com/multipath-tcp/mptcp-upstream-virtme-docker
Please note that despite all the efforts that have been already done to have a
stable tests suite when executed on a public CI like here, it is possible some
reported issues are not due to your modifications. Still, do not hesitate to
help us improve that ;-)
Cheers,
MPTCP GH Action bot
Bot operated by Matthieu Baerts (NGI0 Core)
© 2016 - 2026 Red Hat, Inc.