[PATCH mptcp-next v2] mptcp: don't overwrite sock_ops in mptcp_is_tcpsk()

Davide Caratti posted 1 patch 5 months, 1 week ago
Patches applied successfully (tree, apply log)
git fetch https://github.com/multipath-tcp/mptcp_net-next tags/patchew/784668db3e21351c91f4f312720059c6fa5cc15d.1702290281.git.dcaratti@redhat.com
Maintainers: Matthieu Baerts <matttbe@kernel.org>, Mat Martineau <martineau@kernel.org>, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>
net/mptcp/protocol.c | 108 ++++++++++++++++++-------------------------
1 file changed, 44 insertions(+), 64 deletions(-)
[PATCH mptcp-next v2] mptcp: don't overwrite sock_ops in mptcp_is_tcpsk()
Posted by Davide Caratti 5 months, 1 week ago
Eric Dumazet suggests:

 > The fact that mptcp_is_tcpsk() was able to write over sock->ops was a
 > bit strange to me.
 > mptcp_is_tcpsk() should answer a question, with a read-only argument.

re-factor code to avoid overwriting sock_ops inside that function. Also,
change the helper name to reflect the semantics and to disambiguate from
its dual, sk_is_mptcp(). While at it, collapse mptcp_stream_accept() and
mptcp_accept() into a single function, where fallback / non-fallback are
separated into a single sk_is_mptcp() conditional.

Link: https://github.com/multipath-tcp/mptcp_net-next/issues/432
Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
---
 net/mptcp/protocol.c | 108 ++++++++++++++++++-------------------------
 1 file changed, 44 insertions(+), 64 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 4fc038d29623..b7cb60e6f0c3 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -55,28 +55,14 @@ u64 mptcp_wnd_end(const struct mptcp_sock *msk)
 	return READ_ONCE(msk->wnd_end);
 }
 
-static bool mptcp_is_tcpsk(struct sock *sk)
+static const struct proto_ops *mptcp_fallback_tcp_ops(const struct sock *sk)
 {
-	struct socket *sock = sk->sk_socket;
-
-	if (unlikely(sk->sk_prot == &tcp_prot)) {
-		/* we are being invoked after mptcp_accept() has
-		 * accepted a non-mp-capable flow: sk is a tcp_sk,
-		 * not an mptcp one.
-		 *
-		 * Hand the socket over to tcp so all further socket ops
-		 * bypass mptcp.
-		 */
-		WRITE_ONCE(sock->ops, &inet_stream_ops);
-		return true;
 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
-	} else if (unlikely(sk->sk_prot == &tcpv6_prot)) {
-		WRITE_ONCE(sock->ops, &inet6_stream_ops);
-		return true;
+	if (sk->sk_prot == &tcpv6_prot)
+		return &inet6_stream_ops;
 #endif
-	}
-
-	return false;
+	WARN_ON_ONCE(sk->sk_prot != &tcp_prot);
+	return &inet_stream_ops;
 }
 
 static int __mptcp_socket_create(struct mptcp_sock *msk)
@@ -3258,44 +3244,6 @@ void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk)
 	WRITE_ONCE(msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd);
 }
 
-static struct sock *mptcp_accept(struct sock *ssk, int flags, int *err,
-				 bool kern)
-{
-	struct sock *newsk;
-
-	pr_debug("ssk=%p, listener=%p", ssk, mptcp_subflow_ctx(ssk));
-	newsk = inet_csk_accept(ssk, flags, err, kern);
-	if (!newsk)
-		return NULL;
-
-	pr_debug("newsk=%p, subflow is mptcp=%d", newsk, sk_is_mptcp(newsk));
-	if (sk_is_mptcp(newsk)) {
-		struct mptcp_subflow_context *subflow;
-		struct sock *new_mptcp_sock;
-
-		subflow = mptcp_subflow_ctx(newsk);
-		new_mptcp_sock = subflow->conn;
-
-		/* is_mptcp should be false if subflow->conn is missing, see
-		 * subflow_syn_recv_sock()
-		 */
-		if (WARN_ON_ONCE(!new_mptcp_sock)) {
-			tcp_sk(newsk)->is_mptcp = 0;
-			goto out;
-		}
-
-		newsk = new_mptcp_sock;
-		MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEPASSIVEACK);
-	} else {
-		MPTCP_INC_STATS(sock_net(ssk),
-				MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK);
-	}
-
-out:
-	newsk->sk_kern_sock = kern;
-	return newsk;
-}
-
 void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags)
 {
 	struct mptcp_subflow_context *subflow, *tmp;
@@ -3738,7 +3686,6 @@ static struct proto mptcp_prot = {
 	.connect	= mptcp_connect,
 	.disconnect	= mptcp_disconnect,
 	.close		= mptcp_close,
-	.accept		= mptcp_accept,
 	.setsockopt	= mptcp_setsockopt,
 	.getsockopt	= mptcp_getsockopt,
 	.shutdown	= mptcp_shutdown,
@@ -3848,18 +3795,36 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
 	if (!ssk)
 		return -EINVAL;
 
-	newsk = mptcp_accept(ssk, flags, &err, kern);
+	pr_debug("ssk=%p, listener=%p", ssk, mptcp_subflow_ctx(ssk));
+	newsk = inet_csk_accept(ssk, flags, &err, kern);
 	if (!newsk)
 		return err;
 
-	lock_sock(newsk);
-
-	__inet_accept(sock, newsock, newsk);
-	if (!mptcp_is_tcpsk(newsock->sk)) {
-		struct mptcp_sock *msk = mptcp_sk(newsk);
+	pr_debug("newsk=%p, subflow is mptcp=%d", newsk, sk_is_mptcp(newsk));
+	if (sk_is_mptcp(newsk)) {
 		struct mptcp_subflow_context *subflow;
+		struct sock *new_mptcp_sock;
+
+		subflow = mptcp_subflow_ctx(newsk);
+		new_mptcp_sock = subflow->conn;
+
+		/* is_mptcp should be false if subflow->conn is missing, see
+		 * subflow_syn_recv_sock()
+		 */
+		if (WARN_ON_ONCE(!new_mptcp_sock)) {
+			tcp_sk(newsk)->is_mptcp = 0;
+			goto tcpfallback;
+		}
+
+		newsk = new_mptcp_sock;
+		MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEPASSIVEACK);
+
+		newsk->sk_kern_sock = kern;
+		lock_sock(newsk);
+		__inet_accept(sock, newsock, newsk);
 
 		set_bit(SOCK_CUSTOM_SOCKOPT, &newsock->flags);
+		msk = mptcp_sk(newsk);
 		msk->in_accept_queue = 0;
 
 		/* set ssk->sk_socket of accept()ed flows to mptcp socket.
@@ -3881,6 +3846,21 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
 			if (unlikely(list_is_singular(&msk->conn_list)))
 				inet_sk_state_store(newsk, TCP_CLOSE);
 		}
+	} else {
+		MPTCP_INC_STATS(sock_net(ssk),
+				MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK);
+tcpfallback:
+		newsk->sk_kern_sock = kern;
+		lock_sock(newsk);
+		__inet_accept(sock, newsock, newsk);
+		/* we are being invoked after accepting a non-mp-capable
+		 * flow: sk is a tcp_sk, not an mptcp one.
+		 *
+		 * Hand the socket over to tcp so all further socket ops
+		 * bypass mptcp.
+		 */
+		WRITE_ONCE(newsock->sk->sk_socket->ops,
+			   mptcp_fallback_tcp_ops(newsock->sk));
 	}
 	release_sock(newsk);
 
-- 
2.41.0
Re: [PATCH mptcp-next v2] mptcp: don't overwrite sock_ops in mptcp_is_tcpsk()
Posted by Matthieu Baerts 5 months ago
Hi Davide, Paolo,

On 11/12/2023 11:29, Davide Caratti wrote:
> Eric Dumazet suggests:
> 
>  > The fact that mptcp_is_tcpsk() was able to write over sock->ops was a
>  > bit strange to me.
>  > mptcp_is_tcpsk() should answer a question, with a read-only argument.
> 
> re-factor code to avoid overwriting sock_ops inside that function. Also,
> change the helper name to reflect the semantics and to disambiguate from
> its dual, sk_is_mptcp(). While at it, collapse mptcp_stream_accept() and
> mptcp_accept() into a single function, where fallback / non-fallback are
> separated into a single sk_is_mptcp() conditional.

Thank you for the patch and the review!

Now in our tree (feat. for net-next):

New patches for t/upstream:
- 65304cb7d742: mptcp: don't overwrite sock_ops in mptcp_is_tcpsk()
- Results: a1a139446105..bfda5dfb6a94 (export)

Tests are now in progress:

https://cirrus-ci.com/github/multipath-tcp/mptcp_net-next/export/20231213T110841

Cheers,
Matt
-- 
Sponsored by the NGI0 Core fund.
Re: [PATCH mptcp-next v2] mptcp: don't overwrite sock_ops in mptcp_is_tcpsk()
Posted by Paolo Abeni 5 months ago
On Mon, 2023-12-11 at 11:29 +0100, Davide Caratti wrote:
> Eric Dumazet suggests:
> 
>  > The fact that mptcp_is_tcpsk() was able to write over sock->ops was a
>  > bit strange to me.
>  > mptcp_is_tcpsk() should answer a question, with a read-only argument.
> 
> re-factor code to avoid overwriting sock_ops inside that function. Also,
> change the helper name to reflect the semantics and to disambiguate from
> its dual, sk_is_mptcp(). While at it, collapse mptcp_stream_accept() and
> mptcp_accept() into a single function, where fallback / non-fallback are
> separated into a single sk_is_mptcp() conditional.
> 
> Link: https://github.com/multipath-tcp/mptcp_net-next/issues/432
> Suggested-by: Eric Dumazet <edumazet@google.com>
> Signed-off-by: Davide Caratti <dcaratti@redhat.com>
> ---
>  net/mptcp/protocol.c | 108 ++++++++++++++++++-------------------------
>  1 file changed, 44 insertions(+), 64 deletions(-)
> 
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 4fc038d29623..b7cb60e6f0c3 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -55,28 +55,14 @@ u64 mptcp_wnd_end(const struct mptcp_sock *msk)
>  	return READ_ONCE(msk->wnd_end);
>  }
>  
> -static bool mptcp_is_tcpsk(struct sock *sk)
> +static const struct proto_ops *mptcp_fallback_tcp_ops(const struct sock *sk)
>  {
> -	struct socket *sock = sk->sk_socket;
> -
> -	if (unlikely(sk->sk_prot == &tcp_prot)) {
> -		/* we are being invoked after mptcp_accept() has
> -		 * accepted a non-mp-capable flow: sk is a tcp_sk,
> -		 * not an mptcp one.
> -		 *
> -		 * Hand the socket over to tcp so all further socket ops
> -		 * bypass mptcp.
> -		 */
> -		WRITE_ONCE(sock->ops, &inet_stream_ops);
> -		return true;
>  #if IS_ENABLED(CONFIG_MPTCP_IPV6)
> -	} else if (unlikely(sk->sk_prot == &tcpv6_prot)) {
> -		WRITE_ONCE(sock->ops, &inet6_stream_ops);
> -		return true;
> +	if (sk->sk_prot == &tcpv6_prot)
> +		return &inet6_stream_ops;
>  #endif
> -	}
> -
> -	return false;
> +	WARN_ON_ONCE(sk->sk_prot != &tcp_prot);
> +	return &inet_stream_ops;
>  }
>  
>  static int __mptcp_socket_create(struct mptcp_sock *msk)
> @@ -3258,44 +3244,6 @@ void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk)
>  	WRITE_ONCE(msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd);
>  }
>  
> -static struct sock *mptcp_accept(struct sock *ssk, int flags, int *err,
> -				 bool kern)
> -{
> -	struct sock *newsk;
> -
> -	pr_debug("ssk=%p, listener=%p", ssk, mptcp_subflow_ctx(ssk));
> -	newsk = inet_csk_accept(ssk, flags, err, kern);
> -	if (!newsk)
> -		return NULL;
> -
> -	pr_debug("newsk=%p, subflow is mptcp=%d", newsk, sk_is_mptcp(newsk));
> -	if (sk_is_mptcp(newsk)) {
> -		struct mptcp_subflow_context *subflow;
> -		struct sock *new_mptcp_sock;
> -
> -		subflow = mptcp_subflow_ctx(newsk);
> -		new_mptcp_sock = subflow->conn;
> -
> -		/* is_mptcp should be false if subflow->conn is missing, see
> -		 * subflow_syn_recv_sock()
> -		 */
> -		if (WARN_ON_ONCE(!new_mptcp_sock)) {
> -			tcp_sk(newsk)->is_mptcp = 0;
> -			goto out;
> -		}
> -
> -		newsk = new_mptcp_sock;
> -		MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEPASSIVEACK);
> -	} else {
> -		MPTCP_INC_STATS(sock_net(ssk),
> -				MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK);
> -	}
> -
> -out:
> -	newsk->sk_kern_sock = kern;
> -	return newsk;
> -}
> -
>  void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags)
>  {
>  	struct mptcp_subflow_context *subflow, *tmp;
> @@ -3738,7 +3686,6 @@ static struct proto mptcp_prot = {
>  	.connect	= mptcp_connect,
>  	.disconnect	= mptcp_disconnect,
>  	.close		= mptcp_close,
> -	.accept		= mptcp_accept,
>  	.setsockopt	= mptcp_setsockopt,
>  	.getsockopt	= mptcp_getsockopt,
>  	.shutdown	= mptcp_shutdown,
> @@ -3848,18 +3795,36 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
>  	if (!ssk)
>  		return -EINVAL;
>  
> -	newsk = mptcp_accept(ssk, flags, &err, kern);
> +	pr_debug("ssk=%p, listener=%p", ssk, mptcp_subflow_ctx(ssk));
> +	newsk = inet_csk_accept(ssk, flags, &err, kern);
>  	if (!newsk)
>  		return err;
>  
> -	lock_sock(newsk);
> -
> -	__inet_accept(sock, newsock, newsk);
> -	if (!mptcp_is_tcpsk(newsock->sk)) {
> -		struct mptcp_sock *msk = mptcp_sk(newsk);
> +	pr_debug("newsk=%p, subflow is mptcp=%d", newsk, sk_is_mptcp(newsk));
> +	if (sk_is_mptcp(newsk)) {
>  		struct mptcp_subflow_context *subflow;
> +		struct sock *new_mptcp_sock;
> +
> +		subflow = mptcp_subflow_ctx(newsk);
> +		new_mptcp_sock = subflow->conn;
> +
> +		/* is_mptcp should be false if subflow->conn is missing, see
> +		 * subflow_syn_recv_sock()
> +		 */
> +		if (WARN_ON_ONCE(!new_mptcp_sock)) {
> +			tcp_sk(newsk)->is_mptcp = 0;
> +			goto tcpfallback;
> +		}
> +
> +		newsk = new_mptcp_sock;
> +		MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEPASSIVEACK);
> +
> +		newsk->sk_kern_sock = kern;
> +		lock_sock(newsk);
> +		__inet_accept(sock, newsock, newsk);
>  
>  		set_bit(SOCK_CUSTOM_SOCKOPT, &newsock->flags);
> +		msk = mptcp_sk(newsk);
>  		msk->in_accept_queue = 0;
>  
>  		/* set ssk->sk_socket of accept()ed flows to mptcp socket.
> @@ -3881,6 +3846,21 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
>  			if (unlikely(list_is_singular(&msk->conn_list)))
>  				inet_sk_state_store(newsk, TCP_CLOSE);
>  		}
> +	} else {
> +		MPTCP_INC_STATS(sock_net(ssk),
> +				MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK);
> +tcpfallback:
> +		newsk->sk_kern_sock = kern;
> +		lock_sock(newsk);
> +		__inet_accept(sock, newsock, newsk);
> +		/* we are being invoked after accepting a non-mp-capable
> +		 * flow: sk is a tcp_sk, not an mptcp one.
> +		 *
> +		 * Hand the socket over to tcp so all further socket ops
> +		 * bypass mptcp.
> +		 */
> +		WRITE_ONCE(newsock->sk->sk_socket->ops,
> +			   mptcp_fallback_tcp_ops(newsock->sk));
>  	}
>  	release_sock(newsk);
>  
LGTM! thanks Davide!

Acked-by: Paolo Abeni <pabeni@redhat.com>

btw the CI failures look real, but unrelated ?!?
Re: [PATCH mptcp-next v2] mptcp: don't overwrite sock_ops in mptcp_is_tcpsk()
Posted by Paolo Abeni 5 months ago
On Tue, 2023-12-12 at 17:56 +0100, Paolo Abeni wrote:
> btw the CI failures look real, but unrelated ?!?

They are triggered by a recent upstream commit:

commit 3d501dd326fb1c73f1b8206d4c6e1d7b15c07e27
Author: Eric Dumazet <edumazet@google.com>
Date:   Tue Dec 5 16:18:41 2023 +0000

    tcp: do not accept ACK of bytes we never sent

but the root cause is a pktdrill script bug: the ack field was not set
correctly on some injected packets. I'll send a patch.

Cheers,

Paolo