From nobody Sat May 11 15:02:39 2024 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 034876AAE; Mon, 23 Oct 2023 20:45:33 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="jHnAtImy" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 7020CC433CA; Mon, 23 Oct 2023 20:45:33 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1698093933; bh=o1HCQ1dOhPEC+TSq7N6CCYqvYtbLIm8iE9Kh+2gyczg=; h=From:Date:Subject:References:In-Reply-To:To:Cc:From; b=jHnAtImyKwZTBYcT/jMHG3/vyR55ck0Z5QT/qR1LdU2/mwDiWkyZf/zi8CemwqFEu FbHa9V9R532FnCNpiW9DYkEzDyuCV7UAtElj+4vds3d6zgAevAhRPdL6TexEL6cGHa Eo6eMAWlwOVMmk/xy6M8rqcvg5NQSoS0U9W+6bSRVqyHqdYLMOzIyj03pkKNpMIlTZ d/7SljtxgIrfDC5QcgRIe36WpCZGKkZYUwEKK0ZigIDpNwNLsGIgJVuP1Lvqd/KOy7 xTvs5WSJ5JC2vuUzuACPDNIQ1l9i1hXN3JzOT3io9y7UJuboQBgjpfcB5IPVKo3GcA 6p1G4XH1g2N6w== From: Mat Martineau Date: Mon, 23 Oct 2023 13:44:34 -0700 Subject: [PATCH net-next 1/9] mptcp: add a new sysctl for make after break timeout Precedence: bulk X-Mailing-List: mptcp@lists.linux.dev List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable Message-Id: <20231023-send-net-next-20231023-2-v1-1-9dc60939d371@kernel.org> References: <20231023-send-net-next-20231023-2-v1-0-9dc60939d371@kernel.org> In-Reply-To: <20231023-send-net-next-20231023-2-v1-0-9dc60939d371@kernel.org> To: Matthieu Baerts , "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni Cc: netdev@vger.kernel.org, mptcp@lists.linux.dev, Mat Martineau X-Mailer: b4 0.12.4 From: Paolo Abeni The MPTCP protocol allows sockets with no alive subflows to stay in ESTABLISHED status for and user-defined timeout, to allow for later subflows creation. Currently such timeout is constant - TCP_TIMEWAIT_LEN. Let the user-space configure them via a newly added sysctl, to better cope with busy servers and simplify (make them faster) the relevant pktdrill tests. Note that the new know does not apply to orphaned MPTCP socket waiting for the data_fin handshake completion: they always wait TCP_TIMEWAIT_LEN. Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau --- Documentation/networking/mptcp-sysctl.rst | 11 +++++++++++ net/mptcp/ctrl.c | 16 ++++++++++++++++ net/mptcp/protocol.c | 6 +++--- net/mptcp/protocol.h | 1 + 4 files changed, 31 insertions(+), 3 deletions(-) diff --git a/Documentation/networking/mptcp-sysctl.rst b/Documentation/netw= orking/mptcp-sysctl.rst index 15f1919d640c..69975ce25a02 100644 --- a/Documentation/networking/mptcp-sysctl.rst +++ b/Documentation/networking/mptcp-sysctl.rst @@ -25,6 +25,17 @@ add_addr_timeout - INTEGER (seconds) =20 Default: 120 =20 +close_timeout - INTEGER (seconds) + Set the make-after-break timeout: in absence of any close or + shutdown syscall, MPTCP sockets will maintain the status + unchanged for such time, after the last subflow removal, before + moving to TCP_CLOSE. + + The default value matches TCP_TIMEWAIT_LEN. This is a per-namespace + sysctl. + + Default: 60 + checksum_enabled - BOOLEAN Control whether DSS checksum can be enabled. =20 diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index e72b518c5d02..13fe0748dde8 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -27,6 +27,7 @@ struct mptcp_pernet { #endif =20 unsigned int add_addr_timeout; + unsigned int close_timeout; unsigned int stale_loss_cnt; u8 mptcp_enabled; u8 checksum_enabled; @@ -65,6 +66,13 @@ unsigned int mptcp_stale_loss_cnt(const struct net *net) return mptcp_get_pernet(net)->stale_loss_cnt; } =20 +unsigned int mptcp_close_timeout(const struct sock *sk) +{ + if (sock_flag(sk, SOCK_DEAD)) + return TCP_TIMEWAIT_LEN; + return mptcp_get_pernet(sock_net(sk))->close_timeout; +} + int mptcp_get_pm_type(const struct net *net) { return mptcp_get_pernet(net)->pm_type; @@ -79,6 +87,7 @@ static void mptcp_pernet_set_defaults(struct mptcp_pernet= *pernet) { pernet->mptcp_enabled =3D 1; pernet->add_addr_timeout =3D TCP_RTO_MAX; + pernet->close_timeout =3D TCP_TIMEWAIT_LEN; pernet->checksum_enabled =3D 0; pernet->allow_join_initial_addr_port =3D 1; pernet->stale_loss_cnt =3D 4; @@ -141,6 +150,12 @@ static struct ctl_table mptcp_sysctl_table[] =3D { .mode =3D 0644, .proc_handler =3D proc_dostring, }, + { + .procname =3D "close_timeout", + .maxlen =3D sizeof(unsigned int), + .mode =3D 0644, + .proc_handler =3D proc_dointvec_jiffies, + }, {} }; =20 @@ -163,6 +178,7 @@ static int mptcp_pernet_new_table(struct net *net, stru= ct mptcp_pernet *pernet) table[4].data =3D &pernet->stale_loss_cnt; table[5].data =3D &pernet->pm_type; table[6].data =3D &pernet->scheduler; + table[7].data =3D &pernet->close_timeout; =20 hdr =3D register_net_sysctl_sz(net, MPTCP_SYSCTL_PATH, table, ARRAY_SIZE(mptcp_sysctl_table)); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 886ab689a8ae..a21f8ed26343 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2391,8 +2391,8 @@ static void __mptcp_close_ssk(struct sock *sk, struct= sock *ssk, if (msk->in_accept_queue && msk->first =3D=3D ssk && (sock_flag(sk, SOCK_DEAD) || sock_flag(ssk, SOCK_DEAD))) { /* ensure later check in mptcp_worker() will dispose the msk */ - mptcp_set_close_tout(sk, tcp_jiffies32 - (TCP_TIMEWAIT_LEN + 1)); sock_set_flag(sk, SOCK_DEAD); + mptcp_set_close_tout(sk, tcp_jiffies32 - (mptcp_close_timeout(sk) + 1)); lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); mptcp_subflow_drop_ctx(ssk); goto out_release; @@ -2516,7 +2516,7 @@ static bool mptcp_close_tout_expired(const struct soc= k *sk) return false; =20 return time_after32(tcp_jiffies32, - inet_csk(sk)->icsk_mtup.probe_timestamp + TCP_TIMEWAIT_LEN); + inet_csk(sk)->icsk_mtup.probe_timestamp + mptcp_close_timeout(sk)); } =20 static void mptcp_check_fastclose(struct mptcp_sock *msk) @@ -2659,7 +2659,7 @@ void mptcp_reset_tout_timer(struct mptcp_sock *msk, u= nsigned long fail_tout) return; =20 close_timeout =3D inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32= + jiffies + - TCP_TIMEWAIT_LEN; + mptcp_close_timeout(sk); =20 /* the close timeout takes precedence on the fail one, and here at least = one of * them is active diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 3612545fa62e..02556921bc6c 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -613,6 +613,7 @@ unsigned int mptcp_get_add_addr_timeout(const struct ne= t *net); int mptcp_is_checksum_enabled(const struct net *net); int mptcp_allow_join_id0(const struct net *net); unsigned int mptcp_stale_loss_cnt(const struct net *net); +unsigned int mptcp_close_timeout(const struct sock *sk); int mptcp_get_pm_type(const struct net *net); const char *mptcp_get_scheduler(const struct net *net); void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, --=20 2.41.0 From nobody Sat May 11 15:02:39 2024 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 40C1C8481; Mon, 23 Oct 2023 20:45:33 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="sQW4jUKa" Received: by smtp.kernel.org (Postfix) with ESMTPSA id A2B27C43395; Mon, 23 Oct 2023 20:45:33 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1698093933; bh=GZ2U3nahnSpjVZfHanU6T5qe2gYwnIdXa4rzFxk9ipg=; h=From:Date:Subject:References:In-Reply-To:To:Cc:From; b=sQW4jUKaqI9xAi29mP87H8Rqh6kB3ho89hXKFOSzGn6Gs6LRqFPTXnp0AYjR3WpIC 83z7QdKD6ZGhVU4Bl1OU8DLooMn/166nbkzER9mIorska+9tDypyxLkNJBDnodXXbC Nant98w/XD9z4RKSJvp4HNTsCFZNfh0tf1TUMiGKxHWHqtb3Zf3k3cvSsu5BBf9lYG EhUB/J1lISk6A1d9QhgWD1T2DoUn9OqG4ptY3Wd/PjCYPoYi92gfMYN7fR9SZMoKw1 /U+Gbm7p7WaPdEWsdMLGHJ1fgi5/iYnFSQ+L+mH0rxh9R876DIJzMicPG12T/TqjjT pQNBcyjHIgm0Q== From: Mat Martineau Date: Mon, 23 Oct 2023 13:44:35 -0700 Subject: [PATCH net-next 2/9] mptcp: properly account fastopen data Precedence: bulk X-Mailing-List: mptcp@lists.linux.dev List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable Message-Id: <20231023-send-net-next-20231023-2-v1-2-9dc60939d371@kernel.org> References: <20231023-send-net-next-20231023-2-v1-0-9dc60939d371@kernel.org> In-Reply-To: <20231023-send-net-next-20231023-2-v1-0-9dc60939d371@kernel.org> To: Matthieu Baerts , "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni Cc: netdev@vger.kernel.org, mptcp@lists.linux.dev, Mat Martineau X-Mailer: b4 0.12.4 From: Paolo Abeni Currently the socket level counter aggregating the received data does not take in account the data received via fastopen. Address the issue updating the counter as required. Fixes: 38967f424b5b ("mptcp: track some aggregate data counters") Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau --- net/mptcp/fastopen.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/mptcp/fastopen.c b/net/mptcp/fastopen.c index bceaab8dd8e4..74698582a285 100644 --- a/net/mptcp/fastopen.c +++ b/net/mptcp/fastopen.c @@ -52,6 +52,7 @@ void mptcp_fastopen_subflow_synack_set_params(struct mptc= p_subflow_context *subf =20 mptcp_set_owner_r(skb, sk); __skb_queue_tail(&sk->sk_receive_queue, skb); + mptcp_sk(sk)->bytes_received +=3D skb->len; =20 sk->sk_data_ready(sk); =20 --=20 2.41.0 From nobody Sat May 11 15:02:39 2024 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 663E410A3A; Mon, 23 Oct 2023 20:45:34 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="lHJ9eZlB" Received: by smtp.kernel.org (Postfix) with ESMTPSA id D6182C433A9; Mon, 23 Oct 2023 20:45:33 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1698093934; bh=ysUP4nvCD160glmMwIKJetbxE7xcYUI9k6oScnJoQ1M=; h=From:Date:Subject:References:In-Reply-To:To:Cc:From; b=lHJ9eZlBKWIvla4Px1gzPDWPouuF80eOqUP1OA3Sp781IXhnG0FbNdzZBCFaoj05M O0nImTKtBb4dRtrVcq+uNLTjJK2tajE7QtvqeXzFHc1+3g+IkrlZ5IJV6mTAawzeP3 Xx+tf4ODIka5Xo9jbD+S8ayIAH9SGetJFn2rsebdVe9MZiDiTgg910hRVneUrra23g 4ZulH9MqOQva5hKBV5N4DgBgdTaKWd+fXm7Y274bNgK7ZPvqral1AsB7vlzGf6ffgF FONhbGCXNGN8cfAgkOkhtOudhULW9vMwjKYeVipRnXqGC8m+8SJ7nBal7BIspqYbJa k6tz1Ik5mYMug== From: Mat Martineau Date: Mon, 23 Oct 2023 13:44:36 -0700 Subject: [PATCH net-next 3/9] mptcp: use plain bool instead of custom binary enum Precedence: bulk X-Mailing-List: mptcp@lists.linux.dev List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable Message-Id: <20231023-send-net-next-20231023-2-v1-3-9dc60939d371@kernel.org> References: <20231023-send-net-next-20231023-2-v1-0-9dc60939d371@kernel.org> In-Reply-To: <20231023-send-net-next-20231023-2-v1-0-9dc60939d371@kernel.org> To: Matthieu Baerts , "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni Cc: netdev@vger.kernel.org, mptcp@lists.linux.dev, Mat Martineau X-Mailer: b4 0.12.4 From: Paolo Abeni The 'data_avail' subflow field is already used as plain boolean, drop the custom binary enum type and switch to bool. No functional changed intended. Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau --- net/mptcp/protocol.h | 7 +------ net/mptcp/subflow.c | 12 ++++++------ 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 02556921bc6c..4c9e7ade160d 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -432,11 +432,6 @@ mptcp_subflow_rsk(const struct request_sock *rsk) return (struct mptcp_subflow_request_sock *)rsk; } =20 -enum mptcp_data_avail { - MPTCP_SUBFLOW_NODATA, - MPTCP_SUBFLOW_DATA_AVAIL, -}; - struct mptcp_delegated_action { struct napi_struct napi; struct list_head head; @@ -492,7 +487,7 @@ struct mptcp_subflow_context { valid_csum_seen : 1, /* at least one csum validated */ is_mptfo : 1, /* subflow is doing TFO */ __unused : 9; - enum mptcp_data_avail data_avail; + bool data_avail; bool scheduled; u32 remote_nonce; u64 thmac; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 9c1f8d1d63d2..dbc7a52b322f 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1237,7 +1237,7 @@ static bool subflow_check_data_avail(struct sock *ssk) struct sk_buff *skb; =20 if (!skb_peek(&ssk->sk_receive_queue)) - WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA); + WRITE_ONCE(subflow->data_avail, false); if (subflow->data_avail) return true; =20 @@ -1271,7 +1271,7 @@ static bool subflow_check_data_avail(struct sock *ssk) continue; } =20 - WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL); + WRITE_ONCE(subflow->data_avail, true); break; } return true; @@ -1293,7 +1293,7 @@ static bool subflow_check_data_avail(struct sock *ssk) goto reset; } mptcp_subflow_fail(msk, ssk); - WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL); + WRITE_ONCE(subflow->data_avail, true); return true; } =20 @@ -1310,7 +1310,7 @@ static bool subflow_check_data_avail(struct sock *ssk) while ((skb =3D skb_peek(&ssk->sk_receive_queue))) sk_eat_skb(ssk, skb); tcp_send_active_reset(ssk, GFP_ATOMIC); - WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA); + WRITE_ONCE(subflow->data_avail, false); return false; } =20 @@ -1322,7 +1322,7 @@ static bool subflow_check_data_avail(struct sock *ssk) subflow->map_seq =3D READ_ONCE(msk->ack_seq); subflow->map_data_len =3D skb->len; subflow->map_subflow_seq =3D tcp_sk(ssk)->copied_seq - subflow->ssn_offse= t; - WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL); + WRITE_ONCE(subflow->data_avail, true); return true; } =20 @@ -1334,7 +1334,7 @@ bool mptcp_subflow_data_available(struct sock *sk) if (subflow->map_valid && mptcp_subflow_get_map_offset(subflow) >=3D subflow->map_data_len) { subflow->map_valid =3D 0; - WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA); + WRITE_ONCE(subflow->data_avail, false); =20 pr_debug("Done with mapping: seq=3D%u data_len=3D%u", subflow->map_subflow_seq, --=20 2.41.0 From nobody Sat May 11 15:02:39 2024 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id ABB221C6BD; Mon, 23 Oct 2023 20:45:34 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="oe+NX9Pq" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 16A60C433AB; Mon, 23 Oct 2023 20:45:34 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1698093934; bh=U+qfHVKzSdXpb5Nt3mGZCOI198lAElT+RKzcKjn/PiY=; h=From:Date:Subject:References:In-Reply-To:To:Cc:From; b=oe+NX9Pqh48cHlP94ZQURZ6ZFJY+eg6/vpMO1JiIW+OcOZbq97x0xz9jlo14Au0HJ 6peL/N3oX1U0H61PYnv/+Xmmt5N74v0NQuthBMYTMPFRNmVshW7HZRMQl5UeIv92Sq BbiHXYZsnGd5KLNUcHR9wS5Os9PrClNUBpnLWZ5KXHq9Wa9qnlIAuevey0xsK7q6bE KsZzdiuHQpU2GKYjxIEHjTXnSgUZOFmwQezaXFOxYLgJRh9v6T1nhw+s7qpwa8Spse oMAZjfXr5BCp/xUZLMIDQfKtHJt7UlUS7oLxUVwnAO9NAoQmUOX0Tl2HSW921LtxzO zrHspQP60CAbQ== From: Mat Martineau Date: Mon, 23 Oct 2023 13:44:37 -0700 Subject: [PATCH net-next 4/9] tcp: define initial scaling factor value as a macro Precedence: bulk X-Mailing-List: mptcp@lists.linux.dev List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable Message-Id: <20231023-send-net-next-20231023-2-v1-4-9dc60939d371@kernel.org> References: <20231023-send-net-next-20231023-2-v1-0-9dc60939d371@kernel.org> In-Reply-To: <20231023-send-net-next-20231023-2-v1-0-9dc60939d371@kernel.org> To: Matthieu Baerts , "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni Cc: netdev@vger.kernel.org, mptcp@lists.linux.dev, Mat Martineau X-Mailer: b4 0.12.4 From: Paolo Abeni So that other users could access it. Notably MPTCP will use it in the next patch. No functional change intended. Acked-by: Matthieu Baerts Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau --- include/net/tcp.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 39b731c900dd..993b7fcd4e46 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1489,13 +1489,15 @@ static inline int tcp_space_from_win(const struct s= ock *sk, int win) return __tcp_space_from_win(tcp_sk(sk)->scaling_ratio, win); } =20 +/* Assume a conservative default of 1200 bytes of payload per 4K page. + * This may be adjusted later in tcp_measure_rcv_mss(). + */ +#define TCP_DEFAULT_SCALING_RATIO ((1200 << TCP_RMEM_TO_WIN_SCALE) / \ + SKB_TRUESIZE(4096)) + static inline void tcp_scaling_ratio_init(struct sock *sk) { - /* Assume a conservative default of 1200 bytes of payload per 4K page. - * This may be adjusted later in tcp_measure_rcv_mss(). - */ - tcp_sk(sk)->scaling_ratio =3D (1200 << TCP_RMEM_TO_WIN_SCALE) / - SKB_TRUESIZE(4096); + tcp_sk(sk)->scaling_ratio =3D TCP_DEFAULT_SCALING_RATIO; } =20 /* Note: caller must be prepared to deal with negative returns */ --=20 2.41.0 From nobody Sat May 11 15:02:39 2024 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 880F812B78; Mon, 23 Oct 2023 20:45:34 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="F8cxjnQG" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 4A5EEC433CA; Mon, 23 Oct 2023 20:45:34 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1698093934; bh=eOi8adMX++/v0fFQfsUELjT9lSsfKTGvSjwKfKyUgCA=; h=From:Date:Subject:References:In-Reply-To:To:Cc:From; b=F8cxjnQGAcNmU25kBBr5UtHt9dpsfoMlpmwYelN0KLGRmOW6+TgszRIn0iHMD9aDV Psf+tHd/gjPML1H6nDR8Vn1tDp7qy+rGsoOJGA4LecBiPo3gi54ebLSATiOAS0bxcN dquRh29eivsdVT5++dFXnWbeTERYFAXk7+MvJ44uYpcDnlrDaQCd21ywPcbBb26Ckm 5Za9asYLiDq4YxYa8NeRMO5+0epnm0QpN4kDZ0FouCid3vtmXEdr4nRoOV7Z9HoNHl 93LSS7CvmR25ngpX+QcVg2QNBhn3ya68F7cWFgtCRNXyw3QyK50vrqGvnzi3uMLKG2 TFVtCzY2ejbiQ== From: Mat Martineau Date: Mon, 23 Oct 2023 13:44:38 -0700 Subject: [PATCH net-next 5/9] mptcp: give rcvlowat some love Precedence: bulk X-Mailing-List: mptcp@lists.linux.dev List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable Message-Id: <20231023-send-net-next-20231023-2-v1-5-9dc60939d371@kernel.org> References: <20231023-send-net-next-20231023-2-v1-0-9dc60939d371@kernel.org> In-Reply-To: <20231023-send-net-next-20231023-2-v1-0-9dc60939d371@kernel.org> To: Matthieu Baerts , "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni Cc: netdev@vger.kernel.org, mptcp@lists.linux.dev, Mat Martineau X-Mailer: b4 0.12.4 From: Paolo Abeni The MPTCP protocol allow setting sk_rcvlowat, but the value there is currently ignored. Additionally, the default subflows sk_rcvlowat basically disables per subflow delayed ack: the MPTCP protocol move the incoming data from the subflows into the msk socket as soon as the TCP stacks invokes the subflow data_ready callback. Later, when __tcp_ack_snd_check() takes action, the subflow-level copied_seq matches rcv_nxt, and that mandate for an immediate ack. Let the mptcp receive path be aware of such threshold, explicitly tracking the amount of data available to be ready and checking vs sk_rcvlowat in mptcp_poll() and before waking-up readers. Additionally implement the set_rcvlowat() callback, to properly handle the rcvbuf auto-tuning on sk_rcvlowat changes. Finally to properly handle delayed ack, force the subflow level threshold to 0 and instead explicitly ask for an immediate ack when the msk level th is not reached. Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau --- net/mptcp/protocol.c | 24 +++++++++++------------- net/mptcp/protocol.h | 20 ++++++++++++++++++++ net/mptcp/sockopt.c | 42 ++++++++++++++++++++++++++++++++++++++++++ net/mptcp/subflow.c | 12 ++++++++++-- 4 files changed, 83 insertions(+), 15 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index a21f8ed26343..7036e30c449f 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -863,9 +863,8 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) =20 /* Wake-up the reader only for in-sequence data */ mptcp_data_lock(sk); - if (move_skbs_to_msk(msk, ssk)) + if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk)) sk->sk_data_ready(sk); - mptcp_data_unlock(sk); } =20 @@ -1922,6 +1921,7 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *ms= k, if (!(flags & MSG_PEEK)) { MPTCP_SKB_CB(skb)->offset +=3D count; MPTCP_SKB_CB(skb)->map_seq +=3D count; + msk->bytes_consumed +=3D count; } break; } @@ -1932,6 +1932,7 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *ms= k, WRITE_ONCE(msk->rmem_released, msk->rmem_released + skb->truesize); __skb_unlink(skb, &msk->receive_queue); __kfree_skb(skb); + msk->bytes_consumed +=3D count; } =20 if (copied >=3D len) @@ -2755,6 +2756,7 @@ static void __mptcp_init_sock(struct sock *sk) msk->rmem_fwd_alloc =3D 0; WRITE_ONCE(msk->rmem_released, 0); msk->timer_ival =3D TCP_RTO_MIN; + msk->scaling_ratio =3D TCP_DEFAULT_SCALING_RATIO; =20 WRITE_ONCE(msk->first, NULL); inet_csk(sk)->icsk_sync_mss =3D mptcp_sync_mss; @@ -2964,16 +2966,9 @@ void __mptcp_unaccepted_force_close(struct sock *sk) __mptcp_destroy_sock(sk); } =20 -static __poll_t mptcp_check_readable(struct mptcp_sock *msk) +static __poll_t mptcp_check_readable(struct sock *sk) { - /* Concurrent splices from sk_receive_queue into receive_queue will - * always show at least one non-empty queue when checked in this order. - */ - if (skb_queue_empty_lockless(&((struct sock *)msk)->sk_receive_queue) && - skb_queue_empty_lockless(&msk->receive_queue)) - return 0; - - return EPOLLIN | EPOLLRDNORM; + return mptcp_epollin_ready(sk) ? EPOLLIN | EPOLLRDNORM : 0; } =20 static void mptcp_check_listen_stop(struct sock *sk) @@ -3011,7 +3006,7 @@ bool __mptcp_close(struct sock *sk, long timeout) goto cleanup; } =20 - if (mptcp_check_readable(msk) || timeout < 0) { + if (mptcp_data_avail(msk) || timeout < 0) { /* If the msk has read data, or the caller explicitly ask it, * do the MPTCP equivalent of TCP reset, aka MPTCP fastclose */ @@ -3138,6 +3133,7 @@ static int mptcp_disconnect(struct sock *sk, int flag= s) msk->snd_data_fin_enable =3D false; msk->rcv_fastclose =3D false; msk->use_64bit_ack =3D false; + msk->bytes_consumed =3D 0; WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); mptcp_pm_data_reset(msk); mptcp_ca_reset(sk); @@ -3909,7 +3905,7 @@ static __poll_t mptcp_poll(struct file *file, struct = socket *sock, mask |=3D EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; =20 if (state !=3D TCP_SYN_SENT && state !=3D TCP_SYN_RECV) { - mask |=3D mptcp_check_readable(msk); + mask |=3D mptcp_check_readable(sk); if (shutdown & SEND_SHUTDOWN) mask |=3D EPOLLOUT | EPOLLWRNORM; else @@ -3947,6 +3943,7 @@ static const struct proto_ops mptcp_stream_ops =3D { .sendmsg =3D inet_sendmsg, .recvmsg =3D inet_recvmsg, .mmap =3D sock_no_mmap, + .set_rcvlowat =3D mptcp_set_rcvlowat, }; =20 static struct inet_protosw mptcp_protosw =3D { @@ -4048,6 +4045,7 @@ static const struct proto_ops mptcp_v6_stream_ops =3D= { #ifdef CONFIG_COMPAT .compat_ioctl =3D inet6_compat_ioctl, #endif + .set_rcvlowat =3D mptcp_set_rcvlowat, }; =20 static struct proto mptcp_v6_prot; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 4c9e7ade160d..620a82cd4c10 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -267,6 +267,7 @@ struct mptcp_sock { atomic64_t rcv_wnd_sent; u64 rcv_data_fin_seq; u64 bytes_retrans; + u64 bytes_consumed; int rmem_fwd_alloc; int snd_burst; int old_wspace; @@ -657,6 +658,24 @@ struct sock *mptcp_subflow_get_retrans(struct mptcp_so= ck *msk); int mptcp_sched_get_send(struct mptcp_sock *msk); int mptcp_sched_get_retrans(struct mptcp_sock *msk); =20 +static inline u64 mptcp_data_avail(const struct mptcp_sock *msk) +{ + return READ_ONCE(msk->bytes_received) - READ_ONCE(msk->bytes_consumed); +} + +static inline bool mptcp_epollin_ready(const struct sock *sk) +{ + /* mptcp doesn't have to deal with small skbs in the receive queue, + * at it can always coalesce them + */ + return (mptcp_data_avail(mptcp_sk(sk)) >=3D sk->sk_rcvlowat) || + (mem_cgroup_sockets_enabled && sk->sk_memcg && + mem_cgroup_under_socket_pressure(sk->sk_memcg)) || + READ_ONCE(tcp_memory_pressure); +} + +int mptcp_set_rcvlowat(struct sock *sk, int val); + static inline bool __tcp_can_send(const struct sock *ssk) { /* only send if our side has not closed yet */ @@ -731,6 +750,7 @@ static inline bool mptcp_is_fully_established(struct so= ck *sk) return inet_sk_state_load(sk) =3D=3D TCP_ESTABLISHED && READ_ONCE(mptcp_sk(sk)->fully_established); } + void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk); void mptcp_data_ready(struct sock *sk, struct sock *ssk); bool mptcp_finish_join(struct sock *sk); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 59bd5e114392..d15891e23f45 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -1472,9 +1472,51 @@ void mptcp_sockopt_sync_locked(struct mptcp_sock *ms= k, struct sock *ssk) =20 msk_owned_by_me(msk); =20 + ssk->sk_rcvlowat =3D 0; + if (READ_ONCE(subflow->setsockopt_seq) !=3D msk->setsockopt_seq) { sync_socket_options(msk, ssk); =20 subflow->setsockopt_seq =3D msk->setsockopt_seq; } } + +/* unfortunately this is different enough from the tcp version so + * that we can't factor it out + */ +int mptcp_set_rcvlowat(struct sock *sk, int val) +{ + struct mptcp_subflow_context *subflow; + int space, cap; + + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) + cap =3D sk->sk_rcvbuf >> 1; + else + cap =3D READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1; + val =3D min(val, cap); + WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); + + /* Check if we need to signal EPOLLIN right now */ + if (mptcp_epollin_ready(sk)) + sk->sk_data_ready(sk); + + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) + return 0; + + space =3D __tcp_space_from_win(mptcp_sk(sk)->scaling_ratio, val); + if (space <=3D sk->sk_rcvbuf) + return 0; + + /* propagate the rcvbuf changes to all the subflows */ + WRITE_ONCE(sk->sk_rcvbuf, space); + mptcp_for_each_subflow(mptcp_sk(sk), subflow) { + struct sock *ssk =3D mptcp_subflow_tcp_sock(subflow); + bool slow; + + slow =3D lock_sock_fast(ssk); + WRITE_ONCE(ssk->sk_rcvbuf, space); + tcp_sk(ssk)->window_clamp =3D val; + unlock_sock_fast(ssk, slow); + } + return 0; +} diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index dbc7a52b322f..080b16426222 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1405,10 +1405,18 @@ static void subflow_data_ready(struct sock *sk) WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable && !subflow->mp_join && !(state & TCPF_CLOSE)); =20 - if (mptcp_subflow_data_available(sk)) + if (mptcp_subflow_data_available(sk)) { mptcp_data_ready(parent, sk); - else if (unlikely(sk->sk_err)) + + /* subflow-level lowat test are not relevant. + * respect the msk-level threshold eventually mandating an immediate ack + */ + if (mptcp_data_avail(msk) < parent->sk_rcvlowat && + (tcp_sk(sk)->rcv_nxt - tcp_sk(sk)->rcv_wup) > inet_csk(sk)->icsk_ack= .rcv_mss) + inet_csk(sk)->icsk_ack.pending |=3D ICSK_ACK_NOW; + } else if (unlikely(sk->sk_err)) { subflow_error_report(sk); + } } =20 static void subflow_write_space(struct sock *ssk) --=20 2.41.0 From nobody Sat May 11 15:02:39 2024 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id B0BA71D699; Mon, 23 Oct 2023 20:45:34 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="MuZf/ooW" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 7E656C433C8; Mon, 23 Oct 2023 20:45:34 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1698093934; bh=8wxbLJ82zryvI+nAd9SdZ5WA2SwnODFY54TmSNRVFHc=; h=From:Date:Subject:References:In-Reply-To:To:Cc:From; b=MuZf/ooWATbZqGzxeqHWTqnZGEh5pVW2FMB4GGNcZCWP9hyrg5Ee4gw5OrMUnAd8d VCB9qZuS33YFxgSaxCRGeG+RnE7zj8ysuDRmSV1v4/NY7cQGFMUHh8vkR4IJzC67Ns pYgdCduvSRZk4zvB/KGkwWQPExOT5TPQzao2tkz3U0d30Qp+0lZLj0RFP+XhBlUffR b/bbUK2nbQYv1TosseAMJwdRfDT3Fm5Ab7UEy+RxHNFht4kPMAE3G39sgQlvte5dkr oTSfW65swyjV4VnrGxx27d4k28uz/YWUEMeIzrdPIIqFFXy598wQ6gOcVaRF9Z64QZ TkIOsOxZPJG0g== From: Mat Martineau Date: Mon, 23 Oct 2023 13:44:39 -0700 Subject: [PATCH net-next 6/9] mptcp: use copy_from_iter helpers on transmit Precedence: bulk X-Mailing-List: mptcp@lists.linux.dev List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable Message-Id: <20231023-send-net-next-20231023-2-v1-6-9dc60939d371@kernel.org> References: <20231023-send-net-next-20231023-2-v1-0-9dc60939d371@kernel.org> In-Reply-To: <20231023-send-net-next-20231023-2-v1-0-9dc60939d371@kernel.org> To: Matthieu Baerts , "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni Cc: netdev@vger.kernel.org, mptcp@lists.linux.dev, Mat Martineau X-Mailer: b4 0.12.4 From: Paolo Abeni The perf traces show an high cost for the MPTCP transmit path memcpy. It turn out that the helper currently in use carries quite a bit of unneeded overhead, e.g. to map/unmap the memory pages. Moving to the 'copy_from_iter' variant removes such overhead and additionally gains the no-cache support. Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau --- net/mptcp/protocol.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 7036e30c449f..5489f024dd7e 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1760,6 +1760,18 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, s= truct msghdr *msg, return ret; } =20 +static int do_copy_data_nocache(struct sock *sk, int copy, + struct iov_iter *from, char *to) +{ + if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) { + if (!copy_from_iter_full_nocache(to, copy, from)) + return -EFAULT; + } else if (!copy_from_iter_full(to, copy, from)) { + return -EFAULT; + } + return 0; +} + static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct mptcp_sock *msk =3D mptcp_sk(sk); @@ -1833,11 +1845,10 @@ static int mptcp_sendmsg(struct sock *sk, struct ms= ghdr *msg, size_t len) if (!sk_wmem_schedule(sk, total_ts)) goto wait_for_memory; =20 - if (copy_page_from_iter(dfrag->page, offset, psize, - &msg->msg_iter) !=3D psize) { - ret =3D -EFAULT; + ret =3D do_copy_data_nocache(sk, psize, &msg->msg_iter, + page_address(dfrag->page) + offset); + if (ret) goto do_error; - } =20 /* data successfully copied into the write queue */ sk_forward_alloc_add(sk, -total_ts); --=20 2.41.0 From nobody Sat May 11 15:02:39 2024 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 555F921367; Mon, 23 Oct 2023 20:45:34 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="I1wWymDL" Received: by smtp.kernel.org (Postfix) with ESMTPSA id B0DC5C433BF; Mon, 23 Oct 2023 20:45:34 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1698093934; bh=nyM3qAj+dX1T3E9H0rifWU8fe8+YDlZJN+C5xNnbe3k=; h=From:Date:Subject:References:In-Reply-To:To:Cc:From; b=I1wWymDL3fU0T+6vNwsjqY0DkcksJ9N4aeDmHX/AhfE48rRuu5PqJPgvdrTejr0tJ FUduwY8o4JXaqn/lQCiJcHmrftQmaWeRs56F3/Q/AbcaDRVaoqe1vhl4mDfE+drBkS Rqga52dP/JCnkpsJSdoGSrqY4m30iU4vbt23Pp9lfDY8zJY2O6MEG7/e5fJIKWw36K Fmsr9Y19FDES/dp8aWwjuLRK8g/sQOhpLAfJPy5b36gAwGrCYNPXjr2c2kB6kQ2/q6 cWm5pEt3zA9TuQy/E6+S8HV9M4pBy0ey9HJ+yXmK0o5qUfhWUHfbfS5M94By44F/Iq ui/CP9Li3iJIQ== From: Mat Martineau Date: Mon, 23 Oct 2023 13:44:40 -0700 Subject: [PATCH net-next 7/9] mptcp: consolidate sockopt synchronization Precedence: bulk X-Mailing-List: mptcp@lists.linux.dev List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable Message-Id: <20231023-send-net-next-20231023-2-v1-7-9dc60939d371@kernel.org> References: <20231023-send-net-next-20231023-2-v1-0-9dc60939d371@kernel.org> In-Reply-To: <20231023-send-net-next-20231023-2-v1-0-9dc60939d371@kernel.org> To: Matthieu Baerts , "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni Cc: netdev@vger.kernel.org, mptcp@lists.linux.dev, Mat Martineau X-Mailer: b4 0.12.4 From: Paolo Abeni Move the socket option synchronization for active subflows at subflow creation time. This allows removing the now unused unlocked variant of such helper. While at that, clean-up a bit the mptcp_subflow_create_socket() errors path. Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau --- net/mptcp/protocol.c | 2 -- net/mptcp/sockopt.c | 22 ---------------------- net/mptcp/subflow.c | 18 +++++++++--------- 3 files changed, 9 insertions(+), 33 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 5489f024dd7e..e44a3da12b96 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -121,8 +121,6 @@ struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk) ret =3D __mptcp_socket_create(msk); if (ret) return ERR_PTR(ret); - - mptcp_sockopt_sync(msk, msk->first); } =20 return msk->first; diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index d15891e23f45..abf0645cb65d 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -1444,28 +1444,6 @@ static void sync_socket_options(struct mptcp_sock *m= sk, struct sock *ssk) inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk)); } =20 -static void __mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk) -{ - bool slow =3D lock_sock_fast(ssk); - - sync_socket_options(msk, ssk); - - unlock_sock_fast(ssk, slow); -} - -void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk) -{ - struct mptcp_subflow_context *subflow =3D mptcp_subflow_ctx(ssk); - - msk_owned_by_me(msk); - - if (READ_ONCE(subflow->setsockopt_seq) !=3D msk->setsockopt_seq) { - __mptcp_sockopt_sync(msk, ssk); - - subflow->setsockopt_seq =3D msk->setsockopt_seq; - } -} - void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk) { struct mptcp_subflow_context *subflow =3D mptcp_subflow_ctx(ssk); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 080b16426222..df208666fd19 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1533,8 +1533,6 @@ int __mptcp_subflow_connect(struct sock *sk, const st= ruct mptcp_addr_info *loc, if (addr.ss_family =3D=3D AF_INET6) addrlen =3D sizeof(struct sockaddr_in6); #endif - mptcp_sockopt_sync(msk, ssk); - ssk->sk_bound_dev_if =3D ifindex; err =3D kernel_bind(sf, (struct sockaddr *)&addr, addrlen); if (err) @@ -1645,7 +1643,7 @@ int mptcp_subflow_create_socket(struct sock *sk, unsi= gned short family, =20 err =3D security_mptcp_add_subflow(sk, sf->sk); if (err) - goto release_ssk; + goto err_free; =20 /* the newly created socket has to be in the same cgroup as its parent */ mptcp_attach_cgroup(sk, sf->sk); @@ -1659,15 +1657,12 @@ int mptcp_subflow_create_socket(struct sock *sk, un= signed short family, get_net_track(net, &sf->sk->ns_tracker, GFP_KERNEL); sock_inuse_add(net, 1); err =3D tcp_set_ulp(sf->sk, "mptcp"); + if (err) + goto err_free; =20 -release_ssk: + mptcp_sockopt_sync_locked(mptcp_sk(sk), sf->sk); release_sock(sf->sk); =20 - if (err) { - sock_release(sf); - return err; - } - /* the newly created socket really belongs to the owning MPTCP master * socket, even if for additional subflows the allocation is performed * by a kernel workqueue. Adjust inode references, so that the @@ -1687,6 +1682,11 @@ int mptcp_subflow_create_socket(struct sock *sk, uns= igned short family, mptcp_subflow_ops_override(sf->sk); =20 return 0; + +err_free: + release_sock(sf->sk); + sock_release(sf); + return err; } =20 static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk, --=20 2.41.0 From nobody Sat May 11 15:02:39 2024 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 77289219E2; Mon, 23 Oct 2023 20:45:35 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="Do62zeQm" Received: by smtp.kernel.org (Postfix) with ESMTPSA id E41FDC433BC; Mon, 23 Oct 2023 20:45:34 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1698093935; bh=w1fB4DHVnccpvzxFWt0SdPjP8cvBQUJXUy4NYZuGi0w=; h=From:Date:Subject:References:In-Reply-To:To:Cc:From; b=Do62zeQmroTZ91ppg2Lki/Efs1Ru+VqT4G/fqxGibUw6zQpQwzAboC51DT/cp71ZU b7ErTNX+DtssNYZUFGC5xvv3LXcbvnh/9L6xS+GbLxx1sCAn2JKx32E8cIghPoHCYD 1Nx60DzBtGEPn5D2j6ptJmaUrGQK4I88XRL6ag6p42bARW4gmObRH42+IyfSfw2gPV 6LXvLAA05vw1Psj1X3DeXQkvi3cvcuujv2deeLO5gTgJYBu56NhFZnkuzbHeRwkkwG FM8DUmmemR40rfBAyfWuShdkPlzP3D7QhiznxA7A1vD6llQ+7XxDH3pJnvpg4cqmel Zpfs7SyFBUHow== From: Mat Martineau Date: Mon, 23 Oct 2023 13:44:41 -0700 Subject: [PATCH net-next 8/9] mptcp: ignore notsent_lowat setting at the subflow level Precedence: bulk X-Mailing-List: mptcp@lists.linux.dev List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable Message-Id: <20231023-send-net-next-20231023-2-v1-8-9dc60939d371@kernel.org> References: <20231023-send-net-next-20231023-2-v1-0-9dc60939d371@kernel.org> In-Reply-To: <20231023-send-net-next-20231023-2-v1-0-9dc60939d371@kernel.org> To: Matthieu Baerts , "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni Cc: netdev@vger.kernel.org, mptcp@lists.linux.dev, Mat Martineau X-Mailer: b4 0.12.4 From: Paolo Abeni Any latency related tuning taking action at the subflow level does not really affect the user-space, as only the main MPTCP socket is relevant. Anyway any limiting setting may foul the MPTCP scheduler, not being able to fully use the subflow-level cwin, leading to very poor b/w usage. Enforce notsent_lowat to be a no-op on every subflow. Note that TCP_NOTSENT_LOWAT is currently not supported, and properly dealing with that will require more invasive changes. Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau --- net/mptcp/sockopt.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index abf0645cb65d..72858d7d8974 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -1452,6 +1452,12 @@ void mptcp_sockopt_sync_locked(struct mptcp_sock *ms= k, struct sock *ssk) =20 ssk->sk_rcvlowat =3D 0; =20 + /* subflows must ignore any latency-related settings: will not affect + * the user-space - only the msk is relevant - but will foul the + * mptcp scheduler + */ + tcp_sk(ssk)->notsent_lowat =3D UINT_MAX; + if (READ_ONCE(subflow->setsockopt_seq) !=3D msk->setsockopt_seq) { sync_socket_options(msk, ssk); =20 --=20 2.41.0 From nobody Sat May 11 15:02:39 2024 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 702A2219E0; Mon, 23 Oct 2023 20:45:35 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="htCJgBvM" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 24521C116B1; Mon, 23 Oct 2023 20:45:35 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1698093935; bh=4n1Y2BDxRyW/MG048kYjDc3InJ3vd1rXw6LB824uI+E=; h=From:Date:Subject:References:In-Reply-To:To:Cc:From; b=htCJgBvMjipaNJj8ae+cOFW6SBQs0V8rcU6TpmBzcYQhNPM6wT/adQnHqNDUXHTE2 xn8FE2yAC2LIRZkFpmPdA0XvSqguPo651BuSgDzmwf2qcQgnEzC7VcMnMtLPCtnSZw caNB25bXw9y76faty+z4MSxQ0jdqF5ye2oo5UPmMW/ju/W2U5DoXTJAtyWoKXxAZfb S2OuuTvpsCsZn2wmNChjLWwvQi2gTrFmBhGRc+btT7ofNb2ipma+yT+ddq6hPxO0lu B81MBlamNNaFsnia9AhWBQRIWrKBpHs4zuWEDn00MT9J4dtJlSphZSc2nOuuVGETMB RFNwi7qM0qSjA== From: Mat Martineau Date: Mon, 23 Oct 2023 13:44:42 -0700 Subject: [PATCH net-next 9/9] mptcp: refactor sndbuf auto-tuning Precedence: bulk X-Mailing-List: mptcp@lists.linux.dev List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable Message-Id: <20231023-send-net-next-20231023-2-v1-9-9dc60939d371@kernel.org> References: <20231023-send-net-next-20231023-2-v1-0-9dc60939d371@kernel.org> In-Reply-To: <20231023-send-net-next-20231023-2-v1-0-9dc60939d371@kernel.org> To: Matthieu Baerts , "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni Cc: netdev@vger.kernel.org, mptcp@lists.linux.dev, Mat Martineau X-Mailer: b4 0.12.4 From: Paolo Abeni The MPTCP protocol account for the data enqueued on all the subflows to the main socket send buffer, while the send buffer auto-tuning algorithm set the main socket send buffer size as the max size among the subflows. That causes bad performances when at least one subflow is sndbuf limited, e.g. due to very high latency, as the MPTCP scheduler can't even fill such buffer. Change the send-buffer auto-tuning algorithm to compute the main socket send buffer size as the sum of all the subflows buffer size. Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau --- net/mptcp/protocol.c | 18 ++++++++++++++++-- net/mptcp/protocol.h | 54 +++++++++++++++++++++++++++++++++++++++++++++++-= ---- net/mptcp/sockopt.c | 5 ++++- net/mptcp/subflow.c | 3 +-- 4 files changed, 70 insertions(+), 10 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e44a3da12b96..1dacc072dcca 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -890,6 +890,7 @@ static bool __mptcp_finish_join(struct mptcp_sock *msk,= struct sock *ssk) mptcp_sockopt_sync_locked(msk, ssk); mptcp_subflow_joined(msk, ssk); mptcp_stop_tout_timer(sk); + __mptcp_propagate_sndbuf(sk, ssk); return true; } =20 @@ -1076,15 +1077,16 @@ static void mptcp_enter_memory_pressure(struct sock= *sk) struct mptcp_sock *msk =3D mptcp_sk(sk); bool first =3D true; =20 - sk_stream_moderate_sndbuf(sk); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk =3D mptcp_subflow_tcp_sock(subflow); =20 if (first) tcp_enter_memory_pressure(ssk); sk_stream_moderate_sndbuf(ssk); + first =3D false; } + __mptcp_sync_sndbuf(sk); } =20 /* ensure we get enough memory for the frag hdr, beyond some minimal amoun= t of @@ -2458,6 +2460,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct= sock *ssk, WRITE_ONCE(msk->first, NULL); =20 out: + __mptcp_sync_sndbuf(sk); if (need_push) __mptcp_push_pending(sk, 0); =20 @@ -3224,7 +3227,7 @@ struct sock *mptcp_sk_clone_init(const struct sock *s= k, * uses the correct data */ mptcp_copy_inaddrs(nsk, ssk); - mptcp_propagate_sndbuf(nsk, ssk); + __mptcp_propagate_sndbuf(nsk, ssk); =20 mptcp_rcv_space_init(msk, ssk); bh_unlock_sock(nsk); @@ -3402,6 +3405,8 @@ static void mptcp_release_cb(struct sock *sk) __mptcp_set_connected(sk); if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags)) __mptcp_error_report(sk); + if (__test_and_clear_bit(MPTCP_SYNC_SNDBUF, &msk->cb_flags)) + __mptcp_sync_sndbuf(sk); } =20 __mptcp_update_rmem(sk); @@ -3446,6 +3451,14 @@ void mptcp_subflow_process_delegated(struct sock *ss= k, long status) __set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags); mptcp_data_unlock(sk); } + if (status & BIT(MPTCP_DELEGATE_SNDBUF)) { + mptcp_data_lock(sk); + if (!sock_owned_by_user(sk)) + __mptcp_sync_sndbuf(sk); + else + __set_bit(MPTCP_SYNC_SNDBUF, &mptcp_sk(sk)->cb_flags); + mptcp_data_unlock(sk); + } if (status & BIT(MPTCP_DELEGATE_ACK)) schedule_3rdack_retransmission(ssk); } @@ -3530,6 +3543,7 @@ bool mptcp_finish_join(struct sock *ssk) /* active subflow, already present inside the conn_list */ if (!list_empty(&subflow->node)) { mptcp_subflow_joined(msk, ssk); + mptcp_propagate_sndbuf(parent, ssk); return true; } =20 diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 620a82cd4c10..296d01965943 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -123,6 +123,7 @@ #define MPTCP_RETRANSMIT 4 #define MPTCP_FLUSH_JOIN_LIST 5 #define MPTCP_CONNECTED 6 +#define MPTCP_SYNC_SNDBUF 7 =20 struct mptcp_skb_cb { u64 map_seq; @@ -443,6 +444,7 @@ DECLARE_PER_CPU(struct mptcp_delegated_action, mptcp_de= legated_actions); #define MPTCP_DELEGATE_SCHEDULED 0 #define MPTCP_DELEGATE_SEND 1 #define MPTCP_DELEGATE_ACK 2 +#define MPTCP_DELEGATE_SNDBUF 3 =20 #define MPTCP_DELEGATE_ACTIONS_MASK (~BIT(MPTCP_DELEGATE_SCHEDULED)) /* MPTCP subflow context */ @@ -516,6 +518,9 @@ struct mptcp_subflow_context { =20 u32 setsockopt_seq; u32 stale_rcv_tstamp; + int cached_sndbuf; /* sndbuf size when last synced with the msk s= ndbuf, + * protected by the msk socket lock + */ =20 struct sock *tcp_sock; /* tcp sk backpointer */ struct sock *conn; /* parent mptcp_sock */ @@ -778,13 +783,52 @@ static inline bool mptcp_data_fin_enabled(const struc= t mptcp_sock *msk) READ_ONCE(msk->write_seq) =3D=3D READ_ONCE(msk->snd_nxt); } =20 -static inline bool mptcp_propagate_sndbuf(struct sock *sk, struct sock *ss= k) +static inline void __mptcp_sync_sndbuf(struct sock *sk) { - if ((sk->sk_userlocks & SOCK_SNDBUF_LOCK) || ssk->sk_sndbuf <=3D READ_ONC= E(sk->sk_sndbuf)) - return false; + struct mptcp_subflow_context *subflow; + int ssk_sndbuf, new_sndbuf; + + if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) + return; + + new_sndbuf =3D sock_net(sk)->ipv4.sysctl_tcp_wmem[0]; + mptcp_for_each_subflow(mptcp_sk(sk), subflow) { + ssk_sndbuf =3D READ_ONCE(mptcp_subflow_tcp_sock(subflow)->sk_sndbuf); + + subflow->cached_sndbuf =3D ssk_sndbuf; + new_sndbuf +=3D ssk_sndbuf; + } + + /* the msk max wmem limit is * tcp wmem[2] */ + WRITE_ONCE(sk->sk_sndbuf, new_sndbuf); +} + +/* The called held both the msk socket and the subflow socket locks, + * possibly under BH + */ +static inline void __mptcp_propagate_sndbuf(struct sock *sk, struct sock *= ssk) +{ + struct mptcp_subflow_context *subflow =3D mptcp_subflow_ctx(ssk); + + if (READ_ONCE(ssk->sk_sndbuf) !=3D subflow->cached_sndbuf) + __mptcp_sync_sndbuf(sk); +} + +/* the caller held only the subflow socket lock, either in process or + * BH context. Additionally this can be called under the msk data lock, + * so we can't acquire such lock here: let the delegate action acquires + * the needed locks in suitable order. + */ +static inline void mptcp_propagate_sndbuf(struct sock *sk, struct sock *ss= k) +{ + struct mptcp_subflow_context *subflow =3D mptcp_subflow_ctx(ssk); + + if (likely(READ_ONCE(ssk->sk_sndbuf) =3D=3D subflow->cached_sndbuf)) + return; =20 - WRITE_ONCE(sk->sk_sndbuf, ssk->sk_sndbuf); - return true; + local_bh_disable(); + mptcp_subflow_delegate(subflow, MPTCP_DELEGATE_SNDBUF); + local_bh_enable(); } =20 static inline void mptcp_write_space(struct sock *sk) diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 72858d7d8974..574e221bb765 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -95,6 +95,7 @@ static void mptcp_sol_socket_sync_intval(struct mptcp_soc= k *msk, int optname, in case SO_SNDBUFFORCE: ssk->sk_userlocks |=3D SOCK_SNDBUF_LOCK; WRITE_ONCE(ssk->sk_sndbuf, sk->sk_sndbuf); + mptcp_subflow_ctx(ssk)->cached_sndbuf =3D sk->sk_sndbuf; break; case SO_RCVBUF: case SO_RCVBUFFORCE: @@ -1415,8 +1416,10 @@ static void sync_socket_options(struct mptcp_sock *m= sk, struct sock *ssk) =20 if (sk->sk_userlocks & tx_rx_locks) { ssk->sk_userlocks |=3D sk->sk_userlocks & tx_rx_locks; - if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) + if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) { WRITE_ONCE(ssk->sk_sndbuf, sk->sk_sndbuf); + mptcp_subflow_ctx(ssk)->cached_sndbuf =3D sk->sk_sndbuf; + } if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) WRITE_ONCE(ssk->sk_rcvbuf, sk->sk_rcvbuf); } diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index df208666fd19..2b43577f952e 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -421,6 +421,7 @@ static bool subflow_use_different_dport(struct mptcp_so= ck *msk, const struct soc =20 void __mptcp_set_connected(struct sock *sk) { + __mptcp_propagate_sndbuf(sk, mptcp_sk(sk)->first); if (sk->sk_state =3D=3D TCP_SYN_SENT) { inet_sk_state_store(sk, TCP_ESTABLISHED); sk->sk_state_change(sk); @@ -472,7 +473,6 @@ static void subflow_finish_connect(struct sock *sk, con= st struct sk_buff *skb) return; =20 msk =3D mptcp_sk(parent); - mptcp_propagate_sndbuf(parent, sk); subflow->rel_write_seq =3D 1; subflow->conn_finished =3D 1; subflow->ssn_offset =3D TCP_SKB_CB(skb)->seq; @@ -1736,7 +1736,6 @@ static void subflow_state_change(struct sock *sk) =20 msk =3D mptcp_sk(parent); if (subflow_simultaneous_connect(sk)) { - mptcp_propagate_sndbuf(parent, sk); mptcp_do_fallback(sk); mptcp_rcv_space_init(msk, sk); pr_fallback(msk); --=20 2.41.0