[PATCH mptcp-next v7 2/5] mptcp: redundant subflows push pending

Geliang Tang posted 5 patches 3 years, 2 months ago
Maintainers: John Fastabend <john.fastabend@gmail.com>, Eric Dumazet <edumazet@google.com>, Daniel Borkmann <daniel@iogearbox.net>, Martin KaFai Lau <kafai@fb.com>, Matthieu Baerts <matthieu.baerts@tessares.net>, Yonghong Song <yhs@fb.com>, "David S. Miller" <davem@davemloft.net>, Shuah Khan <shuah@kernel.org>, Alexei Starovoitov <ast@kernel.org>, Andrii Nakryiko <andrii@kernel.org>, Mat Martineau <mathew.j.martineau@linux.intel.com>, Song Liu <songliubraving@fb.com>, Jakub Kicinski <kuba@kernel.org>, KP Singh <kpsingh@kernel.org>, Paolo Abeni <pabeni@redhat.com>
There is a newer version of this series
[PATCH mptcp-next v7 2/5] mptcp: redundant subflows push pending
Posted by Geliang Tang 3 years, 2 months ago
This patch adds the redundant subflows support for __mptcp_push_pending().
Use mptcp_sched_get_send() wrapper instead of mptcp_subflow_get_send()
in it.

Check the subflow scheduled flags to test which subflow or subflows are
picked by the scheduler, use them to send data.

Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
 net/mptcp/protocol.c | 96 +++++++++++++++++++++++++-------------------
 1 file changed, 55 insertions(+), 41 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 043ac3f222ed..4f5e7f7fa6aa 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1551,58 +1551,72 @@ void mptcp_check_and_set_pending(struct sock *sk)
 
 void __mptcp_push_pending(struct sock *sk, unsigned int flags)
 {
-	struct sock *prev_ssk = NULL, *ssk = NULL;
 	struct mptcp_sock *msk = mptcp_sk(sk);
-	struct mptcp_sendmsg_info info = {
-				.flags = flags,
-	};
+	struct mptcp_subflow_context *subflow;
 	struct mptcp_data_frag *dfrag;
 	int len, copied = 0;
+	int err = 0;
 
-	while ((dfrag = mptcp_send_head(sk))) {
-		info.sent = dfrag->already_sent;
-		info.limit = dfrag->data_len;
-		len = dfrag->data_len - dfrag->already_sent;
-		while (len > 0) {
-			int ret = 0;
-
-			prev_ssk = ssk;
-			ssk = mptcp_subflow_get_send(msk);
-
-			/* First check. If the ssk has changed since
-			 * the last round, release prev_ssk
-			 */
-			if (ssk != prev_ssk && prev_ssk)
-				mptcp_push_release(prev_ssk, &info);
-			if (!ssk)
-				goto out;
+	mptcp_sched_get_send(msk, &err);
+	if (err)
+		goto out;
 
-			/* Need to lock the new subflow only if different
-			 * from the previous one, otherwise we are still
-			 * helding the relevant lock
-			 */
-			if (ssk != prev_ssk)
-				lock_sock(ssk);
+	mptcp_for_each_subflow(msk, subflow) {
+		if (READ_ONCE(subflow->scheduled)) {
+			struct sock *prev_ssk = NULL, *ssk = NULL;
+			struct mptcp_sendmsg_info info = {
+				.flags = flags,
+			};
+
+			while ((dfrag = mptcp_send_head(sk))) {
+				info.sent = dfrag->already_sent;
+				info.limit = dfrag->data_len;
+				len = dfrag->data_len - dfrag->already_sent;
+				while (len > 0) {
+					int ret = 0;
+
+					prev_ssk = ssk;
+					ssk = mptcp_subflow_tcp_sock(subflow);
+
+					/* First check. If the ssk has changed since
+					 * the last round, release prev_ssk
+					 */
+					if (ssk != prev_ssk && prev_ssk)
+						mptcp_push_release(prev_ssk, &info);
+					if (!ssk)
+						goto out;
+
+					/* Need to lock the new subflow only if different
+					 * from the previous one, otherwise we are still
+					 * helding the relevant lock
+					 */
+					if (ssk != prev_ssk)
+						lock_sock(ssk);
+
+					ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
+					if (ret <= 0) {
+						mptcp_push_release(ssk, &info);
+						goto out;
+					}
+
+					info.sent += ret;
+					copied += ret;
+					len -= ret;
+
+					mptcp_update_post_push(msk, dfrag, ret);
+				}
+				WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
+			}
 
-			ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
-			if (ret <= 0) {
+			/* at this point we held the socket lock for the last subflow we used */
+			if (ssk) {
 				mptcp_push_release(ssk, &info);
-				goto out;
+				msk->last_snd = ssk;
+				mptcp_subflow_set_scheduled(subflow, false);
 			}
-
-			info.sent += ret;
-			copied += ret;
-			len -= ret;
-
-			mptcp_update_post_push(msk, dfrag, ret);
 		}
-		WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
 	}
 
-	/* at this point we held the socket lock for the last subflow we used */
-	if (ssk)
-		mptcp_push_release(ssk, &info);
-
 out:
 	/* ensure the rtx timer is running */
 	if (!mptcp_timer_pending(sk))
-- 
2.35.3


Re: [PATCH mptcp-next v7 2/5] mptcp: redundant subflows push pending
Posted by Mat Martineau 3 years, 2 months ago
On Tue, 21 Jun 2022, Geliang Tang wrote:

> This patch adds the redundant subflows support for __mptcp_push_pending().
> Use mptcp_sched_get_send() wrapper instead of mptcp_subflow_get_send()
> in it.
>
> Check the subflow scheduled flags to test which subflow or subflows are
> picked by the scheduler, use them to send data.
>
> Signed-off-by: Geliang Tang <geliang.tang@suse.com>
> ---
> net/mptcp/protocol.c | 96 +++++++++++++++++++++++++-------------------
> 1 file changed, 55 insertions(+), 41 deletions(-)
>
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 043ac3f222ed..4f5e7f7fa6aa 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -1551,58 +1551,72 @@ void mptcp_check_and_set_pending(struct sock *sk)
>
> void __mptcp_push_pending(struct sock *sk, unsigned int flags)
> {
> -	struct sock *prev_ssk = NULL, *ssk = NULL;
> 	struct mptcp_sock *msk = mptcp_sk(sk);
> -	struct mptcp_sendmsg_info info = {
> -				.flags = flags,
> -	};
> +	struct mptcp_subflow_context *subflow;
> 	struct mptcp_data_frag *dfrag;
> 	int len, copied = 0;
> +	int err = 0;
>
> -	while ((dfrag = mptcp_send_head(sk))) {
> -		info.sent = dfrag->already_sent;
> -		info.limit = dfrag->data_len;
> -		len = dfrag->data_len - dfrag->already_sent;
> -		while (len > 0) {
> -			int ret = 0;
> -
> -			prev_ssk = ssk;
> -			ssk = mptcp_subflow_get_send(msk);
> -
> -			/* First check. If the ssk has changed since
> -			 * the last round, release prev_ssk
> -			 */
> -			if (ssk != prev_ssk && prev_ssk)
> -				mptcp_push_release(prev_ssk, &info);
> -			if (!ssk)
> -				goto out;
> +	mptcp_sched_get_send(msk, &err);
> +	if (err)
> +		goto out;
>
> -			/* Need to lock the new subflow only if different
> -			 * from the previous one, otherwise we are still
> -			 * helding the relevant lock
> -			 */
> -			if (ssk != prev_ssk)
> -				lock_sock(ssk);
> +	mptcp_for_each_subflow(msk, subflow) {
> +		if (READ_ONCE(subflow->scheduled)) {
> +			struct sock *prev_ssk = NULL, *ssk = NULL;
> +			struct mptcp_sendmsg_info info = {
> +				.flags = flags,
> +			};
> +
> +			while ((dfrag = mptcp_send_head(sk))) {

Hi Geliang -

Thanks for updating for v7. The changes you've made are definitely in the 
direction I was asking for, following the model of:

scheduler()
for_each_subflow():
    while(dfrag=...):
      send as much data on one subflow with one lock/unlock

I think there are some more modifications to make in order to send the 
same data on each subflow.

> +				info.sent = dfrag->already_sent;
> +				info.limit = dfrag->data_len;
> +				len = dfrag->data_len - dfrag->already_sent;
> +				while (len > 0) {
> +					int ret = 0;
> +
> +					prev_ssk = ssk;
> +					ssk = mptcp_subflow_tcp_sock(subflow);
> +
> +					/* First check. If the ssk has changed since
> +					 * the last round, release prev_ssk
> +					 */
> +					if (ssk != prev_ssk && prev_ssk)
> +						mptcp_push_release(prev_ssk, &info);
> +					if (!ssk)
> +						goto out;
> +
> +					/* Need to lock the new subflow only if different
> +					 * from the previous one, otherwise we are still
> +					 * helding the relevant lock
> +					 */
> +					if (ssk != prev_ssk)
> +						lock_sock(ssk);
> +
> +					ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
> +					if (ret <= 0) {
> +						mptcp_push_release(ssk, &info);
> +						goto out;
> +					}
> +
> +					info.sent += ret;
> +					copied += ret;
> +					len -= ret;
> +
> +					mptcp_update_post_push(msk, dfrag, ret);
> +				}
> +				WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));

When msk->first_pending is updated here, it changes the value returned by 
mptcp_send_head() in the "while ((dfrag = mptcp_send_head(sk)))" loop for 
the next subflow.

So, when there are redundant subflows scheduled, the next subflow will 
send new data instead of redundant data. This is because of the 
modifications made to msk->first_pending and the changes made to 
dfrag->already_sent in mptcp_update_post_push().

I think the fix is to keep the loop structure the same, but modify the 
"while ((dfrag = mptcp_send_head(sk)))" loop:

* Every subflow starts sending from the same position in the same 
first dfrag

* Track the max sent data across all the subflows (similar to the 'max' 
tracking done in v6)

* Update dfrag->already_sent (for all sent dfrags) and msk->first_pending 
after all subflows have been attempted. This is currently done by 
mptcp_update_post_push() after each dfrag but I think can be delayed until 
all subflows have sent. It looks like the update to msk->snd_nxt can also 
be delayed until after all subflows have sent.

* Now that I've looked more closely at how mptcp_update_post_push() 
handles msk->snd_burst, I think msk->snd_burst should be set to 0 in 
mptcp_sched_data_init().


This could send different length mappings on each subflow, I need to 
double check the RFC to see if that's allowed.


> +			}
>
> -			ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
> -			if (ret <= 0) {
> +			/* at this point we held the socket lock for the last subflow we used */
> +			if (ssk) {
> 				mptcp_push_release(ssk, &info);
> -				goto out;
> +				msk->last_snd = ssk;
> +				mptcp_subflow_set_scheduled(subflow, false);
> 			}
> -
> -			info.sent += ret;
> -			copied += ret;
> -			len -= ret;
> -
> -			mptcp_update_post_push(msk, dfrag, ret);
> 		}
> -		WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
> 	}
>
> -	/* at this point we held the socket lock for the last subflow we used */
> -	if (ssk)
> -		mptcp_push_release(ssk, &info);
> -
> out:
> 	/* ensure the rtx timer is running */
> 	if (!mptcp_timer_pending(sk))
> -- 
> 2.35.3
>
>
>

--
Mat Martineau
Intel