[PATCH v2 mptcp-next 3/4] mptcp: do not block subflows creation on errors

Paolo Abeni posted 4 patches 4 years, 2 months ago
Maintainers: Jakub Kicinski <kuba@kernel.org>, Mat Martineau <mathew.j.martineau@linux.intel.com>, Shuah Khan <shuah@kernel.org>, "David S. Miller" <davem@davemloft.net>, Matthieu Baerts <matthieu.baerts@tessares.net>
There is a newer version of this series
[PATCH v2 mptcp-next 3/4] mptcp: do not block subflows creation on errors
Posted by Paolo Abeni 4 years, 2 months ago
If the MPTCP configuration allows for multiple subflows
creation, and the first additional subflows never reach
the fully established status - e.g. due to packets drop or
reset - the in kernel path manager do not move to the
next subflow.

This patch introduces a new PM helper to cope with MPJ
subflow creation failure and delay and hook it where appropriate.

Such helper triggers additional subflow creation, as needed
and updates the PM subflow counter, if the current one is
closing.

Note that we don't need an additional timer to catch timeout
and/or long delay in connection completion: we just need to
measure the time elapsed since the subflow creation every
time we emit an MPJ sub-option.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
v2 -> v3:
 - fix compile warning (CI)

v1 -> v2:
 - explicitly hook on subflow close instead of error notification
 - fix checkpatch issue (Mat)
 - introduce and use a new pm helper to cope with subflow close
 - update pm subflow counters on close
---
 net/mptcp/options.c  | 21 ++++++++++++++++++---
 net/mptcp/pm.c       | 27 +++++++++++++++++++++++++--
 net/mptcp/protocol.c |  6 ++++++
 net/mptcp/protocol.h |  4 +++-
 net/mptcp/subflow.c  |  5 ++++-
 5 files changed, 56 insertions(+), 7 deletions(-)

diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 7a6a39b71633..00a8697addcd 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -1257,10 +1257,10 @@ static u16 mptcp_make_csum(const struct mptcp_ext *mpext)
 void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
 			 struct mptcp_out_options *opts)
 {
-	if (unlikely(OPTION_MPTCP_FAIL & opts->suboptions)) {
-		const struct sock *ssk = (const struct sock *)tp;
-		struct mptcp_subflow_context *subflow;
+	const struct sock *ssk = (const struct sock *)tp;
+	struct mptcp_subflow_context *subflow;
 
+	if (unlikely(OPTION_MPTCP_FAIL & opts->suboptions)) {
 		subflow = mptcp_subflow_ctx(ssk);
 		subflow->send_mp_fail = 0;
 
@@ -1382,6 +1382,21 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
 		/* MPC is additionally mutually exclusive with MP_PRIO */
 		goto mp_capable_done;
 	} else if (OPTIONS_MPTCP_MPJ & opts->suboptions) {
+		if (ssk) {
+			/* we are still in the MPJ handshake and "a lot" of time passed
+			 * e.g. due to syn retranmissions. We can attempt next
+			 * subflow creation
+			 */
+			subflow = mptcp_subflow_ctx(ssk);
+			if (subflow->start_stamp &&
+			    unlikely(after(tcp_jiffies32, subflow->start_stamp + HZ / 10))) {
+				mptcp_pm_subflow_check_next(mptcp_sk(subflow->conn), ssk, subflow);
+
+				/* avoid triggering the PM multiple times due to timeout */
+				subflow->start_stamp = 0;
+			}
+		}
+
 		if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) {
 			*ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
 					      TCPOLEN_MPTCP_MPJ_SYN,
diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
index 1ff856dd92c4..df8a596cd3ff 100644
--- a/net/mptcp/pm.c
+++ b/net/mptcp/pm.c
@@ -172,9 +172,32 @@ void mptcp_pm_subflow_established(struct mptcp_sock *msk)
 	spin_unlock_bh(&pm->lock);
 }
 
-void mptcp_pm_subflow_closed(struct mptcp_sock *msk, u8 id)
+void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, const struct sock *ssk,
+				 const struct mptcp_subflow_context *subflow)
 {
-	pr_debug("msk=%p", msk);
+	struct mptcp_pm_data *pm = &msk->pm;
+	bool closed;
+
+	closed = ssk->sk_state == TCP_CLOSE;
+	if (subflow->fully_established && !closed)
+		return;
+
+	spin_lock_bh(&pm->lock);
+	if (closed) {
+		pm->local_addr_used--;
+		pm->subflows--;
+		/* do not enable the pm worker: we don't want to pick again
+		 * the just closed subflow
+		 */
+	}
+
+	/* Even if this subflow is not really established, tell the PM to try
+	 * to pick the next one, if possible.
+	 */
+	if (pm->work_pending)
+		mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED);
+
+	spin_unlock_bh(&pm->lock);
 }
 
 void mptcp_pm_add_addr_received(struct mptcp_sock *msk,
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index ef2125798e64..2ee2fe97c553 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -2351,6 +2351,12 @@ void mptcp_close_ssk(struct sock *sk, struct sock *ssk,
 {
 	if (sk->sk_state == TCP_ESTABLISHED)
 		mptcp_event(MPTCP_EVENT_SUB_CLOSED, mptcp_sk(sk), ssk, GFP_KERNEL);
+
+	/* subflow aborted before reaching the fully_established status
+	 * attempt the creation of the next subflow
+	 */
+	mptcp_pm_subflow_check_next(mptcp_sk(sk), ssk, subflow);
+
 	__mptcp_close_ssk(sk, ssk, subflow, MPTCP_CF_PUSH);
 }
 
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index d384f285b6c1..4da1ef964b88 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -433,6 +433,7 @@ struct mptcp_subflow_context {
 		stale : 1;	    /* unable to snd/rcv data, do not use for xmit */
 	enum mptcp_data_avail data_avail;
 	u32	remote_nonce;
+	u32	start_stamp;
 	u64	thmac;
 	u32	local_nonce;
 	u32	remote_token;
@@ -733,7 +734,8 @@ void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk,
 bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk);
 void mptcp_pm_connection_closed(struct mptcp_sock *msk);
 void mptcp_pm_subflow_established(struct mptcp_sock *msk);
-void mptcp_pm_subflow_closed(struct mptcp_sock *msk, u8 id);
+void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, const struct sock *ssk,
+				 const struct mptcp_subflow_context *subflow);
 void mptcp_pm_add_addr_received(struct mptcp_sock *msk,
 				const struct mptcp_addr_info *addr);
 void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk,
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 0f90bd61de01..76556743e952 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -1273,7 +1273,8 @@ void __mptcp_error_report(struct sock *sk)
 
 static void subflow_error_report(struct sock *ssk)
 {
-	struct sock *sk = mptcp_subflow_ctx(ssk)->conn;
+	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+	struct sock *sk = subflow->conn;
 
 	mptcp_data_lock(sk);
 	if (!sock_owned_by_user(sk))
@@ -1444,6 +1445,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
 	subflow->remote_id = remote_id;
 	subflow->request_join = 1;
 	subflow->request_bkup = !!(flags & MPTCP_PM_ADDR_FLAG_BACKUP);
+	subflow->start_stamp = tcp_jiffies32;
 	mptcp_info2sockaddr(remote, &addr, ssk->sk_family);
 
 	mptcp_add_pending_subflow(msk, subflow);
@@ -1461,6 +1463,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
 	spin_lock_bh(&msk->join_list_lock);
 	list_del(&subflow->node);
 	spin_unlock_bh(&msk->join_list_lock);
+	mptcp_pm_subflow_check_next(msk, ssk, subflow);
 	sock_put(mptcp_subflow_tcp_sock(subflow));
 
 failed:
-- 
2.33.1


Re: [PATCH v2 mptcp-next 3/4] mptcp: do not block subflows creation on errors
Posted by Matthieu Baerts 4 years, 2 months ago
Hi Paolo,

On 26/11/2021 13:19, Paolo Abeni wrote:
> If the MPTCP configuration allows for multiple subflows
> creation, and the first additional subflows never reach
> the fully established status - e.g. due to packets drop or
> reset - the in kernel path manager do not move to the
> next subflow.
> 
> This patch introduces a new PM helper to cope with MPJ
> subflow creation failure and delay and hook it where appropriate.
> 
> Such helper triggers additional subflow creation, as needed
> and updates the PM subflow counter, if the current one is
> closing.
> 
> Note that we don't need an additional timer to catch timeout
> and/or long delay in connection completion: we just need to
> measure the time elapsed since the subflow creation every
> time we emit an MPJ sub-option.
> 
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
> v2 -> v3:
>  - fix compile warning (CI)
> 
> v1 -> v2:
>  - explicitly hook on subflow close instead of error notification
>  - fix checkpatch issue (Mat)

(+t :-P )

>  - introduce and use a new pm helper to cope with subflow close
>  - update pm subflow counters on close

Thank you for the new version!

> diff --git a/net/mptcp/options.c b/net/mptcp/options.c
> index 7a6a39b71633..00a8697addcd 100644
> --- a/net/mptcp/options.c
> +++ b/net/mptcp/options.c
> @@ -1382,6 +1382,21 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
>  		/* MPC is additionally mutually exclusive with MP_PRIO */
>  		goto mp_capable_done;
>  	} else if (OPTIONS_MPTCP_MPJ & opts->suboptions) {
> +		if (ssk) {
> +			/* we are still in the MPJ handshake and "a lot" of time passed
> +			 * e.g. due to syn retranmissions. We can attempt next
> +			 * subflow creation
> +			 */
> +			subflow = mptcp_subflow_ctx(ssk);
> +			if (subflow->start_stamp &&
> +			    unlikely(after(tcp_jiffies32, subflow->start_stamp + HZ / 10))) {

As discussed at the last meeting, we will reach this code path when
retransmitting the SYN, one second after having sent the initial one. In
this case, we might not need to keep 'start_stamp'.

But we can improve that in a new version after.

> +				mptcp_pm_subflow_check_next(mptcp_sk(subflow->conn), ssk, subflow);
> +
> +				/* avoid triggering the PM multiple times due to timeout */
> +				subflow->start_stamp = 0;
> +			}
> +		}
> +
>  		if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) {
>  			*ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
>  					      TCPOLEN_MPTCP_MPJ_SYN,
> diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
> index 1ff856dd92c4..df8a596cd3ff 100644
> --- a/net/mptcp/pm.c
> +++ b/net/mptcp/pm.c
> @@ -172,9 +172,32 @@ void mptcp_pm_subflow_established(struct mptcp_sock *msk)
>  	spin_unlock_bh(&pm->lock);
>  }
>  
> -void mptcp_pm_subflow_closed(struct mptcp_sock *msk, u8 id)
> +void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, const struct sock *ssk,
> +				 const struct mptcp_subflow_context *subflow)

Detail: do we need to pass the 'msk'? Can we not get it from 'subflow'?

>  {
> -	pr_debug("msk=%p", msk);
> +	struct mptcp_pm_data *pm = &msk->pm;
> +	bool closed;
> +
> +	closed = ssk->sk_state == TCP_CLOSE;
> +	if (subflow->fully_established && !closed)
> +		return;
> +
> +	spin_lock_bh(&pm->lock);
> +	if (closed) {
> +		pm->local_addr_used--;
> +		pm->subflows--;
> +		/* do not enable the pm worker: we don't want to pick again
> +		 * the just closed subflow
> +		 */
> +	}
> +
> +	/* Even if this subflow is not really established, tell the PM to try
> +	 * to pick the next one, if possible.
> +	 */
> +	if (pm->work_pending)

Does it not need to be "protected" with a READ_ONCE?

> +		mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED);
> +
> +	spin_unlock_bh(&pm->lock);
>  }
>  
>  void mptcp_pm_add_addr_received(struct mptcp_sock *msk,

(not related to this line)

Did you see that the "fastclose" packetdrill tests are failing? Is it
due to these modifications?

Cheers,
Matt
-- 
Tessares | Belgium | Hybrid Access Solutions
www.tessares.net

Re: [PATCH v2 mptcp-next 3/4] mptcp: do not block subflows creation on errors
Posted by Paolo Abeni 4 years, 2 months ago
On Fri, 2021-11-26 at 18:53 +0100, Matthieu Baerts wrote:
> Hi Paolo,
> 
> On 26/11/2021 13:19, Paolo Abeni wrote:
> > If the MPTCP configuration allows for multiple subflows
> > creation, and the first additional subflows never reach
> > the fully established status - e.g. due to packets drop or
> > reset - the in kernel path manager do not move to the
> > next subflow.
> > 
> > This patch introduces a new PM helper to cope with MPJ
> > subflow creation failure and delay and hook it where appropriate.
> > 
> > Such helper triggers additional subflow creation, as needed
> > and updates the PM subflow counter, if the current one is
> > closing.
> > 
> > Note that we don't need an additional timer to catch timeout
> > and/or long delay in connection completion: we just need to
> > measure the time elapsed since the subflow creation every
> > time we emit an MPJ sub-option.
> > 
> > Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> > ---
> > v2 -> v3:
> >  - fix compile warning (CI)
> > 
> > v1 -> v2:
> >  - explicitly hook on subflow close instead of error notification
> >  - fix checkpatch issue (Mat)
> 
> (+t :-P )
> 
> >  - introduce and use a new pm helper to cope with subflow close
> >  - update pm subflow counters on close
> 
> Thank you for the new version!
> 
> > diff --git a/net/mptcp/options.c b/net/mptcp/options.c
> > index 7a6a39b71633..00a8697addcd 100644
> > --- a/net/mptcp/options.c
> > +++ b/net/mptcp/options.c
> > @@ -1382,6 +1382,21 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
> >  		/* MPC is additionally mutually exclusive with MP_PRIO */
> >  		goto mp_capable_done;
> >  	} else if (OPTIONS_MPTCP_MPJ & opts->suboptions) {
> > +		if (ssk) {
> > +			/* we are still in the MPJ handshake and "a lot" of time passed
> > +			 * e.g. due to syn retranmissions. We can attempt next
> > +			 * subflow creation
> > +			 */
> > +			subflow = mptcp_subflow_ctx(ssk);
> > +			if (subflow->start_stamp &&
> > +			    unlikely(after(tcp_jiffies32, subflow->start_stamp + HZ / 10))) {
> 
> As discussed at the last meeting, we will reach this code path when
> retransmitting the SYN, one second after having sent the initial one. In
> this case, we might not need to keep 'start_stamp'.
> 
> But we can improve that in a new version after.
> 
> > +				mptcp_pm_subflow_check_next(mptcp_sk(subflow->conn), ssk, subflow);
> > +
> > +				/* avoid triggering the PM multiple times due to timeout */
> > +				subflow->start_stamp = 0;
> > +			}
> > +		}
> > +
> >  		if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) {
> >  			*ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
> >  					      TCPOLEN_MPTCP_MPJ_SYN,
> > diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
> > index 1ff856dd92c4..df8a596cd3ff 100644
> > --- a/net/mptcp/pm.c
> > +++ b/net/mptcp/pm.c
> > @@ -172,9 +172,32 @@ void mptcp_pm_subflow_established(struct mptcp_sock *msk)
> >  	spin_unlock_bh(&pm->lock);
> >  }
> >  
> > -void mptcp_pm_subflow_closed(struct mptcp_sock *msk, u8 id)
> > +void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, const struct sock *ssk,
> > +				 const struct mptcp_subflow_context *subflow)
> 
> Detail: do we need to pass the 'msk'? Can we not get it from 'subflow'?

The caler has the msk handy, passing it to the helper keeps the args
list similar to other releated helpers and avoid some more dup code.

> >  {
> > -	pr_debug("msk=%p", msk);
> > +	struct mptcp_pm_data *pm = &msk->pm;
> > +	bool closed;
> > +
> > +	closed = ssk->sk_state == TCP_CLOSE;
> > +	if (subflow->fully_established && !closed)
> > +		return;
> > +
> > +	spin_lock_bh(&pm->lock);
> > +	if (closed) {
> > +		pm->local_addr_used--;
> > +		pm->subflows--;
> > +		/* do not enable the pm worker: we don't want to pick again
> > +		 * the just closed subflow
> > +		 */
> > +	}
> > +
> > +	/* Even if this subflow is not really established, tell the PM to try
> > +	 * to pick the next one, if possible.
> > +	 */
> > +	if (pm->work_pending)
> 
> Does it not need to be "protected" with a READ_ONCE?

AFAIK not needed: this is the only read issued on such field in this
scope and and is done under the relevant lock (all write operation
occours under the pm lock).

> > +		mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED);
> > +
> > +	spin_unlock_bh(&pm->lock);
> >  }
> >  
> >  void mptcp_pm_add_addr_received(struct mptcp_sock *msk,
> 
> (not related to this line)
> 
> Did you see that the "fastclose" packetdrill tests are failing? Is it
> due to these modifications?

uhm... I haven't check them. I'll have a look soon.

Thanks!

Paolo