[v2] mptcp: fix fallback-related races

[PATCH mptcp-net v2 2/5] mptcp: plug races between subflow fail and subflow creation

Posted by Paolo Abeni 2 months, 2 weeks ago

We have races similar to the one addressed by the previous patch
between subflow failing and additional subflow creation. They are
just harder to trigger.

The solution is similar. Use a separate flag to track the condition
'socket state prevent any additional subflow creation' protected by
the fallback lock.

The socket fallback makes such flag true, and also receiving or sending
a MPTCP fail option.

The field 'allow_infinite_fallback' is now always touched under the
relevant lock, we can drop the ONCE annotation on write.

Fixes: 478d770008b0 ("mptcp: send out MP_FAIL when data checksum fails")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/mptcp/pm.c       |  8 +++++++-
 net/mptcp/protocol.c | 11 ++++++-----
 net/mptcp/protocol.h |  8 +++++++-
 net/mptcp/subflow.c  | 19 ++++++++++++++-----
 4 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
index 25ce2d5135bc..687dbb59d084 100644
--- a/net/mptcp/pm.c
+++ b/net/mptcp/pm.c
@@ -765,8 +765,14 @@ void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq)
 
 	pr_debug("fail_seq=%llu\n", fail_seq);
 
-	if (!READ_ONCE(msk->allow_infinite_fallback))
+	/* After accepting the fail, we can't create any other subflows */
+	spin_lock_bh(&msk->fallback_lock);
+	if (!msk->allow_infinite_fallback) {
+		spin_unlock_bh(&msk->fallback_lock);
 		return;
+	}
+	msk->allow_subflows = false;
+	spin_unlock_bh(&msk->fallback_lock);
 
 	if (!subflow->fail_tout) {
 		pr_debug("send MP_FAIL response and infinite map\n");
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 38bb11865b50..71b258aebcd9 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -791,7 +791,7 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
 static void mptcp_subflow_joined(struct mptcp_sock *msk, struct sock *ssk)
 {
 	mptcp_subflow_ctx(ssk)->map_seq = READ_ONCE(msk->ack_seq);
-	WRITE_ONCE(msk->allow_infinite_fallback, false);
+	msk->allow_infinite_fallback = false;
 	mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC);
 }
 
@@ -803,7 +803,7 @@ static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk)
 		return false;
 
 	spin_lock_bh(&msk->fallback_lock);
-	if (__mptcp_check_fallback(msk)) {
+	if (!msk->allow_subflows) {
 		spin_unlock_bh(&msk->fallback_lock);
 		return false;
 	}
@@ -2625,7 +2625,7 @@ static void __mptcp_retrans(struct sock *sk)
 				len = max(copied, len);
 				tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
 					 info.size_goal);
-				WRITE_ONCE(msk->allow_infinite_fallback, false);
+				msk->allow_infinite_fallback = false;
 			}
 			spin_unlock_bh(&msk->fallback_lock);
 
@@ -2753,7 +2753,8 @@ static void __mptcp_init_sock(struct sock *sk)
 	WRITE_ONCE(msk->first, NULL);
 	inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
 	WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk)));
-	WRITE_ONCE(msk->allow_infinite_fallback, true);
+	msk->allow_infinite_fallback = true;
+	msk->allow_subflows = true;
 	msk->recovery = false;
 	msk->subflow_id = 1;
 	msk->last_data_sent = tcp_jiffies32;
@@ -3549,7 +3550,7 @@ bool mptcp_finish_join(struct sock *ssk)
 	/* active subflow, already present inside the conn_list */
 	if (!list_empty(&subflow->node)) {
 		spin_lock_bh(&msk->fallback_lock);
-		if (__mptcp_check_fallback(msk)) {
+		if (!msk->allow_subflows) {
 			spin_unlock_bh(&msk->fallback_lock);
 			return false;
 		}
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 52c9175c7e49..34c0863f1eae 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -348,11 +348,16 @@ struct mptcp_sock {
 		u64	rtt_us; /* last maximum rtt of subflows */
 	} rcvq_space;
 	u8		scaling_ratio;
+	bool		allow_subflows;
 
 	u32		subflow_id;
 	u32		setsockopt_seq;
 	char		ca_name[TCP_CA_NAME_MAX];
-	spinlock_t	fallback_lock;	/* protects fallback && allow_infinite_fallback */
+
+	spinlock_t	fallback_lock;	/* protects fallback,
+					 * allow_infinite_fallback and
+					 * allow_join
+					 */
 };
 
 #define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock)
@@ -1227,6 +1232,7 @@ static inline bool __mptcp_do_fallback(struct mptcp_sock *msk)
 		return false;
 	}
 
+	msk->allow_subflows = false;
 	set_bit(MPTCP_FALLBACK_DONE, &msk->flags);
 	spin_unlock_bh(&msk->fallback_lock);
 	return true;
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 433c0b25dbc1..11c5b042c2e1 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -1302,20 +1302,29 @@ static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ss
 		mptcp_schedule_work(sk);
 }
 
-static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk)
+static bool mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk)
 {
 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
 	unsigned long fail_tout;
 
+	/* we are really failing, prevent any later subflow join */
+	spin_lock_bh(&msk->fallback_lock);
+	if (!msk->allow_infinite_fallback) {
+		spin_unlock_bh(&msk->fallback_lock);
+		return false;
+	}
+	msk->allow_subflows = false;
+	spin_unlock_bh(&msk->fallback_lock);
+
 	/* graceful failure can happen only on the MPC subflow */
 	if (WARN_ON_ONCE(ssk != READ_ONCE(msk->first)))
-		return;
+		return false;
 
 	/* since the close timeout take precedence on the fail one,
 	 * no need to start the latter when the first is already set
 	 */
 	if (sock_flag((struct sock *)msk, SOCK_DEAD))
-		return;
+		return true;
 
 	/* we don't need extreme accuracy here, use a zero fail_tout as special
 	 * value meaning no fail timeout at all;
@@ -1327,6 +1336,7 @@ static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk)
 	tcp_send_ack(ssk);
 
 	mptcp_reset_tout_timer(msk, subflow->fail_tout);
+	return true;
 }
 
 static bool subflow_check_data_avail(struct sock *ssk)
@@ -1387,12 +1397,11 @@ static bool subflow_check_data_avail(struct sock *ssk)
 		    (subflow->mp_join || subflow->valid_csum_seen)) {
 			subflow->send_mp_fail = 1;
 
-			if (!READ_ONCE(msk->allow_infinite_fallback)) {
+			if (!mptcp_subflow_fail(msk, ssk)) {
 				subflow->reset_transient = 0;
 				subflow->reset_reason = MPTCP_RST_EMIDDLEBOX;
 				goto reset;
 			}
-			mptcp_subflow_fail(msk, ssk);
 			WRITE_ONCE(subflow->data_avail, true);
 			return true;
 		}
-- 
2.49.0

Re: [PATCH mptcp-net v2 2/5] mptcp: plug races between subflow fail and subflow creation

Posted by Matthieu Baerts 2 months, 1 week ago

Hi Paolo,

On 05/07/2025 09:24, Paolo Abeni wrote:
> We have races similar to the one addressed by the previous patch
> between subflow failing and additional subflow creation. They are
> just harder to trigger.
> 
> The solution is similar. Use a separate flag to track the condition
> 'socket state prevent any additional subflow creation' protected by
> the fallback lock.
> 
> The socket fallback makes such flag true, and also receiving or sending
> a MPTCP fail option.

Just to be sure, can we not rely on the MPTCP_FALLBACK_DONE flag for the
two cases?

In other words, do we not already have something set when
sending/receiving an MP_FAIL instead of adding a new field? It feels
like we already store this info, no?

Can we not continue to use __mptcp_check_fallback(msk) and
mptcp_check_infinite_map()?

Cheers,
Matt
-- 
Sponsored by the NGI0 Core fund.

Re: [PATCH mptcp-net v2 2/5] mptcp: plug races between subflow fail and subflow creation

Posted by Paolo Abeni 2 months, 1 week ago

On 7/8/25 5:42 PM, Matthieu Baerts wrote:
> On 05/07/2025 09:24, Paolo Abeni wrote:
>> We have races similar to the one addressed by the previous patch
>> between subflow failing and additional subflow creation. They are
>> just harder to trigger.
>>
>> The solution is similar. Use a separate flag to track the condition
>> 'socket state prevent any additional subflow creation' protected by
>> the fallback lock.
>>
>> The socket fallback makes such flag true, and also receiving or sending
>> a MPTCP fail option.
> 
> Just to be sure, can we not rely on the MPTCP_FALLBACK_DONE flag for the
> two cases?
> 
> In other words, do we not already have something set when
> sending/receiving an MP_FAIL instead of adding a new field? It feels
> like we already store this info, no?

mp fail is not an alias for the new field, as the latter will be true if
e.g mp fail has been received or has been sent or sent or in case of
fallback.

We could check all the above conditions in an helper, but I found
simpler adding a new field

> Can we not continue to use __mptcp_check_fallback(msk) and
> mptcp_check_infinite_map()?

Yes and no ;)

Meaning such checks outside the fallback lock are racy, so they are
possibly useful in the fastpath/datapath, but for fallback related
decision we need to acquire the lock first.

/P

Re: [PATCH mptcp-net v2 2/5] mptcp: plug races between subflow fail and subflow creation

Posted by Matthieu Baerts 2 months, 1 week ago

Hi Paolo,

On 09/07/2025 11:14, Paolo Abeni wrote:
> On 7/8/25 5:42 PM, Matthieu Baerts wrote:
>> On 05/07/2025 09:24, Paolo Abeni wrote:
>>> We have races similar to the one addressed by the previous patch
>>> between subflow failing and additional subflow creation. They are
>>> just harder to trigger.
>>>
>>> The solution is similar. Use a separate flag to track the condition
>>> 'socket state prevent any additional subflow creation' protected by
>>> the fallback lock.
>>>
>>> The socket fallback makes such flag true, and also receiving or sending
>>> a MPTCP fail option.
>>
>> Just to be sure, can we not rely on the MPTCP_FALLBACK_DONE flag for the
>> two cases?
>>
>> In other words, do we not already have something set when
>> sending/receiving an MP_FAIL instead of adding a new field? It feels
>> like we already store this info, no?
> 
> mp fail is not an alias for the new field, as the latter will be true if
> e.g mp fail has been received or has been sent or sent or in case of
> fallback.
> 
> We could check all the above conditions in an helper, but I found
> simpler adding a new field
> 
>> Can we not continue to use __mptcp_check_fallback(msk) and
>> mptcp_check_infinite_map()?
> 
> Yes and no ;)
> 
> Meaning such checks outside the fallback lock are racy, so they are
> possibly useful in the fastpath/datapath, but for fallback related
> decision we need to acquire the lock first.

OK, I see, thank you.

Hopefully this new field will not be too complex to backport.

Cheers,
Matt
-- 
Sponsored by the NGI0 Core fund.