[v3] mptcp: mp_fail related fixes

[PATCH mptcp-net v3 6/6] mptcp: fix race on unaccepted mptcp sockets

Posted by Paolo Abeni 3 years, 3 months ago

When the listener socket owning the relevant request is closed,
it frees the unaccepted subflows and that causes later deletion
of the paired MPTCP sockets.

The mptcp socket's worker can run in the time interval between such delete
operations. When that happens, any access to msk->first will cause an UaF
access, as the subflow cleanup did not cleared such field in the mptcp
socket.

Address the issue explictly traversing the listener socket accept
queue at close time and performing the needed cleanup on the pending
msk.

Note that the locking is a bit tricky, as we need to acquire the msk
socket lock, while still owning the subflow socket one.

Fixes: 86e39e04482b ("mptcp: keep track of local endpoint still available for each msk")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/mptcp/protocol.c |  5 +++++
 net/mptcp/protocol.h |  2 ++
 net/mptcp/subflow.c  | 50 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 57 insertions(+)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 00ba9c44933a..6d2aa41390e7 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -2318,6 +2318,11 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
 		kfree_rcu(subflow, rcu);
 	} else {
 		/* otherwise tcp will dispose of the ssk and subflow ctx */
+		if (ssk->sk_state == TCP_LISTEN) {
+			tcp_set_state(ssk, TCP_CLOSE);
+			mptcp_subflow_queue_clean(ssk);
+			inet_csk_listen_stop(ssk);
+		}
 		__tcp_close(ssk, 0);
 
 		/* close acquired an extra ref */
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index ad9b02b1b3e6..95c9ace1437b 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -306,6 +306,7 @@ struct mptcp_sock {
 
 	u32 setsockopt_seq;
 	char		ca_name[TCP_CA_NAME_MAX];
+	struct mptcp_sock	*dl_next;
 };
 
 #define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock)
@@ -610,6 +611,7 @@ void mptcp_close_ssk(struct sock *sk, struct sock *ssk,
 		     struct mptcp_subflow_context *subflow);
 void mptcp_subflow_send_ack(struct sock *ssk);
 void mptcp_subflow_reset(struct sock *ssk);
+void mptcp_subflow_queue_clean(struct sock *ssk);
 void mptcp_sock_graft(struct sock *sk, struct socket *parent);
 struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk);
 
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 1e182301e58b..db83db1b3c4c 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -1723,6 +1723,56 @@ static void subflow_state_change(struct sock *sk)
 	}
 }
 
+void mptcp_subflow_queue_clean(struct sock *listener_ssk)
+{
+	struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue;
+	struct mptcp_sock *msk, *next, *head = NULL;
+	struct request_sock *req;
+
+	/* build a list of all unaccepted mptcp sockets */
+	spin_lock_bh(&queue->rskq_lock);
+	for (req = queue->rskq_accept_head; req; req = req->dl_next) {
+		struct mptcp_subflow_context *subflow;
+		struct sock *ssk = req->sk;
+		struct mptcp_sock *msk;
+
+		if (!sk_is_mptcp(ssk))
+			continue;
+
+		subflow = mptcp_subflow_ctx(ssk);
+		if (!subflow || !subflow->conn)
+			continue;
+
+		/* skip if already in list */
+		msk = mptcp_sk(subflow->conn);
+		if (msk->dl_next || msk == head)
+			continue;
+
+		msk->dl_next = head;
+		head = msk;
+	}
+	spin_unlock_bh(&queue->rskq_lock);
+	if (!head)
+		return;
+
+	/* can't acquire the msk socket lock under the subflow one,
+	 * or will cause ABBA deadlock
+	 */
+	release_sock(listener_ssk);
+
+	for (msk = head; msk; msk = next) {
+		struct sock *sk = (struct sock *)msk;
+		bool slow;
+
+		slow = lock_sock_fast_nested(sk);
+		next = msk->dl_next;
+		msk->first = NULL;
+		msk->dl_next = NULL;
+		unlock_sock_fast(sk, slow);
+	}
+	lock_sock(listener_ssk);
+}
+
 static int subflow_ulp_init(struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
-- 
2.35.3

Re: [PATCH mptcp-net v3 6/6] mptcp: fix race on unaccepted mptcp sockets

Posted by Mat Martineau 3 years, 3 months ago

On Fri, 17 Jun 2022, Paolo Abeni wrote:

> When the listener socket owning the relevant request is closed,
> it frees the unaccepted subflows and that causes later deletion
> of the paired MPTCP sockets.
>
> The mptcp socket's worker can run in the time interval between such delete
> operations. When that happens, any access to msk->first will cause an UaF
> access, as the subflow cleanup did not cleared such field in the mptcp
> socket.

Did you run in to this UaF in a self test? Was it possible before this 
patch series?

>
> Address the issue explictly traversing the listener socket accept
> queue at close time and performing the needed cleanup on the pending
> msk.
>
> Note that the locking is a bit tricky, as we need to acquire the msk
> socket lock, while still owning the subflow socket one.
>
> Fixes: 86e39e04482b ("mptcp: keep track of local endpoint still available for each msk")
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
> net/mptcp/protocol.c |  5 +++++
> net/mptcp/protocol.h |  2 ++
> net/mptcp/subflow.c  | 50 ++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 57 insertions(+)
>
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 00ba9c44933a..6d2aa41390e7 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -2318,6 +2318,11 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
> 		kfree_rcu(subflow, rcu);
> 	} else {
> 		/* otherwise tcp will dispose of the ssk and subflow ctx */
> +		if (ssk->sk_state == TCP_LISTEN) {
> +			tcp_set_state(ssk, TCP_CLOSE);
> +			mptcp_subflow_queue_clean(ssk);
> +			inet_csk_listen_stop(ssk);
> +		}
> 		__tcp_close(ssk, 0);
>
> 		/* close acquired an extra ref */
> diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
> index ad9b02b1b3e6..95c9ace1437b 100644
> --- a/net/mptcp/protocol.h
> +++ b/net/mptcp/protocol.h
> @@ -306,6 +306,7 @@ struct mptcp_sock {
>
> 	u32 setsockopt_seq;
> 	char		ca_name[TCP_CA_NAME_MAX];
> +	struct mptcp_sock	*dl_next;
> };
>
> #define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock)
> @@ -610,6 +611,7 @@ void mptcp_close_ssk(struct sock *sk, struct sock *ssk,
> 		     struct mptcp_subflow_context *subflow);
> void mptcp_subflow_send_ack(struct sock *ssk);
> void mptcp_subflow_reset(struct sock *ssk);
> +void mptcp_subflow_queue_clean(struct sock *ssk);
> void mptcp_sock_graft(struct sock *sk, struct socket *parent);
> struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk);
>
> diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
> index 1e182301e58b..db83db1b3c4c 100644
> --- a/net/mptcp/subflow.c
> +++ b/net/mptcp/subflow.c
> @@ -1723,6 +1723,56 @@ static void subflow_state_change(struct sock *sk)
> 	}
> }
>
> +void mptcp_subflow_queue_clean(struct sock *listener_ssk)
> +{
> +	struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue;
> +	struct mptcp_sock *msk, *next, *head = NULL;
> +	struct request_sock *req;
> +
> +	/* build a list of all unaccepted mptcp sockets */
> +	spin_lock_bh(&queue->rskq_lock);
> +	for (req = queue->rskq_accept_head; req; req = req->dl_next) {
> +		struct mptcp_subflow_context *subflow;
> +		struct sock *ssk = req->sk;
> +		struct mptcp_sock *msk;
> +
> +		if (!sk_is_mptcp(ssk))
> +			continue;
> +
> +		subflow = mptcp_subflow_ctx(ssk);
> +		if (!subflow || !subflow->conn)
> +			continue;
> +
> +		/* skip if already in list */
> +		msk = mptcp_sk(subflow->conn);
> +		if (msk->dl_next || msk == head)
> +			continue;
> +
> +		msk->dl_next = head;

Why is it ok to read/modify msk->dl_next without the msk locked here, but 
the msk lock is needed below?


Are the only msks in this queue created by incoming MP_CAPABLE SYNs? Or 
could MP_JOIN subflow request socks be involved too, which would have 
msk->first pointers to completely different listener ssks?


> +		head = msk;
> +	}
> +	spin_unlock_bh(&queue->rskq_lock);
> +	if (!head)
> +		return;
> +
> +	/* can't acquire the msk socket lock under the subflow one,
> +	 * or will cause ABBA deadlock
> +	 */
> +	release_sock(listener_ssk);
> +
> +	for (msk = head; msk; msk = next) {
> +		struct sock *sk = (struct sock *)msk;
> +		bool slow;
> +
> +		slow = lock_sock_fast_nested(sk);
> +		next = msk->dl_next;
> +		msk->first = NULL;

Related to my question above, does it make sense to check that msk->first 
== listener_ssk before setting to NULL?


> +		msk->dl_next = NULL;
> +		unlock_sock_fast(sk, slow);
> +	}
> +	lock_sock(listener_ssk);

The lock being re-acquired here was locked by the caller using:

lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);


Should that notation be used here too?

> +}
> +
> static int subflow_ulp_init(struct sock *sk)
> {
> 	struct inet_connection_sock *icsk = inet_csk(sk);
> -- 
> 2.35.3
>
>
>

--
Mat Martineau
Intel

Re: [PATCH mptcp-net v3 6/6] mptcp: fix race on unaccepted mptcp sockets

Posted by Paolo Abeni 3 years, 2 months ago

On Fri, 2022-06-17 at 17:51 -0700, Mat Martineau wrote:
> On Fri, 17 Jun 2022, Paolo Abeni wrote:
> 
> > When the listener socket owning the relevant request is closed,
> > it frees the unaccepted subflows and that causes later deletion
> > of the paired MPTCP sockets.
> > 
> > The mptcp socket's worker can run in the time interval between such delete
> > operations. When that happens, any access to msk->first will cause an UaF
> > access, as the subflow cleanup did not cleared such field in the mptcp
> > socket.
> 
> Did you run in to this UaF in a self test? 

IIRC, via packetdril

> Was it possible before this patch series?

Yes, AFAICS since the commit mentioned in the fixes tag. What really
change is the frequency. With this series it become almost
deterministic, while before this series was very hard to trigger.

> > 
> > Address the issue explictly traversing the listener socket accept
> > queue at close time and performing the needed cleanup on the pending
> > msk.
> > 
> > Note that the locking is a bit tricky, as we need to acquire the msk
> > socket lock, while still owning the subflow socket one.
> > 
> > Fixes: 86e39e04482b ("mptcp: keep track of local endpoint still available for each msk")
> > Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> > ---
> > net/mptcp/protocol.c |  5 +++++
> > net/mptcp/protocol.h |  2 ++
> > net/mptcp/subflow.c  | 50 ++++++++++++++++++++++++++++++++++++++++++++
> > 3 files changed, 57 insertions(+)
> > 
> > diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> > index 00ba9c44933a..6d2aa41390e7 100644
> > --- a/net/mptcp/protocol.c
> > +++ b/net/mptcp/protocol.c
> > @@ -2318,6 +2318,11 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
> > 		kfree_rcu(subflow, rcu);
> > 	} else {
> > 		/* otherwise tcp will dispose of the ssk and subflow ctx */
> > +		if (ssk->sk_state == TCP_LISTEN) {
> > +			tcp_set_state(ssk, TCP_CLOSE);
> > +			mptcp_subflow_queue_clean(ssk);
> > +			inet_csk_listen_stop(ssk);
> > +		}
> > 		__tcp_close(ssk, 0);
> > 
> > 		/* close acquired an extra ref */
> > diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
> > index ad9b02b1b3e6..95c9ace1437b 100644
> > --- a/net/mptcp/protocol.h
> > +++ b/net/mptcp/protocol.h
> > @@ -306,6 +306,7 @@ struct mptcp_sock {
> > 
> > 	u32 setsockopt_seq;
> > 	char		ca_name[TCP_CA_NAME_MAX];
> > +	struct mptcp_sock	*dl_next;
> > };
> > 
> > #define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock)
> > @@ -610,6 +611,7 @@ void mptcp_close_ssk(struct sock *sk, struct sock *ssk,
> > 		     struct mptcp_subflow_context *subflow);
> > void mptcp_subflow_send_ack(struct sock *ssk);
> > void mptcp_subflow_reset(struct sock *ssk);
> > +void mptcp_subflow_queue_clean(struct sock *ssk);
> > void mptcp_sock_graft(struct sock *sk, struct socket *parent);
> > struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk);
> > 
> > diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
> > index 1e182301e58b..db83db1b3c4c 100644
> > --- a/net/mptcp/subflow.c
> > +++ b/net/mptcp/subflow.c
> > @@ -1723,6 +1723,56 @@ static void subflow_state_change(struct sock *sk)
> > 	}
> > }
> > 
> > +void mptcp_subflow_queue_clean(struct sock *listener_ssk)
> > +{
> > +	struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue;
> > +	struct mptcp_sock *msk, *next, *head = NULL;
> > +	struct request_sock *req;
> > +
> > +	/* build a list of all unaccepted mptcp sockets */
> > +	spin_lock_bh(&queue->rskq_lock);
> > +	for (req = queue->rskq_accept_head; req; req = req->dl_next) {
> > +		struct mptcp_subflow_context *subflow;
> > +		struct sock *ssk = req->sk;
> > +		struct mptcp_sock *msk;
> > +
> > +		if (!sk_is_mptcp(ssk))
> > +			continue;
> > +
> > +		subflow = mptcp_subflow_ctx(ssk);
> > +		if (!subflow || !subflow->conn)
> > +			continue;
> > +
> > +		/* skip if already in list */
> > +		msk = mptcp_sk(subflow->conn);
> > +		if (msk->dl_next || msk == head)
> > +			continue;
> > +
> > +		msk->dl_next = head;
> 
> Why is it ok to read/modify msk->dl_next without the msk locked here, but 
> the msk lock is needed below?

the dl_next field is protected from the listener socket lock. Such lock
is held both here and below, see the caller of this function.
> 
> Are the only msks in this queue created by incoming MP_CAPABLE SYNs? 

The queue contains only the subflows, and specifically the MPC
subflows. MPJ subflows never land into the request queue - the helper
inserting the req_sk there is inet_csk_reqsk_queue_add(), invoked by
tcp_check_req() via inet_csk_complete_hashdance().

> Or 
> could MP_JOIN subflow request socks be involved too, which would have 
> msk->first pointers to completely different listener ssks?

msk->first does not point to the listener socket, it points to the mpc
subflow (ssk, in the above code).
> 
> 
> > +		head = msk;
> > +	}
> > +	spin_unlock_bh(&queue->rskq_lock);
> > +	if (!head)
> > +		return;
> > +
> > +	/* can't acquire the msk socket lock under the subflow one,
> > +	 * or will cause ABBA deadlock
> > +	 */
> > +	release_sock(listener_ssk);
> > +
> > +	for (msk = head; msk; msk = next) {
> > +		struct sock *sk = (struct sock *)msk;
> > +		bool slow;
> > +
> > +		slow = lock_sock_fast_nested(sk);
> > +		next = msk->dl_next;
> > +		msk->first = NULL;
> 
> Related to my question above, does it make sense to check that msk->first 
> == listener_ssk before setting to NULL?

should be msk->first == ssk, in the previous loop. Should be
rendundant, too. See subflow_syn_recv_sock().
> 
> 
> > +		msk->dl_next = NULL;
> > +		unlock_sock_fast(sk, slow);
> > +	}
> > +	lock_sock(listener_ssk);
> 
> The lock being re-acquired here was locked by the caller using:
> 
> lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
> 
> Should that notation be used here too?

Indeed, that is needed. I'll send a v4.

Thanks!

Paolo

Re: [PATCH mptcp-net v3 6/6] mptcp: fix race on unaccepted mptcp sockets

Posted by Paolo Abeni 3 years, 2 months ago

On Mon, 2022-06-20 at 12:15 +0200, Paolo Abeni wrote:
> On Fri, 2022-06-17 at 17:51 -0700, Mat Martineau wrote:
> > On Fri, 17 Jun 2022, Paolo Abeni wrote:
> > > diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
> > > index 1e182301e58b..db83db1b3c4c 100644
> > > --- a/net/mptcp/subflow.c
> > > +++ b/net/mptcp/subflow.c
> > > @@ -1723,6 +1723,56 @@ static void subflow_state_change(struct sock *sk)
> > > 	}
> > > }
> > > 
> > > +void mptcp_subflow_queue_clean(struct sock *listener_ssk)
> > > +{
> > > +	struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue;
> > > +	struct mptcp_sock *msk, *next, *head = NULL;
> > > +	struct request_sock *req;
> > > +
> > > +	/* build a list of all unaccepted mptcp sockets */
> > > +	spin_lock_bh(&queue->rskq_lock);
> > > +	for (req = queue->rskq_accept_head; req; req = req->dl_next) {
> > > +		struct mptcp_subflow_context *subflow;
> > > +		struct sock *ssk = req->sk;
> > > +		struct mptcp_sock *msk;
> > > +
> > > +		if (!sk_is_mptcp(ssk))
> > > +			continue;
> > > +
> > > +		subflow = mptcp_subflow_ctx(ssk);
> > > +		if (!subflow || !subflow->conn)
> > > +			continue;
> > > +
> > > +		/* skip if already in list */
> > > +		msk = mptcp_sk(subflow->conn);
> > > +		if (msk->dl_next || msk == head)
> > > +			continue;
> > > +
> > > +		msk->dl_next = head;
> > 
> > Why is it ok to read/modify msk->dl_next without the msk locked here, but 
> > the msk lock is needed below?
> 
> the dl_next field is protected from the listener socket lock. Such lock
> is held both here and below, see the caller of this function.
> 

I see the above is not very clear; I mean: this whole function runs
under the listener _msk_ socket lock. The 'dl_next' list is under such
protection.

/P

[PATCH mptcp-net v3 1/6] mptcp: fix error mibs accounting
[PATCH mptcp-net v3 2/6] mptcp: introduce MAPPING_BAD_CSUM
[PATCH mptcp-net v3 3/6] Squash-to: "mptcp: invoke MP_FAIL response when needed"
[PATCH mptcp-net v3 4/6] mptcp: fix shutdown vs fallback race
[PATCH mptcp-net v3 5/6] mptcp: consistent map handling on failure
[PATCH mptcp-net v3 6/6] mptcp: fix race on unaccepted mptcp sockets