[PATCH mptcp-net v2] mptcp: fix lockdep false positive

Paolo Abeni posted 1 patch 1 year, 4 months ago
Patches applied successfully (tree, apply log)
git fetch https://github.com/multipath-tcp/mptcp_net-next tags/patchew/1f27ea6ff3b76035685ec372cc9e2753b9f9d90e.1671032651.git.pabeni@redhat.com
Maintainers: Mat Martineau <mathew.j.martineau@linux.intel.com>, Matthieu Baerts <matthieu.baerts@tessares.net>, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Menglong Dong <imagedong@tencent.com>, Mengen Sun <mengensun@tencent.com>
net/mptcp/protocol.c |  2 +-
net/mptcp/protocol.h |  2 +-
net/mptcp/subflow.c  | 19 +++++++++++++++++--
3 files changed, 19 insertions(+), 4 deletions(-)
[PATCH mptcp-net v2] mptcp: fix lockdep false positive
Posted by Paolo Abeni 1 year, 4 months ago
MatB reported a lockdep splat in the mptcp listener code cleanup:

 WARNING: possible circular locking dependency detected
 packetdrill/14278 is trying to acquire lock:
 ffff888017d868f0 ((work_completion)(&msk->work)){+.+.}-{0:0}, at: __flush_work (kernel/workqueue.c:3069)

 but task is already holding lock:
 ffff888017d84130 (sk_lock-AF_INET){+.+.}-{0:0}, at: mptcp_close (net/mptcp/protocol.c:2973)

 which lock already depends on the new lock.


 the existing dependency chain (in reverse order) is:

 -> #1 (sk_lock-AF_INET){+.+.}-{0:0}:
        __lock_acquire (kernel/locking/lockdep.c:5055)
        lock_acquire (kernel/locking/lockdep.c:466)
        lock_sock_nested (net/core/sock.c:3463)
        mptcp_worker (net/mptcp/protocol.c:2614)
        process_one_work (kernel/workqueue.c:2294)
        worker_thread (include/linux/list.h:292)
        kthread (kernel/kthread.c:376)
        ret_from_fork (arch/x86/entry/entry_64.S:312)

 -> #0 ((work_completion)(&msk->work)){+.+.}-{0:0}:
        check_prev_add (kernel/locking/lockdep.c:3098)
        validate_chain (kernel/locking/lockdep.c:3217)
        __lock_acquire (kernel/locking/lockdep.c:5055)
        lock_acquire (kernel/locking/lockdep.c:466)
        __flush_work (kernel/workqueue.c:3070)
        __cancel_work_timer (kernel/workqueue.c:3160)
        mptcp_cancel_work (net/mptcp/protocol.c:2758)
        mptcp_subflow_queue_clean (net/mptcp/subflow.c:1817)
        __mptcp_close_ssk (net/mptcp/protocol.c:2363)
        mptcp_destroy_common (net/mptcp/protocol.c:3170)
        mptcp_destroy (include/net/sock.h:1495)
        __mptcp_destroy_sock (net/mptcp/protocol.c:2886)
        __mptcp_close (net/mptcp/protocol.c:2959)
        mptcp_close (net/mptcp/protocol.c:2974)
        inet_release (net/ipv4/af_inet.c:432)
        __sock_release (net/socket.c:651)
        sock_close (net/socket.c:1367)
        __fput (fs/file_table.c:320)
        task_work_run (kernel/task_work.c:181 (discriminator 1))
        exit_to_user_mode_prepare (include/linux/resume_user_mode.h:49)
        syscall_exit_to_user_mode (kernel/entry/common.c:130)
        do_syscall_64 (arch/x86/entry/common.c:87)
        entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120)

 other info that might help us debug this:

  Possible unsafe locking scenario:

        CPU0                    CPU1
        ----                    ----
   lock(sk_lock-AF_INET);
                                lock((work_completion)(&msk->work));
                                lock(sk_lock-AF_INET);
   lock((work_completion)(&msk->work));

  *** DEADLOCK ***

The report is actually a false positive, since the only existing lock
nesting is the msk socket lock acquired by the mptcp work.
cancel_work_sync() is invoked without the relevant socket lock being
held, but under a different (the msk listener) socket lock.

We could silence the splat adding a per workqueue dynamic lockdep key,
but that looks overkill. Instead just tell lockdep the msk socket lock
is not held around cancel_work_sync().

Fixes: 30e51b923e43 ("mptcp: fix unreleased socket in accept queue")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
--
v1 -> v2:
 - fix a few typos (MatM)
---
 net/mptcp/protocol.c |  2 +-
 net/mptcp/protocol.h |  2 +-
 net/mptcp/subflow.c  | 19 +++++++++++++++++--
 3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 6d03bdcda33e..289dd4add88c 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -2363,7 +2363,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
 		/* otherwise tcp will dispose of the ssk and subflow ctx */
 		if (ssk->sk_state == TCP_LISTEN) {
 			tcp_set_state(ssk, TCP_CLOSE);
-			mptcp_subflow_queue_clean(ssk);
+			mptcp_subflow_queue_clean(sk, ssk);
 			inet_csk_listen_stop(ssk);
 			mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED);
 		}
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index a9ff7028fad8..3cd3270407b0 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -630,7 +630,7 @@ void mptcp_close_ssk(struct sock *sk, struct sock *ssk,
 		     struct mptcp_subflow_context *subflow);
 void __mptcp_subflow_send_ack(struct sock *ssk);
 void mptcp_subflow_reset(struct sock *ssk);
-void mptcp_subflow_queue_clean(struct sock *ssk);
+void mptcp_subflow_queue_clean(struct sock *sk, struct sock *ssk);
 void mptcp_sock_graft(struct sock *sk, struct socket *parent);
 struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk);
 bool __mptcp_close(struct sock *sk, long timeout);
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 29904303f5c2..294f8d1bb3fc 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -1764,7 +1764,7 @@ static void subflow_state_change(struct sock *sk)
 	}
 }
 
-void mptcp_subflow_queue_clean(struct sock *listener_ssk)
+void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk)
 {
 	struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue;
 	struct mptcp_sock *msk, *next, *head = NULL;
@@ -1813,8 +1813,23 @@ void mptcp_subflow_queue_clean(struct sock *listener_ssk)
 
 		do_cancel_work = __mptcp_close(sk, 0);
 		release_sock(sk);
-		if (do_cancel_work)
+		if (do_cancel_work) {
+			/* lockdep will report a false positive ABBA deadlock
+			 * between cancel_work_sync and the listener socket.
+			 * The involved locks belong to different sockets WRT
+			 * the existing AB chain.
+			 * Using a per socket key is problematic as key
+			 * deregistration requires process context and must be
+			 * performed at socket disposal time, in atomic
+			 * context.
+			 * Just tell lockdep to consider the listener socket
+			 * released here.
+			 */
+			mutex_release(&listener_sk->sk_lock.dep_map, _RET_IP_);
 			mptcp_cancel_work(sk);
+			mutex_acquire(&listener_sk->sk_lock.dep_map,
+				      SINGLE_DEPTH_NESTING, 0, _RET_IP_);
+		}
 		sock_put(sk);
 	}
 
-- 
2.38.1
Re: [PATCH mptcp-net v2] mptcp: fix lockdep false positive
Posted by Matthieu Baerts 1 year, 4 months ago
Hi Paolo, Mat,

On 14/12/2022 16:44, Paolo Abeni wrote:
> MatB reported a lockdep splat in the mptcp listener code cleanup:

(...)

> The report is actually a false positive, since the only existing lock
> nesting is the msk socket lock acquired by the mptcp work.
> cancel_work_sync() is invoked without the relevant socket lock being
> held, but under a different (the msk listener) socket lock.
> 
> We could silence the splat adding a per workqueue dynamic lockdep key,
> but that looks overkill. Instead just tell lockdep the msk socket lock
> is not held around cancel_work_sync().

Thank you for the patches, the great explanations and the reviews!

Now in our tree (fix for -net) with Mat's RvB, a Closes and RpB tags.

New patches for t/upstream-net and t/upstream:
- 71fd5e75df68: mptcp: fix lockdep false positive
- Results: d330d6da95d3..74c46cd6bf02 (export-net)
- Results: d3d3b2053b5b..130b6676200e (export)

Tests are now in progress:

https://cirrus-ci.com/github/multipath-tcp/mptcp_net-next/export-net/20221215T095520
https://cirrus-ci.com/github/multipath-tcp/mptcp_net-next/export/20221215T095520

Cheers,
Matt
-- 
Tessares | Belgium | Hybrid Access Solutions
www.tessares.net
Re: [PATCH mptcp-net v2] mptcp: fix lockdep false positive
Posted by Mat Martineau 1 year, 4 months ago
On Wed, 14 Dec 2022, Paolo Abeni wrote:

> MatB reported a lockdep splat in the mptcp listener code cleanup:
>
> WARNING: possible circular locking dependency detected
> packetdrill/14278 is trying to acquire lock:
> ffff888017d868f0 ((work_completion)(&msk->work)){+.+.}-{0:0}, at: __flush_work (kernel/workqueue.c:3069)
>
> but task is already holding lock:
> ffff888017d84130 (sk_lock-AF_INET){+.+.}-{0:0}, at: mptcp_close (net/mptcp/protocol.c:2973)
>
> which lock already depends on the new lock.
>
>
> the existing dependency chain (in reverse order) is:
>
> -> #1 (sk_lock-AF_INET){+.+.}-{0:0}:
>        __lock_acquire (kernel/locking/lockdep.c:5055)
>        lock_acquire (kernel/locking/lockdep.c:466)
>        lock_sock_nested (net/core/sock.c:3463)
>        mptcp_worker (net/mptcp/protocol.c:2614)
>        process_one_work (kernel/workqueue.c:2294)
>        worker_thread (include/linux/list.h:292)
>        kthread (kernel/kthread.c:376)
>        ret_from_fork (arch/x86/entry/entry_64.S:312)
>
> -> #0 ((work_completion)(&msk->work)){+.+.}-{0:0}:
>        check_prev_add (kernel/locking/lockdep.c:3098)
>        validate_chain (kernel/locking/lockdep.c:3217)
>        __lock_acquire (kernel/locking/lockdep.c:5055)
>        lock_acquire (kernel/locking/lockdep.c:466)
>        __flush_work (kernel/workqueue.c:3070)
>        __cancel_work_timer (kernel/workqueue.c:3160)
>        mptcp_cancel_work (net/mptcp/protocol.c:2758)
>        mptcp_subflow_queue_clean (net/mptcp/subflow.c:1817)
>        __mptcp_close_ssk (net/mptcp/protocol.c:2363)
>        mptcp_destroy_common (net/mptcp/protocol.c:3170)
>        mptcp_destroy (include/net/sock.h:1495)
>        __mptcp_destroy_sock (net/mptcp/protocol.c:2886)
>        __mptcp_close (net/mptcp/protocol.c:2959)
>        mptcp_close (net/mptcp/protocol.c:2974)
>        inet_release (net/ipv4/af_inet.c:432)
>        __sock_release (net/socket.c:651)
>        sock_close (net/socket.c:1367)
>        __fput (fs/file_table.c:320)
>        task_work_run (kernel/task_work.c:181 (discriminator 1))
>        exit_to_user_mode_prepare (include/linux/resume_user_mode.h:49)
>        syscall_exit_to_user_mode (kernel/entry/common.c:130)
>        do_syscall_64 (arch/x86/entry/common.c:87)
>        entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120)
>
> other info that might help us debug this:
>
>  Possible unsafe locking scenario:
>
>        CPU0                    CPU1
>        ----                    ----
>   lock(sk_lock-AF_INET);
>                                lock((work_completion)(&msk->work));
>                                lock(sk_lock-AF_INET);
>   lock((work_completion)(&msk->work));
>
>  *** DEADLOCK ***
>
> The report is actually a false positive, since the only existing lock
> nesting is the msk socket lock acquired by the mptcp work.
> cancel_work_sync() is invoked without the relevant socket lock being
> held, but under a different (the msk listener) socket lock.
>
> We could silence the splat adding a per workqueue dynamic lockdep key,
> but that looks overkill. Instead just tell lockdep the msk socket lock
> is not held around cancel_work_sync().
>
> Fixes: 30e51b923e43 ("mptcp: fix unreleased socket in accept queue")
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> --
> v1 -> v2:
> - fix a few typos (MatM)
> ---
> net/mptcp/protocol.c |  2 +-
> net/mptcp/protocol.h |  2 +-
> net/mptcp/subflow.c  | 19 +++++++++++++++++--
> 3 files changed, 19 insertions(+), 4 deletions(-)
>
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 6d03bdcda33e..289dd4add88c 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -2363,7 +2363,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
> 		/* otherwise tcp will dispose of the ssk and subflow ctx */
> 		if (ssk->sk_state == TCP_LISTEN) {
> 			tcp_set_state(ssk, TCP_CLOSE);
> -			mptcp_subflow_queue_clean(ssk);
> +			mptcp_subflow_queue_clean(sk, ssk);
> 			inet_csk_listen_stop(ssk);
> 			mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED);
> 		}
> diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
> index a9ff7028fad8..3cd3270407b0 100644
> --- a/net/mptcp/protocol.h
> +++ b/net/mptcp/protocol.h
> @@ -630,7 +630,7 @@ void mptcp_close_ssk(struct sock *sk, struct sock *ssk,
> 		     struct mptcp_subflow_context *subflow);
> void __mptcp_subflow_send_ack(struct sock *ssk);
> void mptcp_subflow_reset(struct sock *ssk);
> -void mptcp_subflow_queue_clean(struct sock *ssk);
> +void mptcp_subflow_queue_clean(struct sock *sk, struct sock *ssk);
> void mptcp_sock_graft(struct sock *sk, struct socket *parent);
> struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk);
> bool __mptcp_close(struct sock *sk, long timeout);
> diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
> index 29904303f5c2..294f8d1bb3fc 100644
> --- a/net/mptcp/subflow.c
> +++ b/net/mptcp/subflow.c
> @@ -1764,7 +1764,7 @@ static void subflow_state_change(struct sock *sk)
> 	}
> }
>
> -void mptcp_subflow_queue_clean(struct sock *listener_ssk)
> +void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk)
> {
> 	struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue;
> 	struct mptcp_sock *msk, *next, *head = NULL;
> @@ -1813,8 +1813,23 @@ void mptcp_subflow_queue_clean(struct sock *listener_ssk)
>
> 		do_cancel_work = __mptcp_close(sk, 0);
> 		release_sock(sk);
> -		if (do_cancel_work)
> +		if (do_cancel_work) {
> +			/* lockdep will report a false positive ABBA deadlock
> +			 * between cancel_work_sync and the listener socket.
> +			 * The involved locks belong to different sockets WRT
> +			 * the existing AB chain.
> +			 * Using a per socket key is problematic as key
> +			 * deregistration requires process context and must be
> +			 * performed at socket disposal time, in atomic
> +			 * context.
> +			 * Just tell lockdep to consider the listener socket
> +			 * released here.
> +			 */
> +			mutex_release(&listener_sk->sk_lock.dep_map, _RET_IP_);
> 			mptcp_cancel_work(sk);
> +			mutex_acquire(&listener_sk->sk_lock.dep_map,
> +				      SINGLE_DEPTH_NESTING, 0, _RET_IP_);
> +		}

Thanks for the v2 Paolo.

I took another look at my concern in v1. What I missed was that we know 
listener_sk != sk (because sk is from listener_sk's unaccepted list), so 
there's never a chance for a real deadlock.

The workaround looks safe and reasonable to me:

Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com>



--
Mat Martineau
Intel