Kill
- unix_state_lock_nested
- _nested usage for net->unx.table.locks[].
replace both with lock_set_cmp_fn_ptr_order(&u->lock).
The lock ordering in sk_diag_dump_icons() looks suspicious; this may
turn up a real issue.
Cc: netdev@vger.kernel.org
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
include/net/af_unix.h | 3 ---
net/unix/af_unix.c | 20 ++++++++------------
net/unix/diag.c | 2 +-
3 files changed, 9 insertions(+), 16 deletions(-)
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 49c4640027d8..4eff0a089640 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -48,9 +48,6 @@ struct scm_stat {
#define unix_state_lock(s) spin_lock(&unix_sk(s)->lock)
#define unix_state_unlock(s) spin_unlock(&unix_sk(s)->lock)
-#define unix_state_lock_nested(s) \
- spin_lock_nested(&unix_sk(s)->lock, \
- SINGLE_DEPTH_NESTING)
/* The AF_UNIX socket */
struct unix_sock {
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index d013de3c5490..1a0d273799c1 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -170,7 +170,7 @@ static void unix_table_double_lock(struct net *net,
swap(hash1, hash2);
spin_lock(&net->unx.table.locks[hash1]);
- spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
+ spin_lock(&net->unx.table.locks[hash2]);
}
static void unix_table_double_unlock(struct net *net,
@@ -997,6 +997,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern,
u->path.dentry = NULL;
u->path.mnt = NULL;
spin_lock_init(&u->lock);
+ lock_set_cmp_fn_ptr_order(&u->lock);
atomic_long_set(&u->inflight, 0);
INIT_LIST_HEAD(&u->link);
mutex_init(&u->iolock); /* single task reading lock */
@@ -1340,17 +1341,11 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
{
- if (unlikely(sk1 == sk2) || !sk2) {
- unix_state_lock(sk1);
- return;
- }
- if (sk1 < sk2) {
+ if (sk1 > sk2)
+ swap(sk1, sk2);
+ if (sk1 && sk1 != sk2)
unix_state_lock(sk1);
- unix_state_lock_nested(sk2);
- } else {
- unix_state_lock(sk2);
- unix_state_lock_nested(sk1);
- }
+ unix_state_lock(sk2);
}
static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
@@ -1591,7 +1586,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
goto out_unlock;
}
- unix_state_lock_nested(sk);
+ unix_state_lock(sk);
if (sk->sk_state != st) {
unix_state_unlock(sk);
@@ -3575,6 +3570,7 @@ static int __net_init unix_net_init(struct net *net)
for (i = 0; i < UNIX_HASH_SIZE; i++) {
spin_lock_init(&net->unx.table.locks[i]);
+ lock_set_cmp_fn_ptr_order(&net->unx.table.locks[i]);
INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
}
diff --git a/net/unix/diag.c b/net/unix/diag.c
index bec09a3a1d44..8ab5e2217e4c 100644
--- a/net/unix/diag.c
+++ b/net/unix/diag.c
@@ -84,7 +84,7 @@ static int sk_diag_dump_icons(struct sock *sk, struct sk_buff *nlskb)
* queue lock. With the other's queue locked it's
* OK to lock the state.
*/
- unix_state_lock_nested(req);
+ unix_state_lock(req);
peer = unix_sk(req)->peer;
buf[i++] = (peer ? sock_i_ino(peer) : 0);
unix_state_unlock(req);
--
2.43.0
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 26 Jan 2024 21:08:31 -0500
> Kill
> - unix_state_lock_nested
> - _nested usage for net->unx.table.locks[].
>
> replace both with lock_set_cmp_fn_ptr_order(&u->lock).
>
> The lock ordering in sk_diag_dump_icons() looks suspicious; this may
> turn up a real issue.
Yes, you cannot use lock_cmp_fn() for unix_state_lock_nested().
The lock order in sk_diag_dump_icons() is
listening socket -> child socket in the listener's queue
, and the inverse order never happens. ptr comparison does not make
sense in this case, and lockdep will complain about false positive.
>
> Cc: netdev@vger.kernel.org
> Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
> ---
> include/net/af_unix.h | 3 ---
> net/unix/af_unix.c | 20 ++++++++------------
> net/unix/diag.c | 2 +-
> 3 files changed, 9 insertions(+), 16 deletions(-)
>
> diff --git a/include/net/af_unix.h b/include/net/af_unix.h
> index 49c4640027d8..4eff0a089640 100644
> --- a/include/net/af_unix.h
> +++ b/include/net/af_unix.h
> @@ -48,9 +48,6 @@ struct scm_stat {
>
> #define unix_state_lock(s) spin_lock(&unix_sk(s)->lock)
> #define unix_state_unlock(s) spin_unlock(&unix_sk(s)->lock)
> -#define unix_state_lock_nested(s) \
> - spin_lock_nested(&unix_sk(s)->lock, \
> - SINGLE_DEPTH_NESTING)
>
> /* The AF_UNIX socket */
> struct unix_sock {
> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> index d013de3c5490..1a0d273799c1 100644
> --- a/net/unix/af_unix.c
> +++ b/net/unix/af_unix.c
> @@ -170,7 +170,7 @@ static void unix_table_double_lock(struct net *net,
> swap(hash1, hash2);
>
> spin_lock(&net->unx.table.locks[hash1]);
> - spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
> + spin_lock(&net->unx.table.locks[hash2]);
> }
>
> static void unix_table_double_unlock(struct net *net,
> @@ -997,6 +997,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern,
> u->path.dentry = NULL;
> u->path.mnt = NULL;
> spin_lock_init(&u->lock);
> + lock_set_cmp_fn_ptr_order(&u->lock);
> atomic_long_set(&u->inflight, 0);
> INIT_LIST_HEAD(&u->link);
> mutex_init(&u->iolock); /* single task reading lock */
> @@ -1340,17 +1341,11 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
>
> static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
> {
> - if (unlikely(sk1 == sk2) || !sk2) {
> - unix_state_lock(sk1);
> - return;
> - }
> - if (sk1 < sk2) {
> + if (sk1 > sk2)
> + swap(sk1, sk2);
> + if (sk1 && sk1 != sk2)
> unix_state_lock(sk1);
> - unix_state_lock_nested(sk2);
> - } else {
> - unix_state_lock(sk2);
> - unix_state_lock_nested(sk1);
> - }
> + unix_state_lock(sk2);
> }
>
> static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
> @@ -1591,7 +1586,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
> goto out_unlock;
> }
>
> - unix_state_lock_nested(sk);
> + unix_state_lock(sk);
>
> if (sk->sk_state != st) {
> unix_state_unlock(sk);
> @@ -3575,6 +3570,7 @@ static int __net_init unix_net_init(struct net *net)
>
> for (i = 0; i < UNIX_HASH_SIZE; i++) {
> spin_lock_init(&net->unx.table.locks[i]);
> + lock_set_cmp_fn_ptr_order(&net->unx.table.locks[i]);
> INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
> }
>
> diff --git a/net/unix/diag.c b/net/unix/diag.c
> index bec09a3a1d44..8ab5e2217e4c 100644
> --- a/net/unix/diag.c
> +++ b/net/unix/diag.c
> @@ -84,7 +84,7 @@ static int sk_diag_dump_icons(struct sock *sk, struct sk_buff *nlskb)
> * queue lock. With the other's queue locked it's
> * OK to lock the state.
> */
> - unix_state_lock_nested(req);
> + unix_state_lock(req);
> peer = unix_sk(req)->peer;
> buf[i++] = (peer ? sock_i_ino(peer) : 0);
> unix_state_unlock(req);
> --
> 2.43.0
On Sun, Jan 28, 2024 at 12:28:38AM -0800, Kuniyuki Iwashima wrote: > From: Kent Overstreet <kent.overstreet@linux.dev> > Date: Fri, 26 Jan 2024 21:08:31 -0500 > > Kill > > - unix_state_lock_nested > > - _nested usage for net->unx.table.locks[]. > > > > replace both with lock_set_cmp_fn_ptr_order(&u->lock). > > > > The lock ordering in sk_diag_dump_icons() looks suspicious; this may > > turn up a real issue. > > Yes, you cannot use lock_cmp_fn() for unix_state_lock_nested(). > > The lock order in sk_diag_dump_icons() is > > listening socket -> child socket in the listener's queue > > , and the inverse order never happens. ptr comparison does not make > sense in this case, and lockdep will complain about false positive. Is that a real lock ordering? Is this parent -> child relationship well defined? If it is, we should be able to write a lock_cmp_fn for it, as long as it's not some handwavy "this will never happen but _nested won't check for it" like I saw elsewhere in the net code... :)
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 28 Jan 2024 14:38:02 -0500
> On Sun, Jan 28, 2024 at 12:28:38AM -0800, Kuniyuki Iwashima wrote:
> > From: Kent Overstreet <kent.overstreet@linux.dev>
> > Date: Fri, 26 Jan 2024 21:08:31 -0500
> > > Kill
> > > - unix_state_lock_nested
> > > - _nested usage for net->unx.table.locks[].
> > >
> > > replace both with lock_set_cmp_fn_ptr_order(&u->lock).
> > >
> > > The lock ordering in sk_diag_dump_icons() looks suspicious; this may
> > > turn up a real issue.
> >
> > Yes, you cannot use lock_cmp_fn() for unix_state_lock_nested().
> >
> > The lock order in sk_diag_dump_icons() is
> >
> > listening socket -> child socket in the listener's queue
> >
> > , and the inverse order never happens. ptr comparison does not make
> > sense in this case, and lockdep will complain about false positive.
>
> Is that a real lock ordering? Is this parent -> child relationship well
> defined?
>
> If it is, we should be able to write a lock_cmp_fn for it, as long as
> it's not some handwavy "this will never happen but _nested won't check
> for it" like I saw elsewhere in the net code... :)
The problem would be there's no handy way to detect the relationship
except for iterating the queue again.
---8<---
static int unix_state_lock_cmp_fn(const struct lockdep_map *_a,
const struct lockdep_map *_b)
{
const struct unix_sock *a = container_of(_a, struct unix_sock, lock.dep_map);
const struct unix_sock *b = container_of(_b, struct unix_sock, lock.dep_map);
if (a->sk.sk_state == TCP_LISTEN && b->sk.sk_state == TCP_ESTABLISHED) {
/* check if b is a's cihld */
}
/* otherwise, ptr comparison here. */
}
---8<---
This can be resolved by a patch like this, which is in my local tree
for another series.
So, after posting the series, I can revisit this and write lock_cmp_fn
for u->lock.
---8<---
commit 12d39766b06068fda5987f4e7b4827e8c3273137
Author: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Thu Jan 11 22:36:58 2024 +0000
af_unix: Save listener for embryo socket.
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 9ea04653c7c9..d0c0d81bcb1d 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -82,6 +82,7 @@ struct scm_stat {
struct unix_sock {
/* WARNING: sk has to be the first member */
struct sock sk;
+#define usk_listener sk.__sk_common.skc_unix_listener
struct unix_address *addr;
struct path path;
struct mutex iolock, bindlock;
diff --git a/include/net/sock.h b/include/net/sock.h
index a9d99a9c583f..3df3d068345a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -142,6 +142,8 @@ typedef __u64 __bitwise __addrpair;
* [union with @skc_incoming_cpu]
* @skc_tw_rcv_nxt: (aka tw_rcv_nxt) TCP window next expected seq number
* [union with @skc_incoming_cpu]
+ * @skc_unix_listener: connection request listener socket for AF_UNIX
+ * [union with @skc_rxhash]
* @skc_refcnt: reference count
*
* This is the minimal network layer representation of sockets, the header
@@ -227,6 +229,7 @@ struct sock_common {
u32 skc_rxhash;
u32 skc_window_clamp;
u32 skc_tw_snd_nxt; /* struct tcp_timewait_sock */
+ struct sock *skc_unix_listener; /* struct unix_sock */
};
/* public: */
};
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 5f9871555ec6..4a41bb727c32 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -991,6 +991,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern,
sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen;
sk->sk_destruct = unix_sock_destructor;
u = unix_sk(sk);
+ u->usk_listener = NULL;
u->inflight = 0;
u->path.dentry = NULL;
u->path.mnt = NULL;
@@ -1612,6 +1613,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
newsk->sk_type = sk->sk_type;
init_peercred(newsk);
newu = unix_sk(newsk);
+ newu->usk_listener = other;
RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
otheru = unix_sk(other);
@@ -1707,8 +1709,8 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
bool kern)
{
struct sock *sk = sock->sk;
- struct sock *tsk;
struct sk_buff *skb;
+ struct sock *tsk;
int err;
err = -EOPNOTSUPP;
@@ -1733,6 +1735,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
}
tsk = skb->sk;
+ unix_sk(tsk)->usk_listener = NULL;
skb_free_datagram(sk, skb);
wake_up_interruptible(&unix_sk(sk)->peer_wait);
---8<---
On Sun, Jan 28, 2024 at 12:56:32PM -0800, Kuniyuki Iwashima wrote:
> From: Kent Overstreet <kent.overstreet@linux.dev>
> Date: Sun, 28 Jan 2024 14:38:02 -0500
> > On Sun, Jan 28, 2024 at 12:28:38AM -0800, Kuniyuki Iwashima wrote:
> > > From: Kent Overstreet <kent.overstreet@linux.dev>
> > > Date: Fri, 26 Jan 2024 21:08:31 -0500
> > > > Kill
> > > > - unix_state_lock_nested
> > > > - _nested usage for net->unx.table.locks[].
> > > >
> > > > replace both with lock_set_cmp_fn_ptr_order(&u->lock).
> > > >
> > > > The lock ordering in sk_diag_dump_icons() looks suspicious; this may
> > > > turn up a real issue.
> > >
> > > Yes, you cannot use lock_cmp_fn() for unix_state_lock_nested().
> > >
> > > The lock order in sk_diag_dump_icons() is
> > >
> > > listening socket -> child socket in the listener's queue
> > >
> > > , and the inverse order never happens. ptr comparison does not make
> > > sense in this case, and lockdep will complain about false positive.
> >
> > Is that a real lock ordering? Is this parent -> child relationship well
> > defined?
> >
> > If it is, we should be able to write a lock_cmp_fn for it, as long as
> > it's not some handwavy "this will never happen but _nested won't check
> > for it" like I saw elsewhere in the net code... :)
>
> The problem would be there's no handy way to detect the relationship
> except for iterating the queue again.
>
> ---8<---
> static int unix_state_lock_cmp_fn(const struct lockdep_map *_a,
> const struct lockdep_map *_b)
> {
> const struct unix_sock *a = container_of(_a, struct unix_sock, lock.dep_map);
> const struct unix_sock *b = container_of(_b, struct unix_sock, lock.dep_map);
>
> if (a->sk.sk_state == TCP_LISTEN && b->sk.sk_state == TCP_ESTABLISHED) {
> /* check if b is a's cihld */
> }
>
> /* otherwise, ptr comparison here. */
> }
> ---8<---
>
>
> This can be resolved by a patch like this, which is in my local tree
> for another series.
>
> So, after posting the series, I can revisit this and write lock_cmp_fn
> for u->lock.
Sounds good! Please CC me when you do.
© 2016 - 2025 Red Hat, Inc.