include/linux/skmsg.h | 2 +- include/net/sock.h | 58 +++++++++++++++++++++++++++++++------------ net/core/skmsg.c | 3 ++- 3 files changed, 45 insertions(+), 18 deletions(-)
Syzkaller reports refcount bug as follows:
------------[ cut here ]------------
refcount_t: saturated; leaking memory.
WARNING: CPU: 1 PID: 3605 at lib/refcount.c:19 refcount_warn_saturate+0xf4/0x1e0 lib/refcount.c:19
Modules linked in:
CPU: 1 PID: 3605 Comm: syz-executor208 Not tainted 5.18.0-syzkaller-03023-g7e062cda7d90 #0
<TASK>
__refcount_add_not_zero include/linux/refcount.h:163 [inline]
__refcount_inc_not_zero include/linux/refcount.h:227 [inline]
refcount_inc_not_zero include/linux/refcount.h:245 [inline]
sk_psock_get+0x3bc/0x410 include/linux/skmsg.h:439
tls_data_ready+0x6d/0x1b0 net/tls/tls_sw.c:2091
tcp_data_ready+0x106/0x520 net/ipv4/tcp_input.c:4983
tcp_data_queue+0x25f2/0x4c90 net/ipv4/tcp_input.c:5057
tcp_rcv_state_process+0x1774/0x4e80 net/ipv4/tcp_input.c:6659
tcp_v4_do_rcv+0x339/0x980 net/ipv4/tcp_ipv4.c:1682
sk_backlog_rcv include/net/sock.h:1061 [inline]
__release_sock+0x134/0x3b0 net/core/sock.c:2849
release_sock+0x54/0x1b0 net/core/sock.c:3404
inet_shutdown+0x1e0/0x430 net/ipv4/af_inet.c:909
__sys_shutdown_sock net/socket.c:2331 [inline]
__sys_shutdown_sock net/socket.c:2325 [inline]
__sys_shutdown+0xf1/0x1b0 net/socket.c:2343
__do_sys_shutdown net/socket.c:2351 [inline]
__se_sys_shutdown net/socket.c:2349 [inline]
__x64_sys_shutdown+0x50/0x70 net/socket.c:2349
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x46/0xb0
</TASK>
During SMC fallback process in connect syscall, kernel will
replaces TCP with SMC. In order to forward wakeup
smc socket waitqueue after fallback, kernel will sets
clcsk->sk_user_data to origin smc socket in
smc_fback_replace_callbacks().
Later, in shutdown syscall, kernel will calls
sk_psock_get(), which treats the clcsk->sk_user_data
as psock type, triggering the refcnt warning.
So, the root cause is that smc and psock, both will use
sk_user_data field. So they will mismatch this field
easily.
This patch solves it by using another bit(defined as
SK_USER_DATA_PSOCK) in PTRMASK, to mark whether
sk_user_data points to a psock object or not.
This patch depends on a PTRMASK introduced in commit f1ff5ce2cd5e
("net, sk_msg: Clear sk_user_data pointer on clone if tagged").
Reported-and-tested-by: syzbot+5f26f85569bd179c18ce@syzkaller.appspotmail.com
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Acked-by: Wen Gu <guwen@linux.alibaba.com>
Signed-off-by: Hawkins Jiawei <yin31149@gmail.com>
---
v3 -> v4:
- change new subject
- fix diff content, which has been edit accidentally
v2 -> v3:
- use SK_USER_DATA_PSOCK instead of SK_USER_DATA_NOTPSOCK
to patch the bug
- refactor the code on assigning to sk_user_data field
in psock part
- refactor the code on getting and setting the flag
with sk_user_data field
v1 -> v2:
- add bit in PTRMASK to patch the bug
include/linux/skmsg.h | 2 +-
include/net/sock.h | 58 +++++++++++++++++++++++++++++++------------
net/core/skmsg.c | 3 ++-
3 files changed, 45 insertions(+), 18 deletions(-)
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index c5a2d6f50f25..81bfa1a33623 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -277,7 +277,7 @@ static inline void sk_msg_sg_copy_clear(struct sk_msg *msg, u32 start)
static inline struct sk_psock *sk_psock(const struct sock *sk)
{
- return rcu_dereference_sk_user_data(sk);
+ return rcu_dereference_sk_user_data_psock(sk);
}
static inline void sk_psock_set_state(struct sk_psock *psock,
diff --git a/include/net/sock.h b/include/net/sock.h
index 9fa54762e077..d010910d5879 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -545,14 +545,24 @@ enum sk_pacing {
SK_PACING_FQ = 2,
};
-/* Pointer stored in sk_user_data might not be suitable for copying
- * when cloning the socket. For instance, it can point to a reference
- * counted object. sk_user_data bottom bit is set if pointer must not
- * be copied.
+/* flag bits in sk_user_data
+ *
+ * SK_USER_DATA_NOCOPY - Pointer stored in sk_user_data might
+ * not be suitable for copying when cloning the socket.
+ * For instance, it can point to a reference counted object.
+ * sk_user_data bottom bit is set if pointer must not be copied.
+ *
+ * SK_USER_DATA_BPF - Managed by BPF
+ *
+ * SK_USER_DATA_PSOCK - Mark whether pointer stored in sk_user_data points
+ * to psock type. This bit should be set when sk_user_data is
+ * assigned to a psock object.
*/
#define SK_USER_DATA_NOCOPY 1UL
-#define SK_USER_DATA_BPF 2UL /* Managed by BPF */
-#define SK_USER_DATA_PTRMASK ~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF)
+#define SK_USER_DATA_BPF 2UL
+#define SK_USER_DATA_PSOCK 4UL
+#define SK_USER_DATA_PTRMASK ~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF |\
+ SK_USER_DATA_PSOCK)
/**
* sk_user_data_is_nocopy - Test if sk_user_data pointer must not be copied
@@ -570,19 +580,35 @@ static inline bool sk_user_data_is_nocopy(const struct sock *sk)
void *__tmp = rcu_dereference(__sk_user_data((sk))); \
(void *)((uintptr_t)__tmp & SK_USER_DATA_PTRMASK); \
})
-#define rcu_assign_sk_user_data(sk, ptr) \
+#define rcu_assign_sk_user_data_with_flags(sk, ptr, flags) \
({ \
- uintptr_t __tmp = (uintptr_t)(ptr); \
- WARN_ON_ONCE(__tmp & ~SK_USER_DATA_PTRMASK); \
- rcu_assign_pointer(__sk_user_data((sk)), __tmp); \
-})
-#define rcu_assign_sk_user_data_nocopy(sk, ptr) \
-({ \
- uintptr_t __tmp = (uintptr_t)(ptr); \
- WARN_ON_ONCE(__tmp & ~SK_USER_DATA_PTRMASK); \
+ uintptr_t __tmp1 = (uintptr_t)(ptr), \
+ __tmp2 = (uintptr_t)(flags); \
+ WARN_ON_ONCE(__tmp1 & ~SK_USER_DATA_PTRMASK); \
+ WARN_ON_ONCE(__tmp2 & SK_USER_DATA_PTRMASK); \
rcu_assign_pointer(__sk_user_data((sk)), \
- __tmp | SK_USER_DATA_NOCOPY); \
+ __tmp1 | __tmp2); \
})
+#define rcu_assign_sk_user_data(sk, ptr) \
+ rcu_assign_sk_user_data_with_flags(sk, ptr, 0)
+
+/**
+ * rcu_dereference_sk_user_data_psock - return psock if sk_user_data
+ * points to the psock type(SK_USER_DATA_PSOCK flag is set), otherwise
+ * return NULL
+ *
+ * @sk: socket
+ */
+static inline
+struct sk_psock *rcu_dereference_sk_user_data_psock(const struct sock *sk)
+{
+ uintptr_t __tmp = (uintptr_t)rcu_dereference(__sk_user_data((sk)));
+
+ if (__tmp & SK_USER_DATA_PSOCK)
+ return (struct sk_psock *)(__tmp & SK_USER_DATA_PTRMASK);
+
+ return NULL;
+}
static inline
struct net *sock_net(const struct sock *sk)
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index b0fcd0200e84..d174897dbb4b 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -735,7 +735,8 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node)
sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED);
refcount_set(&psock->refcnt, 1);
- rcu_assign_sk_user_data_nocopy(sk, psock);
+ rcu_assign_sk_user_data_with_flags(sk, psock, SK_USER_DATA_NOCOPY |
+ SK_USER_DATA_PSOCK);
sock_hold(sk);
out:
--
2.25.1
On Wed, 3 Aug 2022 20:41:22 +0800 Hawkins Jiawei wrote:
> -/* Pointer stored in sk_user_data might not be suitable for copying
> - * when cloning the socket. For instance, it can point to a reference
> - * counted object. sk_user_data bottom bit is set if pointer must not
> - * be copied.
> +/* flag bits in sk_user_data
> + *
> + * SK_USER_DATA_NOCOPY - Pointer stored in sk_user_data might
> + * not be suitable for copying when cloning the socket.
> + * For instance, it can point to a reference counted object.
> + * sk_user_data bottom bit is set if pointer must not be copied.
> + *
> + * SK_USER_DATA_BPF - Managed by BPF
I'd use this opportunity to add more info here, BPF is too general.
Maybe "Pointer is used by a BPF reuseport array"? Martin, WDYT?
> + * SK_USER_DATA_PSOCK - Mark whether pointer stored in sk_user_data points
> + * to psock type. This bit should be set when sk_user_data is
> + * assigned to a psock object.
> +/**
> + * rcu_dereference_sk_user_data_psock - return psock if sk_user_data
> + * points to the psock type(SK_USER_DATA_PSOCK flag is set), otherwise
> + * return NULL
> + *
> + * @sk: socket
> + */
> +static inline
> +struct sk_psock *rcu_dereference_sk_user_data_psock(const struct sock *sk)
nit: the return type more commonly goes on the same line as "static
inline"
> +{
> + uintptr_t __tmp = (uintptr_t)rcu_dereference(__sk_user_data((sk)));
> +
> + if (__tmp & SK_USER_DATA_PSOCK)
> + return (struct sk_psock *)(__tmp & SK_USER_DATA_PTRMASK);
> +
> + return NULL;
> +}
As a follow up we can probably generalize this into
__rcu_dereference_sk_user_data_cond(sk, bit)
and make the psock just call that:
static inline struct sk_psock *
rcu_dereference_sk_user_data_psock(const struct sock *sk)
{
return __rcu_dereference_sk_user_data_cond(sk, SK_USER_DATA_PSOCK);
}
then reuseport can also benefit, maybe:
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index e2618fb5870e..ad5c447a690c 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -21,14 +21,11 @@ static struct reuseport_array *reuseport_array(struct bpf_map *map)
/* The caller must hold the reuseport_lock */
void bpf_sk_reuseport_detach(struct sock *sk)
{
- uintptr_t sk_user_data;
+ struct sock __rcu **socks;
write_lock_bh(&sk->sk_callback_lock);
- sk_user_data = (uintptr_t)sk->sk_user_data;
- if (sk_user_data & SK_USER_DATA_BPF) {
- struct sock __rcu **socks;
-
- socks = (void *)(sk_user_data & SK_USER_DATA_PTRMASK);
+ socks = __rcu_dereference_sk_user_data_cond(sk, SK_USER_DATA_BPF);
+ if (socks) {
WRITE_ONCE(sk->sk_user_data, NULL);
/*
* Do not move this NULL assignment outside of
But that must be a separate patch, not part of this fix.
On Wed, Aug 03, 2022 at 08:14:13AM -0700, Jakub Kicinski wrote: > On Wed, 3 Aug 2022 20:41:22 +0800 Hawkins Jiawei wrote: > > -/* Pointer stored in sk_user_data might not be suitable for copying > > - * when cloning the socket. For instance, it can point to a reference > > - * counted object. sk_user_data bottom bit is set if pointer must not > > - * be copied. > > +/* flag bits in sk_user_data > > + * > > + * SK_USER_DATA_NOCOPY - Pointer stored in sk_user_data might > > + * not be suitable for copying when cloning the socket. > > + * For instance, it can point to a reference counted object. > > + * sk_user_data bottom bit is set if pointer must not be copied. > > + * > > + * SK_USER_DATA_BPF - Managed by BPF > > I'd use this opportunity to add more info here, BPF is too general. > Maybe "Pointer is used by a BPF reuseport array"? Martin, WDYT? SGTM. Thanks.
On Wed, 3 Aug 2022 at 23:37, Martin KaFai Lau <kafai@fb.com> wrote:
> > > + * SK_USER_DATA_BPF - Managed by BPF
> >
> > I'd use this opportunity to add more info here, BPF is too general.
> > Maybe "Pointer is used by a BPF reuseport array"? Martin, WDYT?
> SGTM. Thanks.
OK. It seems that this flag is introduced from
c9a368f1c0fb ("bpf: net: Avoid incorrect bpf_sk_reuseport_detach call").
I will search for more detailed description in this commit.
> >
> > > +/**
> > > + * rcu_dereference_sk_user_data_psock - return psock if sk_user_data
> > > + * points to the psock type(SK_USER_DATA_PSOCK flag is set), otherwise
> > > + * return NULL
> > > + *
> > > + * @sk: socket
> > > + */
> > > +static inline
> > > +struct sk_psock *rcu_dereference_sk_user_data_psock(const struct sock *sk)
> >
> > nit: the return type more commonly goes on the same line as "static
> > inline"
Ok. I will correct it.
> >
> > > +{
> > > + uintptr_t __tmp = (uintptr_t)rcu_dereference(__sk_user_data((sk)));
> > > +
> > > + if (__tmp & SK_USER_DATA_PSOCK)
> > > + return (struct sk_psock *)(__tmp & SK_USER_DATA_PTRMASK);
> > > +
> > > + return NULL;
> > > +}
> >
> > As a follow up we can probably generalize this into
> > __rcu_dereference_sk_user_data_cond(sk, bit)
> >
> > and make the psock just call that:
> >
> > static inline struct sk_psock *
> > rcu_dereference_sk_user_data_psock(const struct sock *sk)
> > {
> > return __rcu_dereference_sk_user_data_cond(sk, SK_USER_DATA_PSOCK);
> > }
Yes. I will refactor it in this way.
> >
> > diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
> > index e2618fb5870e..ad5c447a690c 100644
> > --- a/kernel/bpf/reuseport_array.c
> > +++ b/kernel/bpf/reuseport_array.c
> > @@ -21,14 +21,11 @@ static struct reuseport_array *reuseport_array(struct bpf_map *map)
> > /* The caller must hold the reuseport_lock */
> > void bpf_sk_reuseport_detach(struct sock *sk)
> > {
> > - uintptr_t sk_user_data;
> > + struct sock __rcu **socks;
> >
> > write_lock_bh(&sk->sk_callback_lock);
> > - sk_user_data = (uintptr_t)sk->sk_user_data;
> > - if (sk_user_data & SK_USER_DATA_BPF) {
> > - struct sock __rcu **socks;
> > -
> > - socks = (void *)(sk_user_data & SK_USER_DATA_PTRMASK);
> > + socks = __rcu_dereference_sk_user_data_cond(sk, SK_USER_DATA_BPF);
> > + if (socks) {
> > WRITE_ONCE(sk->sk_user_data, NULL);
> > /*
> > * Do not move this NULL assignment outside of
> >
> >
> > But that must be a separate patch, not part of this fix.
I wonder if it is proper to gather these together in a patchset, for
they are all about flags in sk_user_data, maybe:
[PATCH v5 0/2] net: enhancement to flags in sk_user_data field
- introduce the patchset
[PATCH v5 1/2] net: clean up code for flags in sk_user_data field
- refactor the things in include/linux/skmsg.h and
include/net/sock.h
- refactor the flags's usage by other code, such as
net/core/skmsg.c and kernel/bpf/reuseport_array.c
[PATCH v5 2/2] net: fix refcount bug in sk_psock_get (2)
- add SK_USER_DATA_PSOCK flag in sk_user_data field
On Thu, 4 Aug 2022 11:05:15 +0800 Hawkins Jiawei wrote: > I wonder if it is proper to gather these together in a patchset, for > they are all about flags in sk_user_data, maybe: > > [PATCH v5 0/2] net: enhancement to flags in sk_user_data field > - introduce the patchset > > [PATCH v5 1/2] net: clean up code for flags in sk_user_data field > - refactor the things in include/linux/skmsg.h and > include/net/sock.h > - refactor the flags's usage by other code, such as > net/core/skmsg.c and kernel/bpf/reuseport_array.c > > [PATCH v5 2/2] net: fix refcount bug in sk_psock_get (2) > - add SK_USER_DATA_PSOCK flag in sk_user_data field Usually the fix comes first, because it needs to be backported to stable, and we don't want to have to pull extra commits into stable and risk regressions in code which was not directly involved in the bug.
On Thu, 4 Aug 2022 at 23:29, Jakub Kicinski <kuba@kernel.org> wrote: > > On Thu, 4 Aug 2022 11:05:15 +0800 Hawkins Jiawei wrote: > > I wonder if it is proper to gather these together in a patchset, for > > they are all about flags in sk_user_data, maybe: > > > > [PATCH v5 0/2] net: enhancement to flags in sk_user_data field > > - introduce the patchset > > > > [PATCH v5 1/2] net: clean up code for flags in sk_user_data field > > - refactor the things in include/linux/skmsg.h and > > include/net/sock.h > > - refactor the flags's usage by other code, such as > > net/core/skmsg.c and kernel/bpf/reuseport_array.c > > > > [PATCH v5 2/2] net: fix refcount bug in sk_psock_get (2) > > - add SK_USER_DATA_PSOCK flag in sk_user_data field > > Usually the fix comes first, because it needs to be backported to > stable, and we don't want to have to pull extra commits into stable > and risk regressions in code which was not directly involved in the bug. Ok, I got it. Thanks for the explanation.
© 2016 - 2026 Red Hat, Inc.