From: Masami Hiramatsu (Google) <mhiramat@kernel.org>
The "hung_task" shows a long-time uninterruptible slept task, but most
often, it's blocked on a mutex acquired by another task. Without
dumping such a task, investigating the root cause of the hung task
problem is very difficult.
Fortunately CONFIG_DEBUG_MUTEXES=y allows us to identify the mutex
blocking the task. And the mutex has "owner" information, which can
be used to find the owner task and dump it with hung tasks.
With this change, the hung task shows blocker task's info like below;
INFO: task cat:113 blocked for more than 122 seconds.
Not tainted 6.14.0-rc3-00002-g6afe972e1b9b #152
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:cat state:D stack:13432 pid:113 tgid:113 ppid:103 task_flags:0x400100 flags:0x00000002
Call Trace:
<TASK>
__schedule+0x731/0x960
? schedule_preempt_disabled+0x54/0xa0
schedule+0xb7/0x140
? __mutex_lock+0x51d/0xa50
? __mutex_lock+0x51d/0xa50
schedule_preempt_disabled+0x54/0xa0
__mutex_lock+0x51d/0xa50
? current_time+0x3a/0x120
read_dummy+0x23/0x70
full_proxy_read+0x6a/0xc0
vfs_read+0xc2/0x340
? __pfx_direct_file_splice_eof+0x10/0x10
? do_sendfile+0x1bd/0x2e0
ksys_read+0x76/0xe0
do_syscall_64+0xe3/0x1c0
? exc_page_fault+0xa9/0x1d0
entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x4840cd
RSP: 002b:00007ffe632b76c8 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00000000004840cd
RDX: 0000000000001000 RSI: 00007ffe632b7710 RDI: 0000000000000003
RBP: 00007ffe632b7710 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000001000000 R11: 0000000000000246 R12: 0000000000001000
R13: 000000003a8b63a0 R14: 0000000000000001 R15: ffffffffffffffff
</TASK>
INFO: task cat:113 is blocked on a mutex owned by task cat:112.
task:cat state:S stack:13432 pid:112 tgid:112 ppid:103 task_flags:0x400100 flags:0x00000002
Call Trace:
<TASK>
__schedule+0x731/0x960
? schedule_timeout+0xa8/0x120
schedule+0xb7/0x140
schedule_timeout+0xa8/0x120
? __pfx_process_timeout+0x10/0x10
msleep_interruptible+0x3e/0x60
read_dummy+0x2d/0x70
full_proxy_read+0x6a/0xc0
vfs_read+0xc2/0x340
? __pfx_direct_file_splice_eof+0x10/0x10
? do_sendfile+0x1bd/0x2e0
ksys_read+0x76/0xe0
do_syscall_64+0xe3/0x1c0
? exc_page_fault+0xa9/0x1d0
entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x4840cd
RSP: 002b:00007ffd69513748 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00000000004840cd
RDX: 0000000000001000 RSI: 00007ffd69513790 RDI: 0000000000000003
RBP: 00007ffd69513790 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000001000000 R11: 0000000000000246 R12: 0000000000001000
R13: 0000000029d8d3a0 R14: 0000000000000001 R15: ffffffffffffffff
</TASK>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
kernel/hung_task.c | 38 ++++++++++++++++++++++++++++++++++++++
kernel/locking/mutex-debug.c | 1 +
kernel/locking/mutex.c | 9 +++++++++
kernel/locking/mutex.h | 6 ++++++
4 files changed, 54 insertions(+)
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 04efa7a6e69b..d1ce69504090 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -25,6 +25,8 @@
#include <trace/events/sched.h>
+#include "locking/mutex.h"
+
/*
* The number of tasks checked:
*/
@@ -93,6 +95,41 @@ static struct notifier_block panic_block = {
.notifier_call = hung_task_panic,
};
+
+#ifdef CONFIG_DEBUG_MUTEXES
+static void debug_show_blocker(struct task_struct *task)
+{
+ struct task_struct *g, *t;
+ unsigned long owner;
+ struct mutex *lock;
+
+ if (!task->blocked_on)
+ return;
+
+ lock = task->blocked_on->mutex;
+ if (unlikely(!lock)) {
+ pr_err("INFO: task %s:%d is blocked on a mutex, but the mutex is not found.\n",
+ task->comm, task->pid);
+ return;
+ }
+ owner = debug_mutex_get_owner(lock);
+ if (likely(owner)) {
+ /* Ensure the owner information is correct. */
+ for_each_process_thread(g, t)
+ if ((unsigned long)t == owner) {
+ pr_err("INFO: task %s:%d is blocked on a mutex owned by task %s:%d.\n",
+ task->comm, task->pid, t->comm, t->pid);
+ sched_show_task(t);
+ return;
+ }
+ }
+ pr_err("INFO: task %s:%d is blocked on a mutex, but the owner is not found.\n",
+ task->comm, task->pid);
+}
+#else
+#define debug_show_blocker(t) do {} while (0)
+#endif
+
static void check_hung_task(struct task_struct *t, unsigned long timeout)
{
unsigned long switch_count = t->nvcsw + t->nivcsw;
@@ -152,6 +189,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
" disables this message.\n");
sched_show_task(t);
+ debug_show_blocker(t);
hung_task_show_lock = true;
if (sysctl_hung_task_all_cpu_backtrace)
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 6e6f6071cfa2..94282c760cee 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -30,6 +30,7 @@ void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
{
memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
waiter->magic = waiter;
+ waiter->mutex = lock;
INIT_LIST_HEAD(&waiter->list);
waiter->ww_ctx = MUTEX_POISON_WW_CTX;
}
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index b36f23de48f1..db093cc0a220 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -72,6 +72,15 @@ static inline unsigned long __owner_flags(unsigned long owner)
return owner & MUTEX_FLAGS;
}
+#ifdef CONFIG_DEBUG_MUTEXES
+/* Do not use the return value as a pointer directly. */
+unsigned long debug_mutex_get_owner(struct mutex *lock)
+{
+ unsigned long owner = atomic_long_read(&lock->owner);
+
+ return (unsigned long)__owner_task(owner);
+}
+#endif
/*
* Returns: __mutex_owner(lock) on failure or NULL on success.
*/
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index cbff35b9b7ae..0d884e6b3a66 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -16,6 +16,7 @@ struct mutex_waiter {
struct task_struct *task;
struct ww_acquire_ctx *ww_ctx;
#ifdef CONFIG_DEBUG_MUTEXES
+ struct mutex *mutex;
void *magic;
#endif
};
@@ -61,6 +62,7 @@ extern void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *w
extern void debug_mutex_unlock(struct mutex *lock);
extern void debug_mutex_init(struct mutex *lock, const char *name,
struct lock_class_key *key);
+extern unsigned long debug_mutex_get_owner(struct mutex *lock);
#else /* CONFIG_DEBUG_MUTEXES */
# define debug_mutex_lock_common(lock, waiter) do { } while (0)
# define debug_mutex_wake_waiter(lock, waiter) do { } while (0)
@@ -69,4 +71,8 @@ extern void debug_mutex_init(struct mutex *lock, const char *name,
# define debug_mutex_remove_waiter(lock, waiter, ti) do { } while (0)
# define debug_mutex_unlock(lock) do { } while (0)
# define debug_mutex_init(lock, name, key) do { } while (0)
+static inline unsigned long debug_mutex_get_owner(struct mutex *lock)
+{
+ return 0;
+}
#endif /* !CONFIG_DEBUG_MUTEXES */
On Wed, 19 Feb 2025 22:00:49 +0900
"Masami Hiramatsu (Google)" <mhiramat@kernel.org> wrote:
> From: Masami Hiramatsu (Google) <mhiramat@kernel.org>
>
> The "hung_task" shows a long-time uninterruptible slept task, but most
> often, it's blocked on a mutex acquired by another task. Without
> dumping such a task, investigating the root cause of the hung task
> problem is very difficult.
>
> Fortunately CONFIG_DEBUG_MUTEXES=y allows us to identify the mutex
> blocking the task. And the mutex has "owner" information, which can
> be used to find the owner task and dump it with hung tasks.
>
> With this change, the hung task shows blocker task's info like below;
>
We've hit bugs like this in the field a few times, and it was very
difficult to debug. Something like this would have made our lives much
easier!
> Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> ---
> kernel/hung_task.c | 38 ++++++++++++++++++++++++++++++++++++++
> kernel/locking/mutex-debug.c | 1 +
> kernel/locking/mutex.c | 9 +++++++++
> kernel/locking/mutex.h | 6 ++++++
> 4 files changed, 54 insertions(+)
>
> diff --git a/kernel/hung_task.c b/kernel/hung_task.c
> index 04efa7a6e69b..d1ce69504090 100644
> --- a/kernel/hung_task.c
> +++ b/kernel/hung_task.c
> @@ -25,6 +25,8 @@
>
> #include <trace/events/sched.h>
>
> +#include "locking/mutex.h"
> +
> /*
> * The number of tasks checked:
> */
> @@ -93,6 +95,41 @@ static struct notifier_block panic_block = {
> .notifier_call = hung_task_panic,
> };
>
> +
> +#ifdef CONFIG_DEBUG_MUTEXES
> +static void debug_show_blocker(struct task_struct *task)
> +{
> + struct task_struct *g, *t;
> + unsigned long owner;
> + struct mutex *lock;
> +
> + if (!task->blocked_on)
> + return;
> +
> + lock = task->blocked_on->mutex;
This is a catch 22. To look at the task's blocked_on, we need the
lock->wait_lock held, otherwise this could be an issue. But to get that
lock, we need to look at the task's blocked_on field! As this can race.
Another thing is that the waiter is on the task's stack. Perhaps we need to
move this into sched/core.c and be able to lock the task's rq. Because even
something like:
waiter = READ_ONCE(task->blocked_on);
May be garbage if the task were to suddenly wake up and run.
Now if we were able to lock the task's rq, which would prevent it from
being woken up, then the blocked_on field would not be at risk of being
corrupted.
-- Steve
> + if (unlikely(!lock)) {
> + pr_err("INFO: task %s:%d is blocked on a mutex, but the mutex is not found.\n",
> + task->comm, task->pid);
> + return;
> + }
> + owner = debug_mutex_get_owner(lock);
> + if (likely(owner)) {
> + /* Ensure the owner information is correct. */
> + for_each_process_thread(g, t)
> + if ((unsigned long)t == owner) {
> + pr_err("INFO: task %s:%d is blocked on a mutex owned by task %s:%d.\n",
> + task->comm, task->pid, t->comm, t->pid);
> + sched_show_task(t);
> + return;
> + }
> + }
> + pr_err("INFO: task %s:%d is blocked on a mutex, but the owner is not found.\n",
> + task->comm, task->pid);
> +}
> +#else
> +#define debug_show_blocker(t) do {} while (0)
> +#endif
On 2/19/25 11:23 AM, Steven Rostedt wrote:
> On Wed, 19 Feb 2025 22:00:49 +0900
> "Masami Hiramatsu (Google)" <mhiramat@kernel.org> wrote:
>
>> From: Masami Hiramatsu (Google) <mhiramat@kernel.org>
>>
>> The "hung_task" shows a long-time uninterruptible slept task, but most
>> often, it's blocked on a mutex acquired by another task. Without
>> dumping such a task, investigating the root cause of the hung task
>> problem is very difficult.
>>
>> Fortunately CONFIG_DEBUG_MUTEXES=y allows us to identify the mutex
>> blocking the task. And the mutex has "owner" information, which can
>> be used to find the owner task and dump it with hung tasks.
>>
>> With this change, the hung task shows blocker task's info like below;
>>
> We've hit bugs like this in the field a few times, and it was very
> difficult to debug. Something like this would have made our lives much
> easier!
I agree that it will be a useful feature.
>> Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
>> ---
>> kernel/hung_task.c | 38 ++++++++++++++++++++++++++++++++++++++
>> kernel/locking/mutex-debug.c | 1 +
>> kernel/locking/mutex.c | 9 +++++++++
>> kernel/locking/mutex.h | 6 ++++++
>> 4 files changed, 54 insertions(+)
>>
>> diff --git a/kernel/hung_task.c b/kernel/hung_task.c
>> index 04efa7a6e69b..d1ce69504090 100644
>> --- a/kernel/hung_task.c
>> +++ b/kernel/hung_task.c
>> @@ -25,6 +25,8 @@
>>
>> #include <trace/events/sched.h>
>>
>> +#include "locking/mutex.h"
>> +
>> /*
>> * The number of tasks checked:
>> */
>> @@ -93,6 +95,41 @@ static struct notifier_block panic_block = {
>> .notifier_call = hung_task_panic,
>> };
>>
>> +
>> +#ifdef CONFIG_DEBUG_MUTEXES
>> +static void debug_show_blocker(struct task_struct *task)
>> +{
>> + struct task_struct *g, *t;
>> + unsigned long owner;
>> + struct mutex *lock;
>> +
>> + if (!task->blocked_on)
>> + return;
>> +
>> + lock = task->blocked_on->mutex;
> This is a catch 22. To look at the task's blocked_on, we need the
> lock->wait_lock held, otherwise this could be an issue. But to get that
> lock, we need to look at the task's blocked_on field! As this can race.
>
> Another thing is that the waiter is on the task's stack. Perhaps we need to
> move this into sched/core.c and be able to lock the task's rq. Because even
> something like:
>
> waiter = READ_ONCE(task->blocked_on);
>
> May be garbage if the task were to suddenly wake up and run.
>
> Now if we were able to lock the task's rq, which would prevent it from
> being woken up, then the blocked_on field would not be at risk of being
> corrupted.
It is tricky to access the mutex_waiter structure which is allocated
from stack. So another way to work around this issue is to add a new
blocked_on_mutex field in task_struct to directly point to relevant
mutex. Yes, that increase the size of task_struct by 8 bytes, but it is
a pretty large structure anyway. Using READ_ONCE/WRITE_ONCE() to access
this field, we don't need to take lock, though taking the wait_lock may
still be needed to examine other information inside the mutex.
Cheers,
Longman
On Wed, 19 Feb 2025 15:18:57 -0500 Waiman Long <llong@redhat.com> wrote: > It is tricky to access the mutex_waiter structure which is allocated > from stack. So another way to work around this issue is to add a new > blocked_on_mutex field in task_struct to directly point to relevant > mutex. Yes, that increase the size of task_struct by 8 bytes, but it is > a pretty large structure anyway. Using READ_ONCE/WRITE_ONCE() to access And it's been on my TODO list for some time to try to make that structure smaller again :-/ > this field, we don't need to take lock, though taking the wait_lock may > still be needed to examine other information inside the mutex. But perhaps if we add a new config option for this feature, we could just add the lock that a task is blocked on before it goes to sleep and reference that instead. That would be easier than trying to play games getting the lock owner from the blocked_on field. -- Steve
On Wed, 19 Feb 2025 15:24:35 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:
> On Wed, 19 Feb 2025 15:18:57 -0500
> Waiman Long <llong@redhat.com> wrote:
>
> > It is tricky to access the mutex_waiter structure which is allocated
> > from stack. So another way to work around this issue is to add a new
> > blocked_on_mutex field in task_struct to directly point to relevant
> > mutex. Yes, that increase the size of task_struct by 8 bytes, but it is
> > a pretty large structure anyway. Using READ_ONCE/WRITE_ONCE() to access
>
> And it's been on my TODO list for some time to try to make that structure
> smaller again :-/
>
> > this field, we don't need to take lock, though taking the wait_lock may
> > still be needed to examine other information inside the mutex.
>
> But perhaps if we add a new config option for this feature, we could just
> add the lock that a task is blocked on before it goes to sleep and
> reference that instead. That would be easier than trying to play games
> getting the lock owner from the blocked_on field.
So something like this?
unsigned int block_flags;
union {
struct mutex *mutex;
struct rwsem +rwsem;
struct rtmutex *rtmutex;
} blocked_on;
enum {
BLOCKED_ON_MUTEX;
BLOCKED_ON_RWSEM;
BLOCKED_ON_RTMUTEX;
BLOCKED_ON_IO;
} block_reason;
For the safety, we may anyway lock the task anyway, but that is the
same as stacktrace.
Thank you,
>
> -- Steve
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
On (25/02/20 08:09), Masami Hiramatsu wrote:
> So something like this?
>
> unsigned int block_flags;
> union {
> struct mutex *mutex;
> struct rwsem +rwsem;
> struct rtmutex *rtmutex;
> } blocked_on;
>
> enum {
> BLOCKED_ON_MUTEX;
> BLOCKED_ON_RWSEM;
> BLOCKED_ON_RTMUTEX;
> BLOCKED_ON_IO;
> } block_reason;
I totally like this and always wanted to have something simlar,
something for all "sleepable" synchronization primitives, lightweight
enough (memory and CPU usage wise) to run on consumer devices. I was
thinking of a rhashtable where each entry represents "sleepable"
primitive with a "owner" pointer and a list of "blocked on" tasks.
But I'm sure you'll have a better idea.
If I may add a couple of "wishes", can we also add:
- completions (so that things like wait_for_completion and
synchronize srcu get covered)
- wait on bit (so that things like lock_buffer and so on get covered)
On Thu, 20 Feb 2025 13:19:46 +0900
Sergey Senozhatsky <senozhatsky@chromium.org> wrote:
> On (25/02/19 22:49), Waiman Long wrote:
> > On 2/19/25 9:45 PM, Sergey Senozhatsky wrote:
> > > On (25/02/20 08:09), Masami Hiramatsu wrote:
> > > > So something like this?
> > > >
> > > > unsigned int block_flags;
> > > > union {
> > > > struct mutex *mutex;
> > > > struct rwsem +rwsem;
> > > > struct rtmutex *rtmutex;
> > > > } blocked_on;
> > > >
> > > > enum {
> > > > BLOCKED_ON_MUTEX;
> > > > BLOCKED_ON_RWSEM;
> > > > BLOCKED_ON_RTMUTEX;
> > > > BLOCKED_ON_IO;
> > > > } block_reason;
> > > I totally like this and always wanted to have something simlar,
> > > something for all "sleepable" synchronization primitives, lightweight
> > > enough (memory and CPU usage wise) to run on consumer devices. I was
> > > thinking of a rhashtable where each entry represents "sleepable"
> > > primitive with a "owner" pointer and a list of "blocked on" tasks.
> > > But I'm sure you'll have a better idea.
> > >
> > > If I may add a couple of "wishes", can we also add:
> > > - completions (so that things like wait_for_completion and
> > > synchronize srcu get covered)
> > > - wait on bit (so that things like lock_buffer and so on get covered)
> >
> > Bit lock doesn't have a owner field to track the owning task.
>
> Right, so that's why I was thinking about keeping it outside in
> a hashtable. A list of owners plus a list of blocked_on per "lock",
> be it a rwsem, or a mutex, or a bit.
Hmm, how can we sync the accesses of the hashtable? We may need
spinlocks for the hashtable or use RCU list. If we use the RCU,
we may need to allocate RCU object for each time when a lock
contention happens (and use kfree_rcu() for each object).
Maybe it is better using a spinlock for each hashtable entry but
it still introduce overhead (need to check).
Thank you,
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
On 2/19/25 9:45 PM, Sergey Senozhatsky wrote:
> On (25/02/20 08:09), Masami Hiramatsu wrote:
>> So something like this?
>>
>> unsigned int block_flags;
>> union {
>> struct mutex *mutex;
>> struct rwsem +rwsem;
>> struct rtmutex *rtmutex;
>> } blocked_on;
>>
>> enum {
>> BLOCKED_ON_MUTEX;
>> BLOCKED_ON_RWSEM;
>> BLOCKED_ON_RTMUTEX;
>> BLOCKED_ON_IO;
>> } block_reason;
> I totally like this and always wanted to have something simlar,
> something for all "sleepable" synchronization primitives, lightweight
> enough (memory and CPU usage wise) to run on consumer devices. I was
> thinking of a rhashtable where each entry represents "sleepable"
> primitive with a "owner" pointer and a list of "blocked on" tasks.
> But I'm sure you'll have a better idea.
>
> If I may add a couple of "wishes", can we also add:
> - completions (so that things like wait_for_completion and
> synchronize srcu get covered)
> - wait on bit (so that things like lock_buffer and so on get covered)
Bit lock doesn't have a owner field to track the owning task.
Cheers,
Longman
>
On 2/19/25 6:09 PM, Masami Hiramatsu (Google) wrote:
> On Wed, 19 Feb 2025 15:24:35 -0500
> Steven Rostedt <rostedt@goodmis.org> wrote:
>
>> On Wed, 19 Feb 2025 15:18:57 -0500
>> Waiman Long <llong@redhat.com> wrote:
>>
>>> It is tricky to access the mutex_waiter structure which is allocated
>>> from stack. So another way to work around this issue is to add a new
>>> blocked_on_mutex field in task_struct to directly point to relevant
>>> mutex. Yes, that increase the size of task_struct by 8 bytes, but it is
>>> a pretty large structure anyway. Using READ_ONCE/WRITE_ONCE() to access
>> And it's been on my TODO list for some time to try to make that structure
>> smaller again :-/
>>
>>> this field, we don't need to take lock, though taking the wait_lock may
>>> still be needed to examine other information inside the mutex.
>> But perhaps if we add a new config option for this feature, we could just
>> add the lock that a task is blocked on before it goes to sleep and
>> reference that instead. That would be easier than trying to play games
>> getting the lock owner from the blocked_on field.
> So something like this?
>
> unsigned int block_flags;
> union {
> struct mutex *mutex;
> struct rwsem +rwsem;
> struct rtmutex *rtmutex;
> } blocked_on;
>
> enum {
> BLOCKED_ON_MUTEX;
> BLOCKED_ON_RWSEM;
> BLOCKED_ON_RTMUTEX;
> BLOCKED_ON_IO;
> } block_reason;
You should add one enum, e.g. BLOCKED_NONE, to represent the normal
state of not being blocked.
Cheers,
Longman
> For the safety, we may anyway lock the task anyway, but that is the
> same as stacktrace.
>
> Thank you,
>
>> -- Steve
>
On Thu, 20 Feb 2025 08:09:08 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> So something like this?
>
> unsigned int block_flags;
> union {
> struct mutex *mutex;
> struct rwsem +rwsem;
> struct rtmutex *rtmutex;
> } blocked_on;
>
> enum {
> BLOCKED_ON_MUTEX;
> BLOCKED_ON_RWSEM;
> BLOCKED_ON_RTMUTEX;
> BLOCKED_ON_IO;
> } block_reason;
>
> For the safety, we may anyway lock the task anyway, but that is the
> same as stacktrace.
Why not make it into a single entity?
struct blocked_on {
unsigned int flags;
union {
struct mutex *mutex;
struct rwsem *rwsem;
struct rtmutex *rtmutex;
} blocked_on;
};
-- Steve
On Wed, 19 Feb 2025 18:58:10 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:
> On Thu, 20 Feb 2025 08:09:08 +0900
> Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
>
> > So something like this?
> >
> > unsigned int block_flags;
> > union {
> > struct mutex *mutex;
> > struct rwsem +rwsem;
> > struct rtmutex *rtmutex;
> > } blocked_on;
> >
> > enum {
> > BLOCKED_ON_MUTEX;
> > BLOCKED_ON_RWSEM;
> > BLOCKED_ON_RTMUTEX;
> > BLOCKED_ON_IO;
> > } block_reason;
> >
> > For the safety, we may anyway lock the task anyway, but that is the
> > same as stacktrace.
>
> Why not make it into a single entity?
>
> struct blocked_on {
> unsigned int flags;
> union {
> struct mutex *mutex;
> struct rwsem *rwsem;
> struct rtmutex *rtmutex;
> } blocked_on;
> };
Yes, and we also merge current mutex_waiter too.
Thank you,
>
> -- Steve
>
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
On 2/19/25 9:08 PM, Masami Hiramatsu (Google) wrote:
> On Wed, 19 Feb 2025 18:58:10 -0500
> Steven Rostedt <rostedt@goodmis.org> wrote:
>
>> On Thu, 20 Feb 2025 08:09:08 +0900
>> Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
>>
>>> So something like this?
>>>
>>> unsigned int block_flags;
>>> union {
>>> struct mutex *mutex;
>>> struct rwsem +rwsem;
>>> struct rtmutex *rtmutex;
>>> } blocked_on;
>>>
>>> enum {
>>> BLOCKED_ON_MUTEX;
>>> BLOCKED_ON_RWSEM;
>>> BLOCKED_ON_RTMUTEX;
>>> BLOCKED_ON_IO;
>>> } block_reason;
>>>
>>> For the safety, we may anyway lock the task anyway, but that is the
>>> same as stacktrace.
>> Why not make it into a single entity?
>>
>> struct blocked_on {
>> unsigned int flags;
>> union {
>> struct mutex *mutex;
>> struct rwsem *rwsem;
>> struct rtmutex *rtmutex;
>> } blocked_on;
>> };
> Yes, and we also merge current mutex_waiter too.
I don't think we should merge mutex_waiter as it contains additional
fields that are not useful to others. We don't want enlarge the size of
task_struct unnecessarily.
Cheers,
Longman
On 2/19/25 3:24 PM, Steven Rostedt wrote: > On Wed, 19 Feb 2025 15:18:57 -0500 > Waiman Long <llong@redhat.com> wrote: > >> It is tricky to access the mutex_waiter structure which is allocated >> from stack. So another way to work around this issue is to add a new >> blocked_on_mutex field in task_struct to directly point to relevant >> mutex. Yes, that increase the size of task_struct by 8 bytes, but it is >> a pretty large structure anyway. Using READ_ONCE/WRITE_ONCE() to access > And it's been on my TODO list for some time to try to make that structure > smaller again :-/ > >> this field, we don't need to take lock, though taking the wait_lock may >> still be needed to examine other information inside the mutex. > But perhaps if we add a new config option for this feature, we could just > add the lock that a task is blocked on before it goes to sleep and > reference that instead. That would be easier than trying to play games > getting the lock owner from the blocked_on field. Yes, it could be a new config option. This will be a useful feature that I believe most distros will turn it on. Or we may just include that in the core code without any option. BTW, this field can also be shared by other sleeping locks like rwsem and rt_mutex as a task can only be blocked on one of them. We do need another type field to identify the type of the blocked lock. Cheers, Longman
On Wed, 19 Feb 2025 17:44:11 -0500 Waiman Long <llong@redhat.com> wrote: > > On 2/19/25 3:24 PM, Steven Rostedt wrote: > > On Wed, 19 Feb 2025 15:18:57 -0500 > > Waiman Long <llong@redhat.com> wrote: > > > >> It is tricky to access the mutex_waiter structure which is allocated > >> from stack. So another way to work around this issue is to add a new > >> blocked_on_mutex field in task_struct to directly point to relevant > >> mutex. Yes, that increase the size of task_struct by 8 bytes, but it is > >> a pretty large structure anyway. Using READ_ONCE/WRITE_ONCE() to access > > And it's been on my TODO list for some time to try to make that structure > > smaller again :-/ I agree to add the field, actually it was my first prototype :) > > > >> this field, we don't need to take lock, though taking the wait_lock may > >> still be needed to examine other information inside the mutex. Do we need to take it just for accessing owner, which is in an atomic? > > But perhaps if we add a new config option for this feature, we could just > > add the lock that a task is blocked on before it goes to sleep and > > reference that instead. That would be easier than trying to play games > > getting the lock owner from the blocked_on field. > > Yes, it could be a new config option. This will be a useful feature that > I believe most distros will turn it on. Or we may just include that in > the core code without any option. Do we need another option? or just extend DETECT_HUNG_TASK? Thanks, > > BTW, this field can also be shared by other sleeping locks like rwsem > and rt_mutex as a task can only be blocked on one of them. We do need > another type field to identify the type of the blocked lock. > > Cheers, > Longman > > -- Masami Hiramatsu (Google) <mhiramat@kernel.org>
On 2/19/25 5:56 PM, Masami Hiramatsu (Google) wrote: > On Wed, 19 Feb 2025 17:44:11 -0500 > Waiman Long <llong@redhat.com> wrote: > >> On 2/19/25 3:24 PM, Steven Rostedt wrote: >>> On Wed, 19 Feb 2025 15:18:57 -0500 >>> Waiman Long <llong@redhat.com> wrote: >>> >>>> It is tricky to access the mutex_waiter structure which is allocated >>>> from stack. So another way to work around this issue is to add a new >>>> blocked_on_mutex field in task_struct to directly point to relevant >>>> mutex. Yes, that increase the size of task_struct by 8 bytes, but it is >>>> a pretty large structure anyway. Using READ_ONCE/WRITE_ONCE() to access >>> And it's been on my TODO list for some time to try to make that structure >>> smaller again :-/ > I agree to add the field, actually it was my first prototype :) > >>>> this field, we don't need to take lock, though taking the wait_lock may >>>> still be needed to examine other information inside the mutex. > Do we need to take it just for accessing owner, which is in an atomic? Right. I forgot it is an atomic_long_t. In that case, no lock should be needed. Cheers, Longman
On Wed, 19 Feb 2025 20:36:13 -0500 Waiman Long <llong@redhat.com> wrote: > >>>> this field, we don't need to take lock, though taking the wait_lock may > >>>> still be needed to examine other information inside the mutex. > > Do we need to take it just for accessing owner, which is in an atomic? > > Right. I forgot it is an atomic_long_t. In that case, no lock should be > needed. Now if we have a two fields to read: block_flags (for the type of lock) and blocked_on (for the lock) We need a way to synchronize the two. What happens if we read the type, and the task wakes up and and then blocks on a different type of lock? Then the lock read from blocked_on could be a different type of lock than what is expected. -- Steve
On Wed, 19 Feb 2025 20:41:53 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:
> On Wed, 19 Feb 2025 20:36:13 -0500
> Waiman Long <llong@redhat.com> wrote:
>
>
> > >>>> this field, we don't need to take lock, though taking the wait_lock may
> > >>>> still be needed to examine other information inside the mutex.
> > > Do we need to take it just for accessing owner, which is in an atomic?
> >
> > Right. I forgot it is an atomic_long_t. In that case, no lock should be
> > needed.
>
> Now if we have a two fields to read:
>
> block_flags (for the type of lock) and blocked_on (for the lock)
>
> We need a way to synchronize the two. What happens if we read the type, and
> the task wakes up and and then blocks on a different type of lock?
Hmm, right.
Since the blocked_on must be NULL before setting flag, if we can ensure
the writing order so that blocked_flags is always updated before
blocked_on, may it be safe?
Or, (this may introduce more memory overhead) don't use union but
use different blocked_on_mutex, blocked_on_rwsem, etc.
Another idea is to make the owner offset same, like introducing
struct common_lock {
atomic_long_t owner;
};
But the problem is that rt_mutex does not use atomic for storing
the owner. (we can make it atomic using wrapper)
Thank you,
>
> Then the lock read from blocked_on could be a different type of lock than
> what is expected.
>
> -- Steve
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
On Thu, 20 Feb 2025 11:40:36 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> Hmm, right.
> Since the blocked_on must be NULL before setting flag, if we can ensure
> the writing order so that blocked_flags is always updated before
> blocked_on, may it be safe?
>
> Or, (this may introduce more memory overhead) don't use union but
> use different blocked_on_mutex, blocked_on_rwsem, etc.
>
> Another idea is to make the owner offset same, like introducing
>
> struct common_lock {
> atomic_long_t owner;
> };
>
> But the problem is that rt_mutex does not use atomic for storing
> the owner. (we can make it atomic using wrapper)
Either that, or add to the task_struct:
struct mutex *blocked_on_mutex;
struct rwsem *blocked_on_rwsem;
struct rtlock *blocked_on_rtlock;
And just have each type assign to its own type. Then you only need to look
at each one. But yeah, this adds even more bloat to task_struct.
:-/
-- Steve
On 2/19/25 10:11 PM, Steven Rostedt wrote:
> On Thu, 20 Feb 2025 11:40:36 +0900
> Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
>
>> Hmm, right.
>> Since the blocked_on must be NULL before setting flag, if we can ensure
>> the writing order so that blocked_flags is always updated before
>> blocked_on, may it be safe?
>>
>> Or, (this may introduce more memory overhead) don't use union but
>> use different blocked_on_mutex, blocked_on_rwsem, etc.
>>
>> Another idea is to make the owner offset same, like introducing
>>
>> struct common_lock {
>> atomic_long_t owner;
>> };
>>
>> But the problem is that rt_mutex does not use atomic for storing
>> the owner. (we can make it atomic using wrapper)
> Either that, or add to the task_struct:
>
> struct mutex *blocked_on_mutex;
> struct rwsem *blocked_on_rwsem;
> struct rtlock *blocked_on_rtlock;
>
> And just have each type assign to its own type. Then you only need to look
> at each one. But yeah, this adds even more bloat to task_struct.
>
> :-/
Another alternative is to encode the locking type into the lowest 2 bits
of the address and combined them into a single atomic_long_t data item.
Of course, we can only support 4 different types with this scheme.
Cheers,
Longman
>
> -- Steve
>
On Thu, 20 Feb 2025 08:13:31 -0500 Waiman Long <llong@redhat.com> wrote: > Another alternative is to encode the locking type into the lowest 2 bits > of the address and combined them into a single atomic_long_t data item. > Of course, we can only support 4 different types with this scheme. You could also use the MSB as they are either always all ones or all zeros depending on the architecture ;-) -- Steve
On 2/19/25 8:41 PM, Steven Rostedt wrote: > On Wed, 19 Feb 2025 20:36:13 -0500 > Waiman Long <llong@redhat.com> wrote: > > >>>>>> this field, we don't need to take lock, though taking the wait_lock may >>>>>> still be needed to examine other information inside the mutex. >>> Do we need to take it just for accessing owner, which is in an atomic? >> Right. I forgot it is an atomic_long_t. In that case, no lock should be >> needed. > Now if we have a two fields to read: > > block_flags (for the type of lock) and blocked_on (for the lock) > > We need a way to synchronize the two. What happens if we read the type, and > the task wakes up and and then blocks on a different type of lock? > > Then the lock read from blocked_on could be a different type of lock than > what is expected. That is different from reading the owner. In this case, we need to use smp_rmb()/wmb() to sequence the read and write operations unless it is guaranteed that they are in the same cacheline. One possible way is as follows: Writer - setting them: WRITE_ONCE(lock) smp_wmb() WRITE_ONCE(type) Clearing them: WRITE_ONCE(type, 0) smp_wmb() WRITE_ONCE(lock, NULL) Reader: READ_ONCE(type) again: smp_rmb() READ_ONCE(lock) smp_rmb() if (READ_ONCE(type) != type) goto again Cheers, Longman
On Wed, 19 Feb 2025 21:15:08 -0500
Waiman Long <llong@redhat.com> wrote:
>
> On 2/19/25 8:41 PM, Steven Rostedt wrote:
> > On Wed, 19 Feb 2025 20:36:13 -0500
> > Waiman Long <llong@redhat.com> wrote:
> >
> >
> >>>>>> this field, we don't need to take lock, though taking the wait_lock may
> >>>>>> still be needed to examine other information inside the mutex.
> >>> Do we need to take it just for accessing owner, which is in an atomic?
> >> Right. I forgot it is an atomic_long_t. In that case, no lock should be
> >> needed.
> > Now if we have a two fields to read:
> >
> > block_flags (for the type of lock) and blocked_on (for the lock)
> >
> > We need a way to synchronize the two. What happens if we read the type, and
> > the task wakes up and and then blocks on a different type of lock?
> >
> > Then the lock read from blocked_on could be a different type of lock than
> > what is expected.
>
> That is different from reading the owner. In this case, we need to use
> smp_rmb()/wmb() to sequence the read and write operations unless it is
> guaranteed that they are in the same cacheline. One possible way is as
> follows:
>
> Writer - setting them:
>
> WRITE_ONCE(lock)
> smp_wmb()
> WRITE_ONCE(type)
>
> Clearing them:
>
> WRITE_ONCE(type, 0)
> smp_wmb()
> WRITE_ONCE(lock, NULL)
>
> Reader:
>
> READ_ONCE(type)
> again:
> smp_rmb()
> READ_ONCE(lock)
> smp_rmb()
> if (READ_ONCE(type) != type)
> goto again
What about mutex-rwsem-mutex case?
mutex_lock(&lock1);
down_read(&lock2);
mutex_lock(&lock3);
The worst scenario is;
WRITE_ONCE(lock, &lock1)
smp_wmb()
WRITE_ONCE(type, MUTEX) READ_ONCE(type) -> MUTEX
WRITE_ONCE(type, 0)
smp_wmb()
WRITE_ONCE(lock, NULL)
WRITE_ONCE(lock, &lock2) READ_ONCE(lock) -> &lock2
smp_wmb()
WRITE_ONCE(type, RWSEM)
WRITE_ONCE(type, 0)
smp_wmb()
WRITE_ONCE(lock, NULL)
WRITE_ONCE(lock, &lock3)
smp_wmb()
WRITE_ONCE(type, MUTEX) READ_ONCE(type) -> MUTEX == MUTEX
WRITE_ONCE(type, 0)
smp_wmb()
WRITE_ONCE(lock, NULL)
"OK, lock2 is a MUTEX!"
So unless stopping the blocker task, we can not ensure this works.
But unless decode the lock, we don't know the blocker task.
Maybe we can run the hung_task in stop_machine()?
(or introduce common_lock)
Thank you,
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
On 2/19/25 9:59 PM, Masami Hiramatsu (Google) wrote: > On Wed, 19 Feb 2025 21:15:08 -0500 > Waiman Long <llong@redhat.com> wrote: > >> On 2/19/25 8:41 PM, Steven Rostedt wrote: >>> On Wed, 19 Feb 2025 20:36:13 -0500 >>> Waiman Long <llong@redhat.com> wrote: >>> >>> >>>>>>>> this field, we don't need to take lock, though taking the wait_lock may >>>>>>>> still be needed to examine other information inside the mutex. >>>>> Do we need to take it just for accessing owner, which is in an atomic? >>>> Right. I forgot it is an atomic_long_t. In that case, no lock should be >>>> needed. >>> Now if we have a two fields to read: >>> >>> block_flags (for the type of lock) and blocked_on (for the lock) >>> >>> We need a way to synchronize the two. What happens if we read the type, and >>> the task wakes up and and then blocks on a different type of lock? >>> >>> Then the lock read from blocked_on could be a different type of lock than >>> what is expected. >> That is different from reading the owner. In this case, we need to use >> smp_rmb()/wmb() to sequence the read and write operations unless it is >> guaranteed that they are in the same cacheline. One possible way is as >> follows: >> >> Writer - setting them: >> >> WRITE_ONCE(lock) >> smp_wmb() >> WRITE_ONCE(type) >> >> Clearing them: >> >> WRITE_ONCE(type, 0) >> smp_wmb() >> WRITE_ONCE(lock, NULL) >> >> Reader: >> >> READ_ONCE(type) >> again: >> smp_rmb() >> READ_ONCE(lock) >> smp_rmb() >> if (READ_ONCE(type) != type) >> goto again > What about mutex-rwsem-mutex case? > > mutex_lock(&lock1); > down_read(&lock2); > mutex_lock(&lock3); > > The worst scenario is; > > WRITE_ONCE(lock, &lock1) > smp_wmb() > WRITE_ONCE(type, MUTEX) READ_ONCE(type) -> MUTEX > WRITE_ONCE(type, 0) > smp_wmb() > WRITE_ONCE(lock, NULL) > WRITE_ONCE(lock, &lock2) READ_ONCE(lock) -> &lock2 > smp_wmb() > WRITE_ONCE(type, RWSEM) > WRITE_ONCE(type, 0) > smp_wmb() > WRITE_ONCE(lock, NULL) > WRITE_ONCE(lock, &lock3) > smp_wmb() > WRITE_ONCE(type, MUTEX) READ_ONCE(type) -> MUTEX == MUTEX > WRITE_ONCE(type, 0) > smp_wmb() > WRITE_ONCE(lock, NULL) > > "OK, lock2 is a MUTEX!" > > So unless stopping the blocker task, we can not ensure this works. > But unless decode the lock, we don't know the blocker task. That could only happen if the reader can get interrupted/preempted for a long time. In that case, we may need to reread the lock again to be sure that they are stable. Cheers, Longman > > Maybe we can run the hung_task in stop_machine()? > (or introduce common_lock) > > Thank you, >
On Wed, 19 Feb 2025 22:37:04 -0500
Waiman Long <llong@redhat.com> wrote:
> On 2/19/25 9:59 PM, Masami Hiramatsu (Google) wrote:
> > On Wed, 19 Feb 2025 21:15:08 -0500
> > Waiman Long <llong@redhat.com> wrote:
> >
> >> On 2/19/25 8:41 PM, Steven Rostedt wrote:
> >>> On Wed, 19 Feb 2025 20:36:13 -0500
> >>> Waiman Long <llong@redhat.com> wrote:
> >>>
> >>>
> >>>>>>>> this field, we don't need to take lock, though taking the wait_lock may
> >>>>>>>> still be needed to examine other information inside the mutex.
> >>>>> Do we need to take it just for accessing owner, which is in an atomic?
> >>>> Right. I forgot it is an atomic_long_t. In that case, no lock should be
> >>>> needed.
> >>> Now if we have a two fields to read:
> >>>
> >>> block_flags (for the type of lock) and blocked_on (for the lock)
> >>>
> >>> We need a way to synchronize the two. What happens if we read the type, and
> >>> the task wakes up and and then blocks on a different type of lock?
> >>>
> >>> Then the lock read from blocked_on could be a different type of lock than
> >>> what is expected.
> >> That is different from reading the owner. In this case, we need to use
> >> smp_rmb()/wmb() to sequence the read and write operations unless it is
> >> guaranteed that they are in the same cacheline. One possible way is as
> >> follows:
> >>
> >> Writer - setting them:
> >>
> >> WRITE_ONCE(lock)
> >> smp_wmb()
> >> WRITE_ONCE(type)
> >>
> >> Clearing them:
> >>
> >> WRITE_ONCE(type, 0)
> >> smp_wmb()
> >> WRITE_ONCE(lock, NULL)
> >>
> >> Reader:
> >>
> >> READ_ONCE(type)
> >> again:
> >> smp_rmb()
> >> READ_ONCE(lock)
> >> smp_rmb()
> >> if (READ_ONCE(type) != type)
> >> goto again
> > What about mutex-rwsem-mutex case?
> >
> > mutex_lock(&lock1);
> > down_read(&lock2);
> > mutex_lock(&lock3);
> >
> > The worst scenario is;
> >
> > WRITE_ONCE(lock, &lock1)
> > smp_wmb()
> > WRITE_ONCE(type, MUTEX) READ_ONCE(type) -> MUTEX
> > WRITE_ONCE(type, 0)
> > smp_wmb()
> > WRITE_ONCE(lock, NULL)
> > WRITE_ONCE(lock, &lock2) READ_ONCE(lock) -> &lock2
> > smp_wmb()
> > WRITE_ONCE(type, RWSEM)
> > WRITE_ONCE(type, 0)
> > smp_wmb()
> > WRITE_ONCE(lock, NULL)
> > WRITE_ONCE(lock, &lock3)
> > smp_wmb()
> > WRITE_ONCE(type, MUTEX) READ_ONCE(type) -> MUTEX == MUTEX
> > WRITE_ONCE(type, 0)
> > smp_wmb()
> > WRITE_ONCE(lock, NULL)
> >
> > "OK, lock2 is a MUTEX!"
> >
> > So unless stopping the blocker task, we can not ensure this works.
> > But unless decode the lock, we don't know the blocker task.
>
> That could only happen if the reader can get interrupted/preempted for a
> long time. In that case, we may need to reread the lock again to be sure
> that they are stable.
Hm, actually read side should run under rcu read locked, so only interrupt
matters. So I think this rarely happens.
BTW, we don't need the lock address itself, but we need to know who is the
owner. Maybe we can point the address of atomic_long_t?
struct task_struct {
atomic_long_t *blocked_on_owner;
};
The problem is that mutex and rwsem are OK, but rt_mutex uses task_struct *.
Maybe we can use atomic_long_t in rt_mutex too if the new Kconfig is enabled.
Thank you,
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
On 2/20/25 4:29 AM, Masami Hiramatsu (Google) wrote:
> On Wed, 19 Feb 2025 22:37:04 -0500
> Waiman Long <llong@redhat.com> wrote:
>
>> On 2/19/25 9:59 PM, Masami Hiramatsu (Google) wrote:
>>> On Wed, 19 Feb 2025 21:15:08 -0500
>>> Waiman Long <llong@redhat.com> wrote:
>>>
>>>> On 2/19/25 8:41 PM, Steven Rostedt wrote:
>>>>> On Wed, 19 Feb 2025 20:36:13 -0500
>>>>> Waiman Long <llong@redhat.com> wrote:
>>>>>
>>>>>
>>>>>>>>>> this field, we don't need to take lock, though taking the wait_lock may
>>>>>>>>>> still be needed to examine other information inside the mutex.
>>>>>>> Do we need to take it just for accessing owner, which is in an atomic?
>>>>>> Right. I forgot it is an atomic_long_t. In that case, no lock should be
>>>>>> needed.
>>>>> Now if we have a two fields to read:
>>>>>
>>>>> block_flags (for the type of lock) and blocked_on (for the lock)
>>>>>
>>>>> We need a way to synchronize the two. What happens if we read the type, and
>>>>> the task wakes up and and then blocks on a different type of lock?
>>>>>
>>>>> Then the lock read from blocked_on could be a different type of lock than
>>>>> what is expected.
>>>> That is different from reading the owner. In this case, we need to use
>>>> smp_rmb()/wmb() to sequence the read and write operations unless it is
>>>> guaranteed that they are in the same cacheline. One possible way is as
>>>> follows:
>>>>
>>>> Writer - setting them:
>>>>
>>>> WRITE_ONCE(lock)
>>>> smp_wmb()
>>>> WRITE_ONCE(type)
>>>>
>>>> Clearing them:
>>>>
>>>> WRITE_ONCE(type, 0)
>>>> smp_wmb()
>>>> WRITE_ONCE(lock, NULL)
>>>>
>>>> Reader:
>>>>
>>>> READ_ONCE(type)
>>>> again:
>>>> smp_rmb()
>>>> READ_ONCE(lock)
>>>> smp_rmb()
>>>> if (READ_ONCE(type) != type)
>>>> goto again
>>> What about mutex-rwsem-mutex case?
>>>
>>> mutex_lock(&lock1);
>>> down_read(&lock2);
>>> mutex_lock(&lock3);
>>>
>>> The worst scenario is;
>>>
>>> WRITE_ONCE(lock, &lock1)
>>> smp_wmb()
>>> WRITE_ONCE(type, MUTEX) READ_ONCE(type) -> MUTEX
>>> WRITE_ONCE(type, 0)
>>> smp_wmb()
>>> WRITE_ONCE(lock, NULL)
>>> WRITE_ONCE(lock, &lock2) READ_ONCE(lock) -> &lock2
>>> smp_wmb()
>>> WRITE_ONCE(type, RWSEM)
>>> WRITE_ONCE(type, 0)
>>> smp_wmb()
>>> WRITE_ONCE(lock, NULL)
>>> WRITE_ONCE(lock, &lock3)
>>> smp_wmb()
>>> WRITE_ONCE(type, MUTEX) READ_ONCE(type) -> MUTEX == MUTEX
>>> WRITE_ONCE(type, 0)
>>> smp_wmb()
>>> WRITE_ONCE(lock, NULL)
>>>
>>> "OK, lock2 is a MUTEX!"
>>>
>>> So unless stopping the blocker task, we can not ensure this works.
>>> But unless decode the lock, we don't know the blocker task.
>> That could only happen if the reader can get interrupted/preempted for a
>> long time. In that case, we may need to reread the lock again to be sure
>> that they are stable.
> Hm, actually read side should run under rcu read locked, so only interrupt
> matters. So I think this rarely happens.
>
> BTW, we don't need the lock address itself, but we need to know who is the
> owner. Maybe we can point the address of atomic_long_t?
>
> struct task_struct {
> atomic_long_t *blocked_on_owner;
> };
Yes, we can use a pointer to the owner field. However, the way that
owner field is encoded varies depends on the lock type. So we still need
to know what lock it is. As mentioned in the other thread, we can encode
the lock type into the lowest 2 bits of the pointer.
>
> The problem is that mutex and rwsem are OK, but rt_mutex uses task_struct *.
> Maybe we can use atomic_long_t in rt_mutex too if the new Kconfig is enabled.
It shouldn't be a problem to use atomic_long_t for the owner.
Cheers,
Longman
On Wed, 19 Feb 2025 21:15:08 -0500 Waiman Long <llong@redhat.com> wrote: > Writer - setting them: > > WRITE_ONCE(lock) > smp_wmb() > WRITE_ONCE(type) > > Clearing them: > > WRITE_ONCE(type, 0) > smp_wmb() > WRITE_ONCE(lock, NULL) > > Reader: > > READ_ONCE(type) > again: > smp_rmb() > READ_ONCE(lock) > smp_rmb() > if (READ_ONCE(type) != type) > goto again Do you really need the READ/WRITE_ONCE() with the memory barriers? From what I understand, the compiler can't even assume what it read is the same after passing a memory barrier like that. So there should be no reason it can reread the memory location after a barrier. -- Steve
On 2/19/25 9:27 PM, Steven Rostedt wrote: > On Wed, 19 Feb 2025 21:15:08 -0500 > Waiman Long <llong@redhat.com> wrote: > >> Writer - setting them: >> >> WRITE_ONCE(lock) >> smp_wmb() >> WRITE_ONCE(type) >> >> Clearing them: >> >> WRITE_ONCE(type, 0) >> smp_wmb() >> WRITE_ONCE(lock, NULL) >> >> Reader: >> >> READ_ONCE(type) >> again: >> smp_rmb() >> READ_ONCE(lock) >> smp_rmb() >> if (READ_ONCE(type) != type) >> goto again > Do you really need the READ/WRITE_ONCE() with the memory barriers? From > what I understand, the compiler can't even assume what it read is the same > after passing a memory barrier like that. So there should be no reason it > can reread the memory location after a barrier. You may be right. However, without using a READ_ONCE/WRITE_ONLY, a compiler can potentially break up the read/write into multiple smaller trunks resulting in partial data. So I will use them to be on the safe side. In this particular scenario above, we may not need to use them on type as we are going to reread it. I will keep them for lock though. Cheers, Longman > -- Steve >
On Thu, 20 Feb 2025 07:56:39 +0900 Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote: > > >> this field, we don't need to take lock, though taking the wait_lock may > > >> still be needed to examine other information inside the mutex. > > Do we need to take it just for accessing owner, which is in an atomic? Updating the task_struct would be in the same location as the blocked_on is anyway. I would make it into a wrapper function that is a nop when disabled. > > > > But perhaps if we add a new config option for this feature, we could just > > > add the lock that a task is blocked on before it goes to sleep and > > > reference that instead. That would be easier than trying to play games > > > getting the lock owner from the blocked_on field. > > > > Yes, it could be a new config option. This will be a useful feature that > > I believe most distros will turn it on. Or we may just include that in > > the core code without any option. > > Do we need another option? or just extend DETECT_HUNG_TASK? DETECT_HUNG_TASK is just that, for detecting hung tasks. This adds more information to that, which increases the size of the task_struct not to mention adds code in the mutex/rwsem handlers. I would definitely make it a separate config that may depend on DETECT_HUNG_TASK. -- Steve
On Wed, 19 Feb 2025 18:55:31 -0500 Steven Rostedt <rostedt@goodmis.org> wrote: > On Thu, 20 Feb 2025 07:56:39 +0900 > Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote: > > > > >> this field, we don't need to take lock, though taking the wait_lock may > > > >> still be needed to examine other information inside the mutex. > > > > Do we need to take it just for accessing owner, which is in an atomic? > > Updating the task_struct would be in the same location as the blocked_on is > anyway. I would make it into a wrapper function that is a nop when disabled. Should we make it depends on DEBUG_MUTEXES too? I think no. We can introduce a different kconfig and wrapper function which calls debug_mutex_*(). > > > > > > > But perhaps if we add a new config option for this feature, we could just > > > > add the lock that a task is blocked on before it goes to sleep and > > > > reference that instead. That would be easier than trying to play games > > > > getting the lock owner from the blocked_on field. > > > > > > Yes, it could be a new config option. This will be a useful feature that > > > I believe most distros will turn it on. Or we may just include that in > > > the core code without any option. > > > > Do we need another option? or just extend DETECT_HUNG_TASK? > > DETECT_HUNG_TASK is just that, for detecting hung tasks. This adds more > information to that, which increases the size of the task_struct not to > mention adds code in the mutex/rwsem handlers. > > I would definitely make it a separate config that may depend on > DETECT_HUNG_TASK. OK, what about CONFIG_TASK_BLOCKER? Thank you, > > -- Steve > -- Masami Hiramatsu (Google) <mhiramat@kernel.org>
On Thu, 20 Feb 2025 11:07:07 +0900 Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote: > On Wed, 19 Feb 2025 18:55:31 -0500 > Steven Rostedt <rostedt@goodmis.org> wrote: > > > On Thu, 20 Feb 2025 07:56:39 +0900 > > Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote: > > > > > > >> this field, we don't need to take lock, though taking the wait_lock may > > > > >> still be needed to examine other information inside the mutex. > > > > > > Do we need to take it just for accessing owner, which is in an atomic? > > > > Updating the task_struct would be in the same location as the blocked_on is > > anyway. I would make it into a wrapper function that is a nop when disabled. > > Should we make it depends on DEBUG_MUTEXES too? I think no. We can introduce > a different kconfig and wrapper function which calls debug_mutex_*(). No it should only depend on HUNG_TASKS. > > > > > > > > > > > But perhaps if we add a new config option for this feature, we could just > > > > > add the lock that a task is blocked on before it goes to sleep and > > > > > reference that instead. That would be easier than trying to play games > > > > > getting the lock owner from the blocked_on field. > > > > > > > > Yes, it could be a new config option. This will be a useful feature that > > > > I believe most distros will turn it on. Or we may just include that in > > > > the core code without any option. > > > > > > Do we need another option? or just extend DETECT_HUNG_TASK? > > > > DETECT_HUNG_TASK is just that, for detecting hung tasks. This adds more > > information to that, which increases the size of the task_struct not to > > mention adds code in the mutex/rwsem handlers. > > > > I would definitely make it a separate config that may depend on > > DETECT_HUNG_TASK. > > OK, what about CONFIG_TASK_BLOCKER? I'm fine with whatever color you would like the bikeshed to be ;-) -- Steve
On 2/19/25 9:07 PM, Masami Hiramatsu (Google) wrote: > On Wed, 19 Feb 2025 18:55:31 -0500 > Steven Rostedt <rostedt@goodmis.org> wrote: > >> On Thu, 20 Feb 2025 07:56:39 +0900 >> Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote: >> >>>>>> this field, we don't need to take lock, though taking the wait_lock may >>>>>> still be needed to examine other information inside the mutex. >>> Do we need to take it just for accessing owner, which is in an atomic? >> Updating the task_struct would be in the same location as the blocked_on is >> anyway. I would make it into a wrapper function that is a nop when disabled. > Should we make it depends on DEBUG_MUTEXES too? I think no. We can introduce > a different kconfig and wrapper function which calls debug_mutex_*(). No, I don't think so. In fact, the mutex debug code can make use of the new fields for additional checking. I believe DEBUG_MUTEXES should select the new option. Cheers, Longman
On Thu, Feb 20, 2025 at 7:55 AM Steven Rostedt <rostedt@goodmis.org> wrote: > > On Thu, 20 Feb 2025 07:56:39 +0900 > Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote: > > > > >> this field, we don't need to take lock, though taking the wait_lock may > > > >> still be needed to examine other information inside the mutex. > > > > Do we need to take it just for accessing owner, which is in an atomic? > > Updating the task_struct would be in the same location as the blocked_on is > anyway. I would make it into a wrapper function that is a nop when disabled. > > > > > > > But perhaps if we add a new config option for this feature, we could just > > > > add the lock that a task is blocked on before it goes to sleep and > > > > reference that instead. That would be easier than trying to play games > > > > getting the lock owner from the blocked_on field. > > > > > > Yes, it could be a new config option. This will be a useful feature that > > > I believe most distros will turn it on. Or we may just include that in > > > the core code without any option. > > > > Do we need another option? or just extend DETECT_HUNG_TASK? > > DETECT_HUNG_TASK is just that, for detecting hung tasks. This adds more > information to that, which increases the size of the task_struct not to > mention adds code in the mutex/rwsem handlers. > > I would definitely make it a separate config that may depend on > DETECT_HUNG_TASK. Agreed. Making it a separate config option that depends on DETECT_HUNG_TASK sounds like a reasonable approach. It could help us to identify the root cause, but it also adds a bit of extra overhead. Thanks, Lance > > -- Steve
© 2016 - 2025 Red Hat, Inc.