Inspired by mutex blocker tracking[1], this patch makes a trade-off to
balance the overhead and utility of the hung task detector.
Unlike mutexes, semaphores lack explicit ownership tracking, making it
challenging to identify the root cause of hangs. To address this, we
introduce a last_holder field to the semaphore structure, which is
updated when a task successfully calls down() and cleared during up().
The assumption is that if a task is blocked on a semaphore, the holders
must not have released it. While this does not guarantee that the last
holder is one of the current blockers, it likely provides a practical hint
for diagnosing semaphore-related stalls.
With this change, the hung task detector can now show blocker task's info
like below:
[Thu Mar 20 04:52:21 2025] INFO: task cat:955 blocked for more than 120 seconds.
[Thu Mar 20 04:52:21 2025] Tainted: G E 6.14.0-rc6+ #1
[Thu Mar 20 04:52:21 2025] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[Thu Mar 20 04:52:21 2025] task:cat state:D stack:0 pid:955 tgid:955 ppid:917 task_flags:0x400000 flags:0x00000000
[Thu Mar 20 04:52:21 2025] Call Trace:
[Thu Mar 20 04:52:21 2025] <TASK>
[Thu Mar 20 04:52:21 2025] __schedule+0x491/0xbd0
[Thu Mar 20 04:52:21 2025] schedule+0x27/0xf0
[Thu Mar 20 04:52:21 2025] schedule_timeout+0xe3/0xf0
[Thu Mar 20 04:52:21 2025] ? __folio_mod_stat+0x2a/0x80
[Thu Mar 20 04:52:21 2025] ? set_ptes.constprop.0+0x27/0x90
[Thu Mar 20 04:52:21 2025] __down_common+0x155/0x280
[Thu Mar 20 04:52:21 2025] down+0x53/0x70
[Thu Mar 20 04:52:21 2025] read_dummy_semaphore+0x23/0x60
[Thu Mar 20 04:52:21 2025] full_proxy_read+0x5f/0xa0
[Thu Mar 20 04:52:21 2025] vfs_read+0xbc/0x350
[Thu Mar 20 04:52:21 2025] ? __count_memcg_events+0xa5/0x140
[Thu Mar 20 04:52:21 2025] ? count_memcg_events.constprop.0+0x1a/0x30
[Thu Mar 20 04:52:21 2025] ? handle_mm_fault+0x180/0x260
[Thu Mar 20 04:52:21 2025] ksys_read+0x66/0xe0
[Thu Mar 20 04:52:21 2025] do_syscall_64+0x51/0x120
[Thu Mar 20 04:52:21 2025] entry_SYSCALL_64_after_hwframe+0x76/0x7e
[Thu Mar 20 04:52:21 2025] RIP: 0033:0x7ff96d4ab46e
[Thu Mar 20 04:52:21 2025] RSP: 002b:00007ffe2f47f3a8 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
[Thu Mar 20 04:52:21 2025] RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007ff96d4ab46e
[Thu Mar 20 04:52:21 2025] RDX: 0000000000020000 RSI: 00007ff96d39f000 RDI: 0000000000000003
[Thu Mar 20 04:52:21 2025] RBP: 00007ff96d39f000 R08: 00007ff96d39e010 R09: 0000000000000000
[Thu Mar 20 04:52:21 2025] R10: fffffffffffffbc5 R11: 0000000000000246 R12: 0000000000000000
[Thu Mar 20 04:52:21 2025] R13: 0000000000000003 R14: 0000000000020000 R15: 0000000000020000
[Thu Mar 20 04:52:21 2025] </TASK>
[Thu Mar 20 04:52:21 2025] INFO: task cat:955 blocked on a semaphore likely last held by task cat:909
[Thu Mar 20 04:52:21 2025] task:cat state:S stack:0 pid:909 tgid:909 ppid:771 task_flags:0x400000 flags:0x00000000
[Thu Mar 20 04:52:21 2025] Call Trace:
[Thu Mar 20 04:52:21 2025] <TASK>
[Thu Mar 20 04:52:21 2025] __schedule+0x491/0xbd0
[Thu Mar 20 04:52:21 2025] ? _raw_spin_unlock_irqrestore+0xe/0x40
[Thu Mar 20 04:52:21 2025] schedule+0x27/0xf0
[Thu Mar 20 04:52:21 2025] schedule_timeout+0x77/0xf0
[Thu Mar 20 04:52:21 2025] ? __pfx_process_timeout+0x10/0x10
[Thu Mar 20 04:52:21 2025] msleep_interruptible+0x49/0x60
[Thu Mar 20 04:52:21 2025] read_dummy_semaphore+0x2d/0x60
[Thu Mar 20 04:52:21 2025] full_proxy_read+0x5f/0xa0
[Thu Mar 20 04:52:21 2025] vfs_read+0xbc/0x350
[Thu Mar 20 04:52:21 2025] ? __count_memcg_events+0xa5/0x140
[Thu Mar 20 04:52:21 2025] ? count_memcg_events.constprop.0+0x1a/0x30
[Thu Mar 20 04:52:21 2025] ? handle_mm_fault+0x180/0x260
[Thu Mar 20 04:52:21 2025] ksys_read+0x66/0xe0
[Thu Mar 20 04:52:21 2025] do_syscall_64+0x51/0x120
[Thu Mar 20 04:52:21 2025] entry_SYSCALL_64_after_hwframe+0x76/0x7e
[Thu Mar 20 04:52:21 2025] RIP: 0033:0x7fe6bf7a046e
[Thu Mar 20 04:52:21 2025] RSP: 002b:00007ffd6e1a4028 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
[Thu Mar 20 04:52:21 2025] RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007fe6bf7a046e
[Thu Mar 20 04:52:21 2025] RDX: 0000000000020000 RSI: 00007fe6bf694000 RDI: 0000000000000003
[Thu Mar 20 04:52:21 2025] RBP: 00007fe6bf694000 R08: 00007fe6bf693010 R09: 0000000000000000
[Thu Mar 20 04:52:21 2025] R10: fffffffffffffbc5 R11: 0000000000000246 R12: 0000000000000000
[Thu Mar 20 04:52:21 2025] R13: 0000000000000003 R14: 0000000000020000 R15: 0000000000020000
[1] https://lore.kernel.org/all/174046694331.2194069.15472952050240807469.stgit@mhiramat.tok.corp.google.com
Suggested-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Mingzhe Yang <mingzhe.yang@ly.com>
Signed-off-by: Lance Yang <ioworker0@gmail.com>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
include/linux/semaphore.h | 15 ++++++++++-
kernel/hung_task.c | 52 ++++++++++++++++++++++++++++++--------
kernel/locking/semaphore.c | 52 +++++++++++++++++++++++++++++++++-----
3 files changed, 101 insertions(+), 18 deletions(-)
diff --git a/include/linux/semaphore.h b/include/linux/semaphore.h
index 04655faadc2d..89706157e622 100644
--- a/include/linux/semaphore.h
+++ b/include/linux/semaphore.h
@@ -16,13 +16,25 @@ struct semaphore {
raw_spinlock_t lock;
unsigned int count;
struct list_head wait_list;
+
+#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
+ unsigned long last_holder;
+#endif
};
+#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
+#define __LAST_HOLDER_SEMAPHORE_INITIALIZER \
+ , .last_holder = 0UL
+#else
+#define __LAST_HOLDER_SEMAPHORE_INITIALIZER
+#endif
+
#define __SEMAPHORE_INITIALIZER(name, n) \
{ \
.lock = __RAW_SPIN_LOCK_UNLOCKED((name).lock), \
.count = n, \
- .wait_list = LIST_HEAD_INIT((name).wait_list), \
+ .wait_list = LIST_HEAD_INIT((name).wait_list) \
+ __LAST_HOLDER_SEMAPHORE_INITIALIZER \
}
/*
@@ -47,5 +59,6 @@ extern int __must_check down_killable(struct semaphore *sem);
extern int __must_check down_trylock(struct semaphore *sem);
extern int __must_check down_timeout(struct semaphore *sem, long jiffies);
extern void up(struct semaphore *sem);
+extern unsigned long sem_last_holder(struct semaphore *sem);
#endif /* __LINUX_SEMAPHORE_H */
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 79558d76ef06..d2432df2b905 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -99,32 +99,62 @@ static struct notifier_block panic_block = {
static void debug_show_blocker(struct task_struct *task)
{
struct task_struct *g, *t;
- unsigned long owner, blocker;
+ unsigned long owner, blocker, blocker_type;
RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "No rcu lock held");
blocker = READ_ONCE(task->blocker);
- if (!blocker ||
- hung_task_get_blocker_type(blocker) != BLOCKER_TYPE_MUTEX)
+ if (!blocker)
return;
- owner = mutex_get_owner(
- (struct mutex *)hung_task_blocker_to_lock(blocker));
+ blocker_type = hung_task_get_blocker_type(blocker);
+
+ switch (blocker_type) {
+ case BLOCKER_TYPE_MUTEX:
+ owner = mutex_get_owner(
+ (struct mutex *)hung_task_blocker_to_lock(blocker));
+ break;
+ case BLOCKER_TYPE_SEM:
+ owner = sem_last_holder(
+ (struct semaphore *)hung_task_blocker_to_lock(blocker));
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return;
+ }
+
if (unlikely(!owner)) {
- pr_err("INFO: task %s:%d is blocked on a mutex, but the owner is not found.\n",
- task->comm, task->pid);
+ switch (blocker_type) {
+ case BLOCKER_TYPE_MUTEX:
+ pr_err("INFO: task %s:%d is blocked on a mutex, but the owner is not found.\n",
+ task->comm, task->pid);
+ break;
+ case BLOCKER_TYPE_SEM:
+ pr_err("INFO: task %s:%d is blocked on a semaphore, but the last holder is not found.\n",
+ task->comm, task->pid);
+ break;
+ }
return;
}
/* Ensure the owner information is correct. */
for_each_process_thread(g, t) {
- if ((unsigned long)t == owner) {
+ if ((unsigned long)t != owner)
+ continue;
+
+ switch (blocker_type) {
+ case BLOCKER_TYPE_MUTEX:
pr_err("INFO: task %s:%d is blocked on a mutex likely owned by task %s:%d.\n",
- task->comm, task->pid, t->comm, t->pid);
- sched_show_task(t);
- return;
+ task->comm, task->pid, t->comm, t->pid);
+ break;
+ case BLOCKER_TYPE_SEM:
+ pr_err("INFO: task %s:%d blocked on a semaphore likely last held by task %s:%d\n",
+ task->comm, task->pid, t->comm, t->pid);
+ break;
}
+ sched_show_task(t);
+ return;
}
}
#else
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 34bfae72f295..3d06d4adc05b 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -33,12 +33,14 @@
#include <linux/spinlock.h>
#include <linux/ftrace.h>
#include <trace/events/lock.h>
+#include <linux/hung_task.h>
static noinline void __down(struct semaphore *sem);
static noinline int __down_interruptible(struct semaphore *sem);
static noinline int __down_killable(struct semaphore *sem);
static noinline int __down_timeout(struct semaphore *sem, long timeout);
static noinline void __up(struct semaphore *sem);
+static inline void __sem_acquire(struct semaphore *sem);
/**
* down - acquire the semaphore
@@ -58,7 +60,7 @@ void __sched down(struct semaphore *sem)
might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
- sem->count--;
+ __sem_acquire(sem);
else
__down(sem);
raw_spin_unlock_irqrestore(&sem->lock, flags);
@@ -82,7 +84,7 @@ int __sched down_interruptible(struct semaphore *sem)
might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
- sem->count--;
+ __sem_acquire(sem);
else
result = __down_interruptible(sem);
raw_spin_unlock_irqrestore(&sem->lock, flags);
@@ -109,7 +111,7 @@ int __sched down_killable(struct semaphore *sem)
might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
- sem->count--;
+ __sem_acquire(sem);
else
result = __down_killable(sem);
raw_spin_unlock_irqrestore(&sem->lock, flags);
@@ -139,7 +141,7 @@ int __sched down_trylock(struct semaphore *sem)
raw_spin_lock_irqsave(&sem->lock, flags);
count = sem->count - 1;
if (likely(count >= 0))
- sem->count = count;
+ __sem_acquire(sem);
raw_spin_unlock_irqrestore(&sem->lock, flags);
return (count < 0);
@@ -164,7 +166,7 @@ int __sched down_timeout(struct semaphore *sem, long timeout)
might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
- sem->count--;
+ __sem_acquire(sem);
else
result = __down_timeout(sem, timeout);
raw_spin_unlock_irqrestore(&sem->lock, flags);
@@ -185,6 +187,12 @@ void __sched up(struct semaphore *sem)
unsigned long flags;
raw_spin_lock_irqsave(&sem->lock, flags);
+
+#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
+ if (READ_ONCE(sem->last_holder) == (unsigned long)current)
+ WRITE_ONCE(sem->last_holder, 0UL);
+#endif
+
if (likely(list_empty(&sem->wait_list)))
sem->count++;
else
@@ -224,8 +232,12 @@ static inline int __sched ___down_common(struct semaphore *sem, long state,
raw_spin_unlock_irq(&sem->lock);
timeout = schedule_timeout(timeout);
raw_spin_lock_irq(&sem->lock);
- if (waiter.up)
+ if (waiter.up) {
+#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
+ WRITE_ONCE(sem->last_holder, (unsigned long)current);
+#endif
return 0;
+ }
}
timed_out:
@@ -242,10 +254,18 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
{
int ret;
+#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
+ hung_task_set_blocker(sem, BLOCKER_TYPE_SEM);
+#endif
+
trace_contention_begin(sem, 0);
ret = ___down_common(sem, state, timeout);
trace_contention_end(sem, ret);
+#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
+ hung_task_clear_blocker();
+#endif
+
return ret;
}
@@ -277,3 +297,23 @@ static noinline void __sched __up(struct semaphore *sem)
waiter->up = true;
wake_up_process(waiter->task);
}
+
+#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
+unsigned long sem_last_holder(struct semaphore *sem)
+{
+ return READ_ONCE(sem->last_holder);
+}
+#else
+unsigned long sem_last_holder(struct semaphore *sem)
+{
+ return 0UL;
+}
+#endif
+
+static inline void __sem_acquire(struct semaphore *sem)
+{
+ sem->count--;
+#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
+ WRITE_ONCE(sem->last_holder, (unsigned long)current);
+#endif
+}
--
2.45.2
On Thu, 20 Mar 2025 14:49:22 +0800 Lance Yang <ioworker0@gmail.com> wrote: > Inspired by mutex blocker tracking[1], this patch makes a trade-off to > balance the overhead and utility of the hung task detector. > > Unlike mutexes, semaphores lack explicit ownership tracking, making it > challenging to identify the root cause of hangs. To address this, we > introduce a last_holder field to the semaphore structure, which is > updated when a task successfully calls down() and cleared during up(). > > The assumption is that if a task is blocked on a semaphore, the holders > must not have released it. While this does not guarantee that the last > holder is one of the current blockers, it likely provides a practical hint > for diagnosing semaphore-related stalls. > > With this change, the hung task detector can now show blocker task's info > like below: +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER It looks pretty simple to reduce the amount of ifdeffery which this patch adds.
On Thu, 20 Mar 2025 14:49:22 +0800 Lance Yang <ioworker0@gmail.com> wrote: > --- a/kernel/locking/semaphore.c > +++ b/kernel/locking/semaphore.c > @@ -33,12 +33,14 @@ > #include <linux/spinlock.h> > #include <linux/ftrace.h> > #include <trace/events/lock.h> > +#include <linux/hung_task.h> > > static noinline void __down(struct semaphore *sem); > static noinline int __down_interruptible(struct semaphore *sem); > static noinline int __down_killable(struct semaphore *sem); > static noinline int __down_timeout(struct semaphore *sem, long timeout); > static noinline void __up(struct semaphore *sem); > +static inline void __sem_acquire(struct semaphore *sem); It feels Just Weird to forward declare a static inline. Is there a special reason for doing this?
Hi Andrew,
Thanks a lot for taking time to review!
On Tue, Apr 8, 2025 at 4:08 AM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> On Thu, 20 Mar 2025 14:49:22 +0800 Lance Yang <ioworker0@gmail.com> wrote:
>
> > Inspired by mutex blocker tracking[1], this patch makes a trade-off to
> > balance the overhead and utility of the hung task detector.
> >
> > Unlike mutexes, semaphores lack explicit ownership tracking, making it
> > challenging to identify the root cause of hangs. To address this, we
> > introduce a last_holder field to the semaphore structure, which is
> > updated when a task successfully calls down() and cleared during up().
> >
> > The assumption is that if a task is blocked on a semaphore, the holders
> > must not have released it. While this does not guarantee that the last
> > holder is one of the current blockers, it likely provides a practical hint
> > for diagnosing semaphore-related stalls.
> >
> > With this change, the hung task detector can now show blocker task's info
> > like below:
>
> +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
> +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
> +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
> +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
> +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
> +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
> +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
> +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
>
> It looks pretty simple to reduce the amount of ifdeffery which this
> patch adds.
Good catch! We can reduce five ifdeffery with the following change ;)
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 3d06d4adc05b..db8a8f696f50 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -40,7 +40,41 @@ static noinline int __down_interruptible(struct semaphore *sem);
static noinline int __down_killable(struct semaphore *sem);
static noinline int __down_timeout(struct semaphore *sem, long timeout);
static noinline void __up(struct semaphore *sem);
-static inline void __sem_acquire(struct semaphore *sem);
+
+#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
+static inline void hung_task_sem_set_holder(struct semaphore *sem)
+{
+ WRITE_ONCE((sem)->last_holder, (unsigned long)current);
+}
+
+static inline void hung_task_sem_clear_if_holder(struct semaphore *sem)
+{
+ if (READ_ONCE((sem)->last_holder) == (unsigned long)current)
+ WRITE_ONCE((sem)->last_holder, 0UL);
+}
+
+unsigned long sem_last_holder(struct semaphore *sem)
+{
+ return READ_ONCE(sem->last_holder);
+}
+#else
+static inline void hung_task_sem_set_holder(struct semaphore *sem)
+{
+}
+static inline void hung_task_sem_clear_if_holder(struct semaphore *sem)
+{
+}
+unsigned long sem_last_holder(struct semaphore *sem)
+{
+ return 0UL;
+}
+#endif
+
+static inline void __sem_acquire(struct semaphore *sem)
+{
+ sem->count--;
+ hung_task_sem_set_holder(sem);
+}
/**
* down - acquire the semaphore
@@ -188,10 +222,7 @@ void __sched up(struct semaphore *sem)
raw_spin_lock_irqsave(&sem->lock, flags);
-#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
- if (READ_ONCE(sem->last_holder) == (unsigned long)current)
- WRITE_ONCE(sem->last_holder, 0UL);
-#endif
+ hung_task_sem_clear_if_holder(sem);
if (likely(list_empty(&sem->wait_list)))
sem->count++;
@@ -233,9 +264,7 @@ static inline int __sched ___down_common(struct semaphore *sem, long state,
timeout = schedule_timeout(timeout);
raw_spin_lock_irq(&sem->lock);
if (waiter.up) {
-#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
- WRITE_ONCE(sem->last_holder, (unsigned long)current);
-#endif
+ hung_task_sem_set_holder(sem);
return 0;
}
}
@@ -254,17 +283,13 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
{
int ret;
-#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
hung_task_set_blocker(sem, BLOCKER_TYPE_SEM);
-#endif
trace_contention_begin(sem, 0);
ret = ___down_common(sem, state, timeout);
trace_contention_end(sem, ret);
-#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
hung_task_clear_blocker();
-#endif
return ret;
}
@@ -297,23 +322,3 @@ static noinline void __sched __up(struct semaphore *sem)
waiter->up = true;
wake_up_process(waiter->task);
}
-
-#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
-unsigned long sem_last_holder(struct semaphore *sem)
-{
- return READ_ONCE(sem->last_holder);
-}
-#else
-unsigned long sem_last_holder(struct semaphore *sem)
-{
- return 0UL;
-}
-#endif
-
-static inline void __sem_acquire(struct semaphore *sem)
-{
- sem->count--;
-#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
- WRITE_ONCE(sem->last_holder, (unsigned long)current);
-#endif
-}
---
[...]
> > static noinline void __down(struct semaphore *sem);
> > static noinline int __down_interruptible(struct semaphore *sem);
> > static noinline int __down_killable(struct semaphore *sem);
> > static noinline int __down_timeout(struct semaphore *sem, long timeout);
> > static noinline void __up(struct semaphore *sem);
> > +static inline void __sem_acquire(struct semaphore *sem);
>
> It feels Just Weird to forward declare a static inline. Is there a
> special reason for doing this?
Thanks for pointing this out.
Indeed, the forward declaratio was weird :(
Fixed by removing it as shown in the diff above.
Thanks,
Lance
© 2016 - 2025 Red Hat, Inc.