kernel/locking/rtmutex.c | 18 ++++++++++++++++-- kernel/locking/rtmutex_api.c | 2 +- 2 files changed, 17 insertions(+), 3 deletions(-)
Bert reported seeing occasional boot hangs when running with
PREEPT_RT and bisected it down to commit 894d1b3db41c
("locking/mutex: Remove wakeups from under mutex::wait_lock").
It looks like I missed a few spots where we drop the wait_lock and
potentially call into schedule without waking up the tasks on the
wake_q structure. Since the tasks being woken are ww_mutex tasks
they need to be able to run to release the mutex and unblock the
task that currently is planning to wake them. Thus we can deadlock.
So make sure we wake the wake_q tasks when we unlock the wait_lock.
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Will Deacon <will@kernel.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: kernel-team@android.com
Reported-by: Bert Karwatzki <spasswolf@web.de>
Closes: https://lore.kernel.org/lkml/20241211182502.2915-1-spasswolf@web.de
Fixes: 894d1b3db41c ("locking/mutex: Remove wakeups from under mutex::wait_lock")
Signed-off-by: John Stultz <jstultz@google.com>
---
kernel/locking/rtmutex.c | 18 ++++++++++++++++--
kernel/locking/rtmutex_api.c | 2 +-
2 files changed, 17 insertions(+), 3 deletions(-)
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index e858de203eb6f..697a56d3d949b 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1292,7 +1292,13 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
*/
get_task_struct(owner);
+ preempt_disable();
raw_spin_unlock_irq(&lock->wait_lock);
+ /* wake up any tasks on the wake_q before calling rt_mutex_adjust_prio_chain */
+ wake_up_q(wake_q);
+ wake_q_init(wake_q);
+ preempt_enable();
+
res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
next_lock, waiter, task);
@@ -1596,6 +1602,7 @@ static void __sched remove_waiter(struct rt_mutex_base *lock,
* or TASK_UNINTERRUPTIBLE)
* @timeout: the pre-initialized and started timer, or NULL for none
* @waiter: the pre-initialized rt_mutex_waiter
+ * @wake_q: wake_q of tasks to wake when we drop the lock->wait_lock
*
* Must be called with lock->wait_lock held and interrupts disabled
*/
@@ -1603,7 +1610,8 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
struct ww_acquire_ctx *ww_ctx,
unsigned int state,
struct hrtimer_sleeper *timeout,
- struct rt_mutex_waiter *waiter)
+ struct rt_mutex_waiter *waiter,
+ struct wake_q_head *wake_q)
__releases(&lock->wait_lock) __acquires(&lock->wait_lock)
{
struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex);
@@ -1634,7 +1642,13 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
owner = rt_mutex_owner(lock);
else
owner = NULL;
+ preempt_disable();
raw_spin_unlock_irq(&lock->wait_lock);
+ if (wake_q) {
+ wake_up_q(wake_q);
+ wake_q_init(wake_q);
+ }
+ preempt_enable();
if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner))
rt_mutex_schedule();
@@ -1708,7 +1722,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock,
ret = task_blocks_on_rt_mutex(lock, waiter, current, ww_ctx, chwalk, wake_q);
if (likely(!ret))
- ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter);
+ ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter, wake_q);
if (likely(!ret)) {
/* acquired the lock */
diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c
index 33ea31d6a7b3b..191e4720e5466 100644
--- a/kernel/locking/rtmutex_api.c
+++ b/kernel/locking/rtmutex_api.c
@@ -383,7 +383,7 @@ int __sched rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock,
raw_spin_lock_irq(&lock->wait_lock);
/* sleep on the mutex */
set_current_state(TASK_INTERRUPTIBLE);
- ret = rt_mutex_slowlock_block(lock, NULL, TASK_INTERRUPTIBLE, to, waiter);
+ ret = rt_mutex_slowlock_block(lock, NULL, TASK_INTERRUPTIBLE, to, waiter, NULL);
/*
* try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
* have to fix that up.
--
2.47.1.613.gc27f4b7a9f-goog
On Thu, Dec 12, 2024 at 02:21:33PM -0800, John Stultz wrote:
> Bert reported seeing occasional boot hangs when running with
> PREEPT_RT and bisected it down to commit 894d1b3db41c
> ("locking/mutex: Remove wakeups from under mutex::wait_lock").
>
> It looks like I missed a few spots where we drop the wait_lock and
> potentially call into schedule without waking up the tasks on the
> wake_q structure. Since the tasks being woken are ww_mutex tasks
> they need to be able to run to release the mutex and unblock the
> task that currently is planning to wake them. Thus we can deadlock.
>
> So make sure we wake the wake_q tasks when we unlock the wait_lock.
>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: Will Deacon <will@kernel.org>
> Cc: Waiman Long <longman@redhat.com>
> Cc: Boqun Feng <boqun.feng@gmail.com>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Bert Karwatzki <spasswolf@web.de>
> Cc: kernel-team@android.com
> Reported-by: Bert Karwatzki <spasswolf@web.de>
> Closes: https://lore.kernel.org/lkml/20241211182502.2915-1-spasswolf@web.de
> Fixes: 894d1b3db41c ("locking/mutex: Remove wakeups from under mutex::wait_lock")
> Signed-off-by: John Stultz <jstultz@google.com>
> ---
I don't suppose this actually makes things much better -- but I had to
try.
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1192,6 +1192,17 @@ try_to_take_rt_mutex(struct rt_mutex_bas
return 1;
}
+#define WRAP_WAKE(_stmt, _q) \
+do { \
+ struct wake_q_head *_Q = (_q); \
+ guard(preempt)(); \
+ _stmt; \
+ if (_Q && !wake_q_empty(_Q)) { \
+ wake_up_q(_Q); \
+ wake_q_init(_Q); \
+ } \
+} while (0)
+
/*
* Task blocks on lock.
*
@@ -1248,10 +1259,7 @@ static int __sched task_blocks_on_rt_mut
/* Check whether the waiter should back out immediately */
rtm = container_of(lock, struct rt_mutex, rtmutex);
- preempt_disable();
- res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx, wake_q);
- wake_up_q(wake_q);
- preempt_enable();
+ WRAP_WAKE(res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx, wake_q), wake_q);
if (res) {
raw_spin_lock(&task->pi_lock);
rt_mutex_dequeue(lock, waiter);
@@ -1295,13 +1303,7 @@ static int __sched task_blocks_on_rt_mut
*/
get_task_struct(owner);
- preempt_disable();
- raw_spin_unlock_irq(&lock->wait_lock);
- /* wake up any tasks on the wake_q before calling rt_mutex_adjust_prio_chain */
- wake_up_q(wake_q);
- wake_q_init(wake_q);
- preempt_enable();
-
+ WRAP_WAKE(raw_spin_unlock_irq(&lock->wait_lock), wake_q);
res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
next_lock, waiter, task);
@@ -1645,13 +1647,8 @@ static int __sched rt_mutex_slowlock_blo
owner = rt_mutex_owner(lock);
else
owner = NULL;
- preempt_disable();
- raw_spin_unlock_irq(&lock->wait_lock);
- if (wake_q) {
- wake_up_q(wake_q);
- wake_q_init(wake_q);
- }
- preempt_enable();
+
+ WRAP_WAKE(raw_spin_unlock_irq(&lock->wait_lock), wake_q);
if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner))
rt_mutex_schedule();
@@ -1802,10 +1799,7 @@ static int __sched rt_mutex_slowlock(str
*/
raw_spin_lock_irqsave(&lock->wait_lock, flags);
ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state, &wake_q);
- preempt_disable();
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
- wake_up_q(&wake_q);
- preempt_enable();
+ WRAP_WAKE(raw_spin_unlock_irqrestore(&lock->wait_lock, flags), &wake_q);
rt_mutex_post_schedule();
return ret;
@@ -1863,11 +1857,8 @@ static void __sched rtlock_slowlock_lock
owner = rt_mutex_owner(lock);
else
owner = NULL;
- preempt_disable();
- raw_spin_unlock_irq(&lock->wait_lock);
- wake_up_q(wake_q);
- wake_q_init(wake_q);
- preempt_enable();
+
+ WRAP_WAKE(raw_spin_unlock_irq(&lock->wait_lock), wake_q);
if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner))
schedule_rtlock();
@@ -1896,10 +1887,8 @@ static __always_inline void __sched rtlo
raw_spin_lock_irqsave(&lock->wait_lock, flags);
rtlock_slowlock_locked(lock, &wake_q);
- preempt_disable();
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
- wake_up_q(&wake_q);
- preempt_enable();
+
+ WRAP_WAKE(raw_spin_unlock_irqrestore(&lock->wait_lock, flags), &wake_q);
}
#endif /* RT_MUTEX_BUILD_SPINLOCKS */
On Fri, Dec 13, 2024 at 4:46 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Thu, Dec 12, 2024 at 02:21:33PM -0800, John Stultz wrote:
> > Bert reported seeing occasional boot hangs when running with
> > PREEPT_RT and bisected it down to commit 894d1b3db41c
> > ("locking/mutex: Remove wakeups from under mutex::wait_lock").
> >
> > It looks like I missed a few spots where we drop the wait_lock and
> > potentially call into schedule without waking up the tasks on the
> > wake_q structure. Since the tasks being woken are ww_mutex tasks
> > they need to be able to run to release the mutex and unblock the
> > task that currently is planning to wake them. Thus we can deadlock.
> >
> > So make sure we wake the wake_q tasks when we unlock the wait_lock.
> >
> > Cc: Peter Zijlstra <peterz@infradead.org>
> > Cc: Ingo Molnar <mingo@redhat.com>
> > Cc: Will Deacon <will@kernel.org>
> > Cc: Waiman Long <longman@redhat.com>
> > Cc: Boqun Feng <boqun.feng@gmail.com>
> > Cc: Thomas Gleixner <tglx@linutronix.de>
> > Cc: Bert Karwatzki <spasswolf@web.de>
> > Cc: kernel-team@android.com
> > Reported-by: Bert Karwatzki <spasswolf@web.de>
> > Closes: https://lore.kernel.org/lkml/20241211182502.2915-1-spasswolf@web.de
> > Fixes: 894d1b3db41c ("locking/mutex: Remove wakeups from under mutex::wait_lock")
> > Signed-off-by: John Stultz <jstultz@google.com>
> > ---
>
> I don't suppose this actually makes things much better -- but I had to
> try.
>
>
> --- a/kernel/locking/rtmutex.c
> +++ b/kernel/locking/rtmutex.c
> @@ -1192,6 +1192,17 @@ try_to_take_rt_mutex(struct rt_mutex_bas
> return 1;
> }
>
> +#define WRAP_WAKE(_stmt, _q) \
> +do { \
> + struct wake_q_head *_Q = (_q); \
> + guard(preempt)(); \
> + _stmt; \
> + if (_Q && !wake_q_empty(_Q)) { \
> + wake_up_q(_Q); \
> + wake_q_init(_Q); \
> + } \
> +} while (0)
> +
> /*
> * Task blocks on lock.
> *
> @@ -1295,13 +1303,7 @@ static int __sched task_blocks_on_rt_mut
> */
> get_task_struct(owner);
>
> - preempt_disable();
> - raw_spin_unlock_irq(&lock->wait_lock);
> - /* wake up any tasks on the wake_q before calling rt_mutex_adjust_prio_chain */
> - wake_up_q(wake_q);
> - wake_q_init(wake_q);
> - preempt_enable();
> -
> + WRAP_WAKE(raw_spin_unlock_irq(&lock->wait_lock), wake_q);
I worry this obscures the _stmt action and makes it a bit harder to read.
I think all of the calls are tied to the unlock (the one you quoted
earlier was removed with 82f9cc094975240), so would something like a
special unlock be reasonable:
raw_spin_unlock_irq_and_wake(&lock->wait_lock, wake_q)
?
thanks
-john
On Fri, Dec 13, 2024 at 06:39:45PM -0800, John Stultz wrote: > On Fri, Dec 13, 2024 at 4:46 AM Peter Zijlstra <peterz@infradead.org> wrote: > I think all of the calls are tied to the unlock (the one you quoted > earlier was removed with 82f9cc094975240), so would something like a > special unlock be reasonable: > raw_spin_unlock_irq_and_wake(&lock->wait_lock, wake_q) > ? Ha, that's what I started with :-) Then found there's two irq_restore() variants for raisins and it all turned to shit.
A common pattern seen when wake_qs are used to defer a wakeup
until after a lock is released is something like:
preempt_disable();
raw_spin_unlock(lock);
wake_up_q(wake_q);
preempt_enable();
So create some raw_spin_unlock*_wake() helper functions to clean
this up.
Applies on top of the fix I submitted here:
https://lore.kernel.org/lkml/20241212222138.2400498-1-jstultz@google.com/
NOTE: I recognise the unlock()/unlock_irq()/unlock_irqrestore()
variants creates its own duplication, which we could use a macro
to generate the similar functions, but I often dislike how those
generation macros making finding the actual implementation
harder, so I left the three functions as is. If folks would
prefer otherwise, let me know and I'll switch it.
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Darren Hart <dvhart@infradead.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: "André Almeida" <andrealmeid@igalia.com>
Cc: Will Deacon <will@kernel.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: kernel-team@android.com
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: John Stultz <jstultz@google.com>
---
include/linux/sched/wake_q.h | 34 ++++++++++++++++++++++++++++++++++
kernel/futex/pi.c | 5 +----
kernel/locking/mutex.c | 16 ++++------------
kernel/locking/rtmutex.c | 32 +++++---------------------------
4 files changed, 44 insertions(+), 43 deletions(-)
diff --git a/include/linux/sched/wake_q.h b/include/linux/sched/wake_q.h
index 06cd8fb2f4098..0f28b4623ad45 100644
--- a/include/linux/sched/wake_q.h
+++ b/include/linux/sched/wake_q.h
@@ -63,4 +63,38 @@ extern void wake_q_add(struct wake_q_head *head, struct task_struct *task);
extern void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task);
extern void wake_up_q(struct wake_q_head *head);
+/* Spin unlock helpers to unlock and call wake_up_q with preempt disabled */
+static inline
+void raw_spin_unlock_wake(raw_spinlock_t *lock, struct wake_q_head *wake_q)
+{
+ guard(preempt)();
+ raw_spin_unlock(lock);
+ if (wake_q) {
+ wake_up_q(wake_q);
+ wake_q_init(wake_q);
+ }
+}
+
+static inline
+void raw_spin_unlock_irq_wake(raw_spinlock_t *lock, struct wake_q_head *wake_q)
+{
+ guard(preempt)();
+ raw_spin_unlock_irq(lock);
+ if (wake_q) {
+ wake_up_q(wake_q);
+ wake_q_init(wake_q);
+ }
+}
+
+static inline
+void raw_spin_unlock_irqrestore_wake(raw_spinlock_t *lock, unsigned long flags,
+ struct wake_q_head *wake_q)
+{
+ guard(preempt)();
+ raw_spin_unlock_irqrestore(lock, flags);
+ if (wake_q) {
+ wake_up_q(wake_q);
+ wake_q_init(wake_q);
+ }
+}
#endif /* _LINUX_SCHED_WAKE_Q_H */
diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c
index d62cca5ed8f4c..daea650b16f51 100644
--- a/kernel/futex/pi.c
+++ b/kernel/futex/pi.c
@@ -1020,10 +1020,7 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl
* it sees the futex_q::pi_state.
*/
ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current, &wake_q);
- preempt_disable();
- raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
- wake_up_q(&wake_q);
- preempt_enable();
+ raw_spin_unlock_irq_wake(&q.pi_state->pi_mutex.wait_lock, &wake_q);
if (ret) {
if (ret == 1)
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 3302e52f0c967..b36f23de48f1b 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -657,10 +657,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
goto err;
}
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
- /* Make sure we do wakeups before calling schedule */
- wake_up_q(&wake_q);
- wake_q_init(&wake_q);
+ raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
schedule_preempt_disabled();
@@ -710,8 +707,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
if (ww_ctx)
ww_mutex_lock_acquired(ww, ww_ctx);
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
- wake_up_q(&wake_q);
+ raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
preempt_enable();
return 0;
@@ -720,10 +716,9 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
__mutex_remove_waiter(lock, &waiter);
err_early_kill:
trace_contention_end(lock, ret);
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+ raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
debug_mutex_free_waiter(&waiter);
mutex_release(&lock->dep_map, ip);
- wake_up_q(&wake_q);
preempt_enable();
return ret;
}
@@ -935,10 +930,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
if (owner & MUTEX_FLAG_HANDOFF)
__mutex_handoff(lock, next);
- preempt_disable();
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
- wake_up_q(&wake_q);
- preempt_enable();
+ raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
}
#ifndef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 697a56d3d949b..4a8df1800cbbd 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1292,13 +1292,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
*/
get_task_struct(owner);
- preempt_disable();
- raw_spin_unlock_irq(&lock->wait_lock);
- /* wake up any tasks on the wake_q before calling rt_mutex_adjust_prio_chain */
- wake_up_q(wake_q);
- wake_q_init(wake_q);
- preempt_enable();
-
+ raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q);
res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
next_lock, waiter, task);
@@ -1642,13 +1636,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
owner = rt_mutex_owner(lock);
else
owner = NULL;
- preempt_disable();
- raw_spin_unlock_irq(&lock->wait_lock);
- if (wake_q) {
- wake_up_q(wake_q);
- wake_q_init(wake_q);
- }
- preempt_enable();
+ raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q);
if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner))
rt_mutex_schedule();
@@ -1799,10 +1787,7 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock,
*/
raw_spin_lock_irqsave(&lock->wait_lock, flags);
ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state, &wake_q);
- preempt_disable();
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
- wake_up_q(&wake_q);
- preempt_enable();
+ raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
rt_mutex_post_schedule();
return ret;
@@ -1860,11 +1845,7 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock,
owner = rt_mutex_owner(lock);
else
owner = NULL;
- preempt_disable();
- raw_spin_unlock_irq(&lock->wait_lock);
- wake_up_q(wake_q);
- wake_q_init(wake_q);
- preempt_enable();
+ raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q);
if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner))
schedule_rtlock();
@@ -1893,10 +1874,7 @@ static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock)
raw_spin_lock_irqsave(&lock->wait_lock, flags);
rtlock_slowlock_locked(lock, &wake_q);
- preempt_disable();
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
- wake_up_q(&wake_q);
- preempt_enable();
+ raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
}
#endif /* RT_MUTEX_BUILD_SPINLOCKS */
--
2.47.1.613.gc27f4b7a9f-goog
The following commit has been merged into the locking/core branch of tip:
Commit-ID: abfdccd6af2b071951633e57d6322c46a1ea791f
Gitweb: https://git.kernel.org/tip/abfdccd6af2b071951633e57d6322c46a1ea791f
Author: John Stultz <jstultz@google.com>
AuthorDate: Mon, 16 Dec 2024 20:07:35 -08:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Fri, 20 Dec 2024 15:31:21 +01:00
sched/wake_q: Add helper to call wake_up_q after unlock with preemption disabled
A common pattern seen when wake_qs are used to defer a wakeup
until after a lock is released is something like:
preempt_disable();
raw_spin_unlock(lock);
wake_up_q(wake_q);
preempt_enable();
So create some raw_spin_unlock*_wake() helper functions to clean
this up.
Applies on top of the fix I submitted here:
https://lore.kernel.org/lkml/20241212222138.2400498-1-jstultz@google.com/
NOTE: I recognise the unlock()/unlock_irq()/unlock_irqrestore()
variants creates its own duplication, which we could use a macro
to generate the similar functions, but I often dislike how those
generation macros making finding the actual implementation
harder, so I left the three functions as is. If folks would
prefer otherwise, let me know and I'll switch it.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: John Stultz <jstultz@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20241217040803.243420-1-jstultz@google.com
---
include/linux/sched/wake_q.h | 34 ++++++++++++++++++++++++++++++++++
kernel/futex/pi.c | 5 +----
kernel/locking/mutex.c | 16 ++++------------
kernel/locking/rtmutex.c | 32 +++++---------------------------
4 files changed, 44 insertions(+), 43 deletions(-)
diff --git a/include/linux/sched/wake_q.h b/include/linux/sched/wake_q.h
index 06cd8fb..0f28b46 100644
--- a/include/linux/sched/wake_q.h
+++ b/include/linux/sched/wake_q.h
@@ -63,4 +63,38 @@ extern void wake_q_add(struct wake_q_head *head, struct task_struct *task);
extern void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task);
extern void wake_up_q(struct wake_q_head *head);
+/* Spin unlock helpers to unlock and call wake_up_q with preempt disabled */
+static inline
+void raw_spin_unlock_wake(raw_spinlock_t *lock, struct wake_q_head *wake_q)
+{
+ guard(preempt)();
+ raw_spin_unlock(lock);
+ if (wake_q) {
+ wake_up_q(wake_q);
+ wake_q_init(wake_q);
+ }
+}
+
+static inline
+void raw_spin_unlock_irq_wake(raw_spinlock_t *lock, struct wake_q_head *wake_q)
+{
+ guard(preempt)();
+ raw_spin_unlock_irq(lock);
+ if (wake_q) {
+ wake_up_q(wake_q);
+ wake_q_init(wake_q);
+ }
+}
+
+static inline
+void raw_spin_unlock_irqrestore_wake(raw_spinlock_t *lock, unsigned long flags,
+ struct wake_q_head *wake_q)
+{
+ guard(preempt)();
+ raw_spin_unlock_irqrestore(lock, flags);
+ if (wake_q) {
+ wake_up_q(wake_q);
+ wake_q_init(wake_q);
+ }
+}
#endif /* _LINUX_SCHED_WAKE_Q_H */
diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c
index d62cca5..daea650 100644
--- a/kernel/futex/pi.c
+++ b/kernel/futex/pi.c
@@ -1020,10 +1020,7 @@ retry_private:
* it sees the futex_q::pi_state.
*/
ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current, &wake_q);
- preempt_disable();
- raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
- wake_up_q(&wake_q);
- preempt_enable();
+ raw_spin_unlock_irq_wake(&q.pi_state->pi_mutex.wait_lock, &wake_q);
if (ret) {
if (ret == 1)
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 3302e52..b36f23d 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -657,10 +657,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
goto err;
}
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
- /* Make sure we do wakeups before calling schedule */
- wake_up_q(&wake_q);
- wake_q_init(&wake_q);
+ raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
schedule_preempt_disabled();
@@ -710,8 +707,7 @@ skip_wait:
if (ww_ctx)
ww_mutex_lock_acquired(ww, ww_ctx);
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
- wake_up_q(&wake_q);
+ raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
preempt_enable();
return 0;
@@ -720,10 +716,9 @@ err:
__mutex_remove_waiter(lock, &waiter);
err_early_kill:
trace_contention_end(lock, ret);
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+ raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
debug_mutex_free_waiter(&waiter);
mutex_release(&lock->dep_map, ip);
- wake_up_q(&wake_q);
preempt_enable();
return ret;
}
@@ -935,10 +930,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
if (owner & MUTEX_FLAG_HANDOFF)
__mutex_handoff(lock, next);
- preempt_disable();
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
- wake_up_q(&wake_q);
- preempt_enable();
+ raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
}
#ifndef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 697a56d..4a8df18 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1292,13 +1292,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
*/
get_task_struct(owner);
- preempt_disable();
- raw_spin_unlock_irq(&lock->wait_lock);
- /* wake up any tasks on the wake_q before calling rt_mutex_adjust_prio_chain */
- wake_up_q(wake_q);
- wake_q_init(wake_q);
- preempt_enable();
-
+ raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q);
res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
next_lock, waiter, task);
@@ -1642,13 +1636,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
owner = rt_mutex_owner(lock);
else
owner = NULL;
- preempt_disable();
- raw_spin_unlock_irq(&lock->wait_lock);
- if (wake_q) {
- wake_up_q(wake_q);
- wake_q_init(wake_q);
- }
- preempt_enable();
+ raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q);
if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner))
rt_mutex_schedule();
@@ -1799,10 +1787,7 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock,
*/
raw_spin_lock_irqsave(&lock->wait_lock, flags);
ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state, &wake_q);
- preempt_disable();
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
- wake_up_q(&wake_q);
- preempt_enable();
+ raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
rt_mutex_post_schedule();
return ret;
@@ -1860,11 +1845,7 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock,
owner = rt_mutex_owner(lock);
else
owner = NULL;
- preempt_disable();
- raw_spin_unlock_irq(&lock->wait_lock);
- wake_up_q(wake_q);
- wake_q_init(wake_q);
- preempt_enable();
+ raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q);
if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner))
schedule_rtlock();
@@ -1893,10 +1874,7 @@ static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock)
raw_spin_lock_irqsave(&lock->wait_lock, flags);
rtlock_slowlock_locked(lock, &wake_q);
- preempt_disable();
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
- wake_up_q(&wake_q);
- preempt_enable();
+ raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
}
#endif /* RT_MUTEX_BUILD_SPINLOCKS */
On Fri, Dec 13, 2024 at 01:46:15PM +0100, Peter Zijlstra wrote: > +#define WRAP_WAKE(_stmt, _q) \ Realized that those are the wrong way around, this needs to be: #define WRAP_WAKE(_q, _stmt...) \ to deal with all the ',' in things like: > + WRAP_WAKE(res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx, wake_q), wake_q); Anyway, still not sure its worth the trouble.
The following commit has been merged into the locking/urgent branch of tip:
Commit-ID: 4a077914578183ec397ad09f7156a357e00e5d72
Gitweb: https://git.kernel.org/tip/4a077914578183ec397ad09f7156a357e00e5d72
Author: John Stultz <jstultz@google.com>
AuthorDate: Thu, 12 Dec 2024 14:21:33 -08:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 17 Dec 2024 17:47:24 +01:00
locking/rtmutex: Make sure we wake anything on the wake_q when we release the lock->wait_lock
Bert reported seeing occasional boot hangs when running with
PREEPT_RT and bisected it down to commit 894d1b3db41c
("locking/mutex: Remove wakeups from under mutex::wait_lock").
It looks like I missed a few spots where we drop the wait_lock and
potentially call into schedule without waking up the tasks on the
wake_q structure. Since the tasks being woken are ww_mutex tasks
they need to be able to run to release the mutex and unblock the
task that currently is planning to wake them. Thus we can deadlock.
So make sure we wake the wake_q tasks when we unlock the wait_lock.
Closes: https://lore.kernel.org/lkml/20241211182502.2915-1-spasswolf@web.de
Fixes: 894d1b3db41c ("locking/mutex: Remove wakeups from under mutex::wait_lock")
Reported-by: Bert Karwatzki <spasswolf@web.de>
Signed-off-by: John Stultz <jstultz@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20241212222138.2400498-1-jstultz@google.com
---
kernel/locking/rtmutex.c | 18 ++++++++++++++++--
kernel/locking/rtmutex_api.c | 2 +-
2 files changed, 17 insertions(+), 3 deletions(-)
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index e858de2..697a56d 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1292,7 +1292,13 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
*/
get_task_struct(owner);
+ preempt_disable();
raw_spin_unlock_irq(&lock->wait_lock);
+ /* wake up any tasks on the wake_q before calling rt_mutex_adjust_prio_chain */
+ wake_up_q(wake_q);
+ wake_q_init(wake_q);
+ preempt_enable();
+
res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
next_lock, waiter, task);
@@ -1596,6 +1602,7 @@ static void __sched remove_waiter(struct rt_mutex_base *lock,
* or TASK_UNINTERRUPTIBLE)
* @timeout: the pre-initialized and started timer, or NULL for none
* @waiter: the pre-initialized rt_mutex_waiter
+ * @wake_q: wake_q of tasks to wake when we drop the lock->wait_lock
*
* Must be called with lock->wait_lock held and interrupts disabled
*/
@@ -1603,7 +1610,8 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
struct ww_acquire_ctx *ww_ctx,
unsigned int state,
struct hrtimer_sleeper *timeout,
- struct rt_mutex_waiter *waiter)
+ struct rt_mutex_waiter *waiter,
+ struct wake_q_head *wake_q)
__releases(&lock->wait_lock) __acquires(&lock->wait_lock)
{
struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex);
@@ -1634,7 +1642,13 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
owner = rt_mutex_owner(lock);
else
owner = NULL;
+ preempt_disable();
raw_spin_unlock_irq(&lock->wait_lock);
+ if (wake_q) {
+ wake_up_q(wake_q);
+ wake_q_init(wake_q);
+ }
+ preempt_enable();
if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner))
rt_mutex_schedule();
@@ -1708,7 +1722,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock,
ret = task_blocks_on_rt_mutex(lock, waiter, current, ww_ctx, chwalk, wake_q);
if (likely(!ret))
- ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter);
+ ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter, wake_q);
if (likely(!ret)) {
/* acquired the lock */
diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c
index 33ea31d..191e472 100644
--- a/kernel/locking/rtmutex_api.c
+++ b/kernel/locking/rtmutex_api.c
@@ -383,7 +383,7 @@ int __sched rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock,
raw_spin_lock_irq(&lock->wait_lock);
/* sleep on the mutex */
set_current_state(TASK_INTERRUPTIBLE);
- ret = rt_mutex_slowlock_block(lock, NULL, TASK_INTERRUPTIBLE, to, waiter);
+ ret = rt_mutex_slowlock_block(lock, NULL, TASK_INTERRUPTIBLE, to, waiter, NULL);
/*
* try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
* have to fix that up.
© 2016 - 2025 Red Hat, Inc.