include/linux/sched/task.h | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-)
With PREEMPT_RT enabled, some of the calls to put_task_struct() coming
from rt_mutex_adjust_prio_chain() could happen in preemptible context and
with a mutex enqueued. That could lead to this sequence:
rt_mutex_adjust_prio_chain()
put_task_struct()
__put_task_struct()
sched_ext_free()
spin_lock_irqsave()
rtlock_lock() ---> TRIGGERS
lockdep_assert(!current->pi_blocked_on);
Fix that by unconditionally resorting to the deferred call to
__put_task_struct() if PREEMPT_RT is enabled.
Suggested-by: Crystal Wood <crwood@redhat.com>
Signed-off-by: Luis Claudio R. Goncalves <lgoncalv@redhat.com>
---
Resent as a gentle reminder, because this issue results in scary backtraces,
not obvious to debug and pinpoint root cause.
v2: (Rostedt) remove the #ifdef from put_task_struct() and create
tsk_is_pi_blocked_on() in sched.h to make the change cleaner.
v3: (Sebastian, PeterZ) always call the deferred __put_task_struct() on RT.
v4: Fix the implementation of what was requested on v3.
include/linux/sched/task.h | 17 ++++++++---------
1 file changed, 8 insertions(+), 9 deletions(-)
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 0f2aeb37bbb04..51678a541477a 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -134,11 +134,8 @@ static inline void put_task_struct(struct task_struct *t)
if (!refcount_dec_and_test(&t->usage))
return;
- /*
- * In !RT, it is always safe to call __put_task_struct().
- * Under RT, we can only call it in preemptible context.
- */
- if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) {
+ /* In !RT, it is always safe to call __put_task_struct(). */
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
static DEFINE_WAIT_OVERRIDE_MAP(put_task_map, LD_WAIT_SLEEP);
lock_map_acquire_try(&put_task_map);
@@ -148,11 +145,13 @@ static inline void put_task_struct(struct task_struct *t)
}
/*
- * under PREEMPT_RT, we can't call put_task_struct
+ * Under PREEMPT_RT, we can't call __put_task_struct
* in atomic context because it will indirectly
- * acquire sleeping locks.
+ * acquire sleeping locks. The same is true if the
+ * current process has a mutex enqueued (blocked on
+ * a PI chain).
*
- * call_rcu() will schedule delayed_put_task_struct_rcu()
+ * call_rcu() will schedule __put_task_struct_rcu_cb()
* to be called in process context.
*
* __put_task_struct() is called when
@@ -165,7 +164,7 @@ static inline void put_task_struct(struct task_struct *t)
*
* delayed_free_task() also uses ->rcu, but it is only called
* when it fails to fork a process. Therefore, there is no
- * way it can conflict with put_task_struct().
+ * way it can conflict with __put_task_struct().
*/
call_rcu(&t->rcu, __put_task_struct_rcu_cb);
}
----- End forwarded message -----
On Fri, Jun 13, 2025 at 12:05:14PM -0300, Luis Claudio R. Goncalves wrote: > With PREEMPT_RT enabled, some of the calls to put_task_struct() coming > from rt_mutex_adjust_prio_chain() could happen in preemptible context and > with a mutex enqueued. That could lead to this sequence: > > rt_mutex_adjust_prio_chain() > put_task_struct() > __put_task_struct() > sched_ext_free() > spin_lock_irqsave() > rtlock_lock() ---> TRIGGERS > lockdep_assert(!current->pi_blocked_on); > > Fix that by unconditionally resorting to the deferred call to > __put_task_struct() if PREEMPT_RT is enabled. > Should this have a Fixes: tag and go into /urgent? > Suggested-by: Crystal Wood <crwood@redhat.com> > Signed-off-by: Luis Claudio R. Goncalves <lgoncalv@redhat.com> > --- > > Resent as a gentle reminder, because this issue results in scary backtraces, > not obvious to debug and pinpoint root cause. > > v2: (Rostedt) remove the #ifdef from put_task_struct() and create > tsk_is_pi_blocked_on() in sched.h to make the change cleaner. > v3: (Sebastian, PeterZ) always call the deferred __put_task_struct() on RT. > v4: Fix the implementation of what was requested on v3. > > include/linux/sched/task.h | 17 ++++++++--------- > 1 file changed, 8 insertions(+), 9 deletions(-) > > diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h > index 0f2aeb37bbb04..51678a541477a 100644 > --- a/include/linux/sched/task.h > +++ b/include/linux/sched/task.h > @@ -134,11 +134,8 @@ static inline void put_task_struct(struct task_struct *t) > if (!refcount_dec_and_test(&t->usage)) > return; > > - /* > - * In !RT, it is always safe to call __put_task_struct(). > - * Under RT, we can only call it in preemptible context. > - */ > - if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) { > + /* In !RT, it is always safe to call __put_task_struct(). */ > + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { > static DEFINE_WAIT_OVERRIDE_MAP(put_task_map, LD_WAIT_SLEEP); > > lock_map_acquire_try(&put_task_map); > @@ -148,11 +145,13 @@ static inline void put_task_struct(struct task_struct *t) > } > > /* > - * under PREEMPT_RT, we can't call put_task_struct > + * Under PREEMPT_RT, we can't call __put_task_struct > * in atomic context because it will indirectly > - * acquire sleeping locks. > + * acquire sleeping locks. The same is true if the > + * current process has a mutex enqueued (blocked on > + * a PI chain). > * > - * call_rcu() will schedule delayed_put_task_struct_rcu() > + * call_rcu() will schedule __put_task_struct_rcu_cb() > * to be called in process context. > * > * __put_task_struct() is called when > @@ -165,7 +164,7 @@ static inline void put_task_struct(struct task_struct *t) > * > * delayed_free_task() also uses ->rcu, but it is only called > * when it fails to fork a process. Therefore, there is no > - * way it can conflict with put_task_struct(). > + * way it can conflict with __put_task_struct(). > */ > call_rcu(&t->rcu, __put_task_struct_rcu_cb); > } > > ----- End forwarded message ----- >
On Tue, Jun 17, 2025 at 11:26:09AM +0200, Peter Zijlstra wrote: > On Fri, Jun 13, 2025 at 12:05:14PM -0300, Luis Claudio R. Goncalves wrote: > > With PREEMPT_RT enabled, some of the calls to put_task_struct() coming > > from rt_mutex_adjust_prio_chain() could happen in preemptible context and > > with a mutex enqueued. That could lead to this sequence: > > > > rt_mutex_adjust_prio_chain() > > put_task_struct() > > __put_task_struct() > > sched_ext_free() > > spin_lock_irqsave() > > rtlock_lock() ---> TRIGGERS > > lockdep_assert(!current->pi_blocked_on); > > > > Fix that by unconditionally resorting to the deferred call to > > __put_task_struct() if PREEMPT_RT is enabled. > > > > Should this have a Fixes: tag and go into /urgent? Makes sense! I will add the tag: Fixes: 893cdaaa3977b ("sched: avoid false lockdep splat in put_task_struct()") and resend. Thank you! > > Suggested-by: Crystal Wood <crwood@redhat.com> > > Signed-off-by: Luis Claudio R. Goncalves <lgoncalv@redhat.com> > > --- > > > > Resent as a gentle reminder, because this issue results in scary backtraces, > > not obvious to debug and pinpoint root cause. > > > > v2: (Rostedt) remove the #ifdef from put_task_struct() and create > > tsk_is_pi_blocked_on() in sched.h to make the change cleaner. > > v3: (Sebastian, PeterZ) always call the deferred __put_task_struct() on RT. > > v4: Fix the implementation of what was requested on v3. > > > > include/linux/sched/task.h | 17 ++++++++--------- > > 1 file changed, 8 insertions(+), 9 deletions(-) > > > > diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h > > index 0f2aeb37bbb04..51678a541477a 100644 > > --- a/include/linux/sched/task.h > > +++ b/include/linux/sched/task.h > > @@ -134,11 +134,8 @@ static inline void put_task_struct(struct task_struct *t) > > if (!refcount_dec_and_test(&t->usage)) > > return; > > > > - /* > > - * In !RT, it is always safe to call __put_task_struct(). > > - * Under RT, we can only call it in preemptible context. > > - */ > > - if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) { > > + /* In !RT, it is always safe to call __put_task_struct(). */ > > + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { > > static DEFINE_WAIT_OVERRIDE_MAP(put_task_map, LD_WAIT_SLEEP); > > > > lock_map_acquire_try(&put_task_map); > > @@ -148,11 +145,13 @@ static inline void put_task_struct(struct task_struct *t) > > } > > > > /* > > - * under PREEMPT_RT, we can't call put_task_struct > > + * Under PREEMPT_RT, we can't call __put_task_struct > > * in atomic context because it will indirectly > > - * acquire sleeping locks. > > + * acquire sleeping locks. The same is true if the > > + * current process has a mutex enqueued (blocked on > > + * a PI chain). > > * > > - * call_rcu() will schedule delayed_put_task_struct_rcu() > > + * call_rcu() will schedule __put_task_struct_rcu_cb() > > * to be called in process context. > > * > > * __put_task_struct() is called when > > @@ -165,7 +164,7 @@ static inline void put_task_struct(struct task_struct *t) > > * > > * delayed_free_task() also uses ->rcu, but it is only called > > * when it fails to fork a process. Therefore, there is no > > - * way it can conflict with put_task_struct(). > > + * way it can conflict with __put_task_struct(). > > */ > > call_rcu(&t->rcu, __put_task_struct_rcu_cb); > > } > > > > ----- End forwarded message ----- > > > ---end quoted text---
On 2025-06-17 11:26:09 [+0200], Peter Zijlstra wrote: > On Fri, Jun 13, 2025 at 12:05:14PM -0300, Luis Claudio R. Goncalves wrote: > > With PREEMPT_RT enabled, some of the calls to put_task_struct() coming > > from rt_mutex_adjust_prio_chain() could happen in preemptible context and > > with a mutex enqueued. That could lead to this sequence: > > > > rt_mutex_adjust_prio_chain() > > put_task_struct() > > __put_task_struct() > > sched_ext_free() > > spin_lock_irqsave() > > rtlock_lock() ---> TRIGGERS > > lockdep_assert(!current->pi_blocked_on); > > > > Fix that by unconditionally resorting to the deferred call to > > __put_task_struct() if PREEMPT_RT is enabled. > > > > Should this have a Fixes: tag and go into /urgent? I would say so. I'm not sure what caused it. I think Luis said at some point that it is caused by a sched_ext case or I mixed it up with something. Luis? The other question I have, do we need to distinguish between PREEMPT_RT and not or can we do this unconditionally? Sebastian
On Tue, Jun 17, 2025 at 11:36:27AM +0200, Sebastian Andrzej Siewior wrote: > On 2025-06-17 11:26:09 [+0200], Peter Zijlstra wrote: > > On Fri, Jun 13, 2025 at 12:05:14PM -0300, Luis Claudio R. Goncalves wrote: > > > With PREEMPT_RT enabled, some of the calls to put_task_struct() coming > > > from rt_mutex_adjust_prio_chain() could happen in preemptible context and > > > with a mutex enqueued. That could lead to this sequence: > > > > > > rt_mutex_adjust_prio_chain() > > > put_task_struct() > > > __put_task_struct() > > > sched_ext_free() > > > spin_lock_irqsave() > > > rtlock_lock() ---> TRIGGERS > > > lockdep_assert(!current->pi_blocked_on); > > > > > > Fix that by unconditionally resorting to the deferred call to > > > __put_task_struct() if PREEMPT_RT is enabled. > > > > > > > Should this have a Fixes: tag and go into /urgent? > > I would say so. I'm not sure what caused it. I think Luis said at some > point that it is caused by a sched_ext case or I mixed it up with > something. Luis? You are correct, all the initial cases we observed were triggered at sched_ext_free(). Later, Crystal Wood was able to pinpoint the real problem, __put_task_struct() being called by an RT task with a mutex enqueued. With that in mind we were able to identify other cases with a similar cause. > The other question I have, do we need to distinguish between PREEMPT_RT > and not or can we do this unconditionally? After you mentioned that idea in the v2 thread, I ran stress tests (LTP, stress-ng, perf bench all in a tight loop, ...) and a few benchmarks, o kernels with and without PREEMPT_RT enabled, with and without lockdep. Everything worked fine, but due to the lack of a specific benchmark to run, to ensure no penalty was added by the patch, I was not confident enough to suggest the change. Luis > Sebastian > ---end quoted text---
On Tue, Jun 17, 2025 at 11:36:27AM +0200, Sebastian Andrzej Siewior wrote: > On 2025-06-17 11:26:09 [+0200], Peter Zijlstra wrote: > > On Fri, Jun 13, 2025 at 12:05:14PM -0300, Luis Claudio R. Goncalves wrote: > > > With PREEMPT_RT enabled, some of the calls to put_task_struct() coming > > > from rt_mutex_adjust_prio_chain() could happen in preemptible context and > > > with a mutex enqueued. That could lead to this sequence: > > > > > > rt_mutex_adjust_prio_chain() > > > put_task_struct() > > > __put_task_struct() > > > sched_ext_free() > > > spin_lock_irqsave() > > > rtlock_lock() ---> TRIGGERS > > > lockdep_assert(!current->pi_blocked_on); > > > > > > Fix that by unconditionally resorting to the deferred call to > > > __put_task_struct() if PREEMPT_RT is enabled. > > > > > > > Should this have a Fixes: tag and go into /urgent? > > I would say so. I'm not sure what caused it. I think Luis said at some > point that it is caused by a sched_ext case or I mixed it up with > something. Luis? > > The other question I have, do we need to distinguish between PREEMPT_RT > and not or can we do this unconditionally? > That's something I had been wondering myself. However, since this code runs in multiple places, I was concerned it might trigger some obscure corner-case issue. In any case, if we decide to remove the PREEMPT_RT conditional, I’d prefer to handle that in a follow-up patch. > Sebastian >
On Fri, Jun 13, 2025 at 12:05:14PM -0300, Luis Claudio R. Goncalves wrote: > With PREEMPT_RT enabled, some of the calls to put_task_struct() coming > from rt_mutex_adjust_prio_chain() could happen in preemptible context and > with a mutex enqueued. That could lead to this sequence: > > rt_mutex_adjust_prio_chain() > put_task_struct() > __put_task_struct() > sched_ext_free() > spin_lock_irqsave() > rtlock_lock() ---> TRIGGERS > lockdep_assert(!current->pi_blocked_on); > > Fix that by unconditionally resorting to the deferred call to > __put_task_struct() if PREEMPT_RT is enabled. > > Suggested-by: Crystal Wood <crwood@redhat.com> > Signed-off-by: Luis Claudio R. Goncalves <lgoncalv@redhat.com> > --- > > Resent as a gentle reminder, because this issue results in scary backtraces, > not obvious to debug and pinpoint root cause. > > v2: (Rostedt) remove the #ifdef from put_task_struct() and create > tsk_is_pi_blocked_on() in sched.h to make the change cleaner. > v3: (Sebastian, PeterZ) always call the deferred __put_task_struct() on RT. > v4: Fix the implementation of what was requested on v3. > > include/linux/sched/task.h | 17 ++++++++--------- > 1 file changed, 8 insertions(+), 9 deletions(-) > > diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h > index 0f2aeb37bbb04..51678a541477a 100644 > --- a/include/linux/sched/task.h > +++ b/include/linux/sched/task.h > @@ -134,11 +134,8 @@ static inline void put_task_struct(struct task_struct *t) > if (!refcount_dec_and_test(&t->usage)) > return; > > - /* > - * In !RT, it is always safe to call __put_task_struct(). > - * Under RT, we can only call it in preemptible context. > - */ > - if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) { > + /* In !RT, it is always safe to call __put_task_struct(). */ > + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { > static DEFINE_WAIT_OVERRIDE_MAP(put_task_map, LD_WAIT_SLEEP); > > lock_map_acquire_try(&put_task_map); > @@ -148,11 +145,13 @@ static inline void put_task_struct(struct task_struct *t) > } > > /* > - * under PREEMPT_RT, we can't call put_task_struct > + * Under PREEMPT_RT, we can't call __put_task_struct > * in atomic context because it will indirectly > - * acquire sleeping locks. > + * acquire sleeping locks. The same is true if the > + * current process has a mutex enqueued (blocked on > + * a PI chain). > * > - * call_rcu() will schedule delayed_put_task_struct_rcu() > + * call_rcu() will schedule __put_task_struct_rcu_cb() > * to be called in process context. > * > * __put_task_struct() is called when > @@ -165,7 +164,7 @@ static inline void put_task_struct(struct task_struct *t) > * > * delayed_free_task() also uses ->rcu, but it is only called > * when it fails to fork a process. Therefore, there is no > - * way it can conflict with put_task_struct(). > + * way it can conflict with __put_task_struct(). > */ > call_rcu(&t->rcu, __put_task_struct_rcu_cb); > } > Reviewed-by: Wander Laurson Costa <wander@redhat.com> > ----- End forwarded message ----- >
© 2016 - 2025 Red Hat, Inc.