kernel/locking/rtmutex.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-)
One reason to use a trylock is to avoid a ABBA deadlock in case we need
to use a locking sequence that is not in the expected locking order. That
requires the use of trylock all the ways in the abnormal locking
sequence. Unfortunately, this is not the case for rt_mutex_trylock() as
it uses a raw_spin_lock_irqsave() to acquire the lock->wait_lock. That
will cause a lockdep splat like the following in a PREEMPT_RT kernel:
[ 63.695668] -> #0 (&lock->wait_lock){-...}-{2:2}:
[ 63.695674] check_prev_add+0x1bd/0x1310
[ 63.695678] validate_chain+0x6cf/0x7c0
[ 63.695682] __lock_acquire+0x879/0xf40
[ 63.695686] lock_acquire.part.0+0xfa/0x2d0
[ 63.695690] _raw_spin_lock_irqsave+0x46/0x90
[ 63.695695] rt_mutex_slowtrylock+0x3f/0xb0
[ 63.695699] rt_spin_trylock+0x13/0xc0
[ 63.695702] rmqueue_pcplist+0x5b/0x180
[ 63.695705] rmqueue+0xb01/0x1040
[ 63.695708] get_page_from_freelist+0x1d0/0xa00
[ 63.695712] __alloc_pages_noprof+0x19a/0x450
[ 63.695716] alloc_pages_mpol_noprof+0xaf/0x1e0
[ 63.695721] stack_depot_save_flags+0x4db/0x520
[ 63.695727] kasan_save_stack+0x3f/0x50
[ 63.695731] __kasan_record_aux_stack+0x8e/0xa0
[ 63.695736] task_work_add+0x1ad/0x240
[ 63.695741] sched_tick+0x1c7/0x500
[ 63.695744] update_process_times+0xf1/0x130
[ 63.695750] tick_nohz_handler+0xf7/0x230
[ 63.695754] __hrtimer_run_queues+0x13b/0x7b0
[ 63.695758] hrtimer_interrupt+0x1c2/0x350
[ 63.695763] __sysvec_apic_timer_interrupt+0xdb/0x340
[ 63.695770] sysvec_apic_timer_interrupt+0x9c/0xd0
[ 63.695774] asm_sysvec_apic_timer_interrupt+0x1a/0x20
[ 63.695780] __asan_load8+0x8/0xa0
[ 63.695784] mas_wr_end_piv+0x28/0x2c0
[ 63.695789] mas_preallocate+0x3a8/0x680
[ 63.695793] vma_shrink+0x180/0x3f0
[ 63.695799] shift_arg_pages+0x219/0x2c0
[ 63.695804] setup_arg_pages+0x343/0x5e0
[ 63.695807] load_elf_binary+0x5ac/0x15d0
[ 63.695813] search_binary_handler+0x125/0x370
[ 63.695818] exec_binprm+0xc9/0x3d0
[ 63.695821] bprm_execve+0x103/0x230
[ 63.695824] kernel_execve+0x187/0x1c0
[ 63.695828] call_usermodehelper_exec_async+0x145/0x1e0
[ 63.695832] ret_from_fork+0x31/0x60
[ 63.695836] ret_from_fork_asm+0x1a/0x30
[ 63.695840]
[ 63.695840] other info that might help us debug this:
[ 63.695840]
[ 63.695842] Chain exists of:
[ 63.695842] &lock->wait_lock --> &p->pi_lock --> &rq->__lock
[ 63.695842]
[ 63.695850] Possible unsafe locking scenario:
[ 63.695850]
[ 63.695851] CPU0 CPU1
[ 63.695852] ---- ----
[ 63.695854] lock(&rq->__lock);
[ 63.695857] lock(&p->pi_lock);
[ 63.695861] lock(&rq->__lock);
[ 63.695864] lock(&lock->wait_lock);
[ 63.695868]
[ 63.695868] *** DEADLOCK ***
Fix it by using raw_spin_trylock_irqsave() instead.
Fixes: 23f78d4a03c5 ("[PATCH] pi-futex: rt mutex core")
Signed-off-by: Waiman Long <longman@redhat.com>
---
kernel/locking/rtmutex.c | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index ebebd0eec7f6..a32bc2bb5d5e 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1381,10 +1381,13 @@ static int __sched rt_mutex_slowtrylock(struct rt_mutex_base *lock)
return 0;
/*
- * The mutex has currently no owner. Lock the wait lock and try to
- * acquire the lock. We use irqsave here to support early boot calls.
+ * The mutex has currently no owner. Try to lock the wait lock first.
+ * If successful, try to acquire the lock. We use irqsave here to
+ * support early boot calls. Trylock is used all the way to avoid
+ * circular lock dependency.
*/
- raw_spin_lock_irqsave(&lock->wait_lock, flags);
+ if (!raw_spin_trylock_irqsave(&lock->wait_lock, flags))
+ return 0;
ret = __rt_mutex_slowtrylock(lock);
--
2.43.5
On Thu, Sep 26, 2024 at 11:13:15AM -0400, Waiman Long wrote: > One reason to use a trylock is to avoid a ABBA deadlock in case we need > to use a locking sequence that is not in the expected locking order. That > requires the use of trylock all the ways in the abnormal locking > sequence. Unfortunately, this is not the case for rt_mutex_trylock() as > it uses a raw_spin_lock_irqsave() to acquire the lock->wait_lock. That > will cause a lockdep splat like the following in a PREEMPT_RT kernel: > > [ 63.695668] -> #0 (&lock->wait_lock){-...}-{2:2}: > [ 63.695674] check_prev_add+0x1bd/0x1310 > [ 63.695678] validate_chain+0x6cf/0x7c0 > [ 63.695682] __lock_acquire+0x879/0xf40 > [ 63.695686] lock_acquire.part.0+0xfa/0x2d0 > [ 63.695690] _raw_spin_lock_irqsave+0x46/0x90 > [ 63.695695] rt_mutex_slowtrylock+0x3f/0xb0 > [ 63.695699] rt_spin_trylock+0x13/0xc0 > [ 63.695702] rmqueue_pcplist+0x5b/0x180 > [ 63.695705] rmqueue+0xb01/0x1040 > [ 63.695708] get_page_from_freelist+0x1d0/0xa00 > [ 63.695712] __alloc_pages_noprof+0x19a/0x450 > [ 63.695716] alloc_pages_mpol_noprof+0xaf/0x1e0 > [ 63.695721] stack_depot_save_flags+0x4db/0x520 > [ 63.695727] kasan_save_stack+0x3f/0x50 > [ 63.695731] __kasan_record_aux_stack+0x8e/0xa0 > [ 63.695736] task_work_add+0x1ad/0x240 > [ 63.695741] sched_tick+0x1c7/0x500 > [ 63.695744] update_process_times+0xf1/0x130 > [ 63.695750] tick_nohz_handler+0xf7/0x230 > [ 63.695754] __hrtimer_run_queues+0x13b/0x7b0 > [ 63.695758] hrtimer_interrupt+0x1c2/0x350 > [ 63.695763] __sysvec_apic_timer_interrupt+0xdb/0x340 > [ 63.695770] sysvec_apic_timer_interrupt+0x9c/0xd0 > [ 63.695774] asm_sysvec_apic_timer_interrupt+0x1a/0x20 > [ 63.695780] __asan_load8+0x8/0xa0 > [ 63.695784] mas_wr_end_piv+0x28/0x2c0 > [ 63.695789] mas_preallocate+0x3a8/0x680 > [ 63.695793] vma_shrink+0x180/0x3f0 > [ 63.695799] shift_arg_pages+0x219/0x2c0 > [ 63.695804] setup_arg_pages+0x343/0x5e0 > [ 63.695807] load_elf_binary+0x5ac/0x15d0 > [ 63.695813] search_binary_handler+0x125/0x370 > [ 63.695818] exec_binprm+0xc9/0x3d0 > [ 63.695821] bprm_execve+0x103/0x230 > [ 63.695824] kernel_execve+0x187/0x1c0 > [ 63.695828] call_usermodehelper_exec_async+0x145/0x1e0 > [ 63.695832] ret_from_fork+0x31/0x60 > [ 63.695836] ret_from_fork_asm+0x1a/0x30 > [ 63.695840] > [ 63.695840] other info that might help us debug this: > [ 63.695840] > [ 63.695842] Chain exists of: > [ 63.695842] &lock->wait_lock --> &p->pi_lock --> &rq->__lock > [ 63.695842] > [ 63.695850] Possible unsafe locking scenario: > [ 63.695850] > [ 63.695851] CPU0 CPU1 > [ 63.695852] ---- ---- > [ 63.695854] lock(&rq->__lock); > [ 63.695857] lock(&p->pi_lock); > [ 63.695861] lock(&rq->__lock); > [ 63.695864] lock(&lock->wait_lock); > [ 63.695868] > [ 63.695868] *** DEADLOCK *** > > Fix it by using raw_spin_trylock_irqsave() instead. That truncated lockdep report doesn't really explain anything. Please just transcribe the full lockdep report into something small and coherent.
On 10/2/24 05:37, Peter Zijlstra wrote: > On Thu, Sep 26, 2024 at 11:13:15AM -0400, Waiman Long wrote: >> One reason to use a trylock is to avoid a ABBA deadlock in case we need >> to use a locking sequence that is not in the expected locking order. That >> requires the use of trylock all the ways in the abnormal locking >> sequence. Unfortunately, this is not the case for rt_mutex_trylock() as >> it uses a raw_spin_lock_irqsave() to acquire the lock->wait_lock. That >> will cause a lockdep splat like the following in a PREEMPT_RT kernel: >> >> [ 63.695668] -> #0 (&lock->wait_lock){-...}-{2:2}: >> [ 63.695674] check_prev_add+0x1bd/0x1310 >> [ 63.695678] validate_chain+0x6cf/0x7c0 >> [ 63.695682] __lock_acquire+0x879/0xf40 >> [ 63.695686] lock_acquire.part.0+0xfa/0x2d0 >> [ 63.695690] _raw_spin_lock_irqsave+0x46/0x90 >> [ 63.695695] rt_mutex_slowtrylock+0x3f/0xb0 >> [ 63.695699] rt_spin_trylock+0x13/0xc0 >> [ 63.695702] rmqueue_pcplist+0x5b/0x180 >> [ 63.695705] rmqueue+0xb01/0x1040 >> [ 63.695708] get_page_from_freelist+0x1d0/0xa00 >> [ 63.695712] __alloc_pages_noprof+0x19a/0x450 >> [ 63.695716] alloc_pages_mpol_noprof+0xaf/0x1e0 >> [ 63.695721] stack_depot_save_flags+0x4db/0x520 >> [ 63.695727] kasan_save_stack+0x3f/0x50 >> [ 63.695731] __kasan_record_aux_stack+0x8e/0xa0 >> [ 63.695736] task_work_add+0x1ad/0x240 >> [ 63.695741] sched_tick+0x1c7/0x500 >> [ 63.695744] update_process_times+0xf1/0x130 >> [ 63.695750] tick_nohz_handler+0xf7/0x230 >> [ 63.695754] __hrtimer_run_queues+0x13b/0x7b0 >> [ 63.695758] hrtimer_interrupt+0x1c2/0x350 >> [ 63.695763] __sysvec_apic_timer_interrupt+0xdb/0x340 >> [ 63.695770] sysvec_apic_timer_interrupt+0x9c/0xd0 >> [ 63.695774] asm_sysvec_apic_timer_interrupt+0x1a/0x20 >> [ 63.695780] __asan_load8+0x8/0xa0 >> [ 63.695784] mas_wr_end_piv+0x28/0x2c0 >> [ 63.695789] mas_preallocate+0x3a8/0x680 >> [ 63.695793] vma_shrink+0x180/0x3f0 >> [ 63.695799] shift_arg_pages+0x219/0x2c0 >> [ 63.695804] setup_arg_pages+0x343/0x5e0 >> [ 63.695807] load_elf_binary+0x5ac/0x15d0 >> [ 63.695813] search_binary_handler+0x125/0x370 >> [ 63.695818] exec_binprm+0xc9/0x3d0 >> [ 63.695821] bprm_execve+0x103/0x230 >> [ 63.695824] kernel_execve+0x187/0x1c0 >> [ 63.695828] call_usermodehelper_exec_async+0x145/0x1e0 >> [ 63.695832] ret_from_fork+0x31/0x60 >> [ 63.695836] ret_from_fork_asm+0x1a/0x30 >> [ 63.695840] >> [ 63.695840] other info that might help us debug this: >> [ 63.695840] >> [ 63.695842] Chain exists of: >> [ 63.695842] &lock->wait_lock --> &p->pi_lock --> &rq->__lock >> [ 63.695842] >> [ 63.695850] Possible unsafe locking scenario: >> [ 63.695850] >> [ 63.695851] CPU0 CPU1 >> [ 63.695852] ---- ---- >> [ 63.695854] lock(&rq->__lock); >> [ 63.695857] lock(&p->pi_lock); >> [ 63.695861] lock(&rq->__lock); >> [ 63.695864] lock(&lock->wait_lock); >> [ 63.695868] >> [ 63.695868] *** DEADLOCK *** >> >> Fix it by using raw_spin_trylock_irqsave() instead. > That truncated lockdep report doesn't really explain anything. Please > just transcribe the full lockdep report into something small and > coherent. I was trying to show where the offending call is coming from. I will send a v2 with a condensed lockdep splat. Cheers, Longman
On Wed, Oct 02, 2024 at 01:54:16PM -0400, Waiman Long wrote: > > On 10/2/24 05:37, Peter Zijlstra wrote: > > On Thu, Sep 26, 2024 at 11:13:15AM -0400, Waiman Long wrote: > > > One reason to use a trylock is to avoid a ABBA deadlock in case we need > > > to use a locking sequence that is not in the expected locking order. That > > > requires the use of trylock all the ways in the abnormal locking > > > sequence. Unfortunately, this is not the case for rt_mutex_trylock() as > > > it uses a raw_spin_lock_irqsave() to acquire the lock->wait_lock. That > > > will cause a lockdep splat like the following in a PREEMPT_RT kernel: > > > > > > [ 63.695668] -> #0 (&lock->wait_lock){-...}-{2:2}: > > > [ 63.695674] check_prev_add+0x1bd/0x1310 > > > [ 63.695678] validate_chain+0x6cf/0x7c0 > > > [ 63.695682] __lock_acquire+0x879/0xf40 > > > [ 63.695686] lock_acquire.part.0+0xfa/0x2d0 > > > [ 63.695690] _raw_spin_lock_irqsave+0x46/0x90 > > > [ 63.695695] rt_mutex_slowtrylock+0x3f/0xb0 > > > [ 63.695699] rt_spin_trylock+0x13/0xc0 > > > [ 63.695702] rmqueue_pcplist+0x5b/0x180 > > > [ 63.695705] rmqueue+0xb01/0x1040 > > > [ 63.695708] get_page_from_freelist+0x1d0/0xa00 > > > [ 63.695712] __alloc_pages_noprof+0x19a/0x450 > > > [ 63.695716] alloc_pages_mpol_noprof+0xaf/0x1e0 > > > [ 63.695721] stack_depot_save_flags+0x4db/0x520 > > > [ 63.695727] kasan_save_stack+0x3f/0x50 > > > [ 63.695731] __kasan_record_aux_stack+0x8e/0xa0 > > > [ 63.695736] task_work_add+0x1ad/0x240 > > > [ 63.695741] sched_tick+0x1c7/0x500 > > > [ 63.695744] update_process_times+0xf1/0x130 > > > [ 63.695750] tick_nohz_handler+0xf7/0x230 > > > [ 63.695754] __hrtimer_run_queues+0x13b/0x7b0 > > > [ 63.695758] hrtimer_interrupt+0x1c2/0x350 > > > [ 63.695763] __sysvec_apic_timer_interrupt+0xdb/0x340 > > > [ 63.695770] sysvec_apic_timer_interrupt+0x9c/0xd0 > > > [ 63.695774] asm_sysvec_apic_timer_interrupt+0x1a/0x20 > > > [ 63.695780] __asan_load8+0x8/0xa0 > > > [ 63.695784] mas_wr_end_piv+0x28/0x2c0 > > > [ 63.695789] mas_preallocate+0x3a8/0x680 > > > [ 63.695793] vma_shrink+0x180/0x3f0 > > > [ 63.695799] shift_arg_pages+0x219/0x2c0 > > > [ 63.695804] setup_arg_pages+0x343/0x5e0 > > > [ 63.695807] load_elf_binary+0x5ac/0x15d0 > > > [ 63.695813] search_binary_handler+0x125/0x370 > > > [ 63.695818] exec_binprm+0xc9/0x3d0 > > > [ 63.695821] bprm_execve+0x103/0x230 > > > [ 63.695824] kernel_execve+0x187/0x1c0 > > > [ 63.695828] call_usermodehelper_exec_async+0x145/0x1e0 > > > [ 63.695832] ret_from_fork+0x31/0x60 > > > [ 63.695836] ret_from_fork_asm+0x1a/0x30 > > > [ 63.695840] > > > [ 63.695840] other info that might help us debug this: > > > [ 63.695840] > > > [ 63.695842] Chain exists of: > > > [ 63.695842] &lock->wait_lock --> &p->pi_lock --> &rq->__lock > > > [ 63.695842] > > > [ 63.695850] Possible unsafe locking scenario: > > > [ 63.695850] > > > [ 63.695851] CPU0 CPU1 > > > [ 63.695852] ---- ---- > > > [ 63.695854] lock(&rq->__lock); > > > [ 63.695857] lock(&p->pi_lock); > > > [ 63.695861] lock(&rq->__lock); > > > [ 63.695864] lock(&lock->wait_lock); > > > [ 63.695868] > > > [ 63.695868] *** DEADLOCK *** > > > > > > Fix it by using raw_spin_trylock_irqsave() instead. > > That truncated lockdep report doesn't really explain anything. Please > > just transcribe the full lockdep report into something small and > > coherent. > > I was trying to show where the offending call is coming from. I will send a > v2 with a condensed lockdep splat. No no no... explain the actual problem. Is the problem that: sched_tick() task_tick_mm_cid() task_work_add() kasan_save_stack() idiotic crap while holding rq->__lock ? Because afaict that is completely insane. And has nothing to do with rtmutex. We are not going to change rtmutex because instrumentation shit is shit.
On 10/7/24 10:50 AM, Peter Zijlstra wrote: > On Wed, Oct 02, 2024 at 01:54:16PM -0400, Waiman Long wrote: >> On 10/2/24 05:37, Peter Zijlstra wrote: >>> On Thu, Sep 26, 2024 at 11:13:15AM -0400, Waiman Long wrote: >>>> One reason to use a trylock is to avoid a ABBA deadlock in case we need >>>> to use a locking sequence that is not in the expected locking order. That >>>> requires the use of trylock all the ways in the abnormal locking >>>> sequence. Unfortunately, this is not the case for rt_mutex_trylock() as >>>> it uses a raw_spin_lock_irqsave() to acquire the lock->wait_lock. That >>>> will cause a lockdep splat like the following in a PREEMPT_RT kernel: >>>> >>>> [ 63.695668] -> #0 (&lock->wait_lock){-...}-{2:2}: >>>> [ 63.695674] check_prev_add+0x1bd/0x1310 >>>> [ 63.695678] validate_chain+0x6cf/0x7c0 >>>> [ 63.695682] __lock_acquire+0x879/0xf40 >>>> [ 63.695686] lock_acquire.part.0+0xfa/0x2d0 >>>> [ 63.695690] _raw_spin_lock_irqsave+0x46/0x90 >>>> [ 63.695695] rt_mutex_slowtrylock+0x3f/0xb0 >>>> [ 63.695699] rt_spin_trylock+0x13/0xc0 >>>> [ 63.695702] rmqueue_pcplist+0x5b/0x180 >>>> [ 63.695705] rmqueue+0xb01/0x1040 >>>> [ 63.695708] get_page_from_freelist+0x1d0/0xa00 >>>> [ 63.695712] __alloc_pages_noprof+0x19a/0x450 >>>> [ 63.695716] alloc_pages_mpol_noprof+0xaf/0x1e0 >>>> [ 63.695721] stack_depot_save_flags+0x4db/0x520 >>>> [ 63.695727] kasan_save_stack+0x3f/0x50 >>>> [ 63.695731] __kasan_record_aux_stack+0x8e/0xa0 >>>> [ 63.695736] task_work_add+0x1ad/0x240 >>>> [ 63.695741] sched_tick+0x1c7/0x500 >>>> [ 63.695744] update_process_times+0xf1/0x130 >>>> [ 63.695750] tick_nohz_handler+0xf7/0x230 >>>> [ 63.695754] __hrtimer_run_queues+0x13b/0x7b0 >>>> [ 63.695758] hrtimer_interrupt+0x1c2/0x350 >>>> [ 63.695763] __sysvec_apic_timer_interrupt+0xdb/0x340 >>>> [ 63.695770] sysvec_apic_timer_interrupt+0x9c/0xd0 >>>> [ 63.695774] asm_sysvec_apic_timer_interrupt+0x1a/0x20 >>>> [ 63.695780] __asan_load8+0x8/0xa0 >>>> [ 63.695784] mas_wr_end_piv+0x28/0x2c0 >>>> [ 63.695789] mas_preallocate+0x3a8/0x680 >>>> [ 63.695793] vma_shrink+0x180/0x3f0 >>>> [ 63.695799] shift_arg_pages+0x219/0x2c0 >>>> [ 63.695804] setup_arg_pages+0x343/0x5e0 >>>> [ 63.695807] load_elf_binary+0x5ac/0x15d0 >>>> [ 63.695813] search_binary_handler+0x125/0x370 >>>> [ 63.695818] exec_binprm+0xc9/0x3d0 >>>> [ 63.695821] bprm_execve+0x103/0x230 >>>> [ 63.695824] kernel_execve+0x187/0x1c0 >>>> [ 63.695828] call_usermodehelper_exec_async+0x145/0x1e0 >>>> [ 63.695832] ret_from_fork+0x31/0x60 >>>> [ 63.695836] ret_from_fork_asm+0x1a/0x30 >>>> [ 63.695840] >>>> [ 63.695840] other info that might help us debug this: >>>> [ 63.695840] >>>> [ 63.695842] Chain exists of: >>>> [ 63.695842] &lock->wait_lock --> &p->pi_lock --> &rq->__lock >>>> [ 63.695842] >>>> [ 63.695850] Possible unsafe locking scenario: >>>> [ 63.695850] >>>> [ 63.695851] CPU0 CPU1 >>>> [ 63.695852] ---- ---- >>>> [ 63.695854] lock(&rq->__lock); >>>> [ 63.695857] lock(&p->pi_lock); >>>> [ 63.695861] lock(&rq->__lock); >>>> [ 63.695864] lock(&lock->wait_lock); >>>> [ 63.695868] >>>> [ 63.695868] *** DEADLOCK *** >>>> >>>> Fix it by using raw_spin_trylock_irqsave() instead. >>> That truncated lockdep report doesn't really explain anything. Please >>> just transcribe the full lockdep report into something small and >>> coherent. >> I was trying to show where the offending call is coming from. I will send a >> v2 with a condensed lockdep splat. > No no no... explain the actual problem. > > Is the problem that: > > sched_tick() > task_tick_mm_cid() > task_work_add() > kasan_save_stack() > idiotic crap while holding rq->__lock ? > > Because afaict that is completely insane. And has nothing to do with > rtmutex. > > We are not going to change rtmutex because instrumentation shit is shit. Yes, it is because of KASAN that causes page allocation while holding the rq->__lock. Maybe we can blame KASAN for this. It is actually not a problem for non-PREEMPT_RT kernel because only trylock is being used. However, we don't use trylock all the way when rt_spin_trylock() is being used with PREEMPT_RT Kernel. This is certainly a problem that we need to fix as there may be other similar case not involving rq->__lock lurking somewhere. Cheers, Longman >
On Mon, Oct 07, 2024 at 11:23:32AM -0400, Waiman Long wrote: > > Is the problem that: > > > > sched_tick() raw_spin_lock(&rq->__lock); > > task_tick_mm_cid() > > task_work_add() > > kasan_save_stack() > > idiotic crap while holding rq->__lock ? > > > > Because afaict that is completely insane. And has nothing to do with > > rtmutex. > > > > We are not going to change rtmutex because instrumentation shit is shit. > > Yes, it is because of KASAN that causes page allocation while holding the > rq->__lock. Maybe we can blame KASAN for this. It is actually not a problem > for non-PREEMPT_RT kernel because only trylock is being used. However, we > don't use trylock all the way when rt_spin_trylock() is being used with > PREEMPT_RT Kernel. It has nothing to do with trylock, an everything to do with scheduler locks being special. But even so, trying to squirrel a spinlock inside a raw_spinlock is dodgy at the best of times, yes it mostly works, but should be avoided whenever possible. And instrumentation just doesn't count. > This is certainly a problem that we need to fix as there > may be other similar case not involving rq->__lock lurking somewhere. There cannot be, lock order is: rtmutex->wait_lock task->pi_lock rq->__lock Trying to subvert that order gets you a splat, any other: raw_spin_lock(&foo); spin_trylock(&bar); will 'work', despite probably not being a very good idea. Any case involving the scheduler locks needs to be eradicated, not worked around.
On 10/7/24 11:33 AM, Peter Zijlstra wrote: > On Mon, Oct 07, 2024 at 11:23:32AM -0400, Waiman Long wrote: > >>> Is the problem that: >>> >>> sched_tick() > raw_spin_lock(&rq->__lock); >>> task_tick_mm_cid() >>> task_work_add() >>> kasan_save_stack() >>> idiotic crap while holding rq->__lock ? >>> >>> Because afaict that is completely insane. And has nothing to do with >>> rtmutex. >>> >>> We are not going to change rtmutex because instrumentation shit is shit. >> Yes, it is because of KASAN that causes page allocation while holding the >> rq->__lock. Maybe we can blame KASAN for this. It is actually not a problem >> for non-PREEMPT_RT kernel because only trylock is being used. However, we >> don't use trylock all the way when rt_spin_trylock() is being used with >> PREEMPT_RT Kernel. > It has nothing to do with trylock, an everything to do with scheduler > locks being special. > > But even so, trying to squirrel a spinlock inside a raw_spinlock is > dodgy at the best of times, yes it mostly works, but should be avoided > whenever possible. > > And instrumentation just doesn't count. > >> This is certainly a problem that we need to fix as there >> may be other similar case not involving rq->__lock lurking somewhere. > There cannot be, lock order is: > > rtmutex->wait_lock > task->pi_lock > rq->__lock > > Trying to subvert that order gets you a splat, any other: > > raw_spin_lock(&foo); > spin_trylock(&bar); > > will 'work', despite probably not being a very good idea. > > Any case involving the scheduler locks needs to be eradicated, not > worked around. OK, I will see what I can do to work around this issue. Cheers, Longman
On Mon, Oct 07, 2024 at 11:54:54AM -0400, Waiman Long wrote: > > On 10/7/24 11:33 AM, Peter Zijlstra wrote: > > On Mon, Oct 07, 2024 at 11:23:32AM -0400, Waiman Long wrote: > > > > > > Is the problem that: > > > > > > > > sched_tick() > > raw_spin_lock(&rq->__lock); > > > > task_tick_mm_cid() > > > > task_work_add() > > > > kasan_save_stack() > > > > idiotic crap while holding rq->__lock ? > > > > > > > > Because afaict that is completely insane. And has nothing to do with > > > > rtmutex. > > > > > > > > We are not going to change rtmutex because instrumentation shit is shit. > > > Yes, it is because of KASAN that causes page allocation while holding the > > > rq->__lock. Maybe we can blame KASAN for this. It is actually not a problem > > > for non-PREEMPT_RT kernel because only trylock is being used. However, we > > > don't use trylock all the way when rt_spin_trylock() is being used with > > > PREEMPT_RT Kernel. > > It has nothing to do with trylock, an everything to do with scheduler > > locks being special. > > > > But even so, trying to squirrel a spinlock inside a raw_spinlock is > > dodgy at the best of times, yes it mostly works, but should be avoided > > whenever possible. > > > > And instrumentation just doesn't count. > > > > > This is certainly a problem that we need to fix as there > > > may be other similar case not involving rq->__lock lurking somewhere. > > There cannot be, lock order is: > > > > rtmutex->wait_lock > > task->pi_lock > > rq->__lock > > > > Trying to subvert that order gets you a splat, any other: > > > > raw_spin_lock(&foo); > > spin_trylock(&bar); > > > > will 'work', despite probably not being a very good idea. > > > > Any case involving the scheduler locks needs to be eradicated, not > > worked around. > > OK, I will see what I can do to work around this issue. Something like the completely untested below might just work. --- diff --git a/include/linux/task_work.h b/include/linux/task_work.h index cf5e7e891a77..6d22414c5a83 100644 --- a/include/linux/task_work.h +++ b/include/linux/task_work.h @@ -14,11 +14,14 @@ init_task_work(struct callback_head *twork, task_work_func_t func) } enum task_work_notify_mode { - TWA_NONE, + TWA_NONE = 0, TWA_RESUME, TWA_SIGNAL, TWA_SIGNAL_NO_IPI, TWA_NMI_CURRENT, + + TWA_FLAGS = 0xff00, + TWAF_NO_KASAN = 0x0100, }; static inline bool task_work_pending(struct task_struct *task) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 43e453ab7e20..e9b053b403c0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10458,7 +10458,7 @@ void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) return; if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan))) return; - task_work_add(curr, work, TWA_RESUME); + task_work_add(curr, work, TWA_RESUME | TWAF_NO_KASAN); } void sched_mm_cid_exit_signals(struct task_struct *t) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ab497fafa7be..a58d55bba7a3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3604,7 +3604,7 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr) curr->node_stamp += period; if (!time_before(jiffies, curr->mm->numa_next_scan)) - task_work_add(curr, work, TWA_RESUME); + task_work_add(curr, work, TWA_RESUME | TWAF_NO_KASAN); } } diff --git a/kernel/task_work.c b/kernel/task_work.c index 5d14d639ac71..ff07a77bd7be 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -55,13 +55,18 @@ int task_work_add(struct task_struct *task, struct callback_head *work, enum task_work_notify_mode notify) { struct callback_head *head; + int flags = notify & TWA_FLAGS; + notify &= ~TWA_FLAGS; if (notify == TWA_NMI_CURRENT) { if (WARN_ON_ONCE(task != current)) return -EINVAL; if (!IS_ENABLED(CONFIG_IRQ_WORK)) return -EINVAL; - } else { + flags |= TWAF_NO_KASAN; + } + + if (!(flags & TWAF_NO_KASAN)) { /* record the work call stack in order to print it in KASAN reports */ kasan_record_aux_stack(work); }
On 10/8/24 3:38 AM, Peter Zijlstra wrote: > On Mon, Oct 07, 2024 at 11:54:54AM -0400, Waiman Long wrote: >> On 10/7/24 11:33 AM, Peter Zijlstra wrote: >>> On Mon, Oct 07, 2024 at 11:23:32AM -0400, Waiman Long wrote: >>> >>>>> Is the problem that: >>>>> >>>>> sched_tick() >>> raw_spin_lock(&rq->__lock); >>>>> task_tick_mm_cid() >>>>> task_work_add() >>>>> kasan_save_stack() >>>>> idiotic crap while holding rq->__lock ? >>>>> >>>>> Because afaict that is completely insane. And has nothing to do with >>>>> rtmutex. >>>>> >>>>> We are not going to change rtmutex because instrumentation shit is shit. >>>> Yes, it is because of KASAN that causes page allocation while holding the >>>> rq->__lock. Maybe we can blame KASAN for this. It is actually not a problem >>>> for non-PREEMPT_RT kernel because only trylock is being used. However, we >>>> don't use trylock all the way when rt_spin_trylock() is being used with >>>> PREEMPT_RT Kernel. >>> It has nothing to do with trylock, an everything to do with scheduler >>> locks being special. >>> >>> But even so, trying to squirrel a spinlock inside a raw_spinlock is >>> dodgy at the best of times, yes it mostly works, but should be avoided >>> whenever possible. >>> >>> And instrumentation just doesn't count. >>> >>>> This is certainly a problem that we need to fix as there >>>> may be other similar case not involving rq->__lock lurking somewhere. >>> There cannot be, lock order is: >>> >>> rtmutex->wait_lock >>> task->pi_lock >>> rq->__lock >>> >>> Trying to subvert that order gets you a splat, any other: >>> >>> raw_spin_lock(&foo); >>> spin_trylock(&bar); >>> >>> will 'work', despite probably not being a very good idea. >>> >>> Any case involving the scheduler locks needs to be eradicated, not >>> worked around. >> OK, I will see what I can do to work around this issue. > Something like the completely untested below might just work. The real problem is due to the occasional need to allocate new pages to expand the stack buffer in stack depot that will take additional lock. Fortunately, there is a kasan_record_aux_stack_noalloc() variant that will prevent that. Below is my proposed solution that is less restrictive. diff --git a/include/linux/task_work.h b/include/linux/task_work.h index cf5e7e891a77..2964171856e0 100644 --- a/include/linux/task_work.h +++ b/include/linux/task_work.h @@ -14,11 +14,14 @@ init_task_work(struct callback_head *twork, task_work_func_t func) } enum task_work_notify_mode { - TWA_NONE, + TWA_NONE = 0, TWA_RESUME, TWA_SIGNAL, TWA_SIGNAL_NO_IPI, TWA_NMI_CURRENT, + + TWA_FLAGS = 0xff00, + TWAF_NO_ALLOC = 0x0100, }; static inline bool task_work_pending(struct task_struct *task) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 43e453ab7e20..0259301e572e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10458,7 +10458,9 @@ void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) return; if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan))) return; - task_work_add(curr, work, TWA_RESUME); + + /* No page allocation under rq lock */ + task_work_add(curr, work, TWA_RESUME | TWAF_NO_ALLOC); } void sched_mm_cid_exit_signals(struct task_struct *t) diff --git a/kernel/task_work.c b/kernel/task_work.c index 5d14d639ac71..c969f1f26be5 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -55,15 +55,26 @@ int task_work_add(struct task_struct *task, struct callback_head *work, enum task_work_notify_mode notify) { struct callback_head *head; + int flags = notify & TWA_FLAGS; + notify &= ~TWA_FLAGS; if (notify == TWA_NMI_CURRENT) { if (WARN_ON_ONCE(task != current)) return -EINVAL; if (!IS_ENABLED(CONFIG_IRQ_WORK)) return -EINVAL; } else { - /* record the work call stack in order to print it in KASAN reports */ - kasan_record_aux_stack(work); + /* + * Record the work call stack in order to print it in KASAN + * reports. + * + * Note that stack allocation can fail if TWAF_NO_ALLOC flag + * is set and new page is needed to expand the stack buffer. + */ + if (flags & TWAF_NO_ALLOC) + kasan_record_aux_stack_noalloc(work); + else + kasan_record_aux_stack(work); } head = READ_ONCE(task->task_works);
© 2016 - 2024 Red Hat, Inc.