[PATCH v3] locking/semaphore: Use wake_q to wake up processes outside lock critical section

Waiman Long posted 1 patch 10 months, 3 weeks ago
kernel/locking/semaphore.c | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)
[PATCH v3] locking/semaphore: Use wake_q to wake up processes outside lock critical section
Posted by Waiman Long 10 months, 3 weeks ago
A circular lock dependency splat has been seen involving down_trylock().

[ 4011.795602] ======================================================
[ 4011.795603] WARNING: possible circular locking dependency detected
[ 4011.795607] 6.12.0-41.el10.s390x+debug
[ 4011.795612] ------------------------------------------------------
[ 4011.795613] dd/32479 is trying to acquire lock:
[ 4011.795617] 0015a20accd0d4f8 ((console_sem).lock){-.-.}-{2:2}, at: down_trylock+0x26/0x90
[ 4011.795636]
[ 4011.795636] but task is already holding lock:
[ 4011.795637] 000000017e461698 (&zone->lock){-.-.}-{2:2}, at: rmqueue_bulk+0xac/0x8f0

  the existing dependency chain (in reverse order) is:
  -> #4 (&zone->lock){-.-.}-{2:2}:
  -> #3 (hrtimer_bases.lock){-.-.}-{2:2}:
  -> #2 (&rq->__lock){-.-.}-{2:2}:
  -> #1 (&p->pi_lock){-.-.}-{2:2}:
  -> #0 ((console_sem).lock){-.-.}-{2:2}:

The console_sem -> pi_lock dependency is due to calling try_to_wake_up()
while holding the console.sem raw_spinlock. This dependency can be broken
by using wake_q to do the wakeup instead of calling try_to_wake_up()
under the console_sem lock. This will also make the semaphore's
raw_spinlock become a terminal lock without taking any further locks
underneath it.

The hrtimer_bases.lock is a raw_spinlock while zone->lock is a
spinlock. The hrtimer_bases.lock -> zone->lock dependency happens via
the debug_objects_fill_pool() helper function in the debugobjects code.

[ 4011.795646] -> #4 (&zone->lock){-.-.}-{2:2}:
[ 4011.795650]        __lock_acquire+0xe86/0x1cc0
[ 4011.795655]        lock_acquire.part.0+0x258/0x630
[ 4011.795657]        lock_acquire+0xb8/0xe0
[ 4011.795659]        _raw_spin_lock_irqsave+0xb4/0x120
[ 4011.795663]        rmqueue_bulk+0xac/0x8f0
[ 4011.795665]        __rmqueue_pcplist+0x580/0x830
[ 4011.795667]        rmqueue_pcplist+0xfc/0x470
[ 4011.795669]        rmqueue.isra.0+0xdec/0x11b0
[ 4011.795671]        get_page_from_freelist+0x2ee/0xeb0
[ 4011.795673]        __alloc_pages_noprof+0x2c2/0x520
[ 4011.795676]        alloc_pages_mpol_noprof+0x1fc/0x4d0
[ 4011.795681]        alloc_pages_noprof+0x8c/0xe0
[ 4011.795684]        allocate_slab+0x320/0x460
[ 4011.795686]        ___slab_alloc+0xa58/0x12b0
[ 4011.795688]        __slab_alloc.isra.0+0x42/0x60
[ 4011.795690]        kmem_cache_alloc_noprof+0x304/0x350
[ 4011.795692]        fill_pool+0xf6/0x450
[ 4011.795697]        debug_object_activate+0xfe/0x360
[ 4011.795700]        enqueue_hrtimer+0x34/0x190
[ 4011.795703]        __run_hrtimer+0x3c8/0x4c0
[ 4011.795705]        __hrtimer_run_queues+0x1b2/0x260
[ 4011.795707]        hrtimer_interrupt+0x316/0x760
[ 4011.795709]        do_IRQ+0x9a/0xe0
[ 4011.795712]        do_irq_async+0xf6/0x160

Normally raw_spinlock to spinlock dependency is not legit
and will be warned if PROVE_RAW_LOCK_NESTING is enabled,
but debug_objects_fill_pool() is an exception as it explicitly
allows this dependency for non-PREEMPT_RT kernel without causing
PROVE_RAW_LOCK_NESTING lockdep splat. As a result, this dependency is
legit and not a bug.

Anyway, semaphore is the only locking primitive left that is still
using try_to_wake_up() to do wakeup inside critical section, all the
other locking primitives had been migrated to use wake_q to do wakeup
outside of the critical section. It is also possible that there are
other circular locking dependencies involving printk/console_sem or
other existing/new semaphores lurking somewhere which may show up in
the future. Let just do the migration now to wake_q to avoid headache
like this.

Signed-off-by: Waiman Long <longman@redhat.com>
---
 kernel/locking/semaphore.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 34bfae72f295..de9117c0e671 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -29,6 +29,7 @@
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/wake_q.h>
 #include <linux/semaphore.h>
 #include <linux/spinlock.h>
 #include <linux/ftrace.h>
@@ -38,7 +39,7 @@ static noinline void __down(struct semaphore *sem);
 static noinline int __down_interruptible(struct semaphore *sem);
 static noinline int __down_killable(struct semaphore *sem);
 static noinline int __down_timeout(struct semaphore *sem, long timeout);
-static noinline void __up(struct semaphore *sem);
+static noinline void __up(struct semaphore *sem, struct wake_q_head *wake_q);
 
 /**
  * down - acquire the semaphore
@@ -183,13 +184,16 @@ EXPORT_SYMBOL(down_timeout);
 void __sched up(struct semaphore *sem)
 {
 	unsigned long flags;
+	DEFINE_WAKE_Q(wake_q);
 
 	raw_spin_lock_irqsave(&sem->lock, flags);
 	if (likely(list_empty(&sem->wait_list)))
 		sem->count++;
 	else
-		__up(sem);
+		__up(sem, &wake_q);
 	raw_spin_unlock_irqrestore(&sem->lock, flags);
+	if (!wake_q_empty(&wake_q))
+		wake_up_q(&wake_q);
 }
 EXPORT_SYMBOL(up);
 
@@ -269,11 +273,12 @@ static noinline int __sched __down_timeout(struct semaphore *sem, long timeout)
 	return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout);
 }
 
-static noinline void __sched __up(struct semaphore *sem)
+static noinline void __sched __up(struct semaphore *sem,
+				  struct wake_q_head *wake_q)
 {
 	struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
 						struct semaphore_waiter, list);
 	list_del(&waiter->list);
 	waiter->up = true;
-	wake_up_process(waiter->task);
+	wake_q_add(wake_q, waiter->task);
 }
-- 
2.47.1
Re: [PATCH v3] locking/semaphore: Use wake_q to wake up processes outside lock critical section
Posted by Waiman Long 10 months, 1 week ago
On 1/26/25 8:31 PM, Waiman Long wrote:
> A circular lock dependency splat has been seen involving down_trylock().
>
> [ 4011.795602] ======================================================
> [ 4011.795603] WARNING: possible circular locking dependency detected
> [ 4011.795607] 6.12.0-41.el10.s390x+debug
> [ 4011.795612] ------------------------------------------------------
> [ 4011.795613] dd/32479 is trying to acquire lock:
> [ 4011.795617] 0015a20accd0d4f8 ((console_sem).lock){-.-.}-{2:2}, at: down_trylock+0x26/0x90
> [ 4011.795636]
> [ 4011.795636] but task is already holding lock:
> [ 4011.795637] 000000017e461698 (&zone->lock){-.-.}-{2:2}, at: rmqueue_bulk+0xac/0x8f0
>
>    the existing dependency chain (in reverse order) is:
>    -> #4 (&zone->lock){-.-.}-{2:2}:
>    -> #3 (hrtimer_bases.lock){-.-.}-{2:2}:
>    -> #2 (&rq->__lock){-.-.}-{2:2}:
>    -> #1 (&p->pi_lock){-.-.}-{2:2}:
>    -> #0 ((console_sem).lock){-.-.}-{2:2}:
>
> The console_sem -> pi_lock dependency is due to calling try_to_wake_up()
> while holding the console.sem raw_spinlock. This dependency can be broken
> by using wake_q to do the wakeup instead of calling try_to_wake_up()
> under the console_sem lock. This will also make the semaphore's
> raw_spinlock become a terminal lock without taking any further locks
> underneath it.
>
> The hrtimer_bases.lock is a raw_spinlock while zone->lock is a
> spinlock. The hrtimer_bases.lock -> zone->lock dependency happens via
> the debug_objects_fill_pool() helper function in the debugobjects code.
>
> [ 4011.795646] -> #4 (&zone->lock){-.-.}-{2:2}:
> [ 4011.795650]        __lock_acquire+0xe86/0x1cc0
> [ 4011.795655]        lock_acquire.part.0+0x258/0x630
> [ 4011.795657]        lock_acquire+0xb8/0xe0
> [ 4011.795659]        _raw_spin_lock_irqsave+0xb4/0x120
> [ 4011.795663]        rmqueue_bulk+0xac/0x8f0
> [ 4011.795665]        __rmqueue_pcplist+0x580/0x830
> [ 4011.795667]        rmqueue_pcplist+0xfc/0x470
> [ 4011.795669]        rmqueue.isra.0+0xdec/0x11b0
> [ 4011.795671]        get_page_from_freelist+0x2ee/0xeb0
> [ 4011.795673]        __alloc_pages_noprof+0x2c2/0x520
> [ 4011.795676]        alloc_pages_mpol_noprof+0x1fc/0x4d0
> [ 4011.795681]        alloc_pages_noprof+0x8c/0xe0
> [ 4011.795684]        allocate_slab+0x320/0x460
> [ 4011.795686]        ___slab_alloc+0xa58/0x12b0
> [ 4011.795688]        __slab_alloc.isra.0+0x42/0x60
> [ 4011.795690]        kmem_cache_alloc_noprof+0x304/0x350
> [ 4011.795692]        fill_pool+0xf6/0x450
> [ 4011.795697]        debug_object_activate+0xfe/0x360
> [ 4011.795700]        enqueue_hrtimer+0x34/0x190
> [ 4011.795703]        __run_hrtimer+0x3c8/0x4c0
> [ 4011.795705]        __hrtimer_run_queues+0x1b2/0x260
> [ 4011.795707]        hrtimer_interrupt+0x316/0x760
> [ 4011.795709]        do_IRQ+0x9a/0xe0
> [ 4011.795712]        do_irq_async+0xf6/0x160
>
> Normally raw_spinlock to spinlock dependency is not legit
> and will be warned if PROVE_RAW_LOCK_NESTING is enabled,
> but debug_objects_fill_pool() is an exception as it explicitly
> allows this dependency for non-PREEMPT_RT kernel without causing
> PROVE_RAW_LOCK_NESTING lockdep splat. As a result, this dependency is
> legit and not a bug.
>
> Anyway, semaphore is the only locking primitive left that is still
> using try_to_wake_up() to do wakeup inside critical section, all the
> other locking primitives had been migrated to use wake_q to do wakeup
> outside of the critical section. It is also possible that there are
> other circular locking dependencies involving printk/console_sem or
> other existing/new semaphores lurking somewhere which may show up in
> the future. Let just do the migration now to wake_q to avoid headache
> like this.

I can also add the following as another instance where deadlock can happen.

Reported-by:syzbot+ed801a886dfdbfe7136d@syzkaller.appspotmail.com

Cheers,
Longman

> Signed-off-by: Waiman Long <longman@redhat.com>
> ---
>   kernel/locking/semaphore.c | 13 +++++++++----
>   1 file changed, 9 insertions(+), 4 deletions(-)
>
> diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
> index 34bfae72f295..de9117c0e671 100644
> --- a/kernel/locking/semaphore.c
> +++ b/kernel/locking/semaphore.c
> @@ -29,6 +29,7 @@
>   #include <linux/export.h>
>   #include <linux/sched.h>
>   #include <linux/sched/debug.h>
> +#include <linux/sched/wake_q.h>
>   #include <linux/semaphore.h>
>   #include <linux/spinlock.h>
>   #include <linux/ftrace.h>
> @@ -38,7 +39,7 @@ static noinline void __down(struct semaphore *sem);
>   static noinline int __down_interruptible(struct semaphore *sem);
>   static noinline int __down_killable(struct semaphore *sem);
>   static noinline int __down_timeout(struct semaphore *sem, long timeout);
> -static noinline void __up(struct semaphore *sem);
> +static noinline void __up(struct semaphore *sem, struct wake_q_head *wake_q);
>   
>   /**
>    * down - acquire the semaphore
> @@ -183,13 +184,16 @@ EXPORT_SYMBOL(down_timeout);
>   void __sched up(struct semaphore *sem)
>   {
>   	unsigned long flags;
> +	DEFINE_WAKE_Q(wake_q);
>   
>   	raw_spin_lock_irqsave(&sem->lock, flags);
>   	if (likely(list_empty(&sem->wait_list)))
>   		sem->count++;
>   	else
> -		__up(sem);
> +		__up(sem, &wake_q);
>   	raw_spin_unlock_irqrestore(&sem->lock, flags);
> +	if (!wake_q_empty(&wake_q))
> +		wake_up_q(&wake_q);
>   }
>   EXPORT_SYMBOL(up);
>   
> @@ -269,11 +273,12 @@ static noinline int __sched __down_timeout(struct semaphore *sem, long timeout)
>   	return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout);
>   }
>   
> -static noinline void __sched __up(struct semaphore *sem)
> +static noinline void __sched __up(struct semaphore *sem,
> +				  struct wake_q_head *wake_q)
>   {
>   	struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
>   						struct semaphore_waiter, list);
>   	list_del(&waiter->list);
>   	waiter->up = true;
> -	wake_up_process(waiter->task);
> +	wake_q_add(wake_q, waiter->task);
>   }
Re: [PATCH v3] locking/semaphore: Use wake_q to wake up processes outside lock critical section
Posted by Boqun Feng 10 months, 1 week ago
On Tue, Feb 11, 2025 at 09:18:56PM -0500, Waiman Long wrote:
> 
> On 1/26/25 8:31 PM, Waiman Long wrote:
> > A circular lock dependency splat has been seen involving down_trylock().
> > 
> > [ 4011.795602] ======================================================
> > [ 4011.795603] WARNING: possible circular locking dependency detected
> > [ 4011.795607] 6.12.0-41.el10.s390x+debug
> > [ 4011.795612] ------------------------------------------------------
> > [ 4011.795613] dd/32479 is trying to acquire lock:
> > [ 4011.795617] 0015a20accd0d4f8 ((console_sem).lock){-.-.}-{2:2}, at: down_trylock+0x26/0x90
> > [ 4011.795636]
> > [ 4011.795636] but task is already holding lock:
> > [ 4011.795637] 000000017e461698 (&zone->lock){-.-.}-{2:2}, at: rmqueue_bulk+0xac/0x8f0
> > 
> >    the existing dependency chain (in reverse order) is:
> >    -> #4 (&zone->lock){-.-.}-{2:2}:
> >    -> #3 (hrtimer_bases.lock){-.-.}-{2:2}:
> >    -> #2 (&rq->__lock){-.-.}-{2:2}:
> >    -> #1 (&p->pi_lock){-.-.}-{2:2}:
> >    -> #0 ((console_sem).lock){-.-.}-{2:2}:
> > 
> > The console_sem -> pi_lock dependency is due to calling try_to_wake_up()
> > while holding the console.sem raw_spinlock. This dependency can be broken
> > by using wake_q to do the wakeup instead of calling try_to_wake_up()
> > under the console_sem lock. This will also make the semaphore's
> > raw_spinlock become a terminal lock without taking any further locks
> > underneath it.
> > 
> > The hrtimer_bases.lock is a raw_spinlock while zone->lock is a
> > spinlock. The hrtimer_bases.lock -> zone->lock dependency happens via
> > the debug_objects_fill_pool() helper function in the debugobjects code.
> > 
> > [ 4011.795646] -> #4 (&zone->lock){-.-.}-{2:2}:
> > [ 4011.795650]        __lock_acquire+0xe86/0x1cc0
> > [ 4011.795655]        lock_acquire.part.0+0x258/0x630
> > [ 4011.795657]        lock_acquire+0xb8/0xe0
> > [ 4011.795659]        _raw_spin_lock_irqsave+0xb4/0x120
> > [ 4011.795663]        rmqueue_bulk+0xac/0x8f0
> > [ 4011.795665]        __rmqueue_pcplist+0x580/0x830
> > [ 4011.795667]        rmqueue_pcplist+0xfc/0x470
> > [ 4011.795669]        rmqueue.isra.0+0xdec/0x11b0
> > [ 4011.795671]        get_page_from_freelist+0x2ee/0xeb0
> > [ 4011.795673]        __alloc_pages_noprof+0x2c2/0x520
> > [ 4011.795676]        alloc_pages_mpol_noprof+0x1fc/0x4d0
> > [ 4011.795681]        alloc_pages_noprof+0x8c/0xe0
> > [ 4011.795684]        allocate_slab+0x320/0x460
> > [ 4011.795686]        ___slab_alloc+0xa58/0x12b0
> > [ 4011.795688]        __slab_alloc.isra.0+0x42/0x60
> > [ 4011.795690]        kmem_cache_alloc_noprof+0x304/0x350
> > [ 4011.795692]        fill_pool+0xf6/0x450
> > [ 4011.795697]        debug_object_activate+0xfe/0x360
> > [ 4011.795700]        enqueue_hrtimer+0x34/0x190
> > [ 4011.795703]        __run_hrtimer+0x3c8/0x4c0
> > [ 4011.795705]        __hrtimer_run_queues+0x1b2/0x260
> > [ 4011.795707]        hrtimer_interrupt+0x316/0x760
> > [ 4011.795709]        do_IRQ+0x9a/0xe0
> > [ 4011.795712]        do_irq_async+0xf6/0x160
> > 
> > Normally raw_spinlock to spinlock dependency is not legit
> > and will be warned if PROVE_RAW_LOCK_NESTING is enabled,
> > but debug_objects_fill_pool() is an exception as it explicitly
> > allows this dependency for non-PREEMPT_RT kernel without causing
> > PROVE_RAW_LOCK_NESTING lockdep splat. As a result, this dependency is
> > legit and not a bug.
> > 
> > Anyway, semaphore is the only locking primitive left that is still
> > using try_to_wake_up() to do wakeup inside critical section, all the
> > other locking primitives had been migrated to use wake_q to do wakeup
> > outside of the critical section. It is also possible that there are
> > other circular locking dependencies involving printk/console_sem or
> > other existing/new semaphores lurking somewhere which may show up in
> > the future. Let just do the migration now to wake_q to avoid headache
> > like this.
> 
> I can also add the following as another instance where deadlock can happen.
> 
> Reported-by:syzbot+ed801a886dfdbfe7136d@syzkaller.appspotmail.com
> 

FWIW, I already queued in my lockdep-for-tip branch, will send it in a
PR to Peter in one or two weeks (in case he hasn't taken it before
then).

BTW, do we need a "Fixes" tag for stable kernels?

Regards,
Boqun

> Cheers,
> Longman
> 
[...]
Re: [PATCH v3] locking/semaphore: Use wake_q to wake up processes outside lock critical section
Posted by Waiman Long 10 months, 1 week ago
On 2/12/25 12:45 AM, Boqun Feng wrote:
> On Tue, Feb 11, 2025 at 09:18:56PM -0500, Waiman Long wrote:
>> On 1/26/25 8:31 PM, Waiman Long wrote:
>>> A circular lock dependency splat has been seen involving down_trylock().
>>>
>>> [ 4011.795602] ======================================================
>>> [ 4011.795603] WARNING: possible circular locking dependency detected
>>> [ 4011.795607] 6.12.0-41.el10.s390x+debug
>>> [ 4011.795612] ------------------------------------------------------
>>> [ 4011.795613] dd/32479 is trying to acquire lock:
>>> [ 4011.795617] 0015a20accd0d4f8 ((console_sem).lock){-.-.}-{2:2}, at: down_trylock+0x26/0x90
>>> [ 4011.795636]
>>> [ 4011.795636] but task is already holding lock:
>>> [ 4011.795637] 000000017e461698 (&zone->lock){-.-.}-{2:2}, at: rmqueue_bulk+0xac/0x8f0
>>>
>>>     the existing dependency chain (in reverse order) is:
>>>     -> #4 (&zone->lock){-.-.}-{2:2}:
>>>     -> #3 (hrtimer_bases.lock){-.-.}-{2:2}:
>>>     -> #2 (&rq->__lock){-.-.}-{2:2}:
>>>     -> #1 (&p->pi_lock){-.-.}-{2:2}:
>>>     -> #0 ((console_sem).lock){-.-.}-{2:2}:
>>>
>>> The console_sem -> pi_lock dependency is due to calling try_to_wake_up()
>>> while holding the console.sem raw_spinlock. This dependency can be broken
>>> by using wake_q to do the wakeup instead of calling try_to_wake_up()
>>> under the console_sem lock. This will also make the semaphore's
>>> raw_spinlock become a terminal lock without taking any further locks
>>> underneath it.
>>>
>>> The hrtimer_bases.lock is a raw_spinlock while zone->lock is a
>>> spinlock. The hrtimer_bases.lock -> zone->lock dependency happens via
>>> the debug_objects_fill_pool() helper function in the debugobjects code.
>>>
>>> [ 4011.795646] -> #4 (&zone->lock){-.-.}-{2:2}:
>>> [ 4011.795650]        __lock_acquire+0xe86/0x1cc0
>>> [ 4011.795655]        lock_acquire.part.0+0x258/0x630
>>> [ 4011.795657]        lock_acquire+0xb8/0xe0
>>> [ 4011.795659]        _raw_spin_lock_irqsave+0xb4/0x120
>>> [ 4011.795663]        rmqueue_bulk+0xac/0x8f0
>>> [ 4011.795665]        __rmqueue_pcplist+0x580/0x830
>>> [ 4011.795667]        rmqueue_pcplist+0xfc/0x470
>>> [ 4011.795669]        rmqueue.isra.0+0xdec/0x11b0
>>> [ 4011.795671]        get_page_from_freelist+0x2ee/0xeb0
>>> [ 4011.795673]        __alloc_pages_noprof+0x2c2/0x520
>>> [ 4011.795676]        alloc_pages_mpol_noprof+0x1fc/0x4d0
>>> [ 4011.795681]        alloc_pages_noprof+0x8c/0xe0
>>> [ 4011.795684]        allocate_slab+0x320/0x460
>>> [ 4011.795686]        ___slab_alloc+0xa58/0x12b0
>>> [ 4011.795688]        __slab_alloc.isra.0+0x42/0x60
>>> [ 4011.795690]        kmem_cache_alloc_noprof+0x304/0x350
>>> [ 4011.795692]        fill_pool+0xf6/0x450
>>> [ 4011.795697]        debug_object_activate+0xfe/0x360
>>> [ 4011.795700]        enqueue_hrtimer+0x34/0x190
>>> [ 4011.795703]        __run_hrtimer+0x3c8/0x4c0
>>> [ 4011.795705]        __hrtimer_run_queues+0x1b2/0x260
>>> [ 4011.795707]        hrtimer_interrupt+0x316/0x760
>>> [ 4011.795709]        do_IRQ+0x9a/0xe0
>>> [ 4011.795712]        do_irq_async+0xf6/0x160
>>>
>>> Normally raw_spinlock to spinlock dependency is not legit
>>> and will be warned if PROVE_RAW_LOCK_NESTING is enabled,
>>> but debug_objects_fill_pool() is an exception as it explicitly
>>> allows this dependency for non-PREEMPT_RT kernel without causing
>>> PROVE_RAW_LOCK_NESTING lockdep splat. As a result, this dependency is
>>> legit and not a bug.
>>>
>>> Anyway, semaphore is the only locking primitive left that is still
>>> using try_to_wake_up() to do wakeup inside critical section, all the
>>> other locking primitives had been migrated to use wake_q to do wakeup
>>> outside of the critical section. It is also possible that there are
>>> other circular locking dependencies involving printk/console_sem or
>>> other existing/new semaphores lurking somewhere which may show up in
>>> the future. Let just do the migration now to wake_q to avoid headache
>>> like this.
>> I can also add the following as another instance where deadlock can happen.
>>
>> Reported-by:syzbot+ed801a886dfdbfe7136d@syzkaller.appspotmail.com
>>
> FWIW, I already queued in my lockdep-for-tip branch, will send it in a
> PR to Peter in one or two weeks (in case he hasn't taken it before
> then).
>
> BTW, do we need a "Fixes" tag for stable kernels?

After some more thought, I realize that this patch doesn't really fix 
the circular lock dependency problem, it just remove console_sem.lock 
from it. The problem is that printk() can be called in any context. To 
really solve the problem, we will need some kind of deferred wakeup 
using workqueue, for instance. As printing to the console is inherently 
slow, adding some more latency to the wakeup process shouldn't really be 
a problem. This patch will be the first step, I will work on additional 
patches to complete this deferred wakeup functionality. So I don't need 
to add a Fixes tag for now. You can either take this patch out or just 
leave it there.

Cheers,
Longman
Re: [PATCH v3] locking/semaphore: Use wake_q to wake up processes outside lock critical section
Posted by Boqun Feng 10 months, 1 week ago
On Wed, Feb 12, 2025 at 09:10:25AM -0500, Waiman Long wrote:
> On 2/12/25 12:45 AM, Boqun Feng wrote:
> > On Tue, Feb 11, 2025 at 09:18:56PM -0500, Waiman Long wrote:
> > > On 1/26/25 8:31 PM, Waiman Long wrote:
> > > > A circular lock dependency splat has been seen involving down_trylock().
> > > > 
> > > > [ 4011.795602] ======================================================
> > > > [ 4011.795603] WARNING: possible circular locking dependency detected
> > > > [ 4011.795607] 6.12.0-41.el10.s390x+debug
> > > > [ 4011.795612] ------------------------------------------------------
> > > > [ 4011.795613] dd/32479 is trying to acquire lock:
> > > > [ 4011.795617] 0015a20accd0d4f8 ((console_sem).lock){-.-.}-{2:2}, at: down_trylock+0x26/0x90
> > > > [ 4011.795636]
> > > > [ 4011.795636] but task is already holding lock:
> > > > [ 4011.795637] 000000017e461698 (&zone->lock){-.-.}-{2:2}, at: rmqueue_bulk+0xac/0x8f0
> > > > 
> > > >     the existing dependency chain (in reverse order) is:
> > > >     -> #4 (&zone->lock){-.-.}-{2:2}:
> > > >     -> #3 (hrtimer_bases.lock){-.-.}-{2:2}:
> > > >     -> #2 (&rq->__lock){-.-.}-{2:2}:
> > > >     -> #1 (&p->pi_lock){-.-.}-{2:2}:
> > > >     -> #0 ((console_sem).lock){-.-.}-{2:2}:
> > > > 
> > > > The console_sem -> pi_lock dependency is due to calling try_to_wake_up()
> > > > while holding the console.sem raw_spinlock. This dependency can be broken
> > > > by using wake_q to do the wakeup instead of calling try_to_wake_up()
> > > > under the console_sem lock. This will also make the semaphore's
> > > > raw_spinlock become a terminal lock without taking any further locks
> > > > underneath it.
> > > > 
> > > > The hrtimer_bases.lock is a raw_spinlock while zone->lock is a
> > > > spinlock. The hrtimer_bases.lock -> zone->lock dependency happens via
> > > > the debug_objects_fill_pool() helper function in the debugobjects code.
> > > > 
> > > > [ 4011.795646] -> #4 (&zone->lock){-.-.}-{2:2}:
> > > > [ 4011.795650]        __lock_acquire+0xe86/0x1cc0
> > > > [ 4011.795655]        lock_acquire.part.0+0x258/0x630
> > > > [ 4011.795657]        lock_acquire+0xb8/0xe0
> > > > [ 4011.795659]        _raw_spin_lock_irqsave+0xb4/0x120
> > > > [ 4011.795663]        rmqueue_bulk+0xac/0x8f0
> > > > [ 4011.795665]        __rmqueue_pcplist+0x580/0x830
> > > > [ 4011.795667]        rmqueue_pcplist+0xfc/0x470
> > > > [ 4011.795669]        rmqueue.isra.0+0xdec/0x11b0
> > > > [ 4011.795671]        get_page_from_freelist+0x2ee/0xeb0
> > > > [ 4011.795673]        __alloc_pages_noprof+0x2c2/0x520
> > > > [ 4011.795676]        alloc_pages_mpol_noprof+0x1fc/0x4d0
> > > > [ 4011.795681]        alloc_pages_noprof+0x8c/0xe0
> > > > [ 4011.795684]        allocate_slab+0x320/0x460
> > > > [ 4011.795686]        ___slab_alloc+0xa58/0x12b0
> > > > [ 4011.795688]        __slab_alloc.isra.0+0x42/0x60
> > > > [ 4011.795690]        kmem_cache_alloc_noprof+0x304/0x350
> > > > [ 4011.795692]        fill_pool+0xf6/0x450
> > > > [ 4011.795697]        debug_object_activate+0xfe/0x360
> > > > [ 4011.795700]        enqueue_hrtimer+0x34/0x190
> > > > [ 4011.795703]        __run_hrtimer+0x3c8/0x4c0
> > > > [ 4011.795705]        __hrtimer_run_queues+0x1b2/0x260
> > > > [ 4011.795707]        hrtimer_interrupt+0x316/0x760
> > > > [ 4011.795709]        do_IRQ+0x9a/0xe0
> > > > [ 4011.795712]        do_irq_async+0xf6/0x160
> > > > 
> > > > Normally raw_spinlock to spinlock dependency is not legit
> > > > and will be warned if PROVE_RAW_LOCK_NESTING is enabled,
> > > > but debug_objects_fill_pool() is an exception as it explicitly
> > > > allows this dependency for non-PREEMPT_RT kernel without causing
> > > > PROVE_RAW_LOCK_NESTING lockdep splat. As a result, this dependency is
> > > > legit and not a bug.
> > > > 
> > > > Anyway, semaphore is the only locking primitive left that is still
> > > > using try_to_wake_up() to do wakeup inside critical section, all the
> > > > other locking primitives had been migrated to use wake_q to do wakeup
> > > > outside of the critical section. It is also possible that there are
> > > > other circular locking dependencies involving printk/console_sem or
> > > > other existing/new semaphores lurking somewhere which may show up in
> > > > the future. Let just do the migration now to wake_q to avoid headache
> > > > like this.
> > > I can also add the following as another instance where deadlock can happen.
> > > 
> > > Reported-by:syzbot+ed801a886dfdbfe7136d@syzkaller.appspotmail.com
> > > 
> > FWIW, I already queued in my lockdep-for-tip branch, will send it in a
> > PR to Peter in one or two weeks (in case he hasn't taken it before
> > then).
> > 
> > BTW, do we need a "Fixes" tag for stable kernels?
> 
> After some more thought, I realize that this patch doesn't really fix the
> circular lock dependency problem, it just remove console_sem.lock from it.
> The problem is that printk() can be called in any context. To really solve
> the problem, we will need some kind of deferred wakeup using workqueue, for
> instance. As printing to the console is inherently slow, adding some more

Hmm... but your patch does remove the dependency console_sem.lock ->
pi_lock, right? So it does fix the circular lock dependency problem. Or
do you mean that it doesn't fix all the deadlocks that may cause by
printk()?

Regards,
Boqun

> latency to the wakeup process shouldn't really be a problem. This patch will
> be the first step, I will work on additional patches to complete this
> deferred wakeup functionality. So I don't need to add a Fixes tag for now.
> You can either take this patch out or just leave it there.
> 
> Cheers,
> Longman
>
Re: [PATCH v3] locking/semaphore: Use wake_q to wake up processes outside lock critical section
Posted by Waiman Long 10 months, 1 week ago
On 2/12/25 11:38 AM, Boqun Feng wrote:
> On Wed, Feb 12, 2025 at 09:10:25AM -0500, Waiman Long wrote:
>> On 2/12/25 12:45 AM, Boqun Feng wrote:
>>> On Tue, Feb 11, 2025 at 09:18:56PM -0500, Waiman Long wrote:
>>>> On 1/26/25 8:31 PM, Waiman Long wrote:
>>>>> A circular lock dependency splat has been seen involving down_trylock().
>>>>>
>>>>> [ 4011.795602] ======================================================
>>>>> [ 4011.795603] WARNING: possible circular locking dependency detected
>>>>> [ 4011.795607] 6.12.0-41.el10.s390x+debug
>>>>> [ 4011.795612] ------------------------------------------------------
>>>>> [ 4011.795613] dd/32479 is trying to acquire lock:
>>>>> [ 4011.795617] 0015a20accd0d4f8 ((console_sem).lock){-.-.}-{2:2}, at: down_trylock+0x26/0x90
>>>>> [ 4011.795636]
>>>>> [ 4011.795636] but task is already holding lock:
>>>>> [ 4011.795637] 000000017e461698 (&zone->lock){-.-.}-{2:2}, at: rmqueue_bulk+0xac/0x8f0
>>>>>
>>>>>      the existing dependency chain (in reverse order) is:
>>>>>      -> #4 (&zone->lock){-.-.}-{2:2}:
>>>>>      -> #3 (hrtimer_bases.lock){-.-.}-{2:2}:
>>>>>      -> #2 (&rq->__lock){-.-.}-{2:2}:
>>>>>      -> #1 (&p->pi_lock){-.-.}-{2:2}:
>>>>>      -> #0 ((console_sem).lock){-.-.}-{2:2}:
>>>>>
>>>>> The console_sem -> pi_lock dependency is due to calling try_to_wake_up()
>>>>> while holding the console.sem raw_spinlock. This dependency can be broken
>>>>> by using wake_q to do the wakeup instead of calling try_to_wake_up()
>>>>> under the console_sem lock. This will also make the semaphore's
>>>>> raw_spinlock become a terminal lock without taking any further locks
>>>>> underneath it.
>>>>>
>>>>> The hrtimer_bases.lock is a raw_spinlock while zone->lock is a
>>>>> spinlock. The hrtimer_bases.lock -> zone->lock dependency happens via
>>>>> the debug_objects_fill_pool() helper function in the debugobjects code.
>>>>>
>>>>> [ 4011.795646] -> #4 (&zone->lock){-.-.}-{2:2}:
>>>>> [ 4011.795650]        __lock_acquire+0xe86/0x1cc0
>>>>> [ 4011.795655]        lock_acquire.part.0+0x258/0x630
>>>>> [ 4011.795657]        lock_acquire+0xb8/0xe0
>>>>> [ 4011.795659]        _raw_spin_lock_irqsave+0xb4/0x120
>>>>> [ 4011.795663]        rmqueue_bulk+0xac/0x8f0
>>>>> [ 4011.795665]        __rmqueue_pcplist+0x580/0x830
>>>>> [ 4011.795667]        rmqueue_pcplist+0xfc/0x470
>>>>> [ 4011.795669]        rmqueue.isra.0+0xdec/0x11b0
>>>>> [ 4011.795671]        get_page_from_freelist+0x2ee/0xeb0
>>>>> [ 4011.795673]        __alloc_pages_noprof+0x2c2/0x520
>>>>> [ 4011.795676]        alloc_pages_mpol_noprof+0x1fc/0x4d0
>>>>> [ 4011.795681]        alloc_pages_noprof+0x8c/0xe0
>>>>> [ 4011.795684]        allocate_slab+0x320/0x460
>>>>> [ 4011.795686]        ___slab_alloc+0xa58/0x12b0
>>>>> [ 4011.795688]        __slab_alloc.isra.0+0x42/0x60
>>>>> [ 4011.795690]        kmem_cache_alloc_noprof+0x304/0x350
>>>>> [ 4011.795692]        fill_pool+0xf6/0x450
>>>>> [ 4011.795697]        debug_object_activate+0xfe/0x360
>>>>> [ 4011.795700]        enqueue_hrtimer+0x34/0x190
>>>>> [ 4011.795703]        __run_hrtimer+0x3c8/0x4c0
>>>>> [ 4011.795705]        __hrtimer_run_queues+0x1b2/0x260
>>>>> [ 4011.795707]        hrtimer_interrupt+0x316/0x760
>>>>> [ 4011.795709]        do_IRQ+0x9a/0xe0
>>>>> [ 4011.795712]        do_irq_async+0xf6/0x160
>>>>>
>>>>> Normally raw_spinlock to spinlock dependency is not legit
>>>>> and will be warned if PROVE_RAW_LOCK_NESTING is enabled,
>>>>> but debug_objects_fill_pool() is an exception as it explicitly
>>>>> allows this dependency for non-PREEMPT_RT kernel without causing
>>>>> PROVE_RAW_LOCK_NESTING lockdep splat. As a result, this dependency is
>>>>> legit and not a bug.
>>>>>
>>>>> Anyway, semaphore is the only locking primitive left that is still
>>>>> using try_to_wake_up() to do wakeup inside critical section, all the
>>>>> other locking primitives had been migrated to use wake_q to do wakeup
>>>>> outside of the critical section. It is also possible that there are
>>>>> other circular locking dependencies involving printk/console_sem or
>>>>> other existing/new semaphores lurking somewhere which may show up in
>>>>> the future. Let just do the migration now to wake_q to avoid headache
>>>>> like this.
>>>> I can also add the following as another instance where deadlock can happen.
>>>>
>>>> Reported-by:syzbot+ed801a886dfdbfe7136d@syzkaller.appspotmail.com
>>>>
>>> FWIW, I already queued in my lockdep-for-tip branch, will send it in a
>>> PR to Peter in one or two weeks (in case he hasn't taken it before
>>> then).
>>>
>>> BTW, do we need a "Fixes" tag for stable kernels?
>> After some more thought, I realize that this patch doesn't really fix the
>> circular lock dependency problem, it just remove console_sem.lock from it.
>> The problem is that printk() can be called in any context. To really solve
>> the problem, we will need some kind of deferred wakeup using workqueue, for
>> instance. As printing to the console is inherently slow, adding some more
> Hmm... but your patch does remove the dependency console_sem.lock ->
> pi_lock, right? So it does fix the circular lock dependency problem. Or
> do you mean that it doesn't fix all the deadlocks that may cause by
> printk()?

Right, it doesn't  fix all the deadlocks that may be caused by printk(). 
Similar circular lock dependency splat will still happen. It will start 
with pi_lock instead of console_sem.lock and will have one less lock in 
the circular list. It is caused by waking up process within the printk() 
context. That is why I think the proper solution is to have a deferred 
wakeup. It will be specific to the printk() use cases.

Cheers,
Longman

Re: [PATCH v3] locking/semaphore: Use wake_q to wake up processes outside lock critical section
Posted by Boqun Feng 10 months, 1 week ago
On Wed, Feb 12, 2025 at 11:45:31AM -0500, Waiman Long wrote:
> 
> On 2/12/25 11:38 AM, Boqun Feng wrote:
> > On Wed, Feb 12, 2025 at 09:10:25AM -0500, Waiman Long wrote:
> > > On 2/12/25 12:45 AM, Boqun Feng wrote:
> > > > On Tue, Feb 11, 2025 at 09:18:56PM -0500, Waiman Long wrote:
> > > > > On 1/26/25 8:31 PM, Waiman Long wrote:
> > > > > > A circular lock dependency splat has been seen involving down_trylock().
> > > > > > 
> > > > > > [ 4011.795602] ======================================================
> > > > > > [ 4011.795603] WARNING: possible circular locking dependency detected
> > > > > > [ 4011.795607] 6.12.0-41.el10.s390x+debug
> > > > > > [ 4011.795612] ------------------------------------------------------
> > > > > > [ 4011.795613] dd/32479 is trying to acquire lock:
> > > > > > [ 4011.795617] 0015a20accd0d4f8 ((console_sem).lock){-.-.}-{2:2}, at: down_trylock+0x26/0x90
> > > > > > [ 4011.795636]
> > > > > > [ 4011.795636] but task is already holding lock:
> > > > > > [ 4011.795637] 000000017e461698 (&zone->lock){-.-.}-{2:2}, at: rmqueue_bulk+0xac/0x8f0
> > > > > > 
> > > > > >      the existing dependency chain (in reverse order) is:
> > > > > >      -> #4 (&zone->lock){-.-.}-{2:2}:
> > > > > >      -> #3 (hrtimer_bases.lock){-.-.}-{2:2}:
> > > > > >      -> #2 (&rq->__lock){-.-.}-{2:2}:
> > > > > >      -> #1 (&p->pi_lock){-.-.}-{2:2}:
> > > > > >      -> #0 ((console_sem).lock){-.-.}-{2:2}:
> > > > > > 
> > > > > > The console_sem -> pi_lock dependency is due to calling try_to_wake_up()
> > > > > > while holding the console.sem raw_spinlock. This dependency can be broken
> > > > > > by using wake_q to do the wakeup instead of calling try_to_wake_up()
> > > > > > under the console_sem lock. This will also make the semaphore's
> > > > > > raw_spinlock become a terminal lock without taking any further locks
> > > > > > underneath it.
> > > > > > 
> > > > > > The hrtimer_bases.lock is a raw_spinlock while zone->lock is a
> > > > > > spinlock. The hrtimer_bases.lock -> zone->lock dependency happens via
> > > > > > the debug_objects_fill_pool() helper function in the debugobjects code.
> > > > > > 
> > > > > > [ 4011.795646] -> #4 (&zone->lock){-.-.}-{2:2}:
> > > > > > [ 4011.795650]        __lock_acquire+0xe86/0x1cc0
> > > > > > [ 4011.795655]        lock_acquire.part.0+0x258/0x630
> > > > > > [ 4011.795657]        lock_acquire+0xb8/0xe0
> > > > > > [ 4011.795659]        _raw_spin_lock_irqsave+0xb4/0x120
> > > > > > [ 4011.795663]        rmqueue_bulk+0xac/0x8f0
> > > > > > [ 4011.795665]        __rmqueue_pcplist+0x580/0x830
> > > > > > [ 4011.795667]        rmqueue_pcplist+0xfc/0x470
> > > > > > [ 4011.795669]        rmqueue.isra.0+0xdec/0x11b0
> > > > > > [ 4011.795671]        get_page_from_freelist+0x2ee/0xeb0
> > > > > > [ 4011.795673]        __alloc_pages_noprof+0x2c2/0x520
> > > > > > [ 4011.795676]        alloc_pages_mpol_noprof+0x1fc/0x4d0
> > > > > > [ 4011.795681]        alloc_pages_noprof+0x8c/0xe0
> > > > > > [ 4011.795684]        allocate_slab+0x320/0x460
> > > > > > [ 4011.795686]        ___slab_alloc+0xa58/0x12b0
> > > > > > [ 4011.795688]        __slab_alloc.isra.0+0x42/0x60
> > > > > > [ 4011.795690]        kmem_cache_alloc_noprof+0x304/0x350
> > > > > > [ 4011.795692]        fill_pool+0xf6/0x450
> > > > > > [ 4011.795697]        debug_object_activate+0xfe/0x360
> > > > > > [ 4011.795700]        enqueue_hrtimer+0x34/0x190
> > > > > > [ 4011.795703]        __run_hrtimer+0x3c8/0x4c0
> > > > > > [ 4011.795705]        __hrtimer_run_queues+0x1b2/0x260
> > > > > > [ 4011.795707]        hrtimer_interrupt+0x316/0x760
> > > > > > [ 4011.795709]        do_IRQ+0x9a/0xe0
> > > > > > [ 4011.795712]        do_irq_async+0xf6/0x160
> > > > > > 
> > > > > > Normally raw_spinlock to spinlock dependency is not legit
> > > > > > and will be warned if PROVE_RAW_LOCK_NESTING is enabled,
> > > > > > but debug_objects_fill_pool() is an exception as it explicitly
> > > > > > allows this dependency for non-PREEMPT_RT kernel without causing
> > > > > > PROVE_RAW_LOCK_NESTING lockdep splat. As a result, this dependency is
> > > > > > legit and not a bug.
> > > > > > 
> > > > > > Anyway, semaphore is the only locking primitive left that is still
> > > > > > using try_to_wake_up() to do wakeup inside critical section, all the
> > > > > > other locking primitives had been migrated to use wake_q to do wakeup
> > > > > > outside of the critical section. It is also possible that there are
> > > > > > other circular locking dependencies involving printk/console_sem or
> > > > > > other existing/new semaphores lurking somewhere which may show up in
> > > > > > the future. Let just do the migration now to wake_q to avoid headache
> > > > > > like this.
> > > > > I can also add the following as another instance where deadlock can happen.
> > > > > 
> > > > > Reported-by:syzbot+ed801a886dfdbfe7136d@syzkaller.appspotmail.com
> > > > > 
> > > > FWIW, I already queued in my lockdep-for-tip branch, will send it in a
> > > > PR to Peter in one or two weeks (in case he hasn't taken it before
> > > > then).
> > > > 
> > > > BTW, do we need a "Fixes" tag for stable kernels?
> > > After some more thought, I realize that this patch doesn't really fix the
> > > circular lock dependency problem, it just remove console_sem.lock from it.
> > > The problem is that printk() can be called in any context. To really solve
> > > the problem, we will need some kind of deferred wakeup using workqueue, for
> > > instance. As printing to the console is inherently slow, adding some more
> > Hmm... but your patch does remove the dependency console_sem.lock ->
> > pi_lock, right? So it does fix the circular lock dependency problem. Or
> > do you mean that it doesn't fix all the deadlocks that may cause by
> > printk()?
> 
> Right, it doesn't  fix all the deadlocks that may be caused by printk().
> Similar circular lock dependency splat will still happen. It will start with
> pi_lock instead of console_sem.lock and will have one less lock in the
> circular list. It is caused by waking up process within the printk()
> context. That is why I think the proper solution is to have a deferred
> wakeup. It will be specific to the printk() use cases.
> 

Sounds good. Then I will take this patch, because 1) it fixes an issue
(although partially) and 2) the fix is toward the right direction (i.e.
avoiding wakeup inside critical section). Thanks!

Regards,
Boqun

> Cheers,
> Longman
> 
Re: [PATCH v3] locking/semaphore: Use wake_q to wake up processes outside lock critical section
Posted by Waiman Long 10 months, 1 week ago
On 2/12/25 12:09 PM, Boqun Feng wrote:
> On Wed, Feb 12, 2025 at 11:45:31AM -0500, Waiman Long wrote:
>> On 2/12/25 11:38 AM, Boqun Feng wrote:
>>> On Wed, Feb 12, 2025 at 09:10:25AM -0500, Waiman Long wrote:
>>>> On 2/12/25 12:45 AM, Boqun Feng wrote:
>>>>> On Tue, Feb 11, 2025 at 09:18:56PM -0500, Waiman Long wrote:
>>>>>> On 1/26/25 8:31 PM, Waiman Long wrote:
>>>>>>> A circular lock dependency splat has been seen involving down_trylock().
>>>>>>>
>>>>>>> [ 4011.795602] ======================================================
>>>>>>> [ 4011.795603] WARNING: possible circular locking dependency detected
>>>>>>> [ 4011.795607] 6.12.0-41.el10.s390x+debug
>>>>>>> [ 4011.795612] ------------------------------------------------------
>>>>>>> [ 4011.795613] dd/32479 is trying to acquire lock:
>>>>>>> [ 4011.795617] 0015a20accd0d4f8 ((console_sem).lock){-.-.}-{2:2}, at: down_trylock+0x26/0x90
>>>>>>> [ 4011.795636]
>>>>>>> [ 4011.795636] but task is already holding lock:
>>>>>>> [ 4011.795637] 000000017e461698 (&zone->lock){-.-.}-{2:2}, at: rmqueue_bulk+0xac/0x8f0
>>>>>>>
>>>>>>>       the existing dependency chain (in reverse order) is:
>>>>>>>       -> #4 (&zone->lock){-.-.}-{2:2}:
>>>>>>>       -> #3 (hrtimer_bases.lock){-.-.}-{2:2}:
>>>>>>>       -> #2 (&rq->__lock){-.-.}-{2:2}:
>>>>>>>       -> #1 (&p->pi_lock){-.-.}-{2:2}:
>>>>>>>       -> #0 ((console_sem).lock){-.-.}-{2:2}:
>>>>>>>
>>>>>>> The console_sem -> pi_lock dependency is due to calling try_to_wake_up()
>>>>>>> while holding the console.sem raw_spinlock. This dependency can be broken
>>>>>>> by using wake_q to do the wakeup instead of calling try_to_wake_up()
>>>>>>> under the console_sem lock. This will also make the semaphore's
>>>>>>> raw_spinlock become a terminal lock without taking any further locks
>>>>>>> underneath it.
>>>>>>>
>>>>>>> The hrtimer_bases.lock is a raw_spinlock while zone->lock is a
>>>>>>> spinlock. The hrtimer_bases.lock -> zone->lock dependency happens via
>>>>>>> the debug_objects_fill_pool() helper function in the debugobjects code.
>>>>>>>
>>>>>>> [ 4011.795646] -> #4 (&zone->lock){-.-.}-{2:2}:
>>>>>>> [ 4011.795650]        __lock_acquire+0xe86/0x1cc0
>>>>>>> [ 4011.795655]        lock_acquire.part.0+0x258/0x630
>>>>>>> [ 4011.795657]        lock_acquire+0xb8/0xe0
>>>>>>> [ 4011.795659]        _raw_spin_lock_irqsave+0xb4/0x120
>>>>>>> [ 4011.795663]        rmqueue_bulk+0xac/0x8f0
>>>>>>> [ 4011.795665]        __rmqueue_pcplist+0x580/0x830
>>>>>>> [ 4011.795667]        rmqueue_pcplist+0xfc/0x470
>>>>>>> [ 4011.795669]        rmqueue.isra.0+0xdec/0x11b0
>>>>>>> [ 4011.795671]        get_page_from_freelist+0x2ee/0xeb0
>>>>>>> [ 4011.795673]        __alloc_pages_noprof+0x2c2/0x520
>>>>>>> [ 4011.795676]        alloc_pages_mpol_noprof+0x1fc/0x4d0
>>>>>>> [ 4011.795681]        alloc_pages_noprof+0x8c/0xe0
>>>>>>> [ 4011.795684]        allocate_slab+0x320/0x460
>>>>>>> [ 4011.795686]        ___slab_alloc+0xa58/0x12b0
>>>>>>> [ 4011.795688]        __slab_alloc.isra.0+0x42/0x60
>>>>>>> [ 4011.795690]        kmem_cache_alloc_noprof+0x304/0x350
>>>>>>> [ 4011.795692]        fill_pool+0xf6/0x450
>>>>>>> [ 4011.795697]        debug_object_activate+0xfe/0x360
>>>>>>> [ 4011.795700]        enqueue_hrtimer+0x34/0x190
>>>>>>> [ 4011.795703]        __run_hrtimer+0x3c8/0x4c0
>>>>>>> [ 4011.795705]        __hrtimer_run_queues+0x1b2/0x260
>>>>>>> [ 4011.795707]        hrtimer_interrupt+0x316/0x760
>>>>>>> [ 4011.795709]        do_IRQ+0x9a/0xe0
>>>>>>> [ 4011.795712]        do_irq_async+0xf6/0x160
>>>>>>>
>>>>>>> Normally raw_spinlock to spinlock dependency is not legit
>>>>>>> and will be warned if PROVE_RAW_LOCK_NESTING is enabled,
>>>>>>> but debug_objects_fill_pool() is an exception as it explicitly
>>>>>>> allows this dependency for non-PREEMPT_RT kernel without causing
>>>>>>> PROVE_RAW_LOCK_NESTING lockdep splat. As a result, this dependency is
>>>>>>> legit and not a bug.
>>>>>>>
>>>>>>> Anyway, semaphore is the only locking primitive left that is still
>>>>>>> using try_to_wake_up() to do wakeup inside critical section, all the
>>>>>>> other locking primitives had been migrated to use wake_q to do wakeup
>>>>>>> outside of the critical section. It is also possible that there are
>>>>>>> other circular locking dependencies involving printk/console_sem or
>>>>>>> other existing/new semaphores lurking somewhere which may show up in
>>>>>>> the future. Let just do the migration now to wake_q to avoid headache
>>>>>>> like this.
>>>>>> I can also add the following as another instance where deadlock can happen.
>>>>>>
>>>>>> Reported-by:syzbot+ed801a886dfdbfe7136d@syzkaller.appspotmail.com
>>>>>>
>>>>> FWIW, I already queued in my lockdep-for-tip branch, will send it in a
>>>>> PR to Peter in one or two weeks (in case he hasn't taken it before
>>>>> then).
>>>>>
>>>>> BTW, do we need a "Fixes" tag for stable kernels?
>>>> After some more thought, I realize that this patch doesn't really fix the
>>>> circular lock dependency problem, it just remove console_sem.lock from it.
>>>> The problem is that printk() can be called in any context. To really solve
>>>> the problem, we will need some kind of deferred wakeup using workqueue, for
>>>> instance. As printing to the console is inherently slow, adding some more
>>> Hmm... but your patch does remove the dependency console_sem.lock ->
>>> pi_lock, right? So it does fix the circular lock dependency problem. Or
>>> do you mean that it doesn't fix all the deadlocks that may cause by
>>> printk()?
>> Right, it doesn't  fix all the deadlocks that may be caused by printk().
>> Similar circular lock dependency splat will still happen. It will start with
>> pi_lock instead of console_sem.lock and will have one less lock in the
>> circular list. It is caused by waking up process within the printk()
>> context. That is why I think the proper solution is to have a deferred
>> wakeup. It will be specific to the printk() use cases.
>>
> Sounds good. Then I will take this patch, because 1) it fixes an issue
> (although partially) and 2) the fix is toward the right direction (i.e.
> avoiding wakeup inside critical section). Thanks!

This patch does fix some deadlock cases. The process wakeup will happen 
in console_unlock(). Its comment says that "console_unlock(); may be 
called from any context". So there may still be other deadlock cases 
hidden somewhere.

Cheers,
Longman


[tip: locking/core] locking/semaphore: Use wake_q to wake up processes outside lock critical section
Posted by tip-bot2 for Waiman Long 9 months, 2 weeks ago
The following commit has been merged into the locking/core branch of tip:

Commit-ID:     3a5138209d21cd379d9bf64918060cec4fbc3c43
Gitweb:        https://git.kernel.org/tip/3a5138209d21cd379d9bf64918060cec4fbc3c43
Author:        Waiman Long <longman@redhat.com>
AuthorDate:    Sun, 26 Jan 2025 20:31:27 -05:00
Committer:     Boqun Feng <boqun.feng@gmail.com>
CommitterDate: Sun, 23 Feb 2025 18:24:46 -08:00

locking/semaphore: Use wake_q to wake up processes outside lock critical section

A circular lock dependency splat has been seen involving down_trylock().

[ 4011.795602] ======================================================
[ 4011.795603] WARNING: possible circular locking dependency detected
[ 4011.795607] 6.12.0-41.el10.s390x+debug
[ 4011.795612] ------------------------------------------------------
[ 4011.795613] dd/32479 is trying to acquire lock:
[ 4011.795617] 0015a20accd0d4f8 ((console_sem).lock){-.-.}-{2:2}, at: down_trylock+0x26/0x90
[ 4011.795636]
[ 4011.795636] but task is already holding lock:
[ 4011.795637] 000000017e461698 (&zone->lock){-.-.}-{2:2}, at: rmqueue_bulk+0xac/0x8f0

  the existing dependency chain (in reverse order) is:
  -> #4 (&zone->lock){-.-.}-{2:2}:
  -> #3 (hrtimer_bases.lock){-.-.}-{2:2}:
  -> #2 (&rq->__lock){-.-.}-{2:2}:
  -> #1 (&p->pi_lock){-.-.}-{2:2}:
  -> #0 ((console_sem).lock){-.-.}-{2:2}:

The console_sem -> pi_lock dependency is due to calling try_to_wake_up()
while holding the console.sem raw_spinlock. This dependency can be broken
by using wake_q to do the wakeup instead of calling try_to_wake_up()
under the console_sem lock. This will also make the semaphore's
raw_spinlock become a terminal lock without taking any further locks
underneath it.

The hrtimer_bases.lock is a raw_spinlock while zone->lock is a
spinlock. The hrtimer_bases.lock -> zone->lock dependency happens via
the debug_objects_fill_pool() helper function in the debugobjects code.

[ 4011.795646] -> #4 (&zone->lock){-.-.}-{2:2}:
[ 4011.795650]        __lock_acquire+0xe86/0x1cc0
[ 4011.795655]        lock_acquire.part.0+0x258/0x630
[ 4011.795657]        lock_acquire+0xb8/0xe0
[ 4011.795659]        _raw_spin_lock_irqsave+0xb4/0x120
[ 4011.795663]        rmqueue_bulk+0xac/0x8f0
[ 4011.795665]        __rmqueue_pcplist+0x580/0x830
[ 4011.795667]        rmqueue_pcplist+0xfc/0x470
[ 4011.795669]        rmqueue.isra.0+0xdec/0x11b0
[ 4011.795671]        get_page_from_freelist+0x2ee/0xeb0
[ 4011.795673]        __alloc_pages_noprof+0x2c2/0x520
[ 4011.795676]        alloc_pages_mpol_noprof+0x1fc/0x4d0
[ 4011.795681]        alloc_pages_noprof+0x8c/0xe0
[ 4011.795684]        allocate_slab+0x320/0x460
[ 4011.795686]        ___slab_alloc+0xa58/0x12b0
[ 4011.795688]        __slab_alloc.isra.0+0x42/0x60
[ 4011.795690]        kmem_cache_alloc_noprof+0x304/0x350
[ 4011.795692]        fill_pool+0xf6/0x450
[ 4011.795697]        debug_object_activate+0xfe/0x360
[ 4011.795700]        enqueue_hrtimer+0x34/0x190
[ 4011.795703]        __run_hrtimer+0x3c8/0x4c0
[ 4011.795705]        __hrtimer_run_queues+0x1b2/0x260
[ 4011.795707]        hrtimer_interrupt+0x316/0x760
[ 4011.795709]        do_IRQ+0x9a/0xe0
[ 4011.795712]        do_irq_async+0xf6/0x160

Normally raw_spinlock to spinlock dependency is not legit
and will be warned if PROVE_RAW_LOCK_NESTING is enabled,
but debug_objects_fill_pool() is an exception as it explicitly
allows this dependency for non-PREEMPT_RT kernel without causing
PROVE_RAW_LOCK_NESTING lockdep splat. As a result, this dependency is
legit and not a bug.

Anyway, semaphore is the only locking primitive left that is still
using try_to_wake_up() to do wakeup inside critical section, all the
other locking primitives had been migrated to use wake_q to do wakeup
outside of the critical section. It is also possible that there are
other circular locking dependencies involving printk/console_sem or
other existing/new semaphores lurking somewhere which may show up in
the future. Let just do the migration now to wake_q to avoid headache
like this.

Reported-by:syzbot+ed801a886dfdbfe7136d@syzkaller.appspotmail.com
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Link: https://lore.kernel.org/r/20250127013127.3913153-1-longman@redhat.com
---
 kernel/locking/semaphore.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 34bfae7..de9117c 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -29,6 +29,7 @@
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/wake_q.h>
 #include <linux/semaphore.h>
 #include <linux/spinlock.h>
 #include <linux/ftrace.h>
@@ -38,7 +39,7 @@ static noinline void __down(struct semaphore *sem);
 static noinline int __down_interruptible(struct semaphore *sem);
 static noinline int __down_killable(struct semaphore *sem);
 static noinline int __down_timeout(struct semaphore *sem, long timeout);
-static noinline void __up(struct semaphore *sem);
+static noinline void __up(struct semaphore *sem, struct wake_q_head *wake_q);
 
 /**
  * down - acquire the semaphore
@@ -183,13 +184,16 @@ EXPORT_SYMBOL(down_timeout);
 void __sched up(struct semaphore *sem)
 {
 	unsigned long flags;
+	DEFINE_WAKE_Q(wake_q);
 
 	raw_spin_lock_irqsave(&sem->lock, flags);
 	if (likely(list_empty(&sem->wait_list)))
 		sem->count++;
 	else
-		__up(sem);
+		__up(sem, &wake_q);
 	raw_spin_unlock_irqrestore(&sem->lock, flags);
+	if (!wake_q_empty(&wake_q))
+		wake_up_q(&wake_q);
 }
 EXPORT_SYMBOL(up);
 
@@ -269,11 +273,12 @@ static noinline int __sched __down_timeout(struct semaphore *sem, long timeout)
 	return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout);
 }
 
-static noinline void __sched __up(struct semaphore *sem)
+static noinline void __sched __up(struct semaphore *sem,
+				  struct wake_q_head *wake_q)
 {
 	struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
 						struct semaphore_waiter, list);
 	list_del(&waiter->list);
 	waiter->up = true;
-	wake_up_process(waiter->task);
+	wake_q_add(wake_q, waiter->task);
 }