kernel/workqueue.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
Currently, the nr_running can be modified from timer tick, that means
the timer tick can run in not-irq-protected critical section to modify
nr_runnig, consider the following scenario:
CPU0
kworker/0:2 (events)
worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);
->pool->nr_running++; (1)
process_one_work()
->worker->current_func(work);
->schedule()
->wq_worker_sleeping()
->pool->nr_running--; (0)
....
->wq_worker_running()
....
CPU0 by interrupt:
wq_worker_tick()
->worker_set_flags(worker, WORKER_CPU_INTENSIVE);
->pool->nr_running--; (-1)
->worker->flags |= WORKER_CPU_INTENSIVE;
....
->if (!(worker->flags & WORKER_NOT_RUNNING))
->pool->nr_running++; (will not execute)
....
->worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
->pool->nr_running++; (0)
....
worker_set_flags(worker, WORKER_PREP);
->pool->nr_running--; (-1)
....
worker_enter_idle()
->WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running);
if the nr_workers is equal to nr_idle, due to the nr_running is not zero,
will trigger WARN_ON_ONCE().
[ 2.460602] WARNING: CPU: 0 PID: 63 at kernel/workqueue.c:1999 worker_enter_idle+0xb2/0xc0
[ 2.462163] Modules linked in:
[ 2.463401] CPU: 0 PID: 63 Comm: kworker/0:2 Not tainted 6.4.0-rc2-next-20230519 #1
[ 2.463771] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2 04/01/2014
[ 2.465127] Workqueue: 0x0 (events)
[ 2.465678] RIP: 0010:worker_enter_idle+0xb2/0xc0
...
[ 2.472614] Call Trace:
[ 2.473152] <TASK>
[ 2.474182] worker_thread+0x71/0x430
[ 2.474992] ? _raw_spin_unlock_irqrestore+0x28/0x50
[ 2.475263] kthread+0x103/0x120
[ 2.475493] ? __pfx_worker_thread+0x10/0x10
[ 2.476355] ? __pfx_kthread+0x10/0x10
[ 2.476635] ret_from_fork+0x2c/0x50
[ 2.477051] </TASK>
This commit therefore add irq protection in wq_worker_running() to
block timer tick modify nr_running.
Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Closes: https://qa-reports.linaro.org/lkft/linux-next-master/build/next-20230519/testrun/17078554/suite/boot/test/clang-nightly-lkftconfig/log
Signed-off-by: Zqiang <qiang.zhang1211@gmail.com>
---
kernel/workqueue.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9c5c1cfa478f..f8d739fef311 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1060,10 +1060,9 @@ void wq_worker_running(struct task_struct *task)
* and leave with an unexpected pool->nr_running == 1 on the newly unbound
* pool. Protect against such race.
*/
- preempt_disable();
+ local_irq_disable();
if (!(worker->flags & WORKER_NOT_RUNNING))
worker->pool->nr_running++;
- preempt_enable();
/*
* CPU intensive auto-detection cares about how long a work item hogged
@@ -1072,6 +1071,7 @@ void wq_worker_running(struct task_struct *task)
worker->current_at = worker->task->se.sum_exec_runtime;
worker->sleeping = 0;
+ local_irq_enable();
}
/**
--
2.17.1
> > Currently, the nr_running can be modified from timer tick, that means > the timer tick can run in not-irq-protected critical section to modify > nr_runnig, consider the following scenario: > > CPU0 > kworker/0:2 (events) > worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND); > ->pool->nr_running++; (1) > > process_one_work() > ->worker->current_func(work); > ->schedule() > ->wq_worker_sleeping() > ->pool->nr_running--; (0) > .... > ->wq_worker_running() > .... > CPU0 by interrupt: > wq_worker_tick() > ->worker_set_flags(worker, WORKER_CPU_INTENSIVE); > ->pool->nr_running--; (-1) > ->worker->flags |= WORKER_CPU_INTENSIVE; > .... > ->if (!(worker->flags & WORKER_NOT_RUNNING)) > ->pool->nr_running++; (will not execute) > .... > ->worker_clr_flags(worker, WORKER_CPU_INTENSIVE); > ->pool->nr_running++; (0) > .... > worker_set_flags(worker, WORKER_PREP); > ->pool->nr_running--; (-1) > .... > worker_enter_idle() > ->WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running); > > if the nr_workers is equal to nr_idle, due to the nr_running is not zero, > will trigger WARN_ON_ONCE(). > > [ 2.460602] WARNING: CPU: 0 PID: 63 at kernel/workqueue.c:1999 worker_enter_idle+0xb2/0xc0 > [ 2.462163] Modules linked in: > [ 2.463401] CPU: 0 PID: 63 Comm: kworker/0:2 Not tainted 6.4.0-rc2-next-20230519 #1 > [ 2.463771] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2 04/01/2014 > [ 2.465127] Workqueue: 0x0 (events) > [ 2.465678] RIP: 0010:worker_enter_idle+0xb2/0xc0 > ... > [ 2.472614] Call Trace: > [ 2.473152] <TASK> > [ 2.474182] worker_thread+0x71/0x430 > [ 2.474992] ? _raw_spin_unlock_irqrestore+0x28/0x50 > [ 2.475263] kthread+0x103/0x120 > [ 2.475493] ? __pfx_worker_thread+0x10/0x10 > [ 2.476355] ? __pfx_kthread+0x10/0x10 > [ 2.476635] ret_from_fork+0x2c/0x50 > [ 2.477051] </TASK> > > This commit therefore add irq protection in wq_worker_running() to > block timer tick modify nr_running. > > Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org> > Closes: https://qa-reports.linaro.org/lkft/linux-next-master/build/next-20230519/testrun/17078554/suite/boot/test/clang-nightly-lkftconfig/log > Signed-off-by: Zqiang <qiang.zhang1211@gmail.com> > --- > kernel/workqueue.c | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > diff --git a/kernel/workqueue.c b/kernel/workqueue.c > index 9c5c1cfa478f..f8d739fef311 100644 > --- a/kernel/workqueue.c > +++ b/kernel/workqueue.c > @@ -1060,10 +1060,9 @@ void wq_worker_running(struct task_struct *task) > * and leave with an unexpected pool->nr_running == 1 on the newly unbound > * pool. Protect against such race. > */ > - preempt_disable(); Sorry, Here can still be interrupted by wq_worker_tick() before invoke local_irq_disbale(). will resend v2. > + local_irq_disable(); > if (!(worker->flags & WORKER_NOT_RUNNING)) > worker->pool->nr_running++; > - preempt_enable(); > > /* > * CPU intensive auto-detection cares about how long a work item hogged > @@ -1072,6 +1071,7 @@ void wq_worker_running(struct task_struct *task) > worker->current_at = worker->task->se.sum_exec_runtime; > > worker->sleeping = 0; > + local_irq_enable(); > } > > /** > -- > 2.17.1 >
Hi Zqiang, On Tue, 23 May 2023 at 08:49, Zqiang <qiang.zhang1211@gmail.com> wrote: > > Currently, the nr_running can be modified from timer tick, that means > the timer tick can run in not-irq-protected critical section to modify > nr_runnig, consider the following scenario: > > CPU0 > kworker/0:2 (events) > worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND); > ->pool->nr_running++; (1) > > process_one_work() > ->worker->current_func(work); > ->schedule() > ->wq_worker_sleeping() > ->pool->nr_running--; (0) > .... > ->wq_worker_running() > .... > CPU0 by interrupt: > wq_worker_tick() > ->worker_set_flags(worker, WORKER_CPU_INTENSIVE); > ->pool->nr_running--; (-1) > ->worker->flags |= WORKER_CPU_INTENSIVE; > .... > ->if (!(worker->flags & WORKER_NOT_RUNNING)) > ->pool->nr_running++; (will not execute) > .... > ->worker_clr_flags(worker, WORKER_CPU_INTENSIVE); > ->pool->nr_running++; (0) > .... > worker_set_flags(worker, WORKER_PREP); > ->pool->nr_running--; (-1) > .... > worker_enter_idle() > ->WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running); > > if the nr_workers is equal to nr_idle, due to the nr_running is not zero, > will trigger WARN_ON_ONCE(). > > [ 2.460602] WARNING: CPU: 0 PID: 63 at kernel/workqueue.c:1999 worker_enter_idle+0xb2/0xc0 > [ 2.462163] Modules linked in: > [ 2.463401] CPU: 0 PID: 63 Comm: kworker/0:2 Not tainted 6.4.0-rc2-next-20230519 #1 > [ 2.463771] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2 04/01/2014 > [ 2.465127] Workqueue: 0x0 (events) > [ 2.465678] RIP: 0010:worker_enter_idle+0xb2/0xc0 > ... > [ 2.472614] Call Trace: > [ 2.473152] <TASK> > [ 2.474182] worker_thread+0x71/0x430 > [ 2.474992] ? _raw_spin_unlock_irqrestore+0x28/0x50 > [ 2.475263] kthread+0x103/0x120 > [ 2.475493] ? __pfx_worker_thread+0x10/0x10 > [ 2.476355] ? __pfx_kthread+0x10/0x10 > [ 2.476635] ret_from_fork+0x2c/0x50 > [ 2.477051] </TASK> > > This commit therefore add irq protection in wq_worker_running() to > block timer tick modify nr_running. > > Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org> > Closes: https://qa-reports.linaro.org/lkft/linux-next-master/build/next-20230519/testrun/17078554/suite/boot/test/clang-nightly-lkftconfig/log > Signed-off-by: Zqiang <qiang.zhang1211@gmail.com> Reported-by: Linux Kernel Functional Testing <lkft@linaro.org> Tested-by: Anders Roxell <anders.roxell@linaro.org> This proposed fix patch applied on top of Linux next and tested for more than 100 times and reported issues has been fixed. Thank you. - Naresh > --- > kernel/workqueue.c | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > diff --git a/kernel/workqueue.c b/kernel/workqueue.c > index 9c5c1cfa478f..f8d739fef311 100644 > --- a/kernel/workqueue.c > +++ b/kernel/workqueue.c > @@ -1060,10 +1060,9 @@ void wq_worker_running(struct task_struct *task) > * and leave with an unexpected pool->nr_running == 1 on the newly unbound > * pool. Protect against such race. > */ > - preempt_disable(); > + local_irq_disable(); > if (!(worker->flags & WORKER_NOT_RUNNING)) > worker->pool->nr_running++; > - preempt_enable(); > > /* > * CPU intensive auto-detection cares about how long a work item hogged > @@ -1072,6 +1071,7 @@ void wq_worker_running(struct task_struct *task) > worker->current_at = worker->task->se.sum_exec_runtime; > > worker->sleeping = 0; > + local_irq_enable(); > } > > /** > -- > 2.17.1 >
© 2016 - 2026 Red Hat, Inc.