kernel/sched/deadline.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-)
From: Yicong Yang <yangyicong@hisilicon.com>
On CPU offline the kernel stalled with below call trace:
INFO: task kworker/0:1:11 blocked for more than 120 seconds.
Tainted: G O 6.15.0-rc4+ #1
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:kworker/0:1 state:D stack:0 pid:11 tgid:11 ppid:2 task_flags:0x4208060 flags:0x00000008
Workqueue: events vmstat_shepherd
Call trace:
__switch_to+0x118/0x188 (T)
__schedule+0x31c/0x1300
schedule+0x3c/0x120
percpu_rwsem_wait+0x12c/0x1b0
__percpu_down_read+0x78/0x188
cpus_read_lock+0xc4/0xe8
vmstat_shepherd+0x30/0x138
process_one_work+0x154/0x3c8
worker_thread+0x2e8/0x400
kthread+0x154/0x230
ret_from_fork+0x10/0x20
INFO: task kworker/1:1:116 blocked for more than 120 seconds.
Tainted: G O 6.15.0-rc4+ #1
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:kworker/1:1 state:D stack:0 pid:116 tgid:116 ppid:2 task_flags:0x4208060 flags:0x00000008
Workqueue: events work_for_cpu_fn
Call trace:
__switch_to+0x118/0x188 (T)
__schedule+0x31c/0x1300
schedule+0x3c/0x120
schedule_timeout+0x10c/0x120
__wait_for_common+0xc4/0x1b8
wait_for_completion+0x28/0x40
cpuhp_kick_ap_work+0x114/0x3c8
_cpu_down+0x130/0x4b8
__cpu_down_maps_locked+0x20/0x38
work_for_cpu_fn+0x24/0x40
process_one_work+0x154/0x3c8
worker_thread+0x2e8/0x400
kthread+0x154/0x230
ret_from_fork+0x10/0x20
cpuhp hold the cpu hotplug lock endless and stalled vmstat_shepherd.
This is because we count nr_running twice on cpuhp enqueuing and failed
the wait condition of cpuhp:
enqueue_task_fair() // pick cpuhp from idle, rq->nr_running = 0
dl_server_start()
[...]
add_nr_running() // rq->nr_running = 1
add_nr_running() // rq->nr_running = 2
[switch to cpuhp, waiting on balance_hotplug_wait()]
rcuwait_wait_event(rq->nr_running == 1 && ...) // failed, rq->nr_running=2
schedule() // wait again
This doesn't make sense to count one single task twice on
rq->nr_running. So fix this.
Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
---
kernel/sched/deadline.c | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index ad45a8fea245..59fb178762ee 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1894,7 +1894,9 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
u64 deadline = dl_se->deadline;
dl_rq->dl_nr_running++;
- add_nr_running(rq_of_dl_rq(dl_rq), 1);
+
+ if (!dl_server(dl_se))
+ add_nr_running(rq_of_dl_rq(dl_rq), 1);
inc_dl_deadline(dl_rq, deadline);
}
@@ -1904,7 +1906,9 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
WARN_ON(!dl_rq->dl_nr_running);
dl_rq->dl_nr_running--;
- sub_nr_running(rq_of_dl_rq(dl_rq), 1);
+
+ if (!dl_server(dl_se))
+ sub_nr_running(rq_of_dl_rq(dl_rq), 1);
dec_dl_deadline(dl_rq, dl_se->deadline);
}
--
2.24.0
Hello, On 27/06/25 11:54, Yicong Yang wrote: > From: Yicong Yang <yangyicong@hisilicon.com> > > On CPU offline the kernel stalled with below call trace: > > INFO: task kworker/0:1:11 blocked for more than 120 seconds. > Tainted: G O 6.15.0-rc4+ #1 > "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. > task:kworker/0:1 state:D stack:0 pid:11 tgid:11 ppid:2 task_flags:0x4208060 flags:0x00000008 > Workqueue: events vmstat_shepherd > Call trace: > __switch_to+0x118/0x188 (T) > __schedule+0x31c/0x1300 > schedule+0x3c/0x120 > percpu_rwsem_wait+0x12c/0x1b0 > __percpu_down_read+0x78/0x188 > cpus_read_lock+0xc4/0xe8 > vmstat_shepherd+0x30/0x138 > process_one_work+0x154/0x3c8 > worker_thread+0x2e8/0x400 > kthread+0x154/0x230 > ret_from_fork+0x10/0x20 > INFO: task kworker/1:1:116 blocked for more than 120 seconds. > Tainted: G O 6.15.0-rc4+ #1 > "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. > task:kworker/1:1 state:D stack:0 pid:116 tgid:116 ppid:2 task_flags:0x4208060 flags:0x00000008 > Workqueue: events work_for_cpu_fn > Call trace: > __switch_to+0x118/0x188 (T) > __schedule+0x31c/0x1300 > schedule+0x3c/0x120 > schedule_timeout+0x10c/0x120 > __wait_for_common+0xc4/0x1b8 > wait_for_completion+0x28/0x40 > cpuhp_kick_ap_work+0x114/0x3c8 > _cpu_down+0x130/0x4b8 > __cpu_down_maps_locked+0x20/0x38 > work_for_cpu_fn+0x24/0x40 > process_one_work+0x154/0x3c8 > worker_thread+0x2e8/0x400 > kthread+0x154/0x230 > ret_from_fork+0x10/0x20 > > cpuhp hold the cpu hotplug lock endless and stalled vmstat_shepherd. > This is because we count nr_running twice on cpuhp enqueuing and failed > the wait condition of cpuhp: > > enqueue_task_fair() // pick cpuhp from idle, rq->nr_running = 0 > dl_server_start() > [...] > add_nr_running() // rq->nr_running = 1 > add_nr_running() // rq->nr_running = 2 > [switch to cpuhp, waiting on balance_hotplug_wait()] > rcuwait_wait_event(rq->nr_running == 1 && ...) // failed, rq->nr_running=2 > schedule() // wait again > > This doesn't make sense to count one single task twice on > rq->nr_running. So fix this. > > Signed-off-by: Yicong Yang <yangyicong@hisilicon.com> > --- > kernel/sched/deadline.c | 8 ++++++-- > 1 file changed, 6 insertions(+), 2 deletions(-) > > diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c > index ad45a8fea245..59fb178762ee 100644 > --- a/kernel/sched/deadline.c > +++ b/kernel/sched/deadline.c > @@ -1894,7 +1894,9 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) > u64 deadline = dl_se->deadline; > > dl_rq->dl_nr_running++; > - add_nr_running(rq_of_dl_rq(dl_rq), 1); > + > + if (!dl_server(dl_se)) > + add_nr_running(rq_of_dl_rq(dl_rq), 1); > > inc_dl_deadline(dl_rq, deadline); > } > @@ -1904,7 +1906,9 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) > { > WARN_ON(!dl_rq->dl_nr_running); > dl_rq->dl_nr_running--; > - sub_nr_running(rq_of_dl_rq(dl_rq), 1); > + > + if (!dl_server(dl_se)) > + sub_nr_running(rq_of_dl_rq(dl_rq), 1); > > dec_dl_deadline(dl_rq, dl_se->deadline); > } This seems to make sense to me. Thanks for the analysis and the patch. Peter, what do you think? Thanks, Juri
Hi Peter, On 2025/6/27 16:05, Juri Lelli wrote: > Hello, > > On 27/06/25 11:54, Yicong Yang wrote: >> From: Yicong Yang <yangyicong@hisilicon.com> >> >> On CPU offline the kernel stalled with below call trace: >> >> INFO: task kworker/0:1:11 blocked for more than 120 seconds. >> Tainted: G O 6.15.0-rc4+ #1 >> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. >> task:kworker/0:1 state:D stack:0 pid:11 tgid:11 ppid:2 task_flags:0x4208060 flags:0x00000008 >> Workqueue: events vmstat_shepherd >> Call trace: >> __switch_to+0x118/0x188 (T) >> __schedule+0x31c/0x1300 >> schedule+0x3c/0x120 >> percpu_rwsem_wait+0x12c/0x1b0 >> __percpu_down_read+0x78/0x188 >> cpus_read_lock+0xc4/0xe8 >> vmstat_shepherd+0x30/0x138 >> process_one_work+0x154/0x3c8 >> worker_thread+0x2e8/0x400 >> kthread+0x154/0x230 >> ret_from_fork+0x10/0x20 >> INFO: task kworker/1:1:116 blocked for more than 120 seconds. >> Tainted: G O 6.15.0-rc4+ #1 >> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. >> task:kworker/1:1 state:D stack:0 pid:116 tgid:116 ppid:2 task_flags:0x4208060 flags:0x00000008 >> Workqueue: events work_for_cpu_fn >> Call trace: >> __switch_to+0x118/0x188 (T) >> __schedule+0x31c/0x1300 >> schedule+0x3c/0x120 >> schedule_timeout+0x10c/0x120 >> __wait_for_common+0xc4/0x1b8 >> wait_for_completion+0x28/0x40 >> cpuhp_kick_ap_work+0x114/0x3c8 >> _cpu_down+0x130/0x4b8 >> __cpu_down_maps_locked+0x20/0x38 >> work_for_cpu_fn+0x24/0x40 >> process_one_work+0x154/0x3c8 >> worker_thread+0x2e8/0x400 >> kthread+0x154/0x230 >> ret_from_fork+0x10/0x20 >> >> cpuhp hold the cpu hotplug lock endless and stalled vmstat_shepherd. >> This is because we count nr_running twice on cpuhp enqueuing and failed >> the wait condition of cpuhp: >> >> enqueue_task_fair() // pick cpuhp from idle, rq->nr_running = 0 >> dl_server_start() >> [...] >> add_nr_running() // rq->nr_running = 1 >> add_nr_running() // rq->nr_running = 2 >> [switch to cpuhp, waiting on balance_hotplug_wait()] >> rcuwait_wait_event(rq->nr_running == 1 && ...) // failed, rq->nr_running=2 >> schedule() // wait again >> >> This doesn't make sense to count one single task twice on >> rq->nr_running. So fix this. >> >> Signed-off-by: Yicong Yang <yangyicong@hisilicon.com> >> --- >> kernel/sched/deadline.c | 8 ++++++-- >> 1 file changed, 6 insertions(+), 2 deletions(-) >> >> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c >> index ad45a8fea245..59fb178762ee 100644 >> --- a/kernel/sched/deadline.c >> +++ b/kernel/sched/deadline.c >> @@ -1894,7 +1894,9 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) >> u64 deadline = dl_se->deadline; >> >> dl_rq->dl_nr_running++; >> - add_nr_running(rq_of_dl_rq(dl_rq), 1); >> + >> + if (!dl_server(dl_se)) >> + add_nr_running(rq_of_dl_rq(dl_rq), 1); >> >> inc_dl_deadline(dl_rq, deadline); >> } >> @@ -1904,7 +1906,9 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) >> { >> WARN_ON(!dl_rq->dl_nr_running); >> dl_rq->dl_nr_running--; >> - sub_nr_running(rq_of_dl_rq(dl_rq), 1); >> + >> + if (!dl_server(dl_se)) >> + sub_nr_running(rq_of_dl_rq(dl_rq), 1); >> >> dec_dl_deadline(dl_rq, dl_se->deadline); >> } > > This seems to make sense to me. Thanks for the analysis and the patch. > > Peter, what do you think? > does this also make sense to you? or any other solutions you'd like me to try? thanks.
The following commit has been merged into the sched/urgent branch of tip:
Commit-ID: 52d15521eb75f9b521744db675bee61025d2fa52
Gitweb: https://git.kernel.org/tip/52d15521eb75f9b521744db675bee61025d2fa52
Author: Yicong Yang <yangyicong@hisilicon.com>
AuthorDate: Fri, 27 Jun 2025 11:54:20 +08:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 26 Aug 2025 10:46:01 +02:00
sched/deadline: Don't count nr_running for dl_server proxy tasks
On CPU offline the kernel stalled with below call trace:
INFO: task kworker/0:1:11 blocked for more than 120 seconds.
cpuhp hold the cpu hotplug lock endless and stalled vmstat_shepherd.
This is because we count nr_running twice on cpuhp enqueuing and failed
the wait condition of cpuhp:
enqueue_task_fair() // pick cpuhp from idle, rq->nr_running = 0
dl_server_start()
[...]
add_nr_running() // rq->nr_running = 1
add_nr_running() // rq->nr_running = 2
[switch to cpuhp, waiting on balance_hotplug_wait()]
rcuwait_wait_event(rq->nr_running == 1 && ...) // failed, rq->nr_running=2
schedule() // wait again
It doesn't make sense to count the dl_server towards runnable tasks,
since it runs other tasks.
Fixes: 63ba8422f876 ("sched/deadline: Introduce deadline servers")
Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20250627035420.37712-1-yangyicong@huawei.com
---
kernel/sched/deadline.c | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 88c3bd6..f253012 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1851,7 +1851,9 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
u64 deadline = dl_se->deadline;
dl_rq->dl_nr_running++;
- add_nr_running(rq_of_dl_rq(dl_rq), 1);
+
+ if (!dl_server(dl_se))
+ add_nr_running(rq_of_dl_rq(dl_rq), 1);
inc_dl_deadline(dl_rq, deadline);
}
@@ -1861,7 +1863,9 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
WARN_ON(!dl_rq->dl_nr_running);
dl_rq->dl_nr_running--;
- sub_nr_running(rq_of_dl_rq(dl_rq), 1);
+
+ if (!dl_server(dl_se))
+ sub_nr_running(rq_of_dl_rq(dl_rq), 1);
dec_dl_deadline(dl_rq, dl_se->deadline);
}
© 2016 - 2025 Red Hat, Inc.