When dequeue_task() is delayed it becomes possible to exit a task (or
cgroup) that is still enqueued. Ensure things are dequeued before
freeing.
NOTE: switched_from_fair() causes spurious wakeups due to clearing
sched_delayed after enqueueing a task in another class that should've
been dequeued. This *should* be harmless.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/sched/fair.c | 61 ++++++++++++++++++++++++++++++++++++++++------------
1 file changed, 48 insertions(+), 13 deletions(-)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8318,7 +8318,20 @@ static void migrate_task_rq_fair(struct
static void task_dead_fair(struct task_struct *p)
{
- remove_entity_load_avg(&p->se);
+ struct sched_entity *se = &p->se;
+
+ if (se->sched_delayed) {
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(p, &rf);
+ update_rq_clock(rq);
+ if (se->sched_delayed)
+ dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+ task_rq_unlock(rq, p, &rf);
+ }
+
+ remove_entity_load_avg(se);
}
/*
@@ -12817,10 +12830,26 @@ static void attach_task_cfs_rq(struct ta
static void switched_from_fair(struct rq *rq, struct task_struct *p)
{
detach_task_cfs_rq(p);
+ /*
+ * Since this is called after changing class, this isn't quite right.
+ * Specifically, this causes the task to get queued in the target class
+ * and experience a 'spurious' wakeup.
+ *
+ * However, since 'spurious' wakeups are harmless, this shouldn't be a
+ * problem.
+ */
+ p->se.sched_delayed = 0;
+ /*
+ * While here, also clear the vlag, it makes little sense to carry that
+ * over the excursion into the new class.
+ */
+ p->se.vlag = 0;
}
static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
+ SCHED_WARN_ON(p->se.sched_delayed);
+
attach_task_cfs_rq(p);
set_task_max_allowed_capacity(p);
@@ -12971,28 +13000,33 @@ void online_fair_sched_group(struct task
void unregister_fair_sched_group(struct task_group *tg)
{
- unsigned long flags;
- struct rq *rq;
int cpu;
destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
for_each_possible_cpu(cpu) {
- if (tg->se[cpu])
- remove_entity_load_avg(tg->se[cpu]);
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
+ struct sched_entity *se = tg->se[cpu];
+ struct rq *rq = cpu_rq(cpu);
+
+ if (se) {
+ if (se->sched_delayed) {
+ guard(rq_lock_irqsave)(rq);
+ if (se->sched_delayed)
+ dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+ list_del_leaf_cfs_rq(cfs_rq);
+ }
+ remove_entity_load_avg(se);
+ }
/*
* Only empty task groups can be destroyed; so we can speculatively
* check on_list without danger of it being re-added.
*/
- if (!tg->cfs_rq[cpu]->on_list)
- continue;
-
- rq = cpu_rq(cpu);
-
- raw_spin_rq_lock_irqsave(rq, flags);
- list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
- raw_spin_rq_unlock_irqrestore(rq, flags);
+ if (cfs_rq->on_list) {
+ guard(rq_lock_irqsave)(rq);
+ list_del_leaf_cfs_rq(cfs_rq);
+ }
}
}
On 2024-07-27 at 12:27:44 +0200, Peter Zijlstra wrote:
> When dequeue_task() is delayed it becomes possible to exit a task (or
> cgroup) that is still enqueued. Ensure things are dequeued before
> freeing.
>
> NOTE: switched_from_fair() causes spurious wakeups due to clearing
> sched_delayed after enqueueing a task in another class that should've
> been dequeued. This *should* be harmless.
>
It might bring some expected behavior in some corner cases reported here:
https://lore.kernel.org/lkml/202408161619.9ed8b83e-lkp@intel.com/
As the block task might return from schedule() with TASK_INTERRUPTIBLE.
We cooked a patch to workaround it(as below).
thanks,
Chenyu
From 9251b25073d43aeac04a6ee69b590fbfa1b8e1a5 Mon Sep 17 00:00:00 2001
From: Chen Yu <yu.c.chen@intel.com>
Date: Mon, 26 Aug 2024 22:16:38 +0800
Subject: [PATCH] sched/eevdf: Dequeue the delayed task when changing its
schedule policy
[Problem Statement]
The following warning was reported:
do not call blocking ops when !TASK_RUNNING; state=1 set at kthread_worker_fn (kernel/kthread.c:?)
WARNING: CPU: 1 PID: 674 at kernel/sched/core.c:8469 __might_sleep
handle_bug
exc_invalid_op
asm_exc_invalid_op
__might_sleep
__might_sleep
kthread_worker_fn
kthread_worker_fn
kthread
__cfi_kthread_worker_fn
ret_from_fork
__cfi_kthread
ret_from_fork_asm
[Symptom]
kthread_worker_fn()
...
repeat:
set_current_state(TASK_INTERRUPTIBLE);
...
if (work) { // false
__set_current_state(TASK_RUNNING);
...
} else if (!freezing(current)) {
schedule();
// after schedule, the state is still *TASK_INTERRUPTIBLE*
}
try_to_freeze()
might_sleep() <--- trigger the warning
[Analysis]
The question is after schedule(), the state remains TASK_INTERRUPTIBLE
rather than TASK_RUNNING. The short answer is, someone has incorrectly
picked the TASK_INTERRUPTIBLE task from the tree. The scenario is described
below, and all steps happen on 1 CPU:
time
|
|
|
v
kthread_worker_fn() <--- t1
set_current_state(TASK_INTERRUPTIBLE)
schedule()
block_task(t1)
dequeue_entity(t1)
t1->sched_delayed = 1
t2 = pick_next_task()
put_prev_task(t1)
enqueue_entity(t1) <--- TASK_INTERRUPTIBLE in the tree
t1 switches to t2
erofs_init_percpu_worker() <--- t2
sched_set_fifo_low(t1)
sched_setscheduler_nocheck(t1)
__sched_setscheduler(t1)
t1->sched_class = &rt_sched_class
check_class_changed(t1)
switched_from_fair(t1)
t1->sched_delayed = 0 <--- gotcha
** from now on, t1 in the tree is TASK_INTERRUPTIBLE **
** and sched_delayed = 0 **
preempt_enable()
preempt_schedule()
t1 = pick_next_task() <--- because sched_delayed = 0, eligible
t2 switches back to t1, now t1 is TASK_INTERRUPTIBLE.
The cause is, switched_from_fair() incorrectly clear the sched_delayed
flag and confuse the pick_next_task() that it thinks a delayed task is a
eligible task(without dequeue it).
[Proposal]
In the __sched_setscheduler() when trying to change the policy of that
delayed task, do not re-enqueue the delayed task thus to avoid being
picked again. The side effect that, the delayed task can not wait for
its 0-vlag time to be dequeued, but its effect should be neglect.
Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/oe-lkp/202408161619.9ed8b83e-lkp@intel.com
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
---
kernel/sched/syscalls.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 4fae3cf25a3a..10859536e509 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -818,7 +818,8 @@ int __sched_setscheduler(struct task_struct *p,
if (oldprio < p->prio)
queue_flags |= ENQUEUE_HEAD;
- enqueue_task(rq, p, queue_flags);
+ if (!p->se.sched_delayed)
+ enqueue_task(rq, p, queue_flags);
}
if (running)
set_next_task(rq, p);
--
2.25.1
On 2024-08-27 at 17:17:20 +0800, Chen Yu wrote:
> On 2024-07-27 at 12:27:44 +0200, Peter Zijlstra wrote:
> > When dequeue_task() is delayed it becomes possible to exit a task (or
> > cgroup) that is still enqueued. Ensure things are dequeued before
> > freeing.
> >
> > NOTE: switched_from_fair() causes spurious wakeups due to clearing
> > sched_delayed after enqueueing a task in another class that should've
> > been dequeued. This *should* be harmless.
> >
>
> It might bring some expected behavior in some corner cases reported here:
> https://lore.kernel.org/lkml/202408161619.9ed8b83e-lkp@intel.com/
> As the block task might return from schedule() with TASK_INTERRUPTIBLE.
>
> We cooked a patch to workaround it(as below).
>
> thanks,
> Chenyu
>
> >From 9251b25073d43aeac04a6ee69b590fbfa1b8e1a5 Mon Sep 17 00:00:00 2001
> From: Chen Yu <yu.c.chen@intel.com>
> Date: Mon, 26 Aug 2024 22:16:38 +0800
> Subject: [PATCH] sched/eevdf: Dequeue the delayed task when changing its
> schedule policy
>
> [Problem Statement]
> The following warning was reported:
>
> do not call blocking ops when !TASK_RUNNING; state=1 set at kthread_worker_fn (kernel/kthread.c:?)
> WARNING: CPU: 1 PID: 674 at kernel/sched/core.c:8469 __might_sleep
>
> handle_bug
> exc_invalid_op
> asm_exc_invalid_op
> __might_sleep
> __might_sleep
> kthread_worker_fn
> kthread_worker_fn
> kthread
> __cfi_kthread_worker_fn
> ret_from_fork
> __cfi_kthread
> ret_from_fork_asm
>
> [Symptom]
> kthread_worker_fn()
> ...
> repeat:
> set_current_state(TASK_INTERRUPTIBLE);
> ...
> if (work) { // false
> __set_current_state(TASK_RUNNING);
> ...
> } else if (!freezing(current)) {
> schedule();
> // after schedule, the state is still *TASK_INTERRUPTIBLE*
> }
>
> try_to_freeze()
> might_sleep() <--- trigger the warning
>
> [Analysis]
> The question is after schedule(), the state remains TASK_INTERRUPTIBLE
> rather than TASK_RUNNING. The short answer is, someone has incorrectly
> picked the TASK_INTERRUPTIBLE task from the tree. The scenario is described
> below, and all steps happen on 1 CPU:
>
> time
> |
> |
> |
> v
>
> kthread_worker_fn() <--- t1
> set_current_state(TASK_INTERRUPTIBLE)
> schedule()
> block_task(t1)
> dequeue_entity(t1)
> t1->sched_delayed = 1
>
> t2 = pick_next_task()
> put_prev_task(t1)
> enqueue_entity(t1) <--- TASK_INTERRUPTIBLE in the tree
>
> t1 switches to t2
>
> erofs_init_percpu_worker() <--- t2
> sched_set_fifo_low(t1)
> sched_setscheduler_nocheck(t1)
>
> __sched_setscheduler(t1)
> t1->sched_class = &rt_sched_class
>
> check_class_changed(t1)
> switched_from_fair(t1)
> t1->sched_delayed = 0 <--- gotcha
>
> ** from now on, t1 in the tree is TASK_INTERRUPTIBLE **
> ** and sched_delayed = 0 **
>
> preempt_enable()
> preempt_schedule()
> t1 = pick_next_task() <--- because sched_delayed = 0, eligible
>
> t2 switches back to t1, now t1 is TASK_INTERRUPTIBLE.
>
> The cause is, switched_from_fair() incorrectly clear the sched_delayed
> flag and confuse the pick_next_task() that it thinks a delayed task is a
> eligible task(without dequeue it).
>
Valentin pointed out that after the requeue, the t1 is in the new RT priolist.
So the value of sched_delayed does not matter much. The problem is
that rt priolist has a TASK_INTERRUPTIBLE task to be picked by next
schedule(). There is a fix from peter to dequeue this task in switched_from_fair(),
which can fix this problem. But I think the current proposal can save one extra
enqueue/dequeue operation, no?
thanks,
Chenyu
> [Proposal]
> In the __sched_setscheduler() when trying to change the policy of that
> delayed task, do not re-enqueue the delayed task thus to avoid being
> picked again. The side effect that, the delayed task can not wait for
> its 0-vlag time to be dequeued, but its effect should be neglect.
>
> Reported-by: kernel test robot <oliver.sang@intel.com>
> Closes: https://lore.kernel.org/oe-lkp/202408161619.9ed8b83e-lkp@intel.com
> Signed-off-by: Chen Yu <yu.c.chen@intel.com>
> ---
> kernel/sched/syscalls.c | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
> index 4fae3cf25a3a..10859536e509 100644
> --- a/kernel/sched/syscalls.c
> +++ b/kernel/sched/syscalls.c
> @@ -818,7 +818,8 @@ int __sched_setscheduler(struct task_struct *p,
> if (oldprio < p->prio)
> queue_flags |= ENQUEUE_HEAD;
>
> - enqueue_task(rq, p, queue_flags);
> + if (!p->se.sched_delayed)
> + enqueue_task(rq, p, queue_flags);
> }
> if (running)
> set_next_task(rq, p);
> --
> 2.25.1
>
On 27/07/24 12:27, Peter Zijlstra wrote:
> @@ -12817,10 +12830,26 @@ static void attach_task_cfs_rq(struct ta
> static void switched_from_fair(struct rq *rq, struct task_struct *p)
> {
> detach_task_cfs_rq(p);
> + /*
> + * Since this is called after changing class, this isn't quite right.
> + * Specifically, this causes the task to get queued in the target class
> + * and experience a 'spurious' wakeup.
> + *
> + * However, since 'spurious' wakeups are harmless, this shouldn't be a
> + * problem.
> + */
> + p->se.sched_delayed = 0;
> + /*
> + * While here, also clear the vlag, it makes little sense to carry that
> + * over the excursion into the new class.
> + */
> + p->se.vlag = 0;
RQ lock is held, the task can't be current if it's ->sched_delayed; is a
dequeue_task() not possible at this point? Or just not worth it?
On Tue, Aug 13, 2024 at 02:43:47PM +0200, Valentin Schneider wrote:
> On 27/07/24 12:27, Peter Zijlstra wrote:
> > @@ -12817,10 +12830,26 @@ static void attach_task_cfs_rq(struct ta
> > static void switched_from_fair(struct rq *rq, struct task_struct *p)
> > {
> > detach_task_cfs_rq(p);
> > + /*
> > + * Since this is called after changing class, this isn't quite right.
> > + * Specifically, this causes the task to get queued in the target class
> > + * and experience a 'spurious' wakeup.
> > + *
> > + * However, since 'spurious' wakeups are harmless, this shouldn't be a
> > + * problem.
> > + */
> > + p->se.sched_delayed = 0;
> > + /*
> > + * While here, also clear the vlag, it makes little sense to carry that
> > + * over the excursion into the new class.
> > + */
> > + p->se.vlag = 0;
>
> RQ lock is held, the task can't be current if it's ->sched_delayed; is a
> dequeue_task() not possible at this point? Or just not worth it?
Hurmph, I really can't remember why I did it like this :-(
Also, I remember thinking this vlag reset might not be ideal, PI induced
class excursions might be very short and would benefit from retaining
vlag.
Let me make this something like:
if (se->sched_delayed)
dequeue_entities(rq, &p->se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
On Tue, Aug 13, 2024 at 11:54:21PM +0200, Peter Zijlstra wrote:
> On Tue, Aug 13, 2024 at 02:43:47PM +0200, Valentin Schneider wrote:
> > On 27/07/24 12:27, Peter Zijlstra wrote:
> > > @@ -12817,10 +12830,26 @@ static void attach_task_cfs_rq(struct ta
> > > static void switched_from_fair(struct rq *rq, struct task_struct *p)
> > > {
> > > detach_task_cfs_rq(p);
> > > + /*
> > > + * Since this is called after changing class, this isn't quite right.
> > > + * Specifically, this causes the task to get queued in the target class
> > > + * and experience a 'spurious' wakeup.
> > > + *
> > > + * However, since 'spurious' wakeups are harmless, this shouldn't be a
> > > + * problem.
> > > + */
> > > + p->se.sched_delayed = 0;
> > > + /*
> > > + * While here, also clear the vlag, it makes little sense to carry that
> > > + * over the excursion into the new class.
> > > + */
> > > + p->se.vlag = 0;
> >
> > RQ lock is held, the task can't be current if it's ->sched_delayed; is a
> > dequeue_task() not possible at this point? Or just not worth it?
>
> Hurmph, I really can't remember why I did it like this :-(
Obviously I remember it right after hitting send...
We've just done:
dequeue_task();
p->sched_class = some_other_class;
enqueue_task();
IOW, we're enqueued as some other class at this point. There is no way
we can fix it up at this point.
Perhaps I can use the sched_class::switching_to thing sched_ext will
bring.
For now, lets keep things as is. I'll look at things later
On Wed, Aug 14, 2024 at 12:07:57AM +0200, Peter Zijlstra wrote:
> On Tue, Aug 13, 2024 at 11:54:21PM +0200, Peter Zijlstra wrote:
> > On Tue, Aug 13, 2024 at 02:43:47PM +0200, Valentin Schneider wrote:
> > > On 27/07/24 12:27, Peter Zijlstra wrote:
> > > > @@ -12817,10 +12830,26 @@ static void attach_task_cfs_rq(struct ta
> > > > static void switched_from_fair(struct rq *rq, struct task_struct *p)
> > > > {
> > > > detach_task_cfs_rq(p);
> > > > + /*
> > > > + * Since this is called after changing class, this isn't quite right.
> > > > + * Specifically, this causes the task to get queued in the target class
> > > > + * and experience a 'spurious' wakeup.
> > > > + *
> > > > + * However, since 'spurious' wakeups are harmless, this shouldn't be a
> > > > + * problem.
> > > > + */
> > > > + p->se.sched_delayed = 0;
> > > > + /*
> > > > + * While here, also clear the vlag, it makes little sense to carry that
> > > > + * over the excursion into the new class.
> > > > + */
> > > > + p->se.vlag = 0;
> > >
> > > RQ lock is held, the task can't be current if it's ->sched_delayed; is a
> > > dequeue_task() not possible at this point? Or just not worth it?
> >
> > Hurmph, I really can't remember why I did it like this :-(
>
> Obviously I remember it right after hitting send...
>
> We've just done:
>
> dequeue_task();
> p->sched_class = some_other_class;
> enqueue_task();
>
> IOW, we're enqueued as some other class at this point. There is no way
> we can fix it up at this point.
With just a little more sleep than last night, perhaps you're right
after all. Yes we're on a different class, but we can *still* dequeue it
again.
That is, something like the below ... I'll stick it on and see if
anything falls over.
---
kernel/sched/fair.c | 22 +++++++++-------------
1 file changed, 9 insertions(+), 13 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 714826d97ef2..53c8f3ccfd0c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -13105,20 +13105,16 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
{
detach_task_cfs_rq(p);
/*
- * Since this is called after changing class, this isn't quite right.
- * Specifically, this causes the task to get queued in the target class
- * and experience a 'spurious' wakeup.
- *
- * However, since 'spurious' wakeups are harmless, this shouldn't be a
- * problem.
- */
- p->se.sched_delayed = 0;
- /*
- * While here, also clear the vlag, it makes little sense to carry that
- * over the excursion into the new class.
+ * Since this is called after changing class, this is a little weird
+ * and we cannot use DEQUEUE_DELAYED.
*/
- p->se.vlag = 0;
- p->se.rel_deadline = 0;
+ if (p->se.sched_delayed) {
+ dequeue_task(DEQUEUE_NOCLOCK, DEQUEUE_SLEEP);
+ p->se.sched_delayed = 0;
+ p->se.rel_deadline = 0;
+ if (sched_feat(DELAY_ZERO) && p->se.vlag > 0)
+ p->se.vlag = 0;
+ }
}
static void switched_to_fair(struct rq *rq, struct task_struct *p)
On 2024-08-14 at 07:53:30 +0200, Peter Zijlstra wrote:
> On Wed, Aug 14, 2024 at 12:07:57AM +0200, Peter Zijlstra wrote:
> > On Tue, Aug 13, 2024 at 11:54:21PM +0200, Peter Zijlstra wrote:
> > > On Tue, Aug 13, 2024 at 02:43:47PM +0200, Valentin Schneider wrote:
> > > > On 27/07/24 12:27, Peter Zijlstra wrote:
> > > > > @@ -12817,10 +12830,26 @@ static void attach_task_cfs_rq(struct ta
> > > > > static void switched_from_fair(struct rq *rq, struct task_struct *p)
> > > > > {
> > > > > detach_task_cfs_rq(p);
> > > > > + /*
> > > > > + * Since this is called after changing class, this isn't quite right.
> > > > > + * Specifically, this causes the task to get queued in the target class
> > > > > + * and experience a 'spurious' wakeup.
> > > > > + *
> > > > > + * However, since 'spurious' wakeups are harmless, this shouldn't be a
> > > > > + * problem.
> > > > > + */
> > > > > + p->se.sched_delayed = 0;
> > > > > + /*
> > > > > + * While here, also clear the vlag, it makes little sense to carry that
> > > > > + * over the excursion into the new class.
> > > > > + */
> > > > > + p->se.vlag = 0;
> > > >
> > > > RQ lock is held, the task can't be current if it's ->sched_delayed; is a
> > > > dequeue_task() not possible at this point? Or just not worth it?
> > >
> > > Hurmph, I really can't remember why I did it like this :-(
> >
> > Obviously I remember it right after hitting send...
> >
> > We've just done:
> >
> > dequeue_task();
> > p->sched_class = some_other_class;
> > enqueue_task();
> >
> > IOW, we're enqueued as some other class at this point. There is no way
> > we can fix it up at this point.
>
> With just a little more sleep than last night, perhaps you're right
> after all. Yes we're on a different class, but we can *still* dequeue it
> again.
Not quite get this. If the old class is cfs, the task is in a rb-tree. And
if the new class is rt then the task is in the prio list. Just wonder
would the rt.dequeue break the data of rb-tree?
thanks,
Chenyu
On 27/08/24 17:35, Chen Yu wrote: > On 2024-08-14 at 07:53:30 +0200, Peter Zijlstra wrote: >> On Wed, Aug 14, 2024 at 12:07:57AM +0200, Peter Zijlstra wrote: >> > On Tue, Aug 13, 2024 at 11:54:21PM +0200, Peter Zijlstra wrote: >> > >> > Obviously I remember it right after hitting send... >> > >> > We've just done: >> > >> > dequeue_task(); >> > p->sched_class = some_other_class; >> > enqueue_task(); >> > >> > IOW, we're enqueued as some other class at this point. There is no way >> > we can fix it up at this point. >> >> With just a little more sleep than last night, perhaps you're right >> after all. Yes we're on a different class, but we can *still* dequeue it >> again. > > Not quite get this. If the old class is cfs, the task is in a rb-tree. And > if the new class is rt then the task is in the prio list. Just wonder > would the rt.dequeue break the data of rb-tree? > On a class change e.g. CFS to RT, __sched_setscheduler() would dequeue_task() (take it out of the RB tree), change the class, enqueue_task() (put it in the RT priolist). Then check_class_changed()->switched_from_fair() happens, dequeue_task(), and that takes it out of the RT priolist. At least that's the theory, since that currently explodes... > thanks, > Chenyu
On 2024-08-27 at 22:29:50 +0200, Valentin Schneider wrote: > On 27/08/24 17:35, Chen Yu wrote: > > On 2024-08-14 at 07:53:30 +0200, Peter Zijlstra wrote: > >> On Wed, Aug 14, 2024 at 12:07:57AM +0200, Peter Zijlstra wrote: > >> > On Tue, Aug 13, 2024 at 11:54:21PM +0200, Peter Zijlstra wrote: > >> > > >> > Obviously I remember it right after hitting send... > >> > > >> > We've just done: > >> > > >> > dequeue_task(); > >> > p->sched_class = some_other_class; > >> > enqueue_task(); > >> > > >> > IOW, we're enqueued as some other class at this point. There is no way > >> > we can fix it up at this point. > >> > >> With just a little more sleep than last night, perhaps you're right > >> after all. Yes we're on a different class, but we can *still* dequeue it > >> again. > > > > Not quite get this. If the old class is cfs, the task is in a rb-tree. And > > if the new class is rt then the task is in the prio list. Just wonder > > would the rt.dequeue break the data of rb-tree? > > > > On a class change e.g. CFS to RT, __sched_setscheduler() would > dequeue_task() (take it out of the RB tree), change the class, > enqueue_task() (put it in the RT priolist). > > Then check_class_changed()->switched_from_fair() happens, dequeue_task(), > and that takes it out of the RT priolist. At least that's the theory, since > that currently explodes... > I see, thanks for this explaination! I overlooked that the task is already on rt priolist. I applied Peter's dequeue patch with minor fix and did not see the warning[1] after several cycle test(previously it is 100% reproducible). [1] https://lore.kernel.org/lkml/Zs2ZoAcUsZMX2B%2FI@chenyu5-mobl2/ thanks, Chenyu
The following commit has been merged into the sched/core branch of tip:
Commit-ID: 2e0199df252a536a03f4cb0810324dff523d1e79
Gitweb: https://git.kernel.org/tip/2e0199df252a536a03f4cb0810324dff523d1e79
Author: Peter Zijlstra <peterz@infradead.org>
AuthorDate: Thu, 23 May 2024 11:03:42 +02:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Sat, 17 Aug 2024 11:06:43 +02:00
sched/fair: Prepare exit/cleanup paths for delayed_dequeue
When dequeue_task() is delayed it becomes possible to exit a task (or
cgroup) that is still enqueued. Ensure things are dequeued before
freeing.
Thanks to Valentin for asking the obvious questions and making
switched_from_fair() less weird.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Tested-by: Valentin Schneider <vschneid@redhat.com>
Link: https://lkml.kernel.org/r/20240727105029.631948434@infradead.org
---
kernel/sched/fair.c | 59 ++++++++++++++++++++++++++++++++++----------
1 file changed, 46 insertions(+), 13 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 37acd53..9a84903 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8342,7 +8342,21 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
static void task_dead_fair(struct task_struct *p)
{
- remove_entity_load_avg(&p->se);
+ struct sched_entity *se = &p->se;
+
+ if (se->sched_delayed) {
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(p, &rf);
+ if (se->sched_delayed) {
+ update_rq_clock(rq);
+ dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+ }
+ task_rq_unlock(rq, p, &rf);
+ }
+
+ remove_entity_load_avg(se);
}
/*
@@ -12854,10 +12868,22 @@ static void attach_task_cfs_rq(struct task_struct *p)
static void switched_from_fair(struct rq *rq, struct task_struct *p)
{
detach_task_cfs_rq(p);
+ /*
+ * Since this is called after changing class, this is a little weird
+ * and we cannot use DEQUEUE_DELAYED.
+ */
+ if (p->se.sched_delayed) {
+ dequeue_task(rq, p, DEQUEUE_NOCLOCK | DEQUEUE_SLEEP);
+ p->se.sched_delayed = 0;
+ if (sched_feat(DELAY_ZERO) && p->se.vlag > 0)
+ p->se.vlag = 0;
+ }
}
static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
+ SCHED_WARN_ON(p->se.sched_delayed);
+
attach_task_cfs_rq(p);
set_task_max_allowed_capacity(p);
@@ -13008,28 +13034,35 @@ void online_fair_sched_group(struct task_group *tg)
void unregister_fair_sched_group(struct task_group *tg)
{
- unsigned long flags;
- struct rq *rq;
int cpu;
destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
for_each_possible_cpu(cpu) {
- if (tg->se[cpu])
- remove_entity_load_avg(tg->se[cpu]);
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
+ struct sched_entity *se = tg->se[cpu];
+ struct rq *rq = cpu_rq(cpu);
+
+ if (se) {
+ if (se->sched_delayed) {
+ guard(rq_lock_irqsave)(rq);
+ if (se->sched_delayed) {
+ update_rq_clock(rq);
+ dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+ }
+ list_del_leaf_cfs_rq(cfs_rq);
+ }
+ remove_entity_load_avg(se);
+ }
/*
* Only empty task groups can be destroyed; so we can speculatively
* check on_list without danger of it being re-added.
*/
- if (!tg->cfs_rq[cpu]->on_list)
- continue;
-
- rq = cpu_rq(cpu);
-
- raw_spin_rq_lock_irqsave(rq, flags);
- list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
- raw_spin_rq_unlock_irqrestore(rq, flags);
+ if (cfs_rq->on_list) {
+ guard(rq_lock_irqsave)(rq);
+ list_del_leaf_cfs_rq(cfs_rq);
+ }
}
}
© 2016 - 2025 Red Hat, Inc.