[PATCH 12/24] sched/fair: Prepare exit/cleanup paths for delayed_dequeue

Peter Zijlstra posted 24 patches 1 year, 4 months ago
[PATCH 12/24] sched/fair: Prepare exit/cleanup paths for delayed_dequeue
Posted by Peter Zijlstra 1 year, 4 months ago
When dequeue_task() is delayed it becomes possible to exit a task (or
cgroup) that is still enqueued. Ensure things are dequeued before
freeing.

NOTE: switched_from_fair() causes spurious wakeups due to clearing
sched_delayed after enqueueing a task in another class that should've
been dequeued. This *should* be harmless.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/fair.c |   61 ++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 48 insertions(+), 13 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8318,7 +8318,20 @@ static void migrate_task_rq_fair(struct
 
 static void task_dead_fair(struct task_struct *p)
 {
-	remove_entity_load_avg(&p->se);
+	struct sched_entity *se = &p->se;
+
+	if (se->sched_delayed) {
+		struct rq_flags rf;
+		struct rq *rq;
+
+		rq = task_rq_lock(p, &rf);
+		update_rq_clock(rq);
+		if (se->sched_delayed)
+			dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+		task_rq_unlock(rq, p, &rf);
+	}
+
+	remove_entity_load_avg(se);
 }
 
 /*
@@ -12817,10 +12830,26 @@ static void attach_task_cfs_rq(struct ta
 static void switched_from_fair(struct rq *rq, struct task_struct *p)
 {
 	detach_task_cfs_rq(p);
+	/*
+	 * Since this is called after changing class, this isn't quite right.
+	 * Specifically, this causes the task to get queued in the target class
+	 * and experience a 'spurious' wakeup.
+	 *
+	 * However, since 'spurious' wakeups are harmless, this shouldn't be a
+	 * problem.
+	 */
+	p->se.sched_delayed = 0;
+	/*
+	 * While here, also clear the vlag, it makes little sense to carry that
+	 * over the excursion into the new class.
+	 */
+	p->se.vlag = 0;
 }
 
 static void switched_to_fair(struct rq *rq, struct task_struct *p)
 {
+	SCHED_WARN_ON(p->se.sched_delayed);
+
 	attach_task_cfs_rq(p);
 
 	set_task_max_allowed_capacity(p);
@@ -12971,28 +13000,33 @@ void online_fair_sched_group(struct task
 
 void unregister_fair_sched_group(struct task_group *tg)
 {
-	unsigned long flags;
-	struct rq *rq;
 	int cpu;
 
 	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
 
 	for_each_possible_cpu(cpu) {
-		if (tg->se[cpu])
-			remove_entity_load_avg(tg->se[cpu]);
+		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
+		struct sched_entity *se = tg->se[cpu];
+		struct rq *rq = cpu_rq(cpu);
+
+		if (se) {
+			if (se->sched_delayed) {
+				guard(rq_lock_irqsave)(rq);
+				if (se->sched_delayed)
+					dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+				list_del_leaf_cfs_rq(cfs_rq);
+			}
+			remove_entity_load_avg(se);
+		}
 
 		/*
 		 * Only empty task groups can be destroyed; so we can speculatively
 		 * check on_list without danger of it being re-added.
 		 */
-		if (!tg->cfs_rq[cpu]->on_list)
-			continue;
-
-		rq = cpu_rq(cpu);
-
-		raw_spin_rq_lock_irqsave(rq, flags);
-		list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
-		raw_spin_rq_unlock_irqrestore(rq, flags);
+		if (cfs_rq->on_list) {
+			guard(rq_lock_irqsave)(rq);
+			list_del_leaf_cfs_rq(cfs_rq);
+		}
 	}
 }
Re: [PATCH 12/24] sched/fair: Prepare exit/cleanup paths for delayed_dequeue
Posted by Chen Yu 1 year, 3 months ago
On 2024-07-27 at 12:27:44 +0200, Peter Zijlstra wrote:
> When dequeue_task() is delayed it becomes possible to exit a task (or
> cgroup) that is still enqueued. Ensure things are dequeued before
> freeing.
> 
> NOTE: switched_from_fair() causes spurious wakeups due to clearing
> sched_delayed after enqueueing a task in another class that should've
> been dequeued. This *should* be harmless.
>

It might bring some expected behavior in some corner cases reported here:
https://lore.kernel.org/lkml/202408161619.9ed8b83e-lkp@intel.com/
As the block task might return from schedule() with TASK_INTERRUPTIBLE.

We cooked a patch to workaround it(as below).

thanks,
Chenyu
 
From 9251b25073d43aeac04a6ee69b590fbfa1b8e1a5 Mon Sep 17 00:00:00 2001
From: Chen Yu <yu.c.chen@intel.com>
Date: Mon, 26 Aug 2024 22:16:38 +0800
Subject: [PATCH] sched/eevdf: Dequeue the delayed task when changing its
 schedule policy

[Problem Statement]
The following warning was reported:

 do not call blocking ops when !TASK_RUNNING; state=1 set at kthread_worker_fn (kernel/kthread.c:?)
 WARNING: CPU: 1 PID: 674 at kernel/sched/core.c:8469 __might_sleep

 handle_bug
 exc_invalid_op
 asm_exc_invalid_op
 __might_sleep
 __might_sleep
 kthread_worker_fn
 kthread_worker_fn
 kthread
 __cfi_kthread_worker_fn
 ret_from_fork
 __cfi_kthread
 ret_from_fork_asm

[Symptom]
kthread_worker_fn()
  ...
repeat:
  set_current_state(TASK_INTERRUPTIBLE);
  ...
  if (work) { // false
    __set_current_state(TASK_RUNNING);
    ...
  } else if (!freezing(current)) {
    schedule();
    // after schedule, the state is still *TASK_INTERRUPTIBLE*
  }

  try_to_freeze()
    might_sleep() <--- trigger the warning

[Analysis]
The question is after schedule(), the state remains TASK_INTERRUPTIBLE
rather than TASK_RUNNING. The short answer is, someone has incorrectly
picked the TASK_INTERRUPTIBLE task from the tree. The scenario is described
below, and all steps happen on 1 CPU:

time
 |
 |
 |
 v

  kthread_worker_fn()   <--- t1
    set_current_state(TASK_INTERRUPTIBLE)
      schedule()
        block_task(t1)
          dequeue_entity(t1)
            t1->sched_delayed = 1

        t2 = pick_next_task()
          put_prev_task(t1)
            enqueue_entity(t1)  <--- TASK_INTERRUPTIBLE in the tree

  t1 switches to t2

  erofs_init_percpu_worker()    <--- t2
    sched_set_fifo_low(t1)
      sched_setscheduler_nocheck(t1)

        __sched_setscheduler(t1)
          t1->sched_class = &rt_sched_class

          check_class_changed(t1)
            switched_from_fair(t1)
              t1->sched_delayed = 0 <--- gotcha

              ** from now on, t1 in the tree is TASK_INTERRUPTIBLE **
              ** and sched_delayed = 0 **

          preempt_enable()
            preempt_schedule()
              t1 = pick_next_task() <--- because sched_delayed = 0, eligible

  t2 switches back to t1, now t1 is TASK_INTERRUPTIBLE.

The cause is, switched_from_fair() incorrectly clear the sched_delayed
flag and confuse the pick_next_task() that it thinks a delayed task is a
eligible task(without dequeue it).

[Proposal]
In the __sched_setscheduler() when trying to change the policy of that
delayed task, do not re-enqueue the delayed task thus to avoid being
picked again. The side effect that, the delayed task can not wait for
its 0-vlag time to be dequeued, but its effect should be neglect.

Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/oe-lkp/202408161619.9ed8b83e-lkp@intel.com
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
---
 kernel/sched/syscalls.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 4fae3cf25a3a..10859536e509 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -818,7 +818,8 @@ int __sched_setscheduler(struct task_struct *p,
 		if (oldprio < p->prio)
 			queue_flags |= ENQUEUE_HEAD;
 
-		enqueue_task(rq, p, queue_flags);
+		if (!p->se.sched_delayed)
+			enqueue_task(rq, p, queue_flags);
 	}
 	if (running)
 		set_next_task(rq, p);
-- 
2.25.1
Re: [PATCH 12/24] sched/fair: Prepare exit/cleanup paths for delayed_dequeue
Posted by Chen Yu 1 year, 3 months ago
On 2024-08-27 at 17:17:20 +0800, Chen Yu wrote:
> On 2024-07-27 at 12:27:44 +0200, Peter Zijlstra wrote:
> > When dequeue_task() is delayed it becomes possible to exit a task (or
> > cgroup) that is still enqueued. Ensure things are dequeued before
> > freeing.
> > 
> > NOTE: switched_from_fair() causes spurious wakeups due to clearing
> > sched_delayed after enqueueing a task in another class that should've
> > been dequeued. This *should* be harmless.
> >
> 
> It might bring some expected behavior in some corner cases reported here:
> https://lore.kernel.org/lkml/202408161619.9ed8b83e-lkp@intel.com/
> As the block task might return from schedule() with TASK_INTERRUPTIBLE.
> 
> We cooked a patch to workaround it(as below).
> 
> thanks,
> Chenyu
>  
> >From 9251b25073d43aeac04a6ee69b590fbfa1b8e1a5 Mon Sep 17 00:00:00 2001
> From: Chen Yu <yu.c.chen@intel.com>
> Date: Mon, 26 Aug 2024 22:16:38 +0800
> Subject: [PATCH] sched/eevdf: Dequeue the delayed task when changing its
>  schedule policy
> 
> [Problem Statement]
> The following warning was reported:
> 
>  do not call blocking ops when !TASK_RUNNING; state=1 set at kthread_worker_fn (kernel/kthread.c:?)
>  WARNING: CPU: 1 PID: 674 at kernel/sched/core.c:8469 __might_sleep
> 
>  handle_bug
>  exc_invalid_op
>  asm_exc_invalid_op
>  __might_sleep
>  __might_sleep
>  kthread_worker_fn
>  kthread_worker_fn
>  kthread
>  __cfi_kthread_worker_fn
>  ret_from_fork
>  __cfi_kthread
>  ret_from_fork_asm
> 
> [Symptom]
> kthread_worker_fn()
>   ...
> repeat:
>   set_current_state(TASK_INTERRUPTIBLE);
>   ...
>   if (work) { // false
>     __set_current_state(TASK_RUNNING);
>     ...
>   } else if (!freezing(current)) {
>     schedule();
>     // after schedule, the state is still *TASK_INTERRUPTIBLE*
>   }
> 
>   try_to_freeze()
>     might_sleep() <--- trigger the warning
> 
> [Analysis]
> The question is after schedule(), the state remains TASK_INTERRUPTIBLE
> rather than TASK_RUNNING. The short answer is, someone has incorrectly
> picked the TASK_INTERRUPTIBLE task from the tree. The scenario is described
> below, and all steps happen on 1 CPU:
> 
> time
>  |
>  |
>  |
>  v
> 
>   kthread_worker_fn()   <--- t1
>     set_current_state(TASK_INTERRUPTIBLE)
>       schedule()
>         block_task(t1)
>           dequeue_entity(t1)
>             t1->sched_delayed = 1
> 
>         t2 = pick_next_task()
>           put_prev_task(t1)
>             enqueue_entity(t1)  <--- TASK_INTERRUPTIBLE in the tree
> 
>   t1 switches to t2
> 
>   erofs_init_percpu_worker()    <--- t2
>     sched_set_fifo_low(t1)
>       sched_setscheduler_nocheck(t1)
> 
>         __sched_setscheduler(t1)
>           t1->sched_class = &rt_sched_class
> 
>           check_class_changed(t1)
>             switched_from_fair(t1)
>               t1->sched_delayed = 0 <--- gotcha
> 
>               ** from now on, t1 in the tree is TASK_INTERRUPTIBLE **
>               ** and sched_delayed = 0 **
> 
>           preempt_enable()
>             preempt_schedule()
>               t1 = pick_next_task() <--- because sched_delayed = 0, eligible
> 
>   t2 switches back to t1, now t1 is TASK_INTERRUPTIBLE.
> 
> The cause is, switched_from_fair() incorrectly clear the sched_delayed
> flag and confuse the pick_next_task() that it thinks a delayed task is a
> eligible task(without dequeue it).
>

Valentin pointed out that after the requeue, the t1 is in the new RT priolist.
So the value of sched_delayed does not matter much. The problem is
that rt priolist has a TASK_INTERRUPTIBLE task to be picked by next
schedule(). There is a fix from peter to dequeue this task in switched_from_fair(),
which can fix this problem. But I think the current proposal can save one extra
enqueue/dequeue operation, no?

thanks,
Chenyu 

> [Proposal]
> In the __sched_setscheduler() when trying to change the policy of that
> delayed task, do not re-enqueue the delayed task thus to avoid being
> picked again. The side effect that, the delayed task can not wait for
> its 0-vlag time to be dequeued, but its effect should be neglect.
> 
> Reported-by: kernel test robot <oliver.sang@intel.com>
> Closes: https://lore.kernel.org/oe-lkp/202408161619.9ed8b83e-lkp@intel.com
> Signed-off-by: Chen Yu <yu.c.chen@intel.com>
> ---
>  kernel/sched/syscalls.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
> index 4fae3cf25a3a..10859536e509 100644
> --- a/kernel/sched/syscalls.c
> +++ b/kernel/sched/syscalls.c
> @@ -818,7 +818,8 @@ int __sched_setscheduler(struct task_struct *p,
>  		if (oldprio < p->prio)
>  			queue_flags |= ENQUEUE_HEAD;
>  
> -		enqueue_task(rq, p, queue_flags);
> +		if (!p->se.sched_delayed)
> +			enqueue_task(rq, p, queue_flags);
>  	}
>  	if (running)
>  		set_next_task(rq, p);
> -- 
> 2.25.1
>
Re: [PATCH 12/24] sched/fair: Prepare exit/cleanup paths for delayed_dequeue
Posted by Valentin Schneider 1 year, 4 months ago
On 27/07/24 12:27, Peter Zijlstra wrote:
> @@ -12817,10 +12830,26 @@ static void attach_task_cfs_rq(struct ta
>  static void switched_from_fair(struct rq *rq, struct task_struct *p)
>  {
>       detach_task_cfs_rq(p);
> +	/*
> +	 * Since this is called after changing class, this isn't quite right.
> +	 * Specifically, this causes the task to get queued in the target class
> +	 * and experience a 'spurious' wakeup.
> +	 *
> +	 * However, since 'spurious' wakeups are harmless, this shouldn't be a
> +	 * problem.
> +	 */
> +	p->se.sched_delayed = 0;
> +	/*
> +	 * While here, also clear the vlag, it makes little sense to carry that
> +	 * over the excursion into the new class.
> +	 */
> +	p->se.vlag = 0;

RQ lock is held, the task can't be current if it's ->sched_delayed; is a
dequeue_task() not possible at this point?  Or just not worth it?
Re: [PATCH 12/24] sched/fair: Prepare exit/cleanup paths for delayed_dequeue
Posted by Peter Zijlstra 1 year, 4 months ago
On Tue, Aug 13, 2024 at 02:43:47PM +0200, Valentin Schneider wrote:
> On 27/07/24 12:27, Peter Zijlstra wrote:
> > @@ -12817,10 +12830,26 @@ static void attach_task_cfs_rq(struct ta
> >  static void switched_from_fair(struct rq *rq, struct task_struct *p)
> >  {
> >       detach_task_cfs_rq(p);
> > +	/*
> > +	 * Since this is called after changing class, this isn't quite right.
> > +	 * Specifically, this causes the task to get queued in the target class
> > +	 * and experience a 'spurious' wakeup.
> > +	 *
> > +	 * However, since 'spurious' wakeups are harmless, this shouldn't be a
> > +	 * problem.
> > +	 */
> > +	p->se.sched_delayed = 0;
> > +	/*
> > +	 * While here, also clear the vlag, it makes little sense to carry that
> > +	 * over the excursion into the new class.
> > +	 */
> > +	p->se.vlag = 0;
> 
> RQ lock is held, the task can't be current if it's ->sched_delayed; is a
> dequeue_task() not possible at this point?  Or just not worth it?

Hurmph, I really can't remember why I did it like this :-(

Also, I remember thinking this vlag reset might not be ideal, PI induced
class excursions might be very short and would benefit from retaining
vlag.

Let me make this something like:

  if (se->sched_delayed)
    dequeue_entities(rq, &p->se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
Re: [PATCH 12/24] sched/fair: Prepare exit/cleanup paths for delayed_dequeue
Posted by Peter Zijlstra 1 year, 4 months ago
On Tue, Aug 13, 2024 at 11:54:21PM +0200, Peter Zijlstra wrote:
> On Tue, Aug 13, 2024 at 02:43:47PM +0200, Valentin Schneider wrote:
> > On 27/07/24 12:27, Peter Zijlstra wrote:
> > > @@ -12817,10 +12830,26 @@ static void attach_task_cfs_rq(struct ta
> > >  static void switched_from_fair(struct rq *rq, struct task_struct *p)
> > >  {
> > >       detach_task_cfs_rq(p);
> > > +	/*
> > > +	 * Since this is called after changing class, this isn't quite right.
> > > +	 * Specifically, this causes the task to get queued in the target class
> > > +	 * and experience a 'spurious' wakeup.
> > > +	 *
> > > +	 * However, since 'spurious' wakeups are harmless, this shouldn't be a
> > > +	 * problem.
> > > +	 */
> > > +	p->se.sched_delayed = 0;
> > > +	/*
> > > +	 * While here, also clear the vlag, it makes little sense to carry that
> > > +	 * over the excursion into the new class.
> > > +	 */
> > > +	p->se.vlag = 0;
> > 
> > RQ lock is held, the task can't be current if it's ->sched_delayed; is a
> > dequeue_task() not possible at this point?  Or just not worth it?
> 
> Hurmph, I really can't remember why I did it like this :-(

Obviously I remember it right after hitting send...

We've just done:

	dequeue_task();
	p->sched_class = some_other_class;
	enqueue_task();

IOW, we're enqueued as some other class at this point. There is no way
we can fix it up at this point.

Perhaps I can use the sched_class::switching_to thing sched_ext will
bring.

For now, lets keep things as is. I'll look at things later
Re: [PATCH 12/24] sched/fair: Prepare exit/cleanup paths for delayed_dequeue
Posted by Peter Zijlstra 1 year, 4 months ago
On Wed, Aug 14, 2024 at 12:07:57AM +0200, Peter Zijlstra wrote:
> On Tue, Aug 13, 2024 at 11:54:21PM +0200, Peter Zijlstra wrote:
> > On Tue, Aug 13, 2024 at 02:43:47PM +0200, Valentin Schneider wrote:
> > > On 27/07/24 12:27, Peter Zijlstra wrote:
> > > > @@ -12817,10 +12830,26 @@ static void attach_task_cfs_rq(struct ta
> > > >  static void switched_from_fair(struct rq *rq, struct task_struct *p)
> > > >  {
> > > >       detach_task_cfs_rq(p);
> > > > +	/*
> > > > +	 * Since this is called after changing class, this isn't quite right.
> > > > +	 * Specifically, this causes the task to get queued in the target class
> > > > +	 * and experience a 'spurious' wakeup.
> > > > +	 *
> > > > +	 * However, since 'spurious' wakeups are harmless, this shouldn't be a
> > > > +	 * problem.
> > > > +	 */
> > > > +	p->se.sched_delayed = 0;
> > > > +	/*
> > > > +	 * While here, also clear the vlag, it makes little sense to carry that
> > > > +	 * over the excursion into the new class.
> > > > +	 */
> > > > +	p->se.vlag = 0;
> > > 
> > > RQ lock is held, the task can't be current if it's ->sched_delayed; is a
> > > dequeue_task() not possible at this point?  Or just not worth it?
> > 
> > Hurmph, I really can't remember why I did it like this :-(
> 
> Obviously I remember it right after hitting send...
> 
> We've just done:
> 
> 	dequeue_task();
> 	p->sched_class = some_other_class;
> 	enqueue_task();
> 
> IOW, we're enqueued as some other class at this point. There is no way
> we can fix it up at this point.

With just a little more sleep than last night, perhaps you're right
after all. Yes we're on a different class, but we can *still* dequeue it
again.


That is, something like the below ... I'll stick it on and see if
anything falls over.

---
 kernel/sched/fair.c | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 714826d97ef2..53c8f3ccfd0c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -13105,20 +13105,16 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
 {
 	detach_task_cfs_rq(p);
 	/*
-	 * Since this is called after changing class, this isn't quite right.
-	 * Specifically, this causes the task to get queued in the target class
-	 * and experience a 'spurious' wakeup.
-	 *
-	 * However, since 'spurious' wakeups are harmless, this shouldn't be a
-	 * problem.
-	 */
-	p->se.sched_delayed = 0;
-	/*
-	 * While here, also clear the vlag, it makes little sense to carry that
-	 * over the excursion into the new class.
+	 * Since this is called after changing class, this is a little weird
+	 * and we cannot use DEQUEUE_DELAYED.
 	 */
-	p->se.vlag = 0;
-	p->se.rel_deadline = 0;
+	if (p->se.sched_delayed) {
+		dequeue_task(DEQUEUE_NOCLOCK, DEQUEUE_SLEEP);
+		p->se.sched_delayed = 0;
+		p->se.rel_deadline = 0;
+		if (sched_feat(DELAY_ZERO) && p->se.vlag > 0)
+			p->se.vlag = 0;
+	}
 }
 
 static void switched_to_fair(struct rq *rq, struct task_struct *p)
Re: [PATCH 12/24] sched/fair: Prepare exit/cleanup paths for delayed_dequeue
Posted by Chen Yu 1 year, 3 months ago
On 2024-08-14 at 07:53:30 +0200, Peter Zijlstra wrote:
> On Wed, Aug 14, 2024 at 12:07:57AM +0200, Peter Zijlstra wrote:
> > On Tue, Aug 13, 2024 at 11:54:21PM +0200, Peter Zijlstra wrote:
> > > On Tue, Aug 13, 2024 at 02:43:47PM +0200, Valentin Schneider wrote:
> > > > On 27/07/24 12:27, Peter Zijlstra wrote:
> > > > > @@ -12817,10 +12830,26 @@ static void attach_task_cfs_rq(struct ta
> > > > >  static void switched_from_fair(struct rq *rq, struct task_struct *p)
> > > > >  {
> > > > >       detach_task_cfs_rq(p);
> > > > > +	/*
> > > > > +	 * Since this is called after changing class, this isn't quite right.
> > > > > +	 * Specifically, this causes the task to get queued in the target class
> > > > > +	 * and experience a 'spurious' wakeup.
> > > > > +	 *
> > > > > +	 * However, since 'spurious' wakeups are harmless, this shouldn't be a
> > > > > +	 * problem.
> > > > > +	 */
> > > > > +	p->se.sched_delayed = 0;
> > > > > +	/*
> > > > > +	 * While here, also clear the vlag, it makes little sense to carry that
> > > > > +	 * over the excursion into the new class.
> > > > > +	 */
> > > > > +	p->se.vlag = 0;
> > > > 
> > > > RQ lock is held, the task can't be current if it's ->sched_delayed; is a
> > > > dequeue_task() not possible at this point?  Or just not worth it?
> > > 
> > > Hurmph, I really can't remember why I did it like this :-(
> > 
> > Obviously I remember it right after hitting send...
> > 
> > We've just done:
> > 
> > 	dequeue_task();
> > 	p->sched_class = some_other_class;
> > 	enqueue_task();
> > 
> > IOW, we're enqueued as some other class at this point. There is no way
> > we can fix it up at this point.
> 
> With just a little more sleep than last night, perhaps you're right
> after all. Yes we're on a different class, but we can *still* dequeue it
> again.

Not quite get this. If the old class is cfs, the task is in a rb-tree. And
if the new class is rt then the task is in the prio list. Just wonder
would the rt.dequeue break the data of rb-tree?

thanks,
Chenyu
Re: [PATCH 12/24] sched/fair: Prepare exit/cleanup paths for delayed_dequeue
Posted by Valentin Schneider 1 year, 3 months ago
On 27/08/24 17:35, Chen Yu wrote:
> On 2024-08-14 at 07:53:30 +0200, Peter Zijlstra wrote:
>> On Wed, Aug 14, 2024 at 12:07:57AM +0200, Peter Zijlstra wrote:
>> > On Tue, Aug 13, 2024 at 11:54:21PM +0200, Peter Zijlstra wrote:
>> >
>> > Obviously I remember it right after hitting send...
>> >
>> > We've just done:
>> >
>> >    dequeue_task();
>> >    p->sched_class = some_other_class;
>> >    enqueue_task();
>> >
>> > IOW, we're enqueued as some other class at this point. There is no way
>> > we can fix it up at this point.
>>
>> With just a little more sleep than last night, perhaps you're right
>> after all. Yes we're on a different class, but we can *still* dequeue it
>> again.
>
> Not quite get this. If the old class is cfs, the task is in a rb-tree. And
> if the new class is rt then the task is in the prio list. Just wonder
> would the rt.dequeue break the data of rb-tree?
>

On a class change e.g. CFS to RT, __sched_setscheduler() would
dequeue_task() (take it out of the RB tree), change the class,
enqueue_task() (put it in the RT priolist).

Then check_class_changed()->switched_from_fair() happens, dequeue_task(),
and that takes it out of the RT priolist. At least that's the theory, since
that currently explodes...

> thanks,
> Chenyu
Re: [PATCH 12/24] sched/fair: Prepare exit/cleanup paths for delayed_dequeue
Posted by Chen Yu 1 year, 3 months ago
On 2024-08-27 at 22:29:50 +0200, Valentin Schneider wrote:
> On 27/08/24 17:35, Chen Yu wrote:
> > On 2024-08-14 at 07:53:30 +0200, Peter Zijlstra wrote:
> >> On Wed, Aug 14, 2024 at 12:07:57AM +0200, Peter Zijlstra wrote:
> >> > On Tue, Aug 13, 2024 at 11:54:21PM +0200, Peter Zijlstra wrote:
> >> >
> >> > Obviously I remember it right after hitting send...
> >> >
> >> > We've just done:
> >> >
> >> >    dequeue_task();
> >> >    p->sched_class = some_other_class;
> >> >    enqueue_task();
> >> >
> >> > IOW, we're enqueued as some other class at this point. There is no way
> >> > we can fix it up at this point.
> >>
> >> With just a little more sleep than last night, perhaps you're right
> >> after all. Yes we're on a different class, but we can *still* dequeue it
> >> again.
> >
> > Not quite get this. If the old class is cfs, the task is in a rb-tree. And
> > if the new class is rt then the task is in the prio list. Just wonder
> > would the rt.dequeue break the data of rb-tree?
> >
> 
> On a class change e.g. CFS to RT, __sched_setscheduler() would
> dequeue_task() (take it out of the RB tree), change the class,
> enqueue_task() (put it in the RT priolist).
> 
> Then check_class_changed()->switched_from_fair() happens, dequeue_task(),
> and that takes it out of the RT priolist. At least that's the theory, since
> that currently explodes...
>

I see, thanks for this explaination! I overlooked that the task is already on
rt priolist. I applied Peter's dequeue patch with minor fix and did not see
the warning[1] after several cycle test(previously it is 100% reproducible).

[1] https://lore.kernel.org/lkml/Zs2ZoAcUsZMX2B%2FI@chenyu5-mobl2/

thanks,
Chenyu
[tip: sched/core] sched/fair: Prepare exit/cleanup paths for delayed_dequeue
Posted by tip-bot2 for Peter Zijlstra 1 year, 3 months ago
The following commit has been merged into the sched/core branch of tip:

Commit-ID:     2e0199df252a536a03f4cb0810324dff523d1e79
Gitweb:        https://git.kernel.org/tip/2e0199df252a536a03f4cb0810324dff523d1e79
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Thu, 23 May 2024 11:03:42 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Sat, 17 Aug 2024 11:06:43 +02:00

sched/fair: Prepare exit/cleanup paths for delayed_dequeue

When dequeue_task() is delayed it becomes possible to exit a task (or
cgroup) that is still enqueued. Ensure things are dequeued before
freeing.

Thanks to Valentin for asking the obvious questions and making
switched_from_fair() less weird.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Tested-by: Valentin Schneider <vschneid@redhat.com>
Link: https://lkml.kernel.org/r/20240727105029.631948434@infradead.org
---
 kernel/sched/fair.c | 59 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 46 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 37acd53..9a84903 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8342,7 +8342,21 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
 
 static void task_dead_fair(struct task_struct *p)
 {
-	remove_entity_load_avg(&p->se);
+	struct sched_entity *se = &p->se;
+
+	if (se->sched_delayed) {
+		struct rq_flags rf;
+		struct rq *rq;
+
+		rq = task_rq_lock(p, &rf);
+		if (se->sched_delayed) {
+			update_rq_clock(rq);
+			dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+		}
+		task_rq_unlock(rq, p, &rf);
+	}
+
+	remove_entity_load_avg(se);
 }
 
 /*
@@ -12854,10 +12868,22 @@ static void attach_task_cfs_rq(struct task_struct *p)
 static void switched_from_fair(struct rq *rq, struct task_struct *p)
 {
 	detach_task_cfs_rq(p);
+	/*
+	 * Since this is called after changing class, this is a little weird
+	 * and we cannot use DEQUEUE_DELAYED.
+	 */
+	if (p->se.sched_delayed) {
+		dequeue_task(rq, p, DEQUEUE_NOCLOCK | DEQUEUE_SLEEP);
+		p->se.sched_delayed = 0;
+		if (sched_feat(DELAY_ZERO) && p->se.vlag > 0)
+			p->se.vlag = 0;
+	}
 }
 
 static void switched_to_fair(struct rq *rq, struct task_struct *p)
 {
+	SCHED_WARN_ON(p->se.sched_delayed);
+
 	attach_task_cfs_rq(p);
 
 	set_task_max_allowed_capacity(p);
@@ -13008,28 +13034,35 @@ void online_fair_sched_group(struct task_group *tg)
 
 void unregister_fair_sched_group(struct task_group *tg)
 {
-	unsigned long flags;
-	struct rq *rq;
 	int cpu;
 
 	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
 
 	for_each_possible_cpu(cpu) {
-		if (tg->se[cpu])
-			remove_entity_load_avg(tg->se[cpu]);
+		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
+		struct sched_entity *se = tg->se[cpu];
+		struct rq *rq = cpu_rq(cpu);
+
+		if (se) {
+			if (se->sched_delayed) {
+				guard(rq_lock_irqsave)(rq);
+				if (se->sched_delayed) {
+					update_rq_clock(rq);
+					dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+				}
+				list_del_leaf_cfs_rq(cfs_rq);
+			}
+			remove_entity_load_avg(se);
+		}
 
 		/*
 		 * Only empty task groups can be destroyed; so we can speculatively
 		 * check on_list without danger of it being re-added.
 		 */
-		if (!tg->cfs_rq[cpu]->on_list)
-			continue;
-
-		rq = cpu_rq(cpu);
-
-		raw_spin_rq_lock_irqsave(rq, flags);
-		list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
-		raw_spin_rq_unlock_irqrestore(rq, flags);
+		if (cfs_rq->on_list) {
+			guard(rq_lock_irqsave)(rq);
+			list_del_leaf_cfs_rq(cfs_rq);
+		}
 	}
 }