Complete EEVDF | Patchew

[PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Peter Zijlstra 1 year, 6 months ago

Delayed dequeue has tasks sit around on the runqueue that are not
actually runnable -- specifically, they will be dequeued the moment
they get picked.

One side-effect is that such a task can get migrated, which leads to a
'nested' dequeue_task() scenario that messes up uclamp if we don't
take care.

Notably, dequeue_task(DEQUEUE_SLEEP) can 'fail' and keep the task on
the runqueue. This however will have removed the task from uclamp --
per uclamp_rq_dec() in dequeue_task(). So far so good.

However, if at that point the task gets migrated -- or nice adjusted
or any of a myriad of operations that does a dequeue-enqueue cycle --
we'll pass through dequeue_task()/enqueue_task() again. Without
modification this will lead to a double decrement for uclamp, which is
wrong.

Reported-by: Luis Machado <luis.machado@arm.com>
Reported-by: Hongyan Xia <hongyan.xia2@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c |   16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1676,6 +1676,9 @@ static inline void uclamp_rq_inc(struct
 	if (unlikely(!p->sched_class->uclamp_enabled))
 		return;
 
+	if (p->se.sched_delayed)
+		return;
+
 	for_each_clamp_id(clamp_id)
 		uclamp_rq_inc_id(rq, p, clamp_id);
 
@@ -1700,6 +1703,9 @@ static inline void uclamp_rq_dec(struct
 	if (unlikely(!p->sched_class->uclamp_enabled))
 		return;
 
+	if (p->se.sched_delayed)
+		return;
+
 	for_each_clamp_id(clamp_id)
 		uclamp_rq_dec_id(rq, p, clamp_id);
 }
@@ -1979,8 +1985,12 @@ void enqueue_task(struct rq *rq, struct
 		psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
 	}
 
-	uclamp_rq_inc(rq, p);
 	p->sched_class->enqueue_task(rq, p, flags);
+	/*
+	 * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
+	 * ->sched_delayed.
+	 */
+	uclamp_rq_inc(rq, p);
 
 	if (sched_core_enabled(rq))
 		sched_core_enqueue(rq, p);
@@ -2002,6 +2012,10 @@ inline bool dequeue_task(struct rq *rq,
 		psi_dequeue(p, flags & DEQUEUE_SLEEP);
 	}
 
+	/*
+	 * Must be before ->dequeue_task() because ->dequeue_task() can 'fail'
+	 * and mark the task ->sched_delayed.
+	 */
 	uclamp_rq_dec(rq, p);
 	return p->sched_class->dequeue_task(rq, p, flags);
 }

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Hongyan Xia 1 year, 5 months ago

Hi Peter,

Sorry for bombarding this thread in the last couple of days. I'm seeing 
several issues in the latest tip/sched/core after these patches landed.

What I'm now seeing seems to be an unbalanced util_est. First, I applied 
the following diff to warn against util_est != 0 when no tasks are on 
the queue:

https://lore.kernel.org/all/752ae417c02b9277ca3ec18893747c54dd5f277f.1724245193.git.hongyan.xia2@arm.com/

Then, I'm reliably seeing warnings on my Juno board during boot in 
latest tip/sched/core.

If I do the same thing to util_est just like what you did in this uclamp 
patch, like this:

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 574ef19df64b..58aac42c99e5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6946,7 +6946,7 @@ enqueue_task_fair(struct rq *rq, struct 
task_struct *p, int flags)

  	if (flags & ENQUEUE_DELAYED) {
  		requeue_delayed_entity(se);
-		return;
+		goto util_est;
  	}

  	/*
@@ -6955,7 +6955,6 @@ enqueue_task_fair(struct rq *rq, struct 
task_struct *p, int flags)
  	 * Let's add the task's estimated utilization to the cfs_rq's
  	 * estimated utilization, before we update schedutil.
  	 */
-	util_est_enqueue(&rq->cfs, p);

  	/*
  	 * If in_iowait is set, the code below may not trigger any cpufreq
@@ -7050,6 +7049,9 @@ enqueue_task_fair(struct rq *rq, struct 
task_struct *p, int flags)
  	assert_list_leaf_cfs_rq(rq);

  	hrtick_update(rq);
+util_est:
+	if (!p->se.sched_delayed)
+		util_est_enqueue(&rq->cfs, p);
  }

  static void set_next_buddy(struct sched_entity *se);
@@ -7173,7 +7175,8 @@ static int dequeue_entities(struct rq *rq, struct 
sched_entity *se, int flags)
   */
  static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, 
int flags)
  {
-	util_est_dequeue(&rq->cfs, p);
+	if (!p->se.sched_delayed)
+		util_est_dequeue(&rq->cfs, p);

  	if (dequeue_entities(rq, &p->se, flags) < 0) {
  		if (!rq->cfs.h_nr_running)

which is basically enqueuing util_est after enqueue_task_fair(), 
dequeuing util_est before dequeue_task_fair() and double check 
p->se.delayed_dequeue, then the unbalanced issue seems to go away.

Hopefully this helps you in finding where the problem could be.

Hongyan

On 27/07/2024 11:27, Peter Zijlstra wrote:
> Delayed dequeue has tasks sit around on the runqueue that are not
> actually runnable -- specifically, they will be dequeued the moment
> they get picked.
> 
> One side-effect is that such a task can get migrated, which leads to a
> 'nested' dequeue_task() scenario that messes up uclamp if we don't
> take care.
> 
> Notably, dequeue_task(DEQUEUE_SLEEP) can 'fail' and keep the task on
> the runqueue. This however will have removed the task from uclamp --
> per uclamp_rq_dec() in dequeue_task(). So far so good.
> 
> However, if at that point the task gets migrated -- or nice adjusted
> or any of a myriad of operations that does a dequeue-enqueue cycle --
> we'll pass through dequeue_task()/enqueue_task() again. Without
> modification this will lead to a double decrement for uclamp, which is
> wrong.
> 
> Reported-by: Luis Machado <luis.machado@arm.com>
> Reported-by: Hongyan Xia <hongyan.xia2@arm.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>   kernel/sched/core.c |   16 +++++++++++++++-
>   1 file changed, 15 insertions(+), 1 deletion(-)
> 
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1676,6 +1676,9 @@ static inline void uclamp_rq_inc(struct
>   	if (unlikely(!p->sched_class->uclamp_enabled))
>   		return;
>   
> +	if (p->se.sched_delayed)
> +		return;
> +
>   	for_each_clamp_id(clamp_id)
>   		uclamp_rq_inc_id(rq, p, clamp_id);
>   
> @@ -1700,6 +1703,9 @@ static inline void uclamp_rq_dec(struct
>   	if (unlikely(!p->sched_class->uclamp_enabled))
>   		return;
>   
> +	if (p->se.sched_delayed)
> +		return;
> +
>   	for_each_clamp_id(clamp_id)
>   		uclamp_rq_dec_id(rq, p, clamp_id);
>   }
> @@ -1979,8 +1985,12 @@ void enqueue_task(struct rq *rq, struct
>   		psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
>   	}
>   
> -	uclamp_rq_inc(rq, p);
>   	p->sched_class->enqueue_task(rq, p, flags);
> +	/*
> +	 * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
> +	 * ->sched_delayed.
> +	 */
> +	uclamp_rq_inc(rq, p);
>   
>   	if (sched_core_enabled(rq))
>   		sched_core_enqueue(rq, p);
> @@ -2002,6 +2012,10 @@ inline bool dequeue_task(struct rq *rq,
>   		psi_dequeue(p, flags & DEQUEUE_SLEEP);
>   	}
>   
> +	/*
> +	 * Must be before ->dequeue_task() because ->dequeue_task() can 'fail'
> +	 * and mark the task ->sched_delayed.
> +	 */
>   	uclamp_rq_dec(rq, p);
>   	return p->sched_class->dequeue_task(rq, p, flags);
>   }
> 
>

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Vincent Guittot 1 year, 5 months ago

Hi,

On Wed, 21 Aug 2024 at 15:34, Hongyan Xia <hongyan.xia2@arm.com> wrote:
>
> Hi Peter,
>
> Sorry for bombarding this thread in the last couple of days. I'm seeing
> several issues in the latest tip/sched/core after these patches landed.
>
> What I'm now seeing seems to be an unbalanced call of util_est. First, I applied

I also see a remaining util_est for idle rq because of an unbalance
call of util_est_enqueue|dequeue

> the following diff to warn against util_est != 0 when no tasks are on
> the queue:
>
> https://lore.kernel.org/all/752ae417c02b9277ca3ec18893747c54dd5f277f.1724245193.git.hongyan.xia2@arm.com/
>
> Then, I'm reliably seeing warnings on my Juno board during boot in
> latest tip/sched/core.
>
> If I do the same thing to util_est just like what you did in this uclamp
> patch, like this:

I think that the solution is simpler than your proposal and we just
need to always call util_est_enqueue() before the
requeue_delayed_entity

@@ -6970,11 +6970,6 @@ enqueue_task_fair(struct rq *rq, struct
task_struct *p, int flags)
        int rq_h_nr_running = rq->cfs.h_nr_running;
        u64 slice = 0;

-       if (flags & ENQUEUE_DELAYED) {
-               requeue_delayed_entity(se);
-               return;
-       }
-
        /*
         * The code below (indirectly) updates schedutil which looks at
         * the cfs_rq utilization to select a frequency.
@@ -6983,6 +6978,11 @@ enqueue_task_fair(struct rq *rq, struct
task_struct *p, int flags)
         */
        util_est_enqueue(&rq->cfs, p);

+       if (flags & ENQUEUE_DELAYED) {
+               requeue_delayed_entity(se);
+               return;
+       }
+
        /*
         * If in_iowait is set, the code below may not trigger any cpufreq
         * utilization updates, so do it here explicitly with the IOWAIT flag


>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 574ef19df64b..58aac42c99e5 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6946,7 +6946,7 @@ enqueue_task_fair(struct rq *rq, struct
> task_struct *p, int flags)
>
>         if (flags & ENQUEUE_DELAYED) {
>                 requeue_delayed_entity(se);
> -               return;
> +               goto util_est;
>         }
>
>         /*
> @@ -6955,7 +6955,6 @@ enqueue_task_fair(struct rq *rq, struct
> task_struct *p, int flags)
>          * Let's add the task's estimated utilization to the cfs_rq's
>          * estimated utilization, before we update schedutil.
>          */
> -       util_est_enqueue(&rq->cfs, p);
>
>         /*
>          * If in_iowait is set, the code below may not trigger any cpufreq
> @@ -7050,6 +7049,9 @@ enqueue_task_fair(struct rq *rq, struct
> task_struct *p, int flags)
>         assert_list_leaf_cfs_rq(rq);
>
>         hrtick_update(rq);
> +util_est:
> +       if (!p->se.sched_delayed)
> +               util_est_enqueue(&rq->cfs, p);
>   }
>
>   static void set_next_buddy(struct sched_entity *se);
> @@ -7173,7 +7175,8 @@ static int dequeue_entities(struct rq *rq, struct
> sched_entity *se, int flags)
>    */
>   static bool dequeue_task_fair(struct rq *rq, struct task_struct *p,
> int flags)
>   {
> -       util_est_dequeue(&rq->cfs, p);
> +       if (!p->se.sched_delayed)
> +               util_est_dequeue(&rq->cfs, p);
>
>         if (dequeue_entities(rq, &p->se, flags) < 0) {
>                 if (!rq->cfs.h_nr_running)
>
> which is basically enqueuing util_est after enqueue_task_fair(),
> dequeuing util_est before dequeue_task_fair() and double check
> p->se.delayed_dequeue, then the unbalanced issue seems to go away.
>
> Hopefully this helps you in finding where the problem could be.
>
> Hongyan
>
> On 27/07/2024 11:27, Peter Zijlstra wrote:
> > Delayed dequeue has tasks sit around on the runqueue that are not
> > actually runnable -- specifically, they will be dequeued the moment
> > they get picked.
> >
> > One side-effect is that such a task can get migrated, which leads to a
> > 'nested' dequeue_task() scenario that messes up uclamp if we don't
> > take care.
> >
> > Notably, dequeue_task(DEQUEUE_SLEEP) can 'fail' and keep the task on
> > the runqueue. This however will have removed the task from uclamp --
> > per uclamp_rq_dec() in dequeue_task(). So far so good.
> >
> > However, if at that point the task gets migrated -- or nice adjusted
> > or any of a myriad of operations that does a dequeue-enqueue cycle --
> > we'll pass through dequeue_task()/enqueue_task() again. Without
> > modification this will lead to a double decrement for uclamp, which is
> > wrong.
> >
> > Reported-by: Luis Machado <luis.machado@arm.com>
> > Reported-by: Hongyan Xia <hongyan.xia2@arm.com>
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > ---
> >   kernel/sched/core.c |   16 +++++++++++++++-
> >   1 file changed, 15 insertions(+), 1 deletion(-)
> >
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -1676,6 +1676,9 @@ static inline void uclamp_rq_inc(struct
> >       if (unlikely(!p->sched_class->uclamp_enabled))
> >               return;
> >
> > +     if (p->se.sched_delayed)
> > +             return;
> > +
> >       for_each_clamp_id(clamp_id)
> >               uclamp_rq_inc_id(rq, p, clamp_id);
> >
> > @@ -1700,6 +1703,9 @@ static inline void uclamp_rq_dec(struct
> >       if (unlikely(!p->sched_class->uclamp_enabled))
> >               return;
> >
> > +     if (p->se.sched_delayed)
> > +             return;
> > +
> >       for_each_clamp_id(clamp_id)
> >               uclamp_rq_dec_id(rq, p, clamp_id);
> >   }
> > @@ -1979,8 +1985,12 @@ void enqueue_task(struct rq *rq, struct
> >               psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
> >       }
> >
> > -     uclamp_rq_inc(rq, p);
> >       p->sched_class->enqueue_task(rq, p, flags);
> > +     /*
> > +      * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
> > +      * ->sched_delayed.
> > +      */
> > +     uclamp_rq_inc(rq, p);
> >
> >       if (sched_core_enabled(rq))
> >               sched_core_enqueue(rq, p);
> > @@ -2002,6 +2012,10 @@ inline bool dequeue_task(struct rq *rq,
> >               psi_dequeue(p, flags & DEQUEUE_SLEEP);
> >       }
> >
> > +     /*
> > +      * Must be before ->dequeue_task() because ->dequeue_task() can 'fail'
> > +      * and mark the task ->sched_delayed.
> > +      */
> >       uclamp_rq_dec(rq, p);
> >       return p->sched_class->dequeue_task(rq, p, flags);
> >   }
> >
> >

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Luis Machado 1 year, 5 months ago

On 8/22/24 09:19, Vincent Guittot wrote:
> Hi,
> 
> On Wed, 21 Aug 2024 at 15:34, Hongyan Xia <hongyan.xia2@arm.com> wrote:
>>
>> Hi Peter,
>>
>> Sorry for bombarding this thread in the last couple of days. I'm seeing
>> several issues in the latest tip/sched/core after these patches landed.
>>
>> What I'm now seeing seems to be an unbalanced call of util_est. First, I applied
> 
> I also see a remaining util_est for idle rq because of an unbalance
> call of util_est_enqueue|dequeue
> 

I can confirm issues with the utilization values and frequencies being driven
seemingly incorrectly, in particular for little cores.

What I'm seeing with the stock series is high utilization values for some tasks
and little cores having their frequencies maxed out for extended periods of
time. Sometimes for 5+ or 10+ seconds, which is excessive as the cores are mostly
idle. But whenever certain tasks get scheduled there, they have a very high util
level and so the frequency is kept at max.

As a consequence this drives up power usage.

I gave Hongyan's draft fix a try and observed a much more reasonable behavior for
the util numbers and frequencies being used for the little cores. With his fix,
I can also see lower energy use for my specific benchmark.


>> the following diff to warn against util_est != 0 when no tasks are on
>> the queue:
>>
>> https://lore.kernel.org/all/752ae417c02b9277ca3ec18893747c54dd5f277f.1724245193.git.hongyan.xia2@arm.com/
>>
>> Then, I'm reliably seeing warnings on my Juno board during boot in
>> latest tip/sched/core.
>>
>> If I do the same thing to util_est just like what you did in this uclamp
>> patch, like this:
> 
> I think that the solution is simpler than your proposal and we just
> need to always call util_est_enqueue() before the
> requeue_delayed_entity
> 
> @@ -6970,11 +6970,6 @@ enqueue_task_fair(struct rq *rq, struct
> task_struct *p, int flags)
>         int rq_h_nr_running = rq->cfs.h_nr_running;
>         u64 slice = 0;
> 
> -       if (flags & ENQUEUE_DELAYED) {
> -               requeue_delayed_entity(se);
> -               return;
> -       }
> -
>         /*
>          * The code below (indirectly) updates schedutil which looks at
>          * the cfs_rq utilization to select a frequency.
> @@ -6983,6 +6978,11 @@ enqueue_task_fair(struct rq *rq, struct
> task_struct *p, int flags)
>          */
>         util_est_enqueue(&rq->cfs, p);
> 
> +       if (flags & ENQUEUE_DELAYED) {
> +               requeue_delayed_entity(se);
> +               return;
> +       }
> +
>         /*
>          * If in_iowait is set, the code below may not trigger any cpufreq
>          * utilization updates, so do it here explicitly with the IOWAIT flag
> 
> 
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 574ef19df64b..58aac42c99e5 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -6946,7 +6946,7 @@ enqueue_task_fair(struct rq *rq, struct
>> task_struct *p, int flags)
>>
>>         if (flags & ENQUEUE_DELAYED) {
>>                 requeue_delayed_entity(se);
>> -               return;
>> +               goto util_est;
>>         }
>>
>>         /*
>> @@ -6955,7 +6955,6 @@ enqueue_task_fair(struct rq *rq, struct
>> task_struct *p, int flags)
>>          * Let's add the task's estimated utilization to the cfs_rq's
>>          * estimated utilization, before we update schedutil.
>>          */
>> -       util_est_enqueue(&rq->cfs, p);
>>
>>         /*
>>          * If in_iowait is set, the code below may not trigger any cpufreq
>> @@ -7050,6 +7049,9 @@ enqueue_task_fair(struct rq *rq, struct
>> task_struct *p, int flags)
>>         assert_list_leaf_cfs_rq(rq);
>>
>>         hrtick_update(rq);
>> +util_est:
>> +       if (!p->se.sched_delayed)
>> +               util_est_enqueue(&rq->cfs, p);
>>   }
>>
>>   static void set_next_buddy(struct sched_entity *se);
>> @@ -7173,7 +7175,8 @@ static int dequeue_entities(struct rq *rq, struct
>> sched_entity *se, int flags)
>>    */
>>   static bool dequeue_task_fair(struct rq *rq, struct task_struct *p,
>> int flags)
>>   {
>> -       util_est_dequeue(&rq->cfs, p);
>> +       if (!p->se.sched_delayed)
>> +               util_est_dequeue(&rq->cfs, p);
>>
>>         if (dequeue_entities(rq, &p->se, flags) < 0) {
>>                 if (!rq->cfs.h_nr_running)
>>
>> which is basically enqueuing util_est after enqueue_task_fair(),
>> dequeuing util_est before dequeue_task_fair() and double check
>> p->se.delayed_dequeue, then the unbalanced issue seems to go away.
>>
>> Hopefully this helps you in finding where the problem could be.
>>
>> Hongyan
>>
>> On 27/07/2024 11:27, Peter Zijlstra wrote:
>>> Delayed dequeue has tasks sit around on the runqueue that are not
>>> actually runnable -- specifically, they will be dequeued the moment
>>> they get picked.
>>>
>>> One side-effect is that such a task can get migrated, which leads to a
>>> 'nested' dequeue_task() scenario that messes up uclamp if we don't
>>> take care.
>>>
>>> Notably, dequeue_task(DEQUEUE_SLEEP) can 'fail' and keep the task on
>>> the runqueue. This however will have removed the task from uclamp --
>>> per uclamp_rq_dec() in dequeue_task(). So far so good.
>>>
>>> However, if at that point the task gets migrated -- or nice adjusted
>>> or any of a myriad of operations that does a dequeue-enqueue cycle --
>>> we'll pass through dequeue_task()/enqueue_task() again. Without
>>> modification this will lead to a double decrement for uclamp, which is
>>> wrong.
>>>
>>> Reported-by: Luis Machado <luis.machado@arm.com>
>>> Reported-by: Hongyan Xia <hongyan.xia2@arm.com>
>>> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>>> ---
>>>   kernel/sched/core.c |   16 +++++++++++++++-
>>>   1 file changed, 15 insertions(+), 1 deletion(-)
>>>
>>> --- a/kernel/sched/core.c
>>> +++ b/kernel/sched/core.c
>>> @@ -1676,6 +1676,9 @@ static inline void uclamp_rq_inc(struct
>>>       if (unlikely(!p->sched_class->uclamp_enabled))
>>>               return;
>>>
>>> +     if (p->se.sched_delayed)
>>> +             return;
>>> +
>>>       for_each_clamp_id(clamp_id)
>>>               uclamp_rq_inc_id(rq, p, clamp_id);
>>>
>>> @@ -1700,6 +1703,9 @@ static inline void uclamp_rq_dec(struct
>>>       if (unlikely(!p->sched_class->uclamp_enabled))
>>>               return;
>>>
>>> +     if (p->se.sched_delayed)
>>> +             return;
>>> +
>>>       for_each_clamp_id(clamp_id)
>>>               uclamp_rq_dec_id(rq, p, clamp_id);
>>>   }
>>> @@ -1979,8 +1985,12 @@ void enqueue_task(struct rq *rq, struct
>>>               psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
>>>       }
>>>
>>> -     uclamp_rq_inc(rq, p);
>>>       p->sched_class->enqueue_task(rq, p, flags);
>>> +     /*
>>> +      * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
>>> +      * ->sched_delayed.
>>> +      */
>>> +     uclamp_rq_inc(rq, p);
>>>
>>>       if (sched_core_enabled(rq))
>>>               sched_core_enqueue(rq, p);
>>> @@ -2002,6 +2012,10 @@ inline bool dequeue_task(struct rq *rq,
>>>               psi_dequeue(p, flags & DEQUEUE_SLEEP);
>>>       }
>>>
>>> +     /*
>>> +      * Must be before ->dequeue_task() because ->dequeue_task() can 'fail'
>>> +      * and mark the task ->sched_delayed.
>>> +      */
>>>       uclamp_rq_dec(rq, p);
>>>       return p->sched_class->dequeue_task(rq, p, flags);
>>>   }
>>>
>>>

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Vincent Guittot 1 year, 5 months ago

On Thu, 22 Aug 2024 at 11:22, Luis Machado <luis.machado@arm.com> wrote:
>
> On 8/22/24 09:19, Vincent Guittot wrote:
> > Hi,
> >
> > On Wed, 21 Aug 2024 at 15:34, Hongyan Xia <hongyan.xia2@arm.com> wrote:
> >>
> >> Hi Peter,
> >>
> >> Sorry for bombarding this thread in the last couple of days. I'm seeing
> >> several issues in the latest tip/sched/core after these patches landed.
> >>
> >> What I'm now seeing seems to be an unbalanced call of util_est. First, I applied
> >
> > I also see a remaining util_est for idle rq because of an unbalance
> > call of util_est_enqueue|dequeue
> >
>
> I can confirm issues with the utilization values and frequencies being driven
> seemingly incorrectly, in particular for little cores.
>
> What I'm seeing with the stock series is high utilization values for some tasks
> and little cores having their frequencies maxed out for extended periods of
> time. Sometimes for 5+ or 10+ seconds, which is excessive as the cores are mostly
> idle. But whenever certain tasks get scheduled there, they have a very high util
> level and so the frequency is kept at max.
>
> As a consequence this drives up power usage.
>
> I gave Hongyan's draft fix a try and observed a much more reasonable behavior for
> the util numbers and frequencies being used for the little cores. With his fix,
> I can also see lower energy use for my specific benchmark.

The main problem is that the util_est of a delayed dequeued task
remains on the rq and keeps the rq utilization high and as a result
the frequency higher than needed.

The below seems to works for me and keep sync the enqueue/dequeue of
uti_test with the enqueue/dequeue of the task as if de dequeue was not
delayed

Another interest is that we will not try to migrate a delayed dequeue
sleeping task that doesn't actually impact the current load of the cpu
and as a result will not help in the load balance. I haven't yet fully
checked what would happen with hotplug

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fea057b311f6..0970bcdc889a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6944,11 +6944,6 @@ enqueue_task_fair(struct rq *rq, struct
task_struct *p, int flags)
        int rq_h_nr_running = rq->cfs.h_nr_running;
        u64 slice = 0;

-       if (flags & ENQUEUE_DELAYED) {
-               requeue_delayed_entity(se);
-               return;
-       }
-
        /*
         * The code below (indirectly) updates schedutil which looks at
         * the cfs_rq utilization to select a frequency.
@@ -6957,6 +6952,11 @@ enqueue_task_fair(struct rq *rq, struct
task_struct *p, int flags)
         */
        util_est_enqueue(&rq->cfs, p);

+       if (flags & ENQUEUE_DELAYED) {
+               requeue_delayed_entity(se);
+               return;
+       }
+
        /*
         * If in_iowait is set, the code below may not trigger any cpufreq
         * utilization updates, so do it here explicitly with the IOWAIT flag
@@ -9276,6 +9276,8 @@ int can_migrate_task(struct task_struct *p,
struct lb_env *env)

        lockdep_assert_rq_held(env->src_rq);

+       if (p->se.sched_delayed)
+               return 0;
        /*
         * We do not migrate tasks that are:
         * 1) throttled_lb_pair, or

>
>
> >> the following diff to warn against util_est != 0 when no tasks are on
> >> the queue:
> >>
> >> https://lore.kernel.org/all/752ae417c02b9277ca3ec18893747c54dd5f277f.1724245193.git.hongyan.xia2@arm.com/
> >>
> >> Then, I'm reliably seeing warnings on my Juno board during boot in
> >> latest tip/sched/core.
> >>
> >> If I do the same thing to util_est just like what you did in this uclamp
> >> patch, like this:
> >
> > I think that the solution is simpler than your proposal and we just
> > need to always call util_est_enqueue() before the
> > requeue_delayed_entity
> >
> > @@ -6970,11 +6970,6 @@ enqueue_task_fair(struct rq *rq, struct
> > task_struct *p, int flags)
> >         int rq_h_nr_running = rq->cfs.h_nr_running;
> >         u64 slice = 0;
> >
> > -       if (flags & ENQUEUE_DELAYED) {
> > -               requeue_delayed_entity(se);
> > -               return;
> > -       }
> > -
> >         /*
> >          * The code below (indirectly) updates schedutil which looks at
> >          * the cfs_rq utilization to select a frequency.
> > @@ -6983,6 +6978,11 @@ enqueue_task_fair(struct rq *rq, struct
> > task_struct *p, int flags)
> >          */
> >         util_est_enqueue(&rq->cfs, p);
> >
> > +       if (flags & ENQUEUE_DELAYED) {
> > +               requeue_delayed_entity(se);
> > +               return;
> > +       }
> > +
> >         /*
> >          * If in_iowait is set, the code below may not trigger any cpufreq
> >          * utilization updates, so do it here explicitly with the IOWAIT flag
> >
> >
> >>
> >> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> >> index 574ef19df64b..58aac42c99e5 100644
> >> --- a/kernel/sched/fair.c
> >> +++ b/kernel/sched/fair.c
> >> @@ -6946,7 +6946,7 @@ enqueue_task_fair(struct rq *rq, struct
> >> task_struct *p, int flags)
> >>
> >>         if (flags & ENQUEUE_DELAYED) {
> >>                 requeue_delayed_entity(se);
> >> -               return;
> >> +               goto util_est;
> >>         }
> >>
> >>         /*
> >> @@ -6955,7 +6955,6 @@ enqueue_task_fair(struct rq *rq, struct
> >> task_struct *p, int flags)
> >>          * Let's add the task's estimated utilization to the cfs_rq's
> >>          * estimated utilization, before we update schedutil.
> >>          */
> >> -       util_est_enqueue(&rq->cfs, p);
> >>
> >>         /*
> >>          * If in_iowait is set, the code below may not trigger any cpufreq
> >> @@ -7050,6 +7049,9 @@ enqueue_task_fair(struct rq *rq, struct
> >> task_struct *p, int flags)
> >>         assert_list_leaf_cfs_rq(rq);
> >>
> >>         hrtick_update(rq);
> >> +util_est:
> >> +       if (!p->se.sched_delayed)
> >> +               util_est_enqueue(&rq->cfs, p);
> >>   }
> >>
> >>   static void set_next_buddy(struct sched_entity *se);
> >> @@ -7173,7 +7175,8 @@ static int dequeue_entities(struct rq *rq, struct
> >> sched_entity *se, int flags)
> >>    */
> >>   static bool dequeue_task_fair(struct rq *rq, struct task_struct *p,
> >> int flags)
> >>   {
> >> -       util_est_dequeue(&rq->cfs, p);
> >> +       if (!p->se.sched_delayed)
> >> +               util_est_dequeue(&rq->cfs, p);
> >>
> >>         if (dequeue_entities(rq, &p->se, flags) < 0) {
> >>                 if (!rq->cfs.h_nr_running)
> >>
> >> which is basically enqueuing util_est after enqueue_task_fair(),
> >> dequeuing util_est before dequeue_task_fair() and double check
> >> p->se.delayed_dequeue, then the unbalanced issue seems to go away.
> >>
> >> Hopefully this helps you in finding where the problem could be.
> >>
> >> Hongyan
> >>
> >> On 27/07/2024 11:27, Peter Zijlstra wrote:
> >>> Delayed dequeue has tasks sit around on the runqueue that are not
> >>> actually runnable -- specifically, they will be dequeued the moment
> >>> they get picked.
> >>>
> >>> One side-effect is that such a task can get migrated, which leads to a
> >>> 'nested' dequeue_task() scenario that messes up uclamp if we don't
> >>> take care.
> >>>
> >>> Notably, dequeue_task(DEQUEUE_SLEEP) can 'fail' and keep the task on
> >>> the runqueue. This however will have removed the task from uclamp --
> >>> per uclamp_rq_dec() in dequeue_task(). So far so good.
> >>>
> >>> However, if at that point the task gets migrated -- or nice adjusted
> >>> or any of a myriad of operations that does a dequeue-enqueue cycle --
> >>> we'll pass through dequeue_task()/enqueue_task() again. Without
> >>> modification this will lead to a double decrement for uclamp, which is
> >>> wrong.
> >>>
> >>> Reported-by: Luis Machado <luis.machado@arm.com>
> >>> Reported-by: Hongyan Xia <hongyan.xia2@arm.com>
> >>> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> >>> ---
> >>>   kernel/sched/core.c |   16 +++++++++++++++-
> >>>   1 file changed, 15 insertions(+), 1 deletion(-)
> >>>
> >>> --- a/kernel/sched/core.c
> >>> +++ b/kernel/sched/core.c
> >>> @@ -1676,6 +1676,9 @@ static inline void uclamp_rq_inc(struct
> >>>       if (unlikely(!p->sched_class->uclamp_enabled))
> >>>               return;
> >>>
> >>> +     if (p->se.sched_delayed)
> >>> +             return;
> >>> +
> >>>       for_each_clamp_id(clamp_id)
> >>>               uclamp_rq_inc_id(rq, p, clamp_id);
> >>>
> >>> @@ -1700,6 +1703,9 @@ static inline void uclamp_rq_dec(struct
> >>>       if (unlikely(!p->sched_class->uclamp_enabled))
> >>>               return;
> >>>
> >>> +     if (p->se.sched_delayed)
> >>> +             return;
> >>> +
> >>>       for_each_clamp_id(clamp_id)
> >>>               uclamp_rq_dec_id(rq, p, clamp_id);
> >>>   }
> >>> @@ -1979,8 +1985,12 @@ void enqueue_task(struct rq *rq, struct
> >>>               psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
> >>>       }
> >>>
> >>> -     uclamp_rq_inc(rq, p);
> >>>       p->sched_class->enqueue_task(rq, p, flags);
> >>> +     /*
> >>> +      * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
> >>> +      * ->sched_delayed.
> >>> +      */
> >>> +     uclamp_rq_inc(rq, p);
> >>>
> >>>       if (sched_core_enabled(rq))
> >>>               sched_core_enqueue(rq, p);
> >>> @@ -2002,6 +2012,10 @@ inline bool dequeue_task(struct rq *rq,
> >>>               psi_dequeue(p, flags & DEQUEUE_SLEEP);
> >>>       }
> >>>
> >>> +     /*
> >>> +      * Must be before ->dequeue_task() because ->dequeue_task() can 'fail'
> >>> +      * and mark the task ->sched_delayed.
> >>> +      */
> >>>       uclamp_rq_dec(rq, p);
> >>>       return p->sched_class->dequeue_task(rq, p, flags);
> >>>   }
> >>>
> >>>
>

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Luis Machado 1 year, 5 months ago

On 8/22/24 10:53, Vincent Guittot wrote:
> On Thu, 22 Aug 2024 at 11:22, Luis Machado <luis.machado@arm.com> wrote:
>>
>> On 8/22/24 09:19, Vincent Guittot wrote:
>>> Hi,
>>>
>>> On Wed, 21 Aug 2024 at 15:34, Hongyan Xia <hongyan.xia2@arm.com> wrote:
>>>>
>>>> Hi Peter,
>>>>
>>>> Sorry for bombarding this thread in the last couple of days. I'm seeing
>>>> several issues in the latest tip/sched/core after these patches landed.
>>>>
>>>> What I'm now seeing seems to be an unbalanced call of util_est. First, I applied
>>>
>>> I also see a remaining util_est for idle rq because of an unbalance
>>> call of util_est_enqueue|dequeue
>>>
>>
>> I can confirm issues with the utilization values and frequencies being driven
>> seemingly incorrectly, in particular for little cores.
>>
>> What I'm seeing with the stock series is high utilization values for some tasks
>> and little cores having their frequencies maxed out for extended periods of
>> time. Sometimes for 5+ or 10+ seconds, which is excessive as the cores are mostly
>> idle. But whenever certain tasks get scheduled there, they have a very high util
>> level and so the frequency is kept at max.
>>
>> As a consequence this drives up power usage.
>>
>> I gave Hongyan's draft fix a try and observed a much more reasonable behavior for
>> the util numbers and frequencies being used for the little cores. With his fix,
>> I can also see lower energy use for my specific benchmark.
> 
> The main problem is that the util_est of a delayed dequeued task
> remains on the rq and keeps the rq utilization high and as a result
> the frequency higher than needed.
> 
> The below seems to works for me and keep sync the enqueue/dequeue of
> uti_test with the enqueue/dequeue of the task as if de dequeue was not
> delayed
> 
> Another interest is that we will not try to migrate a delayed dequeue
> sleeping task that doesn't actually impact the current load of the cpu
> and as a result will not help in the load balance. I haven't yet fully
> checked what would happen with hotplug

Thanks. Those are good points. Let me go and try your patch.

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Luis Machado 1 year, 5 months ago

Vincent,

On 8/22/24 11:28, Luis Machado wrote:
> On 8/22/24 10:53, Vincent Guittot wrote:
>> On Thu, 22 Aug 2024 at 11:22, Luis Machado <luis.machado@arm.com> wrote:
>>>
>>> On 8/22/24 09:19, Vincent Guittot wrote:
>>>> Hi,
>>>>
>>>> On Wed, 21 Aug 2024 at 15:34, Hongyan Xia <hongyan.xia2@arm.com> wrote:
>>>>>
>>>>> Hi Peter,
>>>>>
>>>>> Sorry for bombarding this thread in the last couple of days. I'm seeing
>>>>> several issues in the latest tip/sched/core after these patches landed.
>>>>>
>>>>> What I'm now seeing seems to be an unbalanced call of util_est. First, I applied
>>>>
>>>> I also see a remaining util_est for idle rq because of an unbalance
>>>> call of util_est_enqueue|dequeue
>>>>
>>>
>>> I can confirm issues with the utilization values and frequencies being driven
>>> seemingly incorrectly, in particular for little cores.
>>>
>>> What I'm seeing with the stock series is high utilization values for some tasks
>>> and little cores having their frequencies maxed out for extended periods of
>>> time. Sometimes for 5+ or 10+ seconds, which is excessive as the cores are mostly
>>> idle. But whenever certain tasks get scheduled there, they have a very high util
>>> level and so the frequency is kept at max.
>>>
>>> As a consequence this drives up power usage.
>>>
>>> I gave Hongyan's draft fix a try and observed a much more reasonable behavior for
>>> the util numbers and frequencies being used for the little cores. With his fix,
>>> I can also see lower energy use for my specific benchmark.
>>
>> The main problem is that the util_est of a delayed dequeued task
>> remains on the rq and keeps the rq utilization high and as a result
>> the frequency higher than needed.
>>
>> The below seems to works for me and keep sync the enqueue/dequeue of
>> uti_test with the enqueue/dequeue of the task as if de dequeue was not
>> delayed
>>
>> Another interest is that we will not try to migrate a delayed dequeue
>> sleeping task that doesn't actually impact the current load of the cpu
>> and as a result will not help in the load balance. I haven't yet fully
>> checked what would happen with hotplug
> 
> Thanks. Those are good points. Let me go and try your patch.

I gave your fix a try, but it seems to make things worse. It is comparable
to the behavior we had before Peter added the uclamp imbalance fix, so I
believe there is something incorrect there.

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Vincent Guittot 1 year, 5 months ago

On Thu, 22 Aug 2024 at 14:08, Luis Machado <luis.machado@arm.com> wrote:
>
> Vincent,
>
> On 8/22/24 11:28, Luis Machado wrote:
> > On 8/22/24 10:53, Vincent Guittot wrote:
> >> On Thu, 22 Aug 2024 at 11:22, Luis Machado <luis.machado@arm.com> wrote:
> >>>
> >>> On 8/22/24 09:19, Vincent Guittot wrote:
> >>>> Hi,
> >>>>
> >>>> On Wed, 21 Aug 2024 at 15:34, Hongyan Xia <hongyan.xia2@arm.com> wrote:
> >>>>>
> >>>>> Hi Peter,
> >>>>>
> >>>>> Sorry for bombarding this thread in the last couple of days. I'm seeing
> >>>>> several issues in the latest tip/sched/core after these patches landed.
> >>>>>
> >>>>> What I'm now seeing seems to be an unbalanced call of util_est. First, I applied
> >>>>
> >>>> I also see a remaining util_est for idle rq because of an unbalance
> >>>> call of util_est_enqueue|dequeue
> >>>>
> >>>
> >>> I can confirm issues with the utilization values and frequencies being driven
> >>> seemingly incorrectly, in particular for little cores.
> >>>
> >>> What I'm seeing with the stock series is high utilization values for some tasks
> >>> and little cores having their frequencies maxed out for extended periods of
> >>> time. Sometimes for 5+ or 10+ seconds, which is excessive as the cores are mostly
> >>> idle. But whenever certain tasks get scheduled there, they have a very high util
> >>> level and so the frequency is kept at max.
> >>>
> >>> As a consequence this drives up power usage.
> >>>
> >>> I gave Hongyan's draft fix a try and observed a much more reasonable behavior for
> >>> the util numbers and frequencies being used for the little cores. With his fix,
> >>> I can also see lower energy use for my specific benchmark.
> >>
> >> The main problem is that the util_est of a delayed dequeued task
> >> remains on the rq and keeps the rq utilization high and as a result
> >> the frequency higher than needed.
> >>
> >> The below seems to works for me and keep sync the enqueue/dequeue of
> >> uti_test with the enqueue/dequeue of the task as if de dequeue was not
> >> delayed
> >>
> >> Another interest is that we will not try to migrate a delayed dequeue
> >> sleeping task that doesn't actually impact the current load of the cpu
> >> and as a result will not help in the load balance. I haven't yet fully
> >> checked what would happen with hotplug
> >
> > Thanks. Those are good points. Let me go and try your patch.
>
> I gave your fix a try, but it seems to make things worse. It is comparable
> to the behavior we had before Peter added the uclamp imbalance fix, so I
> believe there is something incorrect there.

we need to filter case where task are enqueued/dequeued several
consecutive times. That's what I'm look now

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Vincent Guittot 1 year, 5 months ago

On Thu, 22 Aug 2024 at 14:10, Vincent Guittot
<vincent.guittot@linaro.org> wrote:
>
> On Thu, 22 Aug 2024 at 14:08, Luis Machado <luis.machado@arm.com> wrote:
> >
> > Vincent,
> >
> > On 8/22/24 11:28, Luis Machado wrote:
> > > On 8/22/24 10:53, Vincent Guittot wrote:
> > >> On Thu, 22 Aug 2024 at 11:22, Luis Machado <luis.machado@arm.com> wrote:
> > >>>
> > >>> On 8/22/24 09:19, Vincent Guittot wrote:
> > >>>> Hi,
> > >>>>
> > >>>> On Wed, 21 Aug 2024 at 15:34, Hongyan Xia <hongyan.xia2@arm.com> wrote:
> > >>>>>
> > >>>>> Hi Peter,
> > >>>>>
> > >>>>> Sorry for bombarding this thread in the last couple of days. I'm seeing
> > >>>>> several issues in the latest tip/sched/core after these patches landed.
> > >>>>>
> > >>>>> What I'm now seeing seems to be an unbalanced call of util_est. First, I applied
> > >>>>
> > >>>> I also see a remaining util_est for idle rq because of an unbalance
> > >>>> call of util_est_enqueue|dequeue
> > >>>>
> > >>>
> > >>> I can confirm issues with the utilization values and frequencies being driven
> > >>> seemingly incorrectly, in particular for little cores.
> > >>>
> > >>> What I'm seeing with the stock series is high utilization values for some tasks
> > >>> and little cores having their frequencies maxed out for extended periods of
> > >>> time. Sometimes for 5+ or 10+ seconds, which is excessive as the cores are mostly
> > >>> idle. But whenever certain tasks get scheduled there, they have a very high util
> > >>> level and so the frequency is kept at max.
> > >>>
> > >>> As a consequence this drives up power usage.
> > >>>
> > >>> I gave Hongyan's draft fix a try and observed a much more reasonable behavior for
> > >>> the util numbers and frequencies being used for the little cores. With his fix,
> > >>> I can also see lower energy use for my specific benchmark.
> > >>
> > >> The main problem is that the util_est of a delayed dequeued task
> > >> remains on the rq and keeps the rq utilization high and as a result
> > >> the frequency higher than needed.
> > >>
> > >> The below seems to works for me and keep sync the enqueue/dequeue of
> > >> uti_test with the enqueue/dequeue of the task as if de dequeue was not
> > >> delayed
> > >>
> > >> Another interest is that we will not try to migrate a delayed dequeue
> > >> sleeping task that doesn't actually impact the current load of the cpu
> > >> and as a result will not help in the load balance. I haven't yet fully
> > >> checked what would happen with hotplug
> > >
> > > Thanks. Those are good points. Let me go and try your patch.
> >
> > I gave your fix a try, but it seems to make things worse. It is comparable
> > to the behavior we had before Peter added the uclamp imbalance fix, so I
> > believe there is something incorrect there.
>
> we need to filter case where task are enqueued/dequeued several
> consecutive times. That's what I'm look now

I just realize before that It's not only util_est but the h_nr_running
that keeps delayed tasks as well so all stats of the rq are biased:
h_nr_running, util_est, runnable avg and load_avg.

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Hongyan Xia 1 year, 5 months ago

On 22/08/2024 15:58, Vincent Guittot wrote:
> On Thu, 22 Aug 2024 at 14:10, Vincent Guittot
> <vincent.guittot@linaro.org> wrote:
>>
>> On Thu, 22 Aug 2024 at 14:08, Luis Machado <luis.machado@arm.com> wrote:
>>>
>>> Vincent,
>>>
>>> On 8/22/24 11:28, Luis Machado wrote:
>>>> On 8/22/24 10:53, Vincent Guittot wrote:
>>>>> On Thu, 22 Aug 2024 at 11:22, Luis Machado <luis.machado@arm.com> wrote:
>>>>>>
>>>>>> On 8/22/24 09:19, Vincent Guittot wrote:
>>>>>>> Hi,
>>>>>>>
>>>>>>> On Wed, 21 Aug 2024 at 15:34, Hongyan Xia <hongyan.xia2@arm.com> wrote:
>>>>>>>>
>>>>>>>> Hi Peter,
>>>>>>>>
>>>>>>>> Sorry for bombarding this thread in the last couple of days. I'm seeing
>>>>>>>> several issues in the latest tip/sched/core after these patches landed.
>>>>>>>>
>>>>>>>> What I'm now seeing seems to be an unbalanced call of util_est. First, I applied
>>>>>>>
>>>>>>> I also see a remaining util_est for idle rq because of an unbalance
>>>>>>> call of util_est_enqueue|dequeue
>>>>>>>
>>>>>>
>>>>>> I can confirm issues with the utilization values and frequencies being driven
>>>>>> seemingly incorrectly, in particular for little cores.
>>>>>>
>>>>>> What I'm seeing with the stock series is high utilization values for some tasks
>>>>>> and little cores having their frequencies maxed out for extended periods of
>>>>>> time. Sometimes for 5+ or 10+ seconds, which is excessive as the cores are mostly
>>>>>> idle. But whenever certain tasks get scheduled there, they have a very high util
>>>>>> level and so the frequency is kept at max.
>>>>>>
>>>>>> As a consequence this drives up power usage.
>>>>>>
>>>>>> I gave Hongyan's draft fix a try and observed a much more reasonable behavior for
>>>>>> the util numbers and frequencies being used for the little cores. With his fix,
>>>>>> I can also see lower energy use for my specific benchmark.
>>>>>
>>>>> The main problem is that the util_est of a delayed dequeued task
>>>>> remains on the rq and keeps the rq utilization high and as a result
>>>>> the frequency higher than needed.
>>>>>
>>>>> The below seems to works for me and keep sync the enqueue/dequeue of
>>>>> uti_test with the enqueue/dequeue of the task as if de dequeue was not
>>>>> delayed
>>>>>
>>>>> Another interest is that we will not try to migrate a delayed dequeue
>>>>> sleeping task that doesn't actually impact the current load of the cpu
>>>>> and as a result will not help in the load balance. I haven't yet fully
>>>>> checked what would happen with hotplug
>>>>
>>>> Thanks. Those are good points. Let me go and try your patch.
>>>
>>> I gave your fix a try, but it seems to make things worse. It is comparable
>>> to the behavior we had before Peter added the uclamp imbalance fix, so I
>>> believe there is something incorrect there.
>>
>> we need to filter case where task are enqueued/dequeued several
>> consecutive times. That's what I'm look now
> 
> I just realize before that It's not only util_est but the h_nr_running
> that keeps delayed tasks as well so all stats of the rq are biased:
> h_nr_running, util_est, runnable avg and load_avg.

After staring at the code even more, I think the situation is worse.

First thing is that uclamp might also want to be part of these stats 
(h_nr_running, util_est, runnable_avg, load_avg) that do not follow 
delayed dequeue which needs to be specially handled in the same way. The 
current way of handling uclamp in core.c misses the frequency update, 
like I commented before.

Second, there is also an out-of-sync issue in update_load_avg(). We only 
update the task-level se in delayed dequeue and requeue, but we return 
early and the upper levels are completely skipped, as if the delayed 
task is still on rq. This de-sync is wrong.

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Dietmar Eggemann 1 year, 5 months ago

On 29/08/2024 17:42, Hongyan Xia wrote:
> On 22/08/2024 15:58, Vincent Guittot wrote:
>> On Thu, 22 Aug 2024 at 14:10, Vincent Guittot
>> <vincent.guittot@linaro.org> wrote:
>>>
>>> On Thu, 22 Aug 2024 at 14:08, Luis Machado <luis.machado@arm.com> wrote:
>>>>
>>>> Vincent,
>>>>
>>>> On 8/22/24 11:28, Luis Machado wrote:
>>>>> On 8/22/24 10:53, Vincent Guittot wrote:
>>>>>> On Thu, 22 Aug 2024 at 11:22, Luis Machado <luis.machado@arm.com>
>>>>>> wrote:
>>>>>>>
>>>>>>> On 8/22/24 09:19, Vincent Guittot wrote:
>>>>>>>> Hi,
>>>>>>>>
>>>>>>>> On Wed, 21 Aug 2024 at 15:34, Hongyan Xia <hongyan.xia2@arm.com>
>>>>>>>> wrote:
>>>>>>>>>
>>>>>>>>> Hi Peter,
>>>>>>>>>
>>>>>>>>> Sorry for bombarding this thread in the last couple of days.
>>>>>>>>> I'm seeing
>>>>>>>>> several issues in the latest tip/sched/core after these patches
>>>>>>>>> landed.
>>>>>>>>>
>>>>>>>>> What I'm now seeing seems to be an unbalanced call of util_est.
>>>>>>>>> First, I applied
>>>>>>>>
>>>>>>>> I also see a remaining util_est for idle rq because of an unbalance
>>>>>>>> call of util_est_enqueue|dequeue
>>>>>>>>
>>>>>>>
>>>>>>> I can confirm issues with the utilization values and frequencies
>>>>>>> being driven
>>>>>>> seemingly incorrectly, in particular for little cores.
>>>>>>>
>>>>>>> What I'm seeing with the stock series is high utilization values
>>>>>>> for some tasks
>>>>>>> and little cores having their frequencies maxed out for extended
>>>>>>> periods of
>>>>>>> time. Sometimes for 5+ or 10+ seconds, which is excessive as the
>>>>>>> cores are mostly
>>>>>>> idle. But whenever certain tasks get scheduled there, they have a
>>>>>>> very high util
>>>>>>> level and so the frequency is kept at max.
>>>>>>>
>>>>>>> As a consequence this drives up power usage.
>>>>>>>
>>>>>>> I gave Hongyan's draft fix a try and observed a much more
>>>>>>> reasonable behavior for
>>>>>>> the util numbers and frequencies being used for the little cores.
>>>>>>> With his fix,
>>>>>>> I can also see lower energy use for my specific benchmark.
>>>>>>
>>>>>> The main problem is that the util_est of a delayed dequeued task
>>>>>> remains on the rq and keeps the rq utilization high and as a result
>>>>>> the frequency higher than needed.
>>>>>>
>>>>>> The below seems to works for me and keep sync the enqueue/dequeue of
>>>>>> uti_test with the enqueue/dequeue of the task as if de dequeue was
>>>>>> not
>>>>>> delayed
>>>>>>
>>>>>> Another interest is that we will not try to migrate a delayed dequeue
>>>>>> sleeping task that doesn't actually impact the current load of the
>>>>>> cpu
>>>>>> and as a result will not help in the load balance. I haven't yet
>>>>>> fully
>>>>>> checked what would happen with hotplug
>>>>>
>>>>> Thanks. Those are good points. Let me go and try your patch.
>>>>
>>>> I gave your fix a try, but it seems to make things worse. It is
>>>> comparable
>>>> to the behavior we had before Peter added the uclamp imbalance fix,
>>>> so I
>>>> believe there is something incorrect there.
>>>
>>> we need to filter case where task are enqueued/dequeued several
>>> consecutive times. That's what I'm look now
>>
>> I just realize before that It's not only util_est but the h_nr_running
>> that keeps delayed tasks as well so all stats of the rq are biased:
>> h_nr_running, util_est, runnable avg and load_avg.
> 
> After staring at the code even more, I think the situation is worse.
> 
> First thing is that uclamp might also want to be part of these stats
> (h_nr_running, util_est, runnable_avg, load_avg) that do not follow
> delayed dequeue which needs to be specially handled in the same way. The
> current way of handling uclamp in core.c misses the frequency update,
> like I commented before.
> 
> Second, there is also an out-of-sync issue in update_load_avg(). We only
> update the task-level se in delayed dequeue and requeue, but we return
> early and the upper levels are completely skipped, as if the delayed
> task is still on rq. This de-sync is wrong.

I had a look at the util_est issue.

This keeps rq->cfs.avg.util_avg sane for me with
SCHED_FEAT(DELAY_DEQUEUE, true):

-->8--

From 0d7e8d057f49a47e0f3f484ac7d41e047dccec38 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Thu, 5 Sep 2024 00:05:23 +0200
Subject: [PATCH] kernel/sched: Fix util_est accounting for DELAY_DEQUEUE

Remove delayed tasks from util_est even they are runnable.

Exclude delayed task which are (a) migrating between rq's or (b) in a
SAVE/RESTORE dequeue/enqueue.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
---
 kernel/sched/fair.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1e693ca8ebd6..5c32cc26d6c2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6948,18 +6948,19 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	int rq_h_nr_running = rq->cfs.h_nr_running;
 	u64 slice = 0;
 
-	if (flags & ENQUEUE_DELAYED) {
-		requeue_delayed_entity(se);
-		return;
-	}
-
 	/*
 	 * The code below (indirectly) updates schedutil which looks at
 	 * the cfs_rq utilization to select a frequency.
 	 * Let's add the task's estimated utilization to the cfs_rq's
 	 * estimated utilization, before we update schedutil.
 	 */
-	util_est_enqueue(&rq->cfs, p);
+	if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE))))
+		util_est_enqueue(&rq->cfs, p);
+
+	if (flags & ENQUEUE_DELAYED) {
+		requeue_delayed_entity(se);
+		return;
+	}
 
 	/*
 	 * If in_iowait is set, the code below may not trigger any cpufreq
@@ -7177,7 +7178,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
  */
 static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
-	util_est_dequeue(&rq->cfs, p);
+	if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
+		util_est_dequeue(&rq->cfs, p);
 
 	if (dequeue_entities(rq, &p->se, flags) < 0) {
 		if (!rq->cfs.h_nr_running)
-- 
2.34.1

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Peter Zijlstra 1 year, 5 months ago

On Thu, Sep 05, 2024 at 03:02:44PM +0200, Dietmar Eggemann wrote:

> From 0d7e8d057f49a47e0f3f484ac7d41e047dccec38 Mon Sep 17 00:00:00 2001
> From: Dietmar Eggemann <dietmar.eggemann@arm.com>
> Date: Thu, 5 Sep 2024 00:05:23 +0200
> Subject: [PATCH] kernel/sched: Fix util_est accounting for DELAY_DEQUEUE
> 
> Remove delayed tasks from util_est even they are runnable.
> 
> Exclude delayed task which are (a) migrating between rq's or (b) in a
> SAVE/RESTORE dequeue/enqueue.
> 
> Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
> ---
>  kernel/sched/fair.c | 16 +++++++++-------
>  1 file changed, 9 insertions(+), 7 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 1e693ca8ebd6..5c32cc26d6c2 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6948,18 +6948,19 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  	int rq_h_nr_running = rq->cfs.h_nr_running;
>  	u64 slice = 0;
>  
> -	if (flags & ENQUEUE_DELAYED) {
> -		requeue_delayed_entity(se);
> -		return;
> -	}
> -
>  	/*
>  	 * The code below (indirectly) updates schedutil which looks at
>  	 * the cfs_rq utilization to select a frequency.
>  	 * Let's add the task's estimated utilization to the cfs_rq's
>  	 * estimated utilization, before we update schedutil.
>  	 */
> -	util_est_enqueue(&rq->cfs, p);
> +	if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE))))
> +		util_est_enqueue(&rq->cfs, p);
> +
> +	if (flags & ENQUEUE_DELAYED) {
> +		requeue_delayed_entity(se);
> +		return;
> +	}
>  
>  	/*
>  	 * If in_iowait is set, the code below may not trigger any cpufreq
> @@ -7177,7 +7178,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
>   */
>  static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  {
> -	util_est_dequeue(&rq->cfs, p);
> +	if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
> +		util_est_dequeue(&rq->cfs, p);
>  
>  	if (dequeue_entities(rq, &p->se, flags) < 0) {
>  		if (!rq->cfs.h_nr_running)

Thanks!

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Vincent Guittot 1 year, 5 months ago

On Thu, 5 Sept 2024 at 15:02, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:
>
> On 29/08/2024 17:42, Hongyan Xia wrote:
> > On 22/08/2024 15:58, Vincent Guittot wrote:
> >> On Thu, 22 Aug 2024 at 14:10, Vincent Guittot
> >> <vincent.guittot@linaro.org> wrote:
> >>>
> >>> On Thu, 22 Aug 2024 at 14:08, Luis Machado <luis.machado@arm.com> wrote:
> >>>>
> >>>> Vincent,
> >>>>
> >>>> On 8/22/24 11:28, Luis Machado wrote:
> >>>>> On 8/22/24 10:53, Vincent Guittot wrote:
> >>>>>> On Thu, 22 Aug 2024 at 11:22, Luis Machado <luis.machado@arm.com>
> >>>>>> wrote:
> >>>>>>>
> >>>>>>> On 8/22/24 09:19, Vincent Guittot wrote:
> >>>>>>>> Hi,
> >>>>>>>>
> >>>>>>>> On Wed, 21 Aug 2024 at 15:34, Hongyan Xia <hongyan.xia2@arm.com>
> >>>>>>>> wrote:
> >>>>>>>>>
> >>>>>>>>> Hi Peter,
> >>>>>>>>>
> >>>>>>>>> Sorry for bombarding this thread in the last couple of days.
> >>>>>>>>> I'm seeing
> >>>>>>>>> several issues in the latest tip/sched/core after these patches
> >>>>>>>>> landed.
> >>>>>>>>>
> >>>>>>>>> What I'm now seeing seems to be an unbalanced call of util_est.
> >>>>>>>>> First, I applied
> >>>>>>>>
> >>>>>>>> I also see a remaining util_est for idle rq because of an unbalance
> >>>>>>>> call of util_est_enqueue|dequeue
> >>>>>>>>
> >>>>>>>
> >>>>>>> I can confirm issues with the utilization values and frequencies
> >>>>>>> being driven
> >>>>>>> seemingly incorrectly, in particular for little cores.
> >>>>>>>
> >>>>>>> What I'm seeing with the stock series is high utilization values
> >>>>>>> for some tasks
> >>>>>>> and little cores having their frequencies maxed out for extended
> >>>>>>> periods of
> >>>>>>> time. Sometimes for 5+ or 10+ seconds, which is excessive as the
> >>>>>>> cores are mostly
> >>>>>>> idle. But whenever certain tasks get scheduled there, they have a
> >>>>>>> very high util
> >>>>>>> level and so the frequency is kept at max.
> >>>>>>>
> >>>>>>> As a consequence this drives up power usage.
> >>>>>>>
> >>>>>>> I gave Hongyan's draft fix a try and observed a much more
> >>>>>>> reasonable behavior for
> >>>>>>> the util numbers and frequencies being used for the little cores.
> >>>>>>> With his fix,
> >>>>>>> I can also see lower energy use for my specific benchmark.
> >>>>>>
> >>>>>> The main problem is that the util_est of a delayed dequeued task
> >>>>>> remains on the rq and keeps the rq utilization high and as a result
> >>>>>> the frequency higher than needed.
> >>>>>>
> >>>>>> The below seems to works for me and keep sync the enqueue/dequeue of
> >>>>>> uti_test with the enqueue/dequeue of the task as if de dequeue was
> >>>>>> not
> >>>>>> delayed
> >>>>>>
> >>>>>> Another interest is that we will not try to migrate a delayed dequeue
> >>>>>> sleeping task that doesn't actually impact the current load of the
> >>>>>> cpu
> >>>>>> and as a result will not help in the load balance. I haven't yet
> >>>>>> fully
> >>>>>> checked what would happen with hotplug
> >>>>>
> >>>>> Thanks. Those are good points. Let me go and try your patch.
> >>>>
> >>>> I gave your fix a try, but it seems to make things worse. It is
> >>>> comparable
> >>>> to the behavior we had before Peter added the uclamp imbalance fix,
> >>>> so I
> >>>> believe there is something incorrect there.
> >>>
> >>> we need to filter case where task are enqueued/dequeued several
> >>> consecutive times. That's what I'm look now
> >>
> >> I just realize before that It's not only util_est but the h_nr_running
> >> that keeps delayed tasks as well so all stats of the rq are biased:
> >> h_nr_running, util_est, runnable avg and load_avg.
> >
> > After staring at the code even more, I think the situation is worse.
> >
> > First thing is that uclamp might also want to be part of these stats
> > (h_nr_running, util_est, runnable_avg, load_avg) that do not follow
> > delayed dequeue which needs to be specially handled in the same way. The
> > current way of handling uclamp in core.c misses the frequency update,
> > like I commented before.
> >
> > Second, there is also an out-of-sync issue in update_load_avg(). We only
> > update the task-level se in delayed dequeue and requeue, but we return
> > early and the upper levels are completely skipped, as if the delayed
> > task is still on rq. This de-sync is wrong.
>
> I had a look at the util_est issue.
>
> This keeps rq->cfs.avg.util_avg sane for me with
> SCHED_FEAT(DELAY_DEQUEUE, true):
>
> -->8--
>
> From 0d7e8d057f49a47e0f3f484ac7d41e047dccec38 Mon Sep 17 00:00:00 2001
> From: Dietmar Eggemann <dietmar.eggemann@arm.com>
> Date: Thu, 5 Sep 2024 00:05:23 +0200
> Subject: [PATCH] kernel/sched: Fix util_est accounting for DELAY_DEQUEUE
>
> Remove delayed tasks from util_est even they are runnable.

Unfortunately, this is not only about util_est

cfs_rq's runnable_avg is also wrong  because we normally have :
cfs_rq's runnable_avg == /Sum se's runnable_avg
but cfs_rq's runnable_avg uses cfs_rq's h_nr_running but delayed
entities are still accounted in h_nr_running

That also means that cfs_rq's h_nr_running is not accurate anymore
because it includes delayed dequeue

and cfs_rq load_avg is kept artificially high which biases
load_balance and cgroup's shares

>
> Exclude delayed task which are (a) migrating between rq's or (b) in a
> SAVE/RESTORE dequeue/enqueue.
>
> Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
> ---
>  kernel/sched/fair.c | 16 +++++++++-------
>  1 file changed, 9 insertions(+), 7 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 1e693ca8ebd6..5c32cc26d6c2 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6948,18 +6948,19 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>         int rq_h_nr_running = rq->cfs.h_nr_running;
>         u64 slice = 0;
>
> -       if (flags & ENQUEUE_DELAYED) {
> -               requeue_delayed_entity(se);
> -               return;
> -       }
> -
>         /*
>          * The code below (indirectly) updates schedutil which looks at
>          * the cfs_rq utilization to select a frequency.
>          * Let's add the task's estimated utilization to the cfs_rq's
>          * estimated utilization, before we update schedutil.
>          */
> -       util_est_enqueue(&rq->cfs, p);
> +       if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE))))
> +               util_est_enqueue(&rq->cfs, p);
> +
> +       if (flags & ENQUEUE_DELAYED) {
> +               requeue_delayed_entity(se);
> +               return;
> +       }
>
>         /*
>          * If in_iowait is set, the code below may not trigger any cpufreq
> @@ -7177,7 +7178,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
>   */
>  static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  {
> -       util_est_dequeue(&rq->cfs, p);
> +       if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
> +               util_est_dequeue(&rq->cfs, p);
>
>         if (dequeue_entities(rq, &p->se, flags) < 0) {
>                 if (!rq->cfs.h_nr_running)
> --
> 2.34.1
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Dietmar Eggemann 1 year, 5 months ago

On 05/09/2024 15:33, Vincent Guittot wrote:
> On Thu, 5 Sept 2024 at 15:02, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:
>>
>> On 29/08/2024 17:42, Hongyan Xia wrote:
>>> On 22/08/2024 15:58, Vincent Guittot wrote:
>>>> On Thu, 22 Aug 2024 at 14:10, Vincent Guittot
>>>> <vincent.guittot@linaro.org> wrote:
>>>>>
>>>>> On Thu, 22 Aug 2024 at 14:08, Luis Machado <luis.machado@arm.com> wrote:
>>>>>>
>>>>>> Vincent,
>>>>>>
>>>>>> On 8/22/24 11:28, Luis Machado wrote:
>>>>>>> On 8/22/24 10:53, Vincent Guittot wrote:
>>>>>>>> On Thu, 22 Aug 2024 at 11:22, Luis Machado <luis.machado@arm.com>
>>>>>>>> wrote:
>>>>>>>>>
>>>>>>>>> On 8/22/24 09:19, Vincent Guittot wrote:
>>>>>>>>>> Hi,
>>>>>>>>>>
>>>>>>>>>> On Wed, 21 Aug 2024 at 15:34, Hongyan Xia <hongyan.xia2@arm.com>

[...]

>>> After staring at the code even more, I think the situation is worse.
>>>
>>> First thing is that uclamp might also want to be part of these stats
>>> (h_nr_running, util_est, runnable_avg, load_avg) that do not follow
>>> delayed dequeue which needs to be specially handled in the same way. The
>>> current way of handling uclamp in core.c misses the frequency update,
>>> like I commented before.
>>>
>>> Second, there is also an out-of-sync issue in update_load_avg(). We only
>>> update the task-level se in delayed dequeue and requeue, but we return
>>> early and the upper levels are completely skipped, as if the delayed
>>> task is still on rq. This de-sync is wrong.
>>
>> I had a look at the util_est issue.
>>
>> This keeps rq->cfs.avg.util_avg sane for me with
>> SCHED_FEAT(DELAY_DEQUEUE, true):
>>
>> -->8--
>>
>> From 0d7e8d057f49a47e0f3f484ac7d41e047dccec38 Mon Sep 17 00:00:00 2001
>> From: Dietmar Eggemann <dietmar.eggemann@arm.com>
>> Date: Thu, 5 Sep 2024 00:05:23 +0200
>> Subject: [PATCH] kernel/sched: Fix util_est accounting for DELAY_DEQUEUE
>>
>> Remove delayed tasks from util_est even they are runnable.
> 
> Unfortunately, this is not only about util_est
> 
> cfs_rq's runnable_avg is also wrong  because we normally have :
> cfs_rq's runnable_avg == /Sum se's runnable_avg
> but cfs_rq's runnable_avg uses cfs_rq's h_nr_running but delayed
> entities are still accounted in h_nr_running

Yes, I agree.

se's runnable_avg should be fine already since:

se_runnable()

  if (se->sched_delayed)
    return false

But then, like you said, __update_load_avg_cfs_rq() needs correct
cfs_rq->h_nr_running.

And I guess we need something like:

se_on_rq()

  if (se->sched_delayed)
    return false

for

__update_load_avg_se()

- if (___update_load_sum(now, &se->avg, !!se->on_rq, se_runnable(se),
+ if (___update_load_sum(now, &se->avg, se_on_rq(se), se_runnable(se),


My hope was we can fix util_est independently since it drives CPU
frequency. Whereas PELT load_avg and runnable_avg are "only" used for
load balancing. But I agree, it has to be fixed as well.

> That also means that cfs_rq's h_nr_running is not accurate anymore
> because it includes delayed dequeue

+1

> and cfs_rq load_avg is kept artificially high which biases
> load_balance and cgroup's shares

+1

>> Exclude delayed task which are (a) migrating between rq's or (b) in a
>> SAVE/RESTORE dequeue/enqueue.

I just realized that this fixes the uneven util_est_dequeue/enqueue
calls so we don't see the underflow depicted by Hongyan and no massive
rq->cfs util_est due to missing ue_dequeues.
But delayed tasks are part of rq->cfs util_est, not excluded. Let me fix
that.

>> Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
>> ---
>>  kernel/sched/fair.c | 16 +++++++++-------
>>  1 file changed, 9 insertions(+), 7 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 1e693ca8ebd6..5c32cc26d6c2 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -6948,18 +6948,19 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>         int rq_h_nr_running = rq->cfs.h_nr_running;
>>         u64 slice = 0;
>>
>> -       if (flags & ENQUEUE_DELAYED) {
>> -               requeue_delayed_entity(se);
>> -               return;
>> -       }
>> -
>>         /*
>>          * The code below (indirectly) updates schedutil which looks at
>>          * the cfs_rq utilization to select a frequency.
>>          * Let's add the task's estimated utilization to the cfs_rq's
>>          * estimated utilization, before we update schedutil.
>>          */
>> -       util_est_enqueue(&rq->cfs, p);
>> +       if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE))))
>> +               util_est_enqueue(&rq->cfs, p);
>> +
>> +       if (flags & ENQUEUE_DELAYED) {
>> +               requeue_delayed_entity(se);
>> +               return;
>> +       }
>>
>>         /*
>>          * If in_iowait is set, the code below may not trigger any cpufreq
>> @@ -7177,7 +7178,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
>>   */
>>  static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>  {
>> -       util_est_dequeue(&rq->cfs, p);
>> +       if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
>> +               util_est_dequeue(&rq->cfs, p);
>>
>>         if (dequeue_entities(rq, &p->se, flags) < 0) {
>>                 if (!rq->cfs.h_nr_running)
>> --
>> 2.34.1

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Dietmar Eggemann 1 year, 5 months ago

On 05/09/2024 16:07, Dietmar Eggemann wrote:
> On 05/09/2024 15:33, Vincent Guittot wrote:
>> On Thu, 5 Sept 2024 at 15:02, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:
>>>
>>> On 29/08/2024 17:42, Hongyan Xia wrote:
>>>> On 22/08/2024 15:58, Vincent Guittot wrote:
>>>>> On Thu, 22 Aug 2024 at 14:10, Vincent Guittot
>>>>> <vincent.guittot@linaro.org> wrote:
>>>>>>
>>>>>> On Thu, 22 Aug 2024 at 14:08, Luis Machado <luis.machado@arm.com> wrote:
>>>>>>>
>>>>>>> Vincent,
>>>>>>>
>>>>>>> On 8/22/24 11:28, Luis Machado wrote:
>>>>>>>> On 8/22/24 10:53, Vincent Guittot wrote:
>>>>>>>>> On Thu, 22 Aug 2024 at 11:22, Luis Machado <luis.machado@arm.com>
>>>>>>>>> wrote:
>>>>>>>>>>
>>>>>>>>>> On 8/22/24 09:19, Vincent Guittot wrote:
>>>>>>>>>>> Hi,
>>>>>>>>>>>
>>>>>>>>>>> On Wed, 21 Aug 2024 at 15:34, Hongyan Xia <hongyan.xia2@arm.com>

[...]

> I just realized that this fixes the uneven util_est_dequeue/enqueue
> calls so we don't see the underflow depicted by Hongyan and no massive
> rq->cfs util_est due to missing ue_dequeues.
> But delayed tasks are part of rq->cfs util_est, not excluded. Let me fix
> that.

Looks like I got confused ... After checking again, it seems to be OK:


  dequeue_task_fair()

    if !(p is delayed && (migrating || DEQUEUE_SAVE))
      util_est_dequeue()

    if !entity_eligible(&p->se)
      se->sched_delayed = 1       -> p not contributing to
                                     rq->cfs.avg.util_est

  enqueue_task_fair()

    if !(p is delayed && (migrating || ENQUEUE_RESTORE))
      util_est_enqueue()

    if ENQUEUE_DELAYED
      requeue_delayed_entity()
        se->sched_delayed = 0     -> p contributing to
                                     rq->cfs.avg.util_est


Luis M. did test this for power/perf with jetnews on Pix6 mainline 6.8
and the regression went away.

There are still occasional slight CPU frequency spiking on little CPUs.
Could be the influence of decayed tasks on runnable_avg but we're not
sure yet.

[...]

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Peter Zijlstra 1 year, 5 months ago

On Thu, Sep 05, 2024 at 04:07:01PM +0200, Dietmar Eggemann wrote:

> > Unfortunately, this is not only about util_est
> > 
> > cfs_rq's runnable_avg is also wrong  because we normally have :
> > cfs_rq's runnable_avg == /Sum se's runnable_avg
> > but cfs_rq's runnable_avg uses cfs_rq's h_nr_running but delayed
> > entities are still accounted in h_nr_running
> 
> Yes, I agree.
> 
> se's runnable_avg should be fine already since:
> 
> se_runnable()
> 
>   if (se->sched_delayed)
>     return false
> 
> But then, like you said, __update_load_avg_cfs_rq() needs correct
> cfs_rq->h_nr_running.

Uff. So yes __update_load_avg_cfs_rq() needs a different number, but
I'll contest that h_nr_running is in fact correct, albeit no longer
suitable for this purpose.

We can track h_nr_delayed I suppose, and subtract that.

> And I guess we need something like:
> 
> se_on_rq()
> 
>   if (se->sched_delayed)
>     return false
> 
> for
> 
> __update_load_avg_se()
> 
> - if (___update_load_sum(now, &se->avg, !!se->on_rq, se_runnable(se),
> + if (___update_load_sum(now, &se->avg, se_on_rq(se), se_runnable(se),
> 
> 
> My hope was we can fix util_est independently since it drives CPU
> frequency. Whereas PELT load_avg and runnable_avg are "only" used for
> load balancing. But I agree, it has to be fixed as well.
> 
> > That also means that cfs_rq's h_nr_running is not accurate anymore
> > because it includes delayed dequeue
> 
> +1
> 
> > and cfs_rq load_avg is kept artificially high which biases
> > load_balance and cgroup's shares
> 
> +1

Again, fundamentally the delayed tasks are delayed because they need to
remain part of the competition in order to 'earn' time. It really is
fully on_rq, and should be for the purpose of load and load-balancing.

It is only special in that it will never run again (until it gets
woken).

Consider (2 CPUs, 4 tasks):

  CPU1		CPU2
   A		 D
   B (delayed)
   C

Then migrating any one of the tasks on CPU1 to CPU2 will make them all
earn time at 1/2 instead of 1/3 vs 1/1. More fair etc.

Yes, I realize this might seem weird, but we're going to be getting a
ton more of this weirdness once proxy execution lands, then we'll be
having the entire block chain still on the runqueue (and actually
consuming time).

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Peter Zijlstra 1 year, 5 months ago

On Thu, Sep 05, 2024 at 04:53:54PM +0200, Peter Zijlstra wrote:

> > But then, like you said, __update_load_avg_cfs_rq() needs correct
> > cfs_rq->h_nr_running.
> 
> Uff. So yes __update_load_avg_cfs_rq() needs a different number, but
> I'll contest that h_nr_running is in fact correct, albeit no longer
> suitable for this purpose.
> 
> We can track h_nr_delayed I suppose, and subtract that.

Something like so?

---
 kernel/sched/debug.c |  1 +
 kernel/sched/fair.c  | 49 ++++++++++++++++++++++++++++++++++++++++++++-----
 kernel/sched/pelt.c  |  2 +-
 kernel/sched/sched.h |  7 +++++--
 4 files changed, 51 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 01ce9a76164c..3d3c5be78075 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -829,6 +829,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread));
 	SEQ_printf(m, "  .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
 	SEQ_printf(m, "  .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running);
+	SEQ_printf(m, "  .%-30s: %d\n", "h_nr_delayed", cfs_rq->h_nr_delayed);
 	SEQ_printf(m, "  .%-30s: %d\n", "idle_nr_running",
 			cfs_rq->idle_nr_running);
 	SEQ_printf(m, "  .%-30s: %d\n", "idle_h_nr_running",
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 11e890486c1b..629b46308960 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5456,9 +5456,31 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 
-static inline void finish_delayed_dequeue_entity(struct sched_entity *se)
+static void set_delayed(struct sched_entity *se)
+{
+	se->sched_delayed = 1;
+	for_each_sched_entity(se) {
+		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+		cfs_rq->h_nr_delayed++;
+		if (cfs_rq_throttled(cfs_rq))
+			break;
+	}
+}
+
+static void clear_delayed(struct sched_entity *se)
 {
 	se->sched_delayed = 0;
+	for_each_sched_entity(se) {
+		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+		cfs_rq->h_nr_delayed--;
+		if (cfs_rq_throttled(cfs_rq))
+			break;
+	}
+}
+
+static inline void finish_delayed_dequeue_entity(struct sched_entity *se)
+{
+	clear_delayed(se);
 	if (sched_feat(DELAY_ZERO) && se->vlag > 0)
 		se->vlag = 0;
 }
@@ -5488,7 +5510,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 			if (cfs_rq->next == se)
 				cfs_rq->next = NULL;
 			update_load_avg(cfs_rq, se, 0);
-			se->sched_delayed = 1;
+			set_delayed(se);
 			return false;
 		}
 	}
@@ -5907,7 +5929,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	struct rq *rq = rq_of(cfs_rq);
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
 	struct sched_entity *se;
-	long task_delta, idle_task_delta, dequeue = 1;
+	long task_delta, idle_task_delta, delayed_delta, dequeue = 1;
 	long rq_h_nr_running = rq->cfs.h_nr_running;
 
 	raw_spin_lock(&cfs_b->lock);
@@ -5940,6 +5962,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 
 	task_delta = cfs_rq->h_nr_running;
 	idle_task_delta = cfs_rq->idle_h_nr_running;
+	delayed_delta = cfs_rq->h_nr_delayed;
 	for_each_sched_entity(se) {
 		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
 		int flags;
@@ -5963,6 +5986,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 
 		qcfs_rq->h_nr_running -= task_delta;
 		qcfs_rq->idle_h_nr_running -= idle_task_delta;
+		qcfs_rq->h_nr_delayed -= delayed_delta;
 
 		if (qcfs_rq->load.weight) {
 			/* Avoid re-evaluating load for this entity: */
@@ -5985,6 +6009,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 
 		qcfs_rq->h_nr_running -= task_delta;
 		qcfs_rq->idle_h_nr_running -= idle_task_delta;
+		qcfs_rq->h_nr_delayed -= delayed_delta;
 	}
 
 	/* At this point se is NULL and we are at root level*/
@@ -6010,7 +6035,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 	struct rq *rq = rq_of(cfs_rq);
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
 	struct sched_entity *se;
-	long task_delta, idle_task_delta;
+	long task_delta, idle_task_delta, delayed_delta;
 	long rq_h_nr_running = rq->cfs.h_nr_running;
 
 	se = cfs_rq->tg->se[cpu_of(rq)];
@@ -6046,6 +6071,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 
 	task_delta = cfs_rq->h_nr_running;
 	idle_task_delta = cfs_rq->idle_h_nr_running;
+	delayed_delta = cfs_rq->h_nr_delayed;
 	for_each_sched_entity(se) {
 		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
 
@@ -6060,6 +6086,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 
 		qcfs_rq->h_nr_running += task_delta;
 		qcfs_rq->idle_h_nr_running += idle_task_delta;
+		qcfs_rq->h_nr_delayed += delayed_delta;
 
 		/* end evaluation on encountering a throttled cfs_rq */
 		if (cfs_rq_throttled(qcfs_rq))
@@ -6077,6 +6104,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 
 		qcfs_rq->h_nr_running += task_delta;
 		qcfs_rq->idle_h_nr_running += idle_task_delta;
+		qcfs_rq->h_nr_delayed += delayed_delta;
 
 		/* end evaluation on encountering a throttled cfs_rq */
 		if (cfs_rq_throttled(qcfs_rq))
@@ -6930,7 +6958,7 @@ requeue_delayed_entity(struct sched_entity *se)
 	}
 
 	update_load_avg(cfs_rq, se, 0);
-	se->sched_delayed = 0;
+	clear_delayed(se);
 }
 
 /*
@@ -6944,6 +6972,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
 	int idle_h_nr_running = task_has_idle_policy(p);
+	int h_nr_delayed = 0;
 	int task_new = !(flags & ENQUEUE_WAKEUP);
 	int rq_h_nr_running = rq->cfs.h_nr_running;
 	u64 slice = 0;
@@ -6953,6 +6982,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		return;
 	}
 
+	if (task_new)
+		h_nr_delayed = !!se->sched_delayed;
+
 	/*
 	 * The code below (indirectly) updates schedutil which looks at
 	 * the cfs_rq utilization to select a frequency.
@@ -6991,6 +7023,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
 		cfs_rq->h_nr_running++;
 		cfs_rq->idle_h_nr_running += idle_h_nr_running;
+		cfs_rq->h_nr_delayed += h_nr_delayed;
 
 		if (cfs_rq_is_idle(cfs_rq))
 			idle_h_nr_running = 1;
@@ -7014,6 +7047,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
 		cfs_rq->h_nr_running++;
 		cfs_rq->idle_h_nr_running += idle_h_nr_running;
+		cfs_rq->h_nr_delayed += h_nr_delayed;
 
 		if (cfs_rq_is_idle(cfs_rq))
 			idle_h_nr_running = 1;
@@ -7076,6 +7110,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 	struct task_struct *p = NULL;
 	int idle_h_nr_running = 0;
 	int h_nr_running = 0;
+	int h_nr_delayed = 0;
 	struct cfs_rq *cfs_rq;
 	u64 slice = 0;
 
@@ -7083,6 +7118,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 		p = task_of(se);
 		h_nr_running = 1;
 		idle_h_nr_running = task_has_idle_policy(p);
+		if (!task_sleep && !task_delayed)
+			h_nr_delayed = !!se->sched_delayed;
 	} else {
 		cfs_rq = group_cfs_rq(se);
 		slice = cfs_rq_min_slice(cfs_rq);
@@ -7100,6 +7137,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 
 		cfs_rq->h_nr_running -= h_nr_running;
 		cfs_rq->idle_h_nr_running -= idle_h_nr_running;
+		cfs_rq->h_nr_delayed -= h_nr_delayed;
 
 		if (cfs_rq_is_idle(cfs_rq))
 			idle_h_nr_running = h_nr_running;
@@ -7138,6 +7176,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 
 		cfs_rq->h_nr_running -= h_nr_running;
 		cfs_rq->idle_h_nr_running -= idle_h_nr_running;
+		cfs_rq->h_nr_delayed -= h_nr_delayed;
 
 		if (cfs_rq_is_idle(cfs_rq))
 			idle_h_nr_running = h_nr_running;
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index fa52906a4478..21e3ff5eb77a 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -321,7 +321,7 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
 {
 	if (___update_load_sum(now, &cfs_rq->avg,
 				scale_load_down(cfs_rq->load.weight),
-				cfs_rq->h_nr_running,
+				cfs_rq->h_nr_running - cfs_rq->h_nr_delayed,
 				cfs_rq->curr != NULL)) {
 
 		___update_load_avg(&cfs_rq->avg, 1);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3744f16a1293..d91360b0cca1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -603,6 +603,7 @@ struct cfs_rq {
 	unsigned int		h_nr_running;      /* SCHED_{NORMAL,BATCH,IDLE} */
 	unsigned int		idle_nr_running;   /* SCHED_IDLE */
 	unsigned int		idle_h_nr_running; /* SCHED_IDLE */
+	unsigned int		h_nr_delayed;
 
 	s64			avg_vruntime;
 	u64			avg_load;
@@ -813,8 +814,10 @@ struct dl_rq {
 
 static inline void se_update_runnable(struct sched_entity *se)
 {
-	if (!entity_is_task(se))
-		se->runnable_weight = se->my_q->h_nr_running;
+	if (!entity_is_task(se)) {
+		struct cfs_rq *cfs_rq = se->my_q;
+		se->runnable_weight = cfs_rq->h_nr_running - cfs_rq->h_nr_delayed;
+	}
 }
 
 static inline long se_runnable(struct sched_entity *se)

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Luis Machado 1 year, 5 months ago

Peter,

On 9/6/24 11:45, Peter Zijlstra wrote:
> On Thu, Sep 05, 2024 at 04:53:54PM +0200, Peter Zijlstra wrote:
> 
>>> But then, like you said, __update_load_avg_cfs_rq() needs correct
>>> cfs_rq->h_nr_running.
>>
>> Uff. So yes __update_load_avg_cfs_rq() needs a different number, but
>> I'll contest that h_nr_running is in fact correct, albeit no longer
>> suitable for this purpose.
>>
>> We can track h_nr_delayed I suppose, and subtract that.
> 
> Something like so?
> 
> ---
>  kernel/sched/debug.c |  1 +
>  kernel/sched/fair.c  | 49 ++++++++++++++++++++++++++++++++++++++++++++-----
>  kernel/sched/pelt.c  |  2 +-
>  kernel/sched/sched.h |  7 +++++--
>  4 files changed, 51 insertions(+), 8 deletions(-)
> 
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 01ce9a76164c..3d3c5be78075 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -829,6 +829,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
>  	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread));
>  	SEQ_printf(m, "  .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
>  	SEQ_printf(m, "  .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running);
> +	SEQ_printf(m, "  .%-30s: %d\n", "h_nr_delayed", cfs_rq->h_nr_delayed);
>  	SEQ_printf(m, "  .%-30s: %d\n", "idle_nr_running",
>  			cfs_rq->idle_nr_running);
>  	SEQ_printf(m, "  .%-30s: %d\n", "idle_h_nr_running",
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 11e890486c1b..629b46308960 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5456,9 +5456,31 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  
>  static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
>  
> -static inline void finish_delayed_dequeue_entity(struct sched_entity *se)
> +static void set_delayed(struct sched_entity *se)
> +{
> +	se->sched_delayed = 1;
> +	for_each_sched_entity(se) {
> +		struct cfs_rq *cfs_rq = cfs_rq_of(se);
> +		cfs_rq->h_nr_delayed++;
> +		if (cfs_rq_throttled(cfs_rq))
> +			break;
> +	}
> +}
> +
> +static void clear_delayed(struct sched_entity *se)
>  {
>  	se->sched_delayed = 0;
> +	for_each_sched_entity(se) {
> +		struct cfs_rq *cfs_rq = cfs_rq_of(se);
> +		cfs_rq->h_nr_delayed--;
> +		if (cfs_rq_throttled(cfs_rq))
> +			break;
> +	}
> +}
> +
> +static inline void finish_delayed_dequeue_entity(struct sched_entity *se)
> +{
> +	clear_delayed(se);
>  	if (sched_feat(DELAY_ZERO) && se->vlag > 0)
>  		se->vlag = 0;
>  }
> @@ -5488,7 +5510,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>  			if (cfs_rq->next == se)
>  				cfs_rq->next = NULL;
>  			update_load_avg(cfs_rq, se, 0);
> -			se->sched_delayed = 1;
> +			set_delayed(se);
>  			return false;
>  		}
>  	}
> @@ -5907,7 +5929,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
>  	struct rq *rq = rq_of(cfs_rq);
>  	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
>  	struct sched_entity *se;
> -	long task_delta, idle_task_delta, dequeue = 1;
> +	long task_delta, idle_task_delta, delayed_delta, dequeue = 1;
>  	long rq_h_nr_running = rq->cfs.h_nr_running;
>  
>  	raw_spin_lock(&cfs_b->lock);
> @@ -5940,6 +5962,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
>  
>  	task_delta = cfs_rq->h_nr_running;
>  	idle_task_delta = cfs_rq->idle_h_nr_running;
> +	delayed_delta = cfs_rq->h_nr_delayed;
>  	for_each_sched_entity(se) {
>  		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
>  		int flags;
> @@ -5963,6 +5986,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
>  
>  		qcfs_rq->h_nr_running -= task_delta;
>  		qcfs_rq->idle_h_nr_running -= idle_task_delta;
> +		qcfs_rq->h_nr_delayed -= delayed_delta;
>  
>  		if (qcfs_rq->load.weight) {
>  			/* Avoid re-evaluating load for this entity: */
> @@ -5985,6 +6009,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
>  
>  		qcfs_rq->h_nr_running -= task_delta;
>  		qcfs_rq->idle_h_nr_running -= idle_task_delta;
> +		qcfs_rq->h_nr_delayed -= delayed_delta;
>  	}
>  
>  	/* At this point se is NULL and we are at root level*/
> @@ -6010,7 +6035,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
>  	struct rq *rq = rq_of(cfs_rq);
>  	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
>  	struct sched_entity *se;
> -	long task_delta, idle_task_delta;
> +	long task_delta, idle_task_delta, delayed_delta;
>  	long rq_h_nr_running = rq->cfs.h_nr_running;
>  
>  	se = cfs_rq->tg->se[cpu_of(rq)];
> @@ -6046,6 +6071,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
>  
>  	task_delta = cfs_rq->h_nr_running;
>  	idle_task_delta = cfs_rq->idle_h_nr_running;
> +	delayed_delta = cfs_rq->h_nr_delayed;
>  	for_each_sched_entity(se) {
>  		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
>  
> @@ -6060,6 +6086,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
>  
>  		qcfs_rq->h_nr_running += task_delta;
>  		qcfs_rq->idle_h_nr_running += idle_task_delta;
> +		qcfs_rq->h_nr_delayed += delayed_delta;
>  
>  		/* end evaluation on encountering a throttled cfs_rq */
>  		if (cfs_rq_throttled(qcfs_rq))
> @@ -6077,6 +6104,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
>  
>  		qcfs_rq->h_nr_running += task_delta;
>  		qcfs_rq->idle_h_nr_running += idle_task_delta;
> +		qcfs_rq->h_nr_delayed += delayed_delta;
>  
>  		/* end evaluation on encountering a throttled cfs_rq */
>  		if (cfs_rq_throttled(qcfs_rq))
> @@ -6930,7 +6958,7 @@ requeue_delayed_entity(struct sched_entity *se)
>  	}
>  
>  	update_load_avg(cfs_rq, se, 0);
> -	se->sched_delayed = 0;
> +	clear_delayed(se);
>  }
>  
>  /*
> @@ -6944,6 +6972,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  	struct cfs_rq *cfs_rq;
>  	struct sched_entity *se = &p->se;
>  	int idle_h_nr_running = task_has_idle_policy(p);
> +	int h_nr_delayed = 0;
>  	int task_new = !(flags & ENQUEUE_WAKEUP);
>  	int rq_h_nr_running = rq->cfs.h_nr_running;
>  	u64 slice = 0;
> @@ -6953,6 +6982,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  		return;
>  	}
>  
> +	if (task_new)
> +		h_nr_delayed = !!se->sched_delayed;
> +
>  	/*
>  	 * The code below (indirectly) updates schedutil which looks at
>  	 * the cfs_rq utilization to select a frequency.
> @@ -6991,6 +7023,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  
>  		cfs_rq->h_nr_running++;
>  		cfs_rq->idle_h_nr_running += idle_h_nr_running;
> +		cfs_rq->h_nr_delayed += h_nr_delayed;
>  
>  		if (cfs_rq_is_idle(cfs_rq))
>  			idle_h_nr_running = 1;
> @@ -7014,6 +7047,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  
>  		cfs_rq->h_nr_running++;
>  		cfs_rq->idle_h_nr_running += idle_h_nr_running;
> +		cfs_rq->h_nr_delayed += h_nr_delayed;
>  
>  		if (cfs_rq_is_idle(cfs_rq))
>  			idle_h_nr_running = 1;
> @@ -7076,6 +7110,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
>  	struct task_struct *p = NULL;
>  	int idle_h_nr_running = 0;
>  	int h_nr_running = 0;
> +	int h_nr_delayed = 0;
>  	struct cfs_rq *cfs_rq;
>  	u64 slice = 0;
>  
> @@ -7083,6 +7118,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
>  		p = task_of(se);
>  		h_nr_running = 1;
>  		idle_h_nr_running = task_has_idle_policy(p);
> +		if (!task_sleep && !task_delayed)
> +			h_nr_delayed = !!se->sched_delayed;
>  	} else {
>  		cfs_rq = group_cfs_rq(se);
>  		slice = cfs_rq_min_slice(cfs_rq);
> @@ -7100,6 +7137,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
>  
>  		cfs_rq->h_nr_running -= h_nr_running;
>  		cfs_rq->idle_h_nr_running -= idle_h_nr_running;
> +		cfs_rq->h_nr_delayed -= h_nr_delayed;
>  
>  		if (cfs_rq_is_idle(cfs_rq))
>  			idle_h_nr_running = h_nr_running;
> @@ -7138,6 +7176,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
>  
>  		cfs_rq->h_nr_running -= h_nr_running;
>  		cfs_rq->idle_h_nr_running -= idle_h_nr_running;
> +		cfs_rq->h_nr_delayed -= h_nr_delayed;
>  
>  		if (cfs_rq_is_idle(cfs_rq))
>  			idle_h_nr_running = h_nr_running;
> diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
> index fa52906a4478..21e3ff5eb77a 100644
> --- a/kernel/sched/pelt.c
> +++ b/kernel/sched/pelt.c
> @@ -321,7 +321,7 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
>  {
>  	if (___update_load_sum(now, &cfs_rq->avg,
>  				scale_load_down(cfs_rq->load.weight),
> -				cfs_rq->h_nr_running,
> +				cfs_rq->h_nr_running - cfs_rq->h_nr_delayed,
>  				cfs_rq->curr != NULL)) {
>  
>  		___update_load_avg(&cfs_rq->avg, 1);
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 3744f16a1293..d91360b0cca1 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -603,6 +603,7 @@ struct cfs_rq {
>  	unsigned int		h_nr_running;      /* SCHED_{NORMAL,BATCH,IDLE} */
>  	unsigned int		idle_nr_running;   /* SCHED_IDLE */
>  	unsigned int		idle_h_nr_running; /* SCHED_IDLE */
> +	unsigned int		h_nr_delayed;
>  
>  	s64			avg_vruntime;
>  	u64			avg_load;
> @@ -813,8 +814,10 @@ struct dl_rq {
>  
>  static inline void se_update_runnable(struct sched_entity *se)
>  {
> -	if (!entity_is_task(se))
> -		se->runnable_weight = se->my_q->h_nr_running;
> +	if (!entity_is_task(se)) {
> +		struct cfs_rq *cfs_rq = se->my_q;
> +		se->runnable_weight = cfs_rq->h_nr_running - cfs_rq->h_nr_delayed;
> +	}
>  }
>  
>  static inline long se_runnable(struct sched_entity *se)

I gave the above patch a try on our Android workload running on the Pixel 6 with a 6.8-based kernel.

First I'd like to confirm that Dietmar's fix that was pushed to tip:sched/core (Fix util_est
accounting for DELAY_DEQUEUE) helps bring the frequencies and power use down to more sensible levels.

As for the above changes, unfortunately I'm seeing high frequencies and high power usage again. The
pattern looks similar to what we observed with the uclamp inc/dec imbalance.

I haven't investigated this in depth yet, but I'll go stare at some traces and the code, and hopefully
something will ring bells.

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Peter Zijlstra 1 year, 5 months ago

On Tue, Sep 10, 2024 at 12:04:11PM +0100, Luis Machado wrote:
> I gave the above patch a try on our Android workload running on the Pixel 6 with a 6.8-based kernel.
> 
> First I'd like to confirm that Dietmar's fix that was pushed to tip:sched/core (Fix util_est
> accounting for DELAY_DEQUEUE) helps bring the frequencies and power use down to more sensible levels.
> 
> As for the above changes, unfortunately I'm seeing high frequencies and high power usage again. The
> pattern looks similar to what we observed with the uclamp inc/dec imbalance.

:-(

> I haven't investigated this in depth yet, but I'll go stare at some traces and the code, and hopefully
> something will ring bells.

So first thing to do is trace h_nr_delayed I suppose, in my own
(limited) testing that was mostly [0,1] correctly correlating to there
being a delayed task on the runqueue.

I'm assuming that removing the usage sites restores function?

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Luis Machado 1 year, 5 months ago

On 9/10/24 15:05, Peter Zijlstra wrote:
> On Tue, Sep 10, 2024 at 12:04:11PM +0100, Luis Machado wrote:
>> I gave the above patch a try on our Android workload running on the Pixel 6 with a 6.8-based kernel.
>>
>> First I'd like to confirm that Dietmar's fix that was pushed to tip:sched/core (Fix util_est
>> accounting for DELAY_DEQUEUE) helps bring the frequencies and power use down to more sensible levels.
>>
>> As for the above changes, unfortunately I'm seeing high frequencies and high power usage again. The
>> pattern looks similar to what we observed with the uclamp inc/dec imbalance.
> 
> :-(
> 
>> I haven't investigated this in depth yet, but I'll go stare at some traces and the code, and hopefully
>> something will ring bells.
> 
> So first thing to do is trace h_nr_delayed I suppose, in my own
> (limited) testing that was mostly [0,1] correctly correlating to there
> being a delayed task on the runqueue.
> 
> I'm assuming that removing the usage sites restores function?

It does restore function if we remove the usage.

From an initial look:

cat /sys/kernel/debug/sched/debug | grep -i delay                                                                                                                                                                                                                             
  .h_nr_delayed                  : -4
  .h_nr_delayed                  : -6
  .h_nr_delayed                  : -1
  .h_nr_delayed                  : -6
  .h_nr_delayed                  : -1
  .h_nr_delayed                  : -1
  .h_nr_delayed                  : -5
  .h_nr_delayed                  : -6

So probably an unexpected decrement or lack of an increment somewhere.

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Peter Zijlstra 1 year, 5 months ago

On Wed, Sep 11, 2024 at 09:35:16AM +0100, Luis Machado wrote:
> On 9/10/24 15:05, Peter Zijlstra wrote:
> > On Tue, Sep 10, 2024 at 12:04:11PM +0100, Luis Machado wrote:
> >> I gave the above patch a try on our Android workload running on the Pixel 6 with a 6.8-based kernel.
> >>
> >> First I'd like to confirm that Dietmar's fix that was pushed to tip:sched/core (Fix util_est
> >> accounting for DELAY_DEQUEUE) helps bring the frequencies and power use down to more sensible levels.
> >>
> >> As for the above changes, unfortunately I'm seeing high frequencies and high power usage again. The
> >> pattern looks similar to what we observed with the uclamp inc/dec imbalance.
> > 
> > :-(
> > 
> >> I haven't investigated this in depth yet, but I'll go stare at some traces and the code, and hopefully
> >> something will ring bells.
> > 
> > So first thing to do is trace h_nr_delayed I suppose, in my own
> > (limited) testing that was mostly [0,1] correctly correlating to there
> > being a delayed task on the runqueue.
> > 
> > I'm assuming that removing the usage sites restores function?
> 
> It does restore function if we remove the usage.
> 
> From an initial look:
> 
> cat /sys/kernel/debug/sched/debug | grep -i delay                                                                                                                                                                                                                             
>   .h_nr_delayed                  : -4
>   .h_nr_delayed                  : -6
>   .h_nr_delayed                  : -1
>   .h_nr_delayed                  : -6
>   .h_nr_delayed                  : -1
>   .h_nr_delayed                  : -1
>   .h_nr_delayed                  : -5
>   .h_nr_delayed                  : -6
> 
> So probably an unexpected decrement or lack of an increment somewhere.

Yeah, that's buggered. Ok, I'll go rebase sched/core and take this patch
out. I'll see if I can reproduce that.

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Luis Machado 1 year, 5 months ago

On 9/11/24 09:45, Peter Zijlstra wrote:
> On Wed, Sep 11, 2024 at 09:35:16AM +0100, Luis Machado wrote:
>> On 9/10/24 15:05, Peter Zijlstra wrote:
>>> On Tue, Sep 10, 2024 at 12:04:11PM +0100, Luis Machado wrote:
>>>> I gave the above patch a try on our Android workload running on the Pixel 6 with a 6.8-based kernel.
>>>>
>>>> First I'd like to confirm that Dietmar's fix that was pushed to tip:sched/core (Fix util_est
>>>> accounting for DELAY_DEQUEUE) helps bring the frequencies and power use down to more sensible levels.
>>>>
>>>> As for the above changes, unfortunately I'm seeing high frequencies and high power usage again. The
>>>> pattern looks similar to what we observed with the uclamp inc/dec imbalance.
>>>
>>> :-(
>>>
>>>> I haven't investigated this in depth yet, but I'll go stare at some traces and the code, and hopefully
>>>> something will ring bells.
>>>
>>> So first thing to do is trace h_nr_delayed I suppose, in my own
>>> (limited) testing that was mostly [0,1] correctly correlating to there
>>> being a delayed task on the runqueue.
>>>
>>> I'm assuming that removing the usage sites restores function?
>>
>> It does restore function if we remove the usage.
>>
>> From an initial look:
>>
>> cat /sys/kernel/debug/sched/debug | grep -i delay                                                                                                                                                                                                                             
>>   .h_nr_delayed                  : -4
>>   .h_nr_delayed                  : -6
>>   .h_nr_delayed                  : -1
>>   .h_nr_delayed                  : -6
>>   .h_nr_delayed                  : -1
>>   .h_nr_delayed                  : -1
>>   .h_nr_delayed                  : -5
>>   .h_nr_delayed                  : -6
>>
>> So probably an unexpected decrement or lack of an increment somewhere.
> 
> Yeah, that's buggered. Ok, I'll go rebase sched/core and take this patch
> out. I'll see if I can reproduce that.

Before reverting it, let me run a few more checks first. Dietmar tells me
he sees sane values for h_nr_delayed on a juno system.

I just want to make sure I'm not hitting some oddness with the 6.8 kernel
on the pixel 6.

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Mike Galbraith 1 year, 5 months ago

On Wed, 2024-09-11 at 10:45 +0200, Peter Zijlstra wrote:
> On Wed, Sep 11, 2024 at 09:35:16AM +0100, Luis Machado wrote:
> > > 
> > > I'm assuming that removing the usage sites restores function?
> > 
> > It does restore function if we remove the usage.
> > 
> > From an initial look:
> > 
> > cat /sys/kernel/debug/sched/debug | grep -i delay                                                                                                                                                                                                                             
> >   .h_nr_delayed                  : -4
> >   .h_nr_delayed                  : -6
> >   .h_nr_delayed                  : -1
> >   .h_nr_delayed                  : -6
> >   .h_nr_delayed                  : -1
> >   .h_nr_delayed                  : -1
> >   .h_nr_delayed                  : -5
> >   .h_nr_delayed                  : -6
> > 
> > So probably an unexpected decrement or lack of an increment somewhere.
> 
> Yeah, that's buggered. Ok, I'll go rebase sched/core and take this patch
> out. I'll see if I can reproduce that.

Hm, would be interesting to know how the heck he's triggering that.

My x86_64 box refuses to produce any such artifacts with anything I've
tossed at it, including full LTP with enterprise RT and !RT configs,
both in master and my local SLE15-SP7 branch.  Hohum.

	-Mike

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Luis Machado 1 year, 4 months ago

On 9/11/24 10:10, Mike Galbraith wrote:
> On Wed, 2024-09-11 at 10:45 +0200, Peter Zijlstra wrote:
>> On Wed, Sep 11, 2024 at 09:35:16AM +0100, Luis Machado wrote:
>>>>
>>>> I'm assuming that removing the usage sites restores function?
>>>
>>> It does restore function if we remove the usage.
>>>
>>> From an initial look:
>>>
>>> cat /sys/kernel/debug/sched/debug | grep -i delay                                                                                                                                                                                                                             
>>>   .h_nr_delayed                  : -4
>>>   .h_nr_delayed                  : -6
>>>   .h_nr_delayed                  : -1
>>>   .h_nr_delayed                  : -6
>>>   .h_nr_delayed                  : -1
>>>   .h_nr_delayed                  : -1
>>>   .h_nr_delayed                  : -5
>>>   .h_nr_delayed                  : -6
>>>
>>> So probably an unexpected decrement or lack of an increment somewhere.
>>
>> Yeah, that's buggered. Ok, I'll go rebase sched/core and take this patch
>> out. I'll see if I can reproduce that.
> 
> Hm, would be interesting to know how the heck he's triggering that.
> 
> My x86_64 box refuses to produce any such artifacts with anything I've
> tossed at it, including full LTP with enterprise RT and !RT configs,
> both in master and my local SLE15-SP7 branch.  Hohum.
> 
> 	-Mike

Ok, I seem to have narrowed this down to scheduler class switching. In particular
switched_from_fair.

Valentin's patch (75b6499024a6c1a4ef0288f280534a5c54269076
sched/fair: Properly deactivate sched_delayed task upon class change) introduced
finish_delayed_dequeue_entity, which takes care of cleaning up the state of delayed-dequeue
tasks during class change. Things work fine (minus delayed task accounting) up to this point.

When Peter introduced his patch to do h_nr_delayed accounting, we modified
finish_delayed_dequeue_entity to also call clear_delayed, instead of simply
zeroing se->sched_delayed.

The call to clear_delayed decrements the rq's h_nr_delayed, and it gets used elsewhere
to cleanup the state of delayed-dequeue tasks, in order to share some common code.

With that said, my testing on Android shows that when we hit switched_from_fair during
switching sched classes (due to some RT task), we're in a state where...

1 - We already called into dequeue_entities for this delayed task.
2 - We tested true for the !task_sleep && !task_delayed condition.
3 - se->sched_delayed is true, so h_nr_delayed == 1.
4 - We carry on executing the rest of dequeue_entities and decrement the rq's h_nr_running by 1.

In switched_from_fair, after the above events, we call into finish_delayed_dequeue_entity -> clear_delayed
and do yet another decrement to the rq's h_nr_delayed, now potentially making it negative. As
a consequence, we probably misuse the negative value and adjust the frequencies incorrectly. I
think this is the issue I'm seeing.

It is worth pointing out that even with the Android setup, things only go bad when there is enough
competition and switching of classes (lots of screen events etc).

My suggestion of a fix (below), still under testing, is to inline the delayed-dequeue and the lag zeroing
cleanup within switched_from_fair instead of calling finish_delayed_dequeue_entity. Or maybe
drop finish_delayed_dequeue_entity and inline its contents into its callers.

The rest of Peter's patch introducing h_nr_delayed seems OK as far as I could test.

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f993ac282a83..f8df2f8d2e11 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -13168,7 +13168,9 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
                 * related to sched_delayed being true and that wasn't done
                 * due to the generic dequeue not using DEQUEUE_DELAYED.
                 */
-               finish_delayed_dequeue_entity(&p->se);
+               p->se.sched_delayed = 0;
+               if (sched_feat(DELAY_ZERO) && p->se.vlag > 0)
+                       p->se.vlag = 0;
                p->se.rel_deadline = 0;
                __block_task(rq, p);
        }

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Dietmar Eggemann 1 year, 4 months ago

On 12/09/2024 14:58, Luis Machado wrote:
> On 9/11/24 10:10, Mike Galbraith wrote:
>> On Wed, 2024-09-11 at 10:45 +0200, Peter Zijlstra wrote:
>>> On Wed, Sep 11, 2024 at 09:35:16AM +0100, Luis Machado wrote:

[...]

> Ok, I seem to have narrowed this down to scheduler class switching. In particular
> switched_from_fair.
> 
> Valentin's patch (75b6499024a6c1a4ef0288f280534a5c54269076
> sched/fair: Properly deactivate sched_delayed task upon class change) introduced
> finish_delayed_dequeue_entity, which takes care of cleaning up the state of delayed-dequeue
> tasks during class change. Things work fine (minus delayed task accounting) up to this point.
> 
> When Peter introduced his patch to do h_nr_delayed accounting, we modified
> finish_delayed_dequeue_entity to also call clear_delayed, instead of simply
> zeroing se->sched_delayed.
> 
> The call to clear_delayed decrements the rq's h_nr_delayed, and it gets used elsewhere
> to cleanup the state of delayed-dequeue tasks, in order to share some common code.
> 
> With that said, my testing on Android shows that when we hit switched_from_fair during
> switching sched classes (due to some RT task), we're in a state where...
> 
> 1 - We already called into dequeue_entities for this delayed task.
> 2 - We tested true for the !task_sleep && !task_delayed condition.
> 3 - se->sched_delayed is true, so h_nr_delayed == 1.
> 4 - We carry on executing the rest of dequeue_entities and decrement the rq's h_nr_running by 1.
> 
> In switched_from_fair, after the above events, we call into finish_delayed_dequeue_entity -> clear_delayed
> and do yet another decrement to the rq's h_nr_delayed, now potentially making it negative. As
> a consequence, we probably misuse the negative value and adjust the frequencies incorrectly. I
> think this is the issue I'm seeing.
> 
> It is worth pointing out that even with the Android setup, things only go bad when there is enough
> competition and switching of classes (lots of screen events etc).
> 
> My suggestion of a fix (below), still under testing, is to inline the delayed-dequeue and the lag zeroing
> cleanup within switched_from_fair instead of calling finish_delayed_dequeue_entity. Or maybe
> drop finish_delayed_dequeue_entity and inline its contents into its callers.
> 
> The rest of Peter's patch introducing h_nr_delayed seems OK as far as I could test.
> 
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index f993ac282a83..f8df2f8d2e11 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -13168,7 +13168,9 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
>                  * related to sched_delayed being true and that wasn't done
>                  * due to the generic dequeue not using DEQUEUE_DELAYED.
>                  */
> -               finish_delayed_dequeue_entity(&p->se);
> +               p->se.sched_delayed = 0;
> +               if (sched_feat(DELAY_ZERO) && p->se.vlag > 0)
> +                       p->se.vlag = 0;
>                 p->se.rel_deadline = 0;
>                 __block_task(rq, p);
>         }

I could recreate this on QEMU with:

@@ -5473,6 +5473,7 @@ static void clear_delayed(struct sched_entity *se)
        for_each_sched_entity(se) {
                struct cfs_rq *cfs_rq = cfs_rq_of(se);
                cfs_rq->h_nr_delayed--;
+               BUG_ON((int)cfs_rq->h_nr_delayed < 0);
                if (cfs_rq_throttled(cfs_rq))
                        break;
        }
running:

  # while(true); do chrt -rr -p 50 $$; chrt -o -p 0 $$; done

in one shell and:

  # hackbench

in another.

[  318.490522] ------------[ cut here ]------------
[  318.490969] kernel BUG at kernel/sched/fair.c:5476!
[  318.491411] Oops: invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
[  318.491964] CPU: 3 UID: 0 PID: 68053 Comm: chrt Not tainted 6.11.0-rc1-00066-g2e05f6c71d36-dirty #23
[  318.492604] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.4
[  318.494192] RIP: 0010:switched_from_fair+0x67/0xe0
[  318.494899] Code: ff ff c6 85 d1 00 00 00 00 48 85 db 75 0e eb 1c 48 8b 9b 98 00 00 00 48 85 db 74 10 48 8b 0
[  318.496491] RSP: 0018:ffffb1154bc63e20 EFLAGS: 00010097
[  318.496991] RAX: 0000000000000001 RBX: ffff92668844e200 RCX: ffff9266fddadea8
[  318.497681] RDX: ffff926684213e00 RSI: ffff9266fddadea8 RDI: ffff92668844e608
[  318.498339] RBP: ffff92668844e180 R08: ffff92668106c44c R09: ffff92668119c0b8
[  318.498940] R10: ffff926681f42000 R11: ffffffffb1e155d0 R12: ffff9266fddad640
[  318.499525] R13: ffffb1154bc63ed8 R14: 0000000000000078 R15: ffff9266fddad640
[  318.500234] FS:  00007f68f52bf740(0000) GS:ffff9266fdd80000(0000) knlGS:0000000000000000
[  318.500837] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  318.501261] CR2: 00007f68f53afdd0 CR3: 0000000006a58002 CR4: 0000000000370ef0
[  318.501798] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  318.502385] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  318.502919] Call Trace:
[  318.503118]  <TASK>
[  318.503284]  ? die+0x32/0x90
[  318.503508]  ? do_trap+0xd8/0x100
[  318.503770]  ? switched_from_fair+0x67/0xe0
[  318.504085]  ? do_error_trap+0x60/0x80
[  318.504374]  ? switched_from_fair+0x67/0xe0
[  318.504652]  ? exc_invalid_op+0x53/0x70
[  318.504995]  ? switched_from_fair+0x67/0xe0
[  318.505270]  ? asm_exc_invalid_op+0x1a/0x20
[  318.505588]  ? switched_from_fair+0x67/0xe0
[  318.505929]  check_class_changed+0x2a/0x80
[  318.506236]  __sched_setscheduler+0x1f3/0x920
[  318.506526]  do_sched_setscheduler+0xfd/0x1c0
[  318.506867]  ? do_sys_openat2+0x7c/0xc0
[  318.507141]  __x64_sys_sched_setscheduler+0x1a/0x30
[  318.507462]  do_syscall_64+0x9e/0x1a0
[  318.507722]  entry_SYSCALL_64_after_hwframe+0x77/0x7f
[  318.508127] RIP: 0033:0x7f68f53c3719
[  318.508369] Code: 08 89 e8 5b 5d c3 66 2e 0f 1f 84 00 00 00 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 8
[  318.509941] RSP: 002b:00007fff8d920578 EFLAGS: 00000246 ORIG_RAX: 0000000000000090
[  318.511178] RAX: ffffffffffffffda RBX: 00007fff8d920610 RCX: 00007f68f53c3719
[  318.512105] RDX: 00007fff8d92058c RSI: 0000000000000002 RDI: 0000000000000103
[  318.512676] RBP: 0000000000000103 R08: 0000000000000000 R09: 0000000000000000
[  318.513296] R10: 1999999999999999 R11: 0000000000000246 R12: 0000000000000002
[  318.513917] R13: 0000000000000002 R14: 0000000000000032 R15: 0000000000000103
[  318.514617]  </TASK>
[  318.514861] Modules linked in:
[  318.515132] ---[ end trace 0000000000000000 ]---
[  318.515466] RIP: 0010:switched_from_fair+0x67/0xe0
[  318.515942] Code: ff ff c6 85 d1 00 00 00 00 48 85 db 75 0e eb 1c 48 8b 9b 98 00 00 00 48 85 db 74 10 48 8b 0
[  318.517411] RSP: 0018:ffffb1154bc63e20 EFLAGS: 00010097
[  318.517819] RAX: 0000000000000001 RBX: ffff92668844e200 RCX: ffff9266fddadea8
[  318.518354] RDX: ffff926684213e00 RSI: ffff9266fddadea8 RDI: ffff92668844e608
[  318.518946] RBP: ffff92668844e180 R08: ffff92668106c44c R09: ffff92668119c0b8
[  318.519527] R10: ffff926681f42000 R11: ffffffffb1e155d0 R12: ffff9266fddad640
[  318.520138] R13: ffffb1154bc63ed8 R14: 0000000000000078 R15: ffff9266fddad640
[  318.520684] FS:  00007f68f52bf740(0000) GS:ffff9266fdd80000(0000) knlGS:0000000000000000
[  318.521350] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  318.521745] CR2: 00007f68f53afdd0 CR3: 0000000006a58002 CR4: 0000000000370ef0
[  318.522306] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  318.522896] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  318.523388] note: chrt[68053] exited with irqs disabled

With your proposed fix the issue goes away.

Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Luis Machado 1 year, 5 months ago

On 9/11/24 10:10, Mike Galbraith wrote:
> On Wed, 2024-09-11 at 10:45 +0200, Peter Zijlstra wrote:
>> On Wed, Sep 11, 2024 at 09:35:16AM +0100, Luis Machado wrote:
>>>>
>>>> I'm assuming that removing the usage sites restores function?
>>>
>>> It does restore function if we remove the usage.
>>>
>>> From an initial look:
>>>
>>> cat /sys/kernel/debug/sched/debug | grep -i delay                                                                                                                                                                                                                             
>>>   .h_nr_delayed                  : -4
>>>   .h_nr_delayed                  : -6
>>>   .h_nr_delayed                  : -1
>>>   .h_nr_delayed                  : -6
>>>   .h_nr_delayed                  : -1
>>>   .h_nr_delayed                  : -1
>>>   .h_nr_delayed                  : -5
>>>   .h_nr_delayed                  : -6
>>>
>>> So probably an unexpected decrement or lack of an increment somewhere.
>>
>> Yeah, that's buggered. Ok, I'll go rebase sched/core and take this patch
>> out. I'll see if I can reproduce that.
> 
> Hm, would be interesting to know how the heck he's triggering that.
> 
> My x86_64 box refuses to produce any such artifacts with anything I've
> tossed at it, including full LTP with enterprise RT and !RT configs,
> both in master and my local SLE15-SP7 branch.  Hohum.
> 
> 	-Mike

From what I can tell, the decrement that makes h_nr_delayed go negative is in
the dequeue_entities path.

First:

                if (!task_sleep && !task_delayed)
                        h_nr_delayed = !!se->sched_delayed;

h_nr_delayed is 1 here.

Then we decrement cfs_rq->h_nr_delayed below:

                cfs_rq->h_nr_running -= h_nr_running;
                cfs_rq->idle_h_nr_running -= idle_h_nr_running;
                cfs_rq->h_nr_delayed -= h_nr_delayed;

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Peter Zijlstra 1 year, 5 months ago

On Wed, Sep 11, 2024 at 11:10:26AM +0200, Mike Galbraith wrote:
> On Wed, 2024-09-11 at 10:45 +0200, Peter Zijlstra wrote:
> > On Wed, Sep 11, 2024 at 09:35:16AM +0100, Luis Machado wrote:
> > > > 
> > > > I'm assuming that removing the usage sites restores function?
> > > 
> > > It does restore function if we remove the usage.
> > > 
> > > From an initial look:
> > > 
> > > cat /sys/kernel/debug/sched/debug | grep -i delay                                                                                                                                                                                                                             
> > >   .h_nr_delayed                  : -4
> > >   .h_nr_delayed                  : -6
> > >   .h_nr_delayed                  : -1
> > >   .h_nr_delayed                  : -6
> > >   .h_nr_delayed                  : -1
> > >   .h_nr_delayed                  : -1
> > >   .h_nr_delayed                  : -5
> > >   .h_nr_delayed                  : -6
> > > 
> > > So probably an unexpected decrement or lack of an increment somewhere.
> > 
> > Yeah, that's buggered. Ok, I'll go rebase sched/core and take this patch
> > out. I'll see if I can reproduce that.
> 
> Hm, would be interesting to know how the heck he's triggering that.
> 
> My x86_64 box refuses to produce any such artifacts with anything I've
> tossed at it, including full LTP with enterprise RT and !RT configs,
> both in master and my local SLE15-SP7 branch.  Hohum.

Yeah, my hackbench runs also didn't show that. Perhaps something funny
with cgroups. I didn't test cgroup bandwidth for exanple.

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Dietmar Eggemann 1 year, 5 months ago

On 11/09/2024 11:13, Peter Zijlstra wrote:
> On Wed, Sep 11, 2024 at 11:10:26AM +0200, Mike Galbraith wrote:
>> On Wed, 2024-09-11 at 10:45 +0200, Peter Zijlstra wrote:
>>> On Wed, Sep 11, 2024 at 09:35:16AM +0100, Luis Machado wrote:

[...]

>>>> So probably an unexpected decrement or lack of an increment somewhere.
>>>
>>> Yeah, that's buggered. Ok, I'll go rebase sched/core and take this patch
>>> out. I'll see if I can reproduce that.
>>
>> Hm, would be interesting to know how the heck he's triggering that.
>>
>> My x86_64 box refuses to produce any such artifacts with anything I've
>> tossed at it, including full LTP with enterprise RT and !RT configs,
>> both in master and my local SLE15-SP7 branch.  Hohum.
> 
> Yeah, my hackbench runs also didn't show that. Perhaps something funny
> with cgroups. I didn't test cgroup bandwidth for exanple.

Don't see it either on my Arm64 Juno-r0 (6 CPUs) with:

cgexec -g cpu:/A/B/C hackbench -l 1000

We are checking the Pixel6 now.

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Mike Galbraith 1 year, 5 months ago

On Wed, 2024-09-11 at 11:13 +0200, Peter Zijlstra wrote:
> On Wed, Sep 11, 2024 at 11:10:26AM +0200, Mike Galbraith wrote:
> >
> > Hm, would be interesting to know how the heck he's triggering that.
> >
> > My x86_64 box refuses to produce any such artifacts with anything I've
> > tossed at it, including full LTP with enterprise RT and !RT configs,
> > both in master and my local SLE15-SP7 branch.  Hohum.
>
> Yeah, my hackbench runs also didn't show that. Perhaps something funny
> with cgroups. I didn't test cgroup bandwidth for exanple.

That's all on in enterprise configs tested with LTP, so hypothetically
got some testing.  I also turned on AUTOGROUP in !RT configs so cgroups
would get some exercise no matter what I'm mucking about with.

	-Mike

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Mike Galbraith 1 year, 4 months ago

On Wed, 2024-09-11 at 11:27 +0200, Mike Galbraith wrote:
> On Wed, 2024-09-11 at 11:13 +0200, Peter Zijlstra wrote:
> > On Wed, Sep 11, 2024 at 11:10:26AM +0200, Mike Galbraith wrote:
> > >
> > > Hm, would be interesting to know how the heck he's triggering that.
> > >
> > > My x86_64 box refuses to produce any such artifacts with anything I've
> > > tossed at it, including full LTP with enterprise RT and !RT configs,
> > > both in master and my local SLE15-SP7 branch.  Hohum.
> >
> > Yeah, my hackbench runs also didn't show that. Perhaps something funny
> > with cgroups. I didn't test cgroup bandwidth for exanple.
>
> That's all on in enterprise configs tested with LTP, so hypothetically
> got some testing.  I also turned on AUTOGROUP in !RT configs so cgroups
> would get some exercise no matter what I'm mucking about with.

Oho, I just hit a pick_eevdf() returns NULL in pick_next_entity() and
we deref it bug in tip that I recall having seen someone else mention
them having hit.  LTP was chugging away doing lord knows what when
evolution apparently decided to check accounts, which didn't go well.

state=TASK_WAKING(?), on_rq=0, on_cpu=1, cfs_rq.nr_running=0

crash> bt -sx
PID: 29024    TASK: ffff9118b7583300  CPU: 1    COMMAND: "pool-evolution"
 #0 [ffffa939dfd0f930] machine_kexec+0x1a0 at ffffffffab886cc0
 #1 [ffffa939dfd0f990] __crash_kexec+0x6a at ffffffffab99496a
 #2 [ffffa939dfd0fa50] crash_kexec+0x23 at ffffffffab994e33
 #3 [ffffa939dfd0fa60] oops_end+0xbe at ffffffffab844b4e
 #4 [ffffa939dfd0fa80] page_fault_oops+0x151 at ffffffffab898fc1
 #5 [ffffa939dfd0fb08] exc_page_fault+0x6b at ffffffffac3a410b
 #6 [ffffa939dfd0fb30] asm_exc_page_fault+0x22 at ffffffffac400ac2
    [exception RIP: pick_task_fair+113]
    RIP: ffffffffab8fb471  RSP: ffffa939dfd0fbe0  RFLAGS: 00010046
    RAX: 0000000000000000  RBX: ffff91180735ee00  RCX: 000b709eab0437d5
    RDX: 0000000000000001  RSI: 0000000000000000  RDI: ffff91180735ee00
    RBP: ffff91180735f400   R8: 00000000000001d9   R9: 0000000000000000
    R10: ffff911a8ecb9380  R11: 0000000000000000  R12: ffff911a8eab89c0
    R13: ffff911a8eab8a40  R14: ffffffffacafc373  R15: ffff9118b7583300
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
 #7 [ffffa939dfd0fc08] pick_next_task_fair+0x48 at ffffffffab9013b8
 #8 [ffffa939dfd0fc48] __schedule+0x1d9 at ffffffffac3aab39
 #9 [ffffa939dfd0fcf8] schedule+0x24 at ffffffffac3ac084
#10 [ffffa939dfd0fd10] futex_wait_queue+0x63 at ffffffffab98e353
#11 [ffffa939dfd0fd38] __futex_wait+0x139 at ffffffffab98e989
#12 [ffffa939dfd0fdf0] futex_wait+0x6a at ffffffffab98ea5a
#13 [ffffa939dfd0fe80] do_futex+0x88 at ffffffffab98a9f8
#14 [ffffa939dfd0fe90] __x64_sys_futex+0x5e at ffffffffab98ab0e
#15 [ffffa939dfd0ff00] do_syscall_64+0x74 at ffffffffac39ce44
#16 [ffffa939dfd0ff40] entry_SYSCALL_64_after_hwframe+0x4b at ffffffffac4000ac
    RIP: 00007fd6b991a849  RSP: 00007fd6813ff6e8  RFLAGS: 00000246
    RAX: ffffffffffffffda  RBX: 0000000000000a6c  RCX: 00007fd6b991a849
    RDX: 0000000000000a6c  RSI: 0000000000000080  RDI: 00005631abf620c0
    RBP: 00005631abf620b8   R8: 00007fd6bad0a080   R9: 00000000000015fe
    R10: 00007fd6813ff700  R11: 0000000000000246  R12: 00005631abf620b0
    R13: 00005631abf620b0  R14: 00005631abf620b8  R15: 0000000000000000
    ORIG_RAX: 00000000000000ca  CS: 0033  SS: 002b
crash> dis pick_task_fair+113
0xffffffffab8fb471 <pick_task_fair+113>:        cmpb   $0x0,0x51(%rax)
crash> gdb list *pick_task_fair+113
0xffffffffab8fb471 is in pick_task_fair (kernel/sched/fair.c:5639).
5634			SCHED_WARN_ON(cfs_rq->next->sched_delayed);
5635			return cfs_rq->next;
5636		}
5637
5638		struct sched_entity *se = pick_eevdf(cfs_rq);
5639		if (se->sched_delayed) {
5640			dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
5641			SCHED_WARN_ON(se->sched_delayed);
5642			SCHED_WARN_ON(se->on_rq);
5643			return NULL;
crash> task_struct -x 0xffff9118b7583300 | grep "__state ="
  __state = 0x200,
crash> task_struct -x 0xffff9118b7583300 | grep rq
  on_rq = 0x0,
    on_rq = 0x0,
    cfs_rq = 0xffff9117e81a3e00,
    on_rq = 0x0,
    rq = 0x0,
crash> task_struct -xo | grep sched_entity
    [0x80] struct sched_entity se
crash> sched_entity 0xffff9118b7583380
struct sched_entity {
  load = {
    weight = 1048576,
    inv_weight = 4194304
  },
  run_node = {
    __rb_parent_color = 1,
    rb_right = 0x0,
    rb_left = 0x0
  },
  deadline = 5788784166,
  min_vruntime = 5785784166,
  min_slice = 3000000,
  group_node = {
    next = 0xffff9118b75833c0,
    prev = 0xffff9118b75833c0
  },
  on_rq = 0 '\000',
  sched_delayed = 0 '\000',
  rel_deadline = 0 '\000',
  custom_slice = 0 '\000',
  exec_start = 5630407844294,
  sum_exec_runtime = 5031478,
  prev_sum_exec_runtime = 5004139,
  vruntime = 5785811505,
  vlag = 0,
  slice = 3000000,
  nr_migrations = 0,
  depth = 1,
  parent = 0xffff9117e81a0600,
  cfs_rq = 0xffff9117e81a3e00,
  my_q = 0x0,
  runnable_weight = 0,
  avg = {
    last_update_time = 5630386353152,
    load_sum = 2555,
    runnable_sum = 2617274,
    util_sum = 83342,
    period_contrib = 877,
    load_avg = 39,
    runnable_avg = 39,
    util_avg = 1,
    util_est = 2147483760
  }
}
crash> cfs_rq 0xffff9117e81a3e00
struct cfs_rq {
  load = {
    weight = 0,
    inv_weight = 0
  },
  nr_running = 0,
  h_nr_running = 0,
  idle_nr_running = 0,
  idle_h_nr_running = 0,
  h_nr_delayed = 0,
  avg_vruntime = 0,
  avg_load = 0,
  min_vruntime = 5785811505,
  forceidle_seq = 0,
  min_vruntime_fi = 0,
  tasks_timeline = {
    rb_root = {
      rb_node = 0x0
    },
    rb_leftmost = 0x0
  },
  curr = 0xffff9118b7583380,
  next = 0x0,
  avg = {
    last_update_time = 5630386353152,
    load_sum = 2617381,
    runnable_sum = 2617379,
    util_sum = 83417,
    period_contrib = 877,
    load_avg = 39,
    runnable_avg = 39,
    util_avg = 1,
    util_est = 0
  },
  removed = {
    lock = {
      raw_lock = {
        {
          val = {
            counter = 0
          },
          {
            locked = 0 '\000',
            pending = 0 '\000'
          },
          {
            locked_pending = 0,
            tail = 0
          }
        }
      }
    },
    nr = 0,
    load_avg = 0,
    util_avg = 0,
    runnable_avg = 0
  },
  last_update_tg_load_avg = 5630407057919,
  tg_load_avg_contrib = 39,
  propagate = 0,
  prop_runnable_sum = 0,
  h_load = 0,
  last_h_load_update = 4296299815,
  h_load_next = 0x0,
  rq = 0xffff911a8eab89c0,
  on_list = 1,
  leaf_cfs_rq_list = {
    next = 0xffff911794a2d348,
    prev = 0xffff9119ebe62148
  },
  tg = 0xffff91178434a080,
  idle = 0,
  runtime_enabled = 0,
  runtime_remaining = 0,
  throttled_pelt_idle = 0,
  throttled_clock = 0,
  throttled_clock_pelt = 0,
  throttled_clock_pelt_time = 0,
  throttled_clock_self = 0,
  throttled_clock_self_time = 0,
  throttled = 0,
  throttle_count = 0,
  throttled_list = {
    next = 0xffff9117e81a3fa8,
    prev = 0xffff9117e81a3fa8
  },
  throttled_csd_list = {
    next = 0xffff9117e81a3fb8,
    prev = 0xffff9117e81a3fb8
  }
}
crash>

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Mike Galbraith 1 year, 4 months ago

On Thu, 2024-09-12 at 16:00 +0200, Mike Galbraith wrote:
>
> Oho, I just hit a pick_eevdf() returns NULL in pick_next_entity() and
> we deref it bug in tip that I recall having seen someone else mention
> them having hit.  LTP was chugging away doing lord knows what when
> evolution apparently decided to check accounts, which didn't go well.

BTW, what LTP was up to was cfs_bandwidth01.  I reproduced the crash
using it on master with the full patch set pretty quickly, but rebased
tip 4c293d0fa315 (sans sched/eevdf: More PELT vs DELAYED_DEQUEUE) seems
to be stable.

	-Mike

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Mike Galbraith 1 year, 4 months ago

On Fri, 2024-09-13 at 18:39 +0200, Mike Galbraith wrote:
> tip 4c293d0fa315 (sans sched/eevdf: More PELT vs DELAYED_DEQUEUE) seems
> to be stable.

Belay that, it went boom immediately this morning while trying to
trigger a warning I met elsewhere.

	-Mike

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Luis Machado 1 year, 4 months ago

On 9/14/24 04:40, Mike Galbraith wrote:
> On Fri, 2024-09-13 at 18:39 +0200, Mike Galbraith wrote:
>> tip 4c293d0fa315 (sans sched/eevdf: More PELT vs DELAYED_DEQUEUE) seems
>> to be stable.
> 
> Belay that, it went boom immediately this morning while trying to
> trigger a warning I met elsewhere.
> 
> 	-Mike

Are you still hitting this one?

I was trying to reproduce this one on the Pixel 6 (manifests as
a boot crash) but couldn't so far. Something might've changed a bit.

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Mike Galbraith 1 year, 4 months ago

On Tue, 2024-09-24 at 16:16 +0100, Luis Machado wrote:
> On 9/14/24 04:40, Mike Galbraith wrote:
> > On Fri, 2024-09-13 at 18:39 +0200, Mike Galbraith wrote:
> > > tip 4c293d0fa315 (sans sched/eevdf: More PELT vs DELAYED_DEQUEUE)
> > > seems
> > > to be stable.
> >
> > Belay that, it went boom immediately this morning while trying to
> > trigger a warning I met elsewhere.
> >
> >         -Mike
>
> Are you still hitting this one?
>
> I was trying to reproduce this one on the Pixel 6 (manifests as
> a boot crash) but couldn't so far. Something might've changed a bit.

I'm also having no luck triggering it.

	-Mike

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Mike Galbraith 1 year, 4 months ago

On Tue, 2024-09-24 at 19:35 +0200, Mike Galbraith wrote:
> On Tue, 2024-09-24 at 16:16 +0100, Luis Machado wrote:
> > On 9/14/24 04:40, Mike Galbraith wrote:
> > > On Fri, 2024-09-13 at 18:39 +0200, Mike Galbraith wrote:
> > > > tip 4c293d0fa315 (sans sched/eevdf: More PELT vs DELAYED_DEQUEUE)
> > > > seems
> > > > to be stable.
> > >
> > > Belay that, it went boom immediately this morning while trying to
> > > trigger a warning I met elsewhere.
> > >
> > >         -Mike
> >
> > Are you still hitting this one?
> >
> > I was trying to reproduce this one on the Pixel 6 (manifests as
> > a boot crash) but couldn't so far. Something might've changed a bit.
>
> I'm also having no luck triggering it.

BTW, that includes hefty RT/!RT config beating sessions w/wo..

   sched/eevdf: More PELT vs DELAYED_DEQUEUE

..and w/wo your mod to same, with dequeue on unthrottle patchlet to
keep sched_delayed encounter there from interrupting the hunt.

	-Mike

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Luis Machado 1 year, 5 months ago

On 9/11/24 09:45, Peter Zijlstra wrote:
> On Wed, Sep 11, 2024 at 09:35:16AM +0100, Luis Machado wrote:
>> On 9/10/24 15:05, Peter Zijlstra wrote:
>>> On Tue, Sep 10, 2024 at 12:04:11PM +0100, Luis Machado wrote:
>>>> I gave the above patch a try on our Android workload running on the Pixel 6 with a 6.8-based kernel.
>>>>
>>>> First I'd like to confirm that Dietmar's fix that was pushed to tip:sched/core (Fix util_est
>>>> accounting for DELAY_DEQUEUE) helps bring the frequencies and power use down to more sensible levels.
>>>>
>>>> As for the above changes, unfortunately I'm seeing high frequencies and high power usage again. The
>>>> pattern looks similar to what we observed with the uclamp inc/dec imbalance.
>>>
>>> :-(
>>>
>>>> I haven't investigated this in depth yet, but I'll go stare at some traces and the code, and hopefully
>>>> something will ring bells.
>>>
>>> So first thing to do is trace h_nr_delayed I suppose, in my own
>>> (limited) testing that was mostly [0,1] correctly correlating to there
>>> being a delayed task on the runqueue.
>>>
>>> I'm assuming that removing the usage sites restores function?
>>
>> It does restore function if we remove the usage.
>>
>> From an initial look:
>>
>> cat /sys/kernel/debug/sched/debug | grep -i delay                                                                                                                                                                                                                             
>>   .h_nr_delayed                  : -4
>>   .h_nr_delayed                  : -6
>>   .h_nr_delayed                  : -1
>>   .h_nr_delayed                  : -6
>>   .h_nr_delayed                  : -1
>>   .h_nr_delayed                  : -1
>>   .h_nr_delayed                  : -5
>>   .h_nr_delayed                  : -6
>>
>> So probably an unexpected decrement or lack of an increment somewhere.
> 
> Yeah, that's buggered. Ok, I'll go rebase sched/core and take this patch
> out. I'll see if I can reproduce that.

I'll keep looking on my end as well. I'm trying to capture the first time it goes bad. For some
reason my SCHED_WARN_ON didn't trigger when it should've.

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Mike Galbraith 1 year, 5 months ago

On Fri, 2024-09-06 at 12:45 +0200, Peter Zijlstra wrote:
> On Thu, Sep 05, 2024 at 04:53:54PM +0200, Peter Zijlstra wrote:
> 
> > > But then, like you said, __update_load_avg_cfs_rq() needs correct
> > > cfs_rq->h_nr_running.
> > 
> > Uff. So yes __update_load_avg_cfs_rq() needs a different number,
> > but
> > I'll contest that h_nr_running is in fact correct, albeit no longer
> > suitable for this purpose.
> > 
> > We can track h_nr_delayed I suppose, and subtract that.
> 
> Something like so?

With these two added to the series plus your prototype below, watching
sched_debug as box builds kernels and whatnot.. is about as stimulating
as watching paint peel <thumbs up emoji>

sched-fair-Properly-deactivate-sched_delayed-task-upon-class-change.patch
sched-fair-Fix-util_est-accounting-for-DELAY_DEQUEUE.patch

> 
> ---
>  kernel/sched/debug.c |  1 +
>  kernel/sched/fair.c  | 49
> ++++++++++++++++++++++++++++++++++++++++++++-----
>  kernel/sched/pelt.c  |  2 +-
>  kernel/sched/sched.h |  7 +++++--
>  4 files changed, 51 insertions(+), 8 deletions(-)
> 
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 01ce9a76164c..3d3c5be78075 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -829,6 +829,7 @@ void print_cfs_rq(struct seq_file *m, int cpu,
> struct cfs_rq *cfs_rq)
>         SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "spread",
> SPLIT_NS(spread));
>         SEQ_printf(m, "  .%-30s: %d\n", "nr_running", cfs_rq-
> >nr_running);
>         SEQ_printf(m, "  .%-30s: %d\n", "h_nr_running", cfs_rq-
> >h_nr_running);
> +       SEQ_printf(m, "  .%-30s: %d\n", "h_nr_delayed", cfs_rq-
> >h_nr_delayed);
>         SEQ_printf(m, "  .%-30s: %d\n", "idle_nr_running",
>                         cfs_rq->idle_nr_running);
>         SEQ_printf(m, "  .%-30s: %d\n", "idle_h_nr_running",
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 11e890486c1b..629b46308960 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5456,9 +5456,31 @@ static void clear_buddies(struct cfs_rq
> *cfs_rq, struct sched_entity *se)
>  
>  static __always_inline void return_cfs_rq_runtime(struct cfs_rq
> *cfs_rq);
>  
> -static inline void finish_delayed_dequeue_entity(struct sched_entity
> *se)
> +static void set_delayed(struct sched_entity *se)
> +{
> +       se->sched_delayed = 1;
> +       for_each_sched_entity(se) {
> +               struct cfs_rq *cfs_rq = cfs_rq_of(se);
> +               cfs_rq->h_nr_delayed++;
> +               if (cfs_rq_throttled(cfs_rq))
> +                       break;
> +       }
> +}
> +
> +static void clear_delayed(struct sched_entity *se)
>  {
>         se->sched_delayed = 0;
> +       for_each_sched_entity(se) {
> +               struct cfs_rq *cfs_rq = cfs_rq_of(se);
> +               cfs_rq->h_nr_delayed--;
> +               if (cfs_rq_throttled(cfs_rq))
> +                       break;
> +       }
> +}
> +
> +static inline void finish_delayed_dequeue_entity(struct sched_entity
> *se)
> +{
> +       clear_delayed(se);
>         if (sched_feat(DELAY_ZERO) && se->vlag > 0)
>                 se->vlag = 0;
>  }
> @@ -5488,7 +5510,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct
> sched_entity *se, int flags)
>                         if (cfs_rq->next == se)
>                                 cfs_rq->next = NULL;
>                         update_load_avg(cfs_rq, se, 0);
> -                       se->sched_delayed = 1;
> +                       set_delayed(se);
>                         return false;
>                 }
>         }
> @@ -5907,7 +5929,7 @@ static bool throttle_cfs_rq(struct cfs_rq
> *cfs_rq)
>         struct rq *rq = rq_of(cfs_rq);
>         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
>         struct sched_entity *se;
> -       long task_delta, idle_task_delta, dequeue = 1;
> +       long task_delta, idle_task_delta, delayed_delta, dequeue = 1;
>         long rq_h_nr_running = rq->cfs.h_nr_running;
>  
>         raw_spin_lock(&cfs_b->lock);
> @@ -5940,6 +5962,7 @@ static bool throttle_cfs_rq(struct cfs_rq
> *cfs_rq)
>  
>         task_delta = cfs_rq->h_nr_running;
>         idle_task_delta = cfs_rq->idle_h_nr_running;
> +       delayed_delta = cfs_rq->h_nr_delayed;
>         for_each_sched_entity(se) {
>                 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
>                 int flags;
> @@ -5963,6 +5986,7 @@ static bool throttle_cfs_rq(struct cfs_rq
> *cfs_rq)
>  
>                 qcfs_rq->h_nr_running -= task_delta;
>                 qcfs_rq->idle_h_nr_running -= idle_task_delta;
> +               qcfs_rq->h_nr_delayed -= delayed_delta;
>  
>                 if (qcfs_rq->load.weight) {
>                         /* Avoid re-evaluating load for this entity:
> */
> @@ -5985,6 +6009,7 @@ static bool throttle_cfs_rq(struct cfs_rq
> *cfs_rq)
>  
>                 qcfs_rq->h_nr_running -= task_delta;
>                 qcfs_rq->idle_h_nr_running -= idle_task_delta;
> +               qcfs_rq->h_nr_delayed -= delayed_delta;
>         }
>  
>         /* At this point se is NULL and we are at root level*/
> @@ -6010,7 +6035,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
>         struct rq *rq = rq_of(cfs_rq);
>         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
>         struct sched_entity *se;
> -       long task_delta, idle_task_delta;
> +       long task_delta, idle_task_delta, delayed_delta;
>         long rq_h_nr_running = rq->cfs.h_nr_running;
>  
>         se = cfs_rq->tg->se[cpu_of(rq)];
> @@ -6046,6 +6071,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
>  
>         task_delta = cfs_rq->h_nr_running;
>         idle_task_delta = cfs_rq->idle_h_nr_running;
> +       delayed_delta = cfs_rq->h_nr_delayed;
>         for_each_sched_entity(se) {
>                 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
>  
> @@ -6060,6 +6086,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
>  
>                 qcfs_rq->h_nr_running += task_delta;
>                 qcfs_rq->idle_h_nr_running += idle_task_delta;
> +               qcfs_rq->h_nr_delayed += delayed_delta;
>  
>                 /* end evaluation on encountering a throttled cfs_rq
> */
>                 if (cfs_rq_throttled(qcfs_rq))
> @@ -6077,6 +6104,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
>  
>                 qcfs_rq->h_nr_running += task_delta;
>                 qcfs_rq->idle_h_nr_running += idle_task_delta;
> +               qcfs_rq->h_nr_delayed += delayed_delta;
>  
>                 /* end evaluation on encountering a throttled cfs_rq
> */
>                 if (cfs_rq_throttled(qcfs_rq))
> @@ -6930,7 +6958,7 @@ requeue_delayed_entity(struct sched_entity *se)
>         }
>  
>         update_load_avg(cfs_rq, se, 0);
> -       se->sched_delayed = 0;
> +       clear_delayed(se);
>  }
>  
>  /*
> @@ -6944,6 +6972,7 @@ enqueue_task_fair(struct rq *rq, struct
> task_struct *p, int flags)
>         struct cfs_rq *cfs_rq;
>         struct sched_entity *se = &p->se;
>         int idle_h_nr_running = task_has_idle_policy(p);
> +       int h_nr_delayed = 0;
>         int task_new = !(flags & ENQUEUE_WAKEUP);
>         int rq_h_nr_running = rq->cfs.h_nr_running;
>         u64 slice = 0;
> @@ -6953,6 +6982,9 @@ enqueue_task_fair(struct rq *rq, struct
> task_struct *p, int flags)
>                 return;
>         }
>  
> +       if (task_new)
> +               h_nr_delayed = !!se->sched_delayed;
> +
>         /*
>          * The code below (indirectly) updates schedutil which looks
> at
>          * the cfs_rq utilization to select a frequency.
> @@ -6991,6 +7023,7 @@ enqueue_task_fair(struct rq *rq, struct
> task_struct *p, int flags)
>  
>                 cfs_rq->h_nr_running++;
>                 cfs_rq->idle_h_nr_running += idle_h_nr_running;
> +               cfs_rq->h_nr_delayed += h_nr_delayed;
>  
>                 if (cfs_rq_is_idle(cfs_rq))
>                         idle_h_nr_running = 1;
> @@ -7014,6 +7047,7 @@ enqueue_task_fair(struct rq *rq, struct
> task_struct *p, int flags)
>  
>                 cfs_rq->h_nr_running++;
>                 cfs_rq->idle_h_nr_running += idle_h_nr_running;
> +               cfs_rq->h_nr_delayed += h_nr_delayed;
>  
>                 if (cfs_rq_is_idle(cfs_rq))
>                         idle_h_nr_running = 1;
> @@ -7076,6 +7110,7 @@ static int dequeue_entities(struct rq *rq,
> struct sched_entity *se, int flags)
>         struct task_struct *p = NULL;
>         int idle_h_nr_running = 0;
>         int h_nr_running = 0;
> +       int h_nr_delayed = 0;
>         struct cfs_rq *cfs_rq;
>         u64 slice = 0;
>  
> @@ -7083,6 +7118,8 @@ static int dequeue_entities(struct rq *rq,
> struct sched_entity *se, int flags)
>                 p = task_of(se);
>                 h_nr_running = 1;
>                 idle_h_nr_running = task_has_idle_policy(p);
> +               if (!task_sleep && !task_delayed)
> +                       h_nr_delayed = !!se->sched_delayed;
>         } else {
>                 cfs_rq = group_cfs_rq(se);
>                 slice = cfs_rq_min_slice(cfs_rq);
> @@ -7100,6 +7137,7 @@ static int dequeue_entities(struct rq *rq,
> struct sched_entity *se, int flags)
>  
>                 cfs_rq->h_nr_running -= h_nr_running;
>                 cfs_rq->idle_h_nr_running -= idle_h_nr_running;
> +               cfs_rq->h_nr_delayed -= h_nr_delayed;
>  
>                 if (cfs_rq_is_idle(cfs_rq))
>                         idle_h_nr_running = h_nr_running;
> @@ -7138,6 +7176,7 @@ static int dequeue_entities(struct rq *rq,
> struct sched_entity *se, int flags)
>  
>                 cfs_rq->h_nr_running -= h_nr_running;
>                 cfs_rq->idle_h_nr_running -= idle_h_nr_running;
> +               cfs_rq->h_nr_delayed -= h_nr_delayed;
>  
>                 if (cfs_rq_is_idle(cfs_rq))
>                         idle_h_nr_running = h_nr_running;
> diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
> index fa52906a4478..21e3ff5eb77a 100644
> --- a/kernel/sched/pelt.c
> +++ b/kernel/sched/pelt.c
> @@ -321,7 +321,7 @@ int __update_load_avg_cfs_rq(u64 now, struct
> cfs_rq *cfs_rq)
>  {
>         if (___update_load_sum(now, &cfs_rq->avg,
>                                 scale_load_down(cfs_rq->load.weight),
> -                               cfs_rq->h_nr_running,
> +                               cfs_rq->h_nr_running - cfs_rq-
> >h_nr_delayed,
>                                 cfs_rq->curr != NULL)) {
>  
>                 ___update_load_avg(&cfs_rq->avg, 1);
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 3744f16a1293..d91360b0cca1 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -603,6 +603,7 @@ struct cfs_rq {
>         unsigned int            h_nr_running;      /*
> SCHED_{NORMAL,BATCH,IDLE} */
>         unsigned int            idle_nr_running;   /* SCHED_IDLE */
>         unsigned int            idle_h_nr_running; /* SCHED_IDLE */
> +       unsigned int            h_nr_delayed;
>  
>         s64                     avg_vruntime;
>         u64                     avg_load;
> @@ -813,8 +814,10 @@ struct dl_rq {
>  
>  static inline void se_update_runnable(struct sched_entity *se)
>  {
> -       if (!entity_is_task(se))
> -               se->runnable_weight = se->my_q->h_nr_running;
> +       if (!entity_is_task(se)) {
> +               struct cfs_rq *cfs_rq = se->my_q;
> +               se->runnable_weight = cfs_rq->h_nr_running - cfs_rq-
> >h_nr_delayed;
> +       }
>  }
>  
>  static inline long se_runnable(struct sched_entity *se)

[tip: sched/core] sched/eevdf: More PELT vs DELAYED_DEQUEUE

Posted by tip-bot2 for Peter Zijlstra 1 year, 5 months ago

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     2e05f6c71d36f8ae1410a1cf3f12848cc17916e9
Gitweb:        https://git.kernel.org/tip/2e05f6c71d36f8ae1410a1cf3f12848cc17916e9
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Fri, 06 Sep 2024 12:45:25 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 10 Sep 2024 09:51:15 +02:00

sched/eevdf: More PELT vs DELAYED_DEQUEUE

Vincent and Dietmar noted that while commit fc1892becd56 fixes the
entity runnable stats, it does not adjust the cfs_rq runnable stats,
which are based off of h_nr_running.

Track h_nr_delayed such that we can discount those and adjust the
signal.

Fixes: fc1892becd56 ("sched/eevdf: Fixup PELT vs DELAYED_DEQUEUE")
Reported-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Reported-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20240906104525.GG4928@noisy.programming.kicks-ass.net
---
 kernel/sched/debug.c |  1 +-
 kernel/sched/fair.c  | 49 ++++++++++++++++++++++++++++++++++++++-----
 kernel/sched/pelt.c  |  2 +-
 kernel/sched/sched.h |  7 ++++--
 4 files changed, 51 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index de1dc52..35974ac 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -844,6 +844,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread));
 	SEQ_printf(m, "  .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
 	SEQ_printf(m, "  .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running);
+	SEQ_printf(m, "  .%-30s: %d\n", "h_nr_delayed", cfs_rq->h_nr_delayed);
 	SEQ_printf(m, "  .%-30s: %d\n", "idle_nr_running",
 			cfs_rq->idle_nr_running);
 	SEQ_printf(m, "  .%-30s: %d\n", "idle_h_nr_running",
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 922d690..0bc5e62 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5456,9 +5456,31 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 
-static inline void finish_delayed_dequeue_entity(struct sched_entity *se)
+static void set_delayed(struct sched_entity *se)
+{
+	se->sched_delayed = 1;
+	for_each_sched_entity(se) {
+		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+		cfs_rq->h_nr_delayed++;
+		if (cfs_rq_throttled(cfs_rq))
+			break;
+	}
+}
+
+static void clear_delayed(struct sched_entity *se)
 {
 	se->sched_delayed = 0;
+	for_each_sched_entity(se) {
+		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+		cfs_rq->h_nr_delayed--;
+		if (cfs_rq_throttled(cfs_rq))
+			break;
+	}
+}
+
+static inline void finish_delayed_dequeue_entity(struct sched_entity *se)
+{
+	clear_delayed(se);
 	if (sched_feat(DELAY_ZERO) && se->vlag > 0)
 		se->vlag = 0;
 }
@@ -5488,7 +5510,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 			if (cfs_rq->next == se)
 				cfs_rq->next = NULL;
 			update_load_avg(cfs_rq, se, 0);
-			se->sched_delayed = 1;
+			set_delayed(se);
 			return false;
 		}
 	}
@@ -5907,7 +5929,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	struct rq *rq = rq_of(cfs_rq);
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
 	struct sched_entity *se;
-	long task_delta, idle_task_delta, dequeue = 1;
+	long task_delta, idle_task_delta, delayed_delta, dequeue = 1;
 	long rq_h_nr_running = rq->cfs.h_nr_running;
 
 	raw_spin_lock(&cfs_b->lock);
@@ -5940,6 +5962,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 
 	task_delta = cfs_rq->h_nr_running;
 	idle_task_delta = cfs_rq->idle_h_nr_running;
+	delayed_delta = cfs_rq->h_nr_delayed;
 	for_each_sched_entity(se) {
 		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
 		int flags;
@@ -5963,6 +5986,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 
 		qcfs_rq->h_nr_running -= task_delta;
 		qcfs_rq->idle_h_nr_running -= idle_task_delta;
+		qcfs_rq->h_nr_delayed -= delayed_delta;
 
 		if (qcfs_rq->load.weight) {
 			/* Avoid re-evaluating load for this entity: */
@@ -5985,6 +6009,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 
 		qcfs_rq->h_nr_running -= task_delta;
 		qcfs_rq->idle_h_nr_running -= idle_task_delta;
+		qcfs_rq->h_nr_delayed -= delayed_delta;
 	}
 
 	/* At this point se is NULL and we are at root level*/
@@ -6010,7 +6035,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 	struct rq *rq = rq_of(cfs_rq);
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
 	struct sched_entity *se;
-	long task_delta, idle_task_delta;
+	long task_delta, idle_task_delta, delayed_delta;
 	long rq_h_nr_running = rq->cfs.h_nr_running;
 
 	se = cfs_rq->tg->se[cpu_of(rq)];
@@ -6046,6 +6071,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 
 	task_delta = cfs_rq->h_nr_running;
 	idle_task_delta = cfs_rq->idle_h_nr_running;
+	delayed_delta = cfs_rq->h_nr_delayed;
 	for_each_sched_entity(se) {
 		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
 
@@ -6060,6 +6086,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 
 		qcfs_rq->h_nr_running += task_delta;
 		qcfs_rq->idle_h_nr_running += idle_task_delta;
+		qcfs_rq->h_nr_delayed += delayed_delta;
 
 		/* end evaluation on encountering a throttled cfs_rq */
 		if (cfs_rq_throttled(qcfs_rq))
@@ -6077,6 +6104,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 
 		qcfs_rq->h_nr_running += task_delta;
 		qcfs_rq->idle_h_nr_running += idle_task_delta;
+		qcfs_rq->h_nr_delayed += delayed_delta;
 
 		/* end evaluation on encountering a throttled cfs_rq */
 		if (cfs_rq_throttled(qcfs_rq))
@@ -6930,7 +6958,7 @@ requeue_delayed_entity(struct sched_entity *se)
 	}
 
 	update_load_avg(cfs_rq, se, 0);
-	se->sched_delayed = 0;
+	clear_delayed(se);
 }
 
 /*
@@ -6944,6 +6972,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
 	int idle_h_nr_running = task_has_idle_policy(p);
+	int h_nr_delayed = 0;
 	int task_new = !(flags & ENQUEUE_WAKEUP);
 	int rq_h_nr_running = rq->cfs.h_nr_running;
 	u64 slice = 0;
@@ -6970,6 +6999,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	if (p->in_iowait)
 		cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
 
+	if (task_new)
+		h_nr_delayed = !!se->sched_delayed;
+
 	for_each_sched_entity(se) {
 		if (se->on_rq) {
 			if (se->sched_delayed)
@@ -6992,6 +7024,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
 		cfs_rq->h_nr_running++;
 		cfs_rq->idle_h_nr_running += idle_h_nr_running;
+		cfs_rq->h_nr_delayed += h_nr_delayed;
 
 		if (cfs_rq_is_idle(cfs_rq))
 			idle_h_nr_running = 1;
@@ -7015,6 +7048,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
 		cfs_rq->h_nr_running++;
 		cfs_rq->idle_h_nr_running += idle_h_nr_running;
+		cfs_rq->h_nr_delayed += h_nr_delayed;
 
 		if (cfs_rq_is_idle(cfs_rq))
 			idle_h_nr_running = 1;
@@ -7077,6 +7111,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 	struct task_struct *p = NULL;
 	int idle_h_nr_running = 0;
 	int h_nr_running = 0;
+	int h_nr_delayed = 0;
 	struct cfs_rq *cfs_rq;
 	u64 slice = 0;
 
@@ -7084,6 +7119,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 		p = task_of(se);
 		h_nr_running = 1;
 		idle_h_nr_running = task_has_idle_policy(p);
+		if (!task_sleep && !task_delayed)
+			h_nr_delayed = !!se->sched_delayed;
 	} else {
 		cfs_rq = group_cfs_rq(se);
 		slice = cfs_rq_min_slice(cfs_rq);
@@ -7101,6 +7138,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 
 		cfs_rq->h_nr_running -= h_nr_running;
 		cfs_rq->idle_h_nr_running -= idle_h_nr_running;
+		cfs_rq->h_nr_delayed -= h_nr_delayed;
 
 		if (cfs_rq_is_idle(cfs_rq))
 			idle_h_nr_running = h_nr_running;
@@ -7139,6 +7177,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 
 		cfs_rq->h_nr_running -= h_nr_running;
 		cfs_rq->idle_h_nr_running -= idle_h_nr_running;
+		cfs_rq->h_nr_delayed -= h_nr_delayed;
 
 		if (cfs_rq_is_idle(cfs_rq))
 			idle_h_nr_running = h_nr_running;
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index fa52906..21e3ff5 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -321,7 +321,7 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
 {
 	if (___update_load_sum(now, &cfs_rq->avg,
 				scale_load_down(cfs_rq->load.weight),
-				cfs_rq->h_nr_running,
+				cfs_rq->h_nr_running - cfs_rq->h_nr_delayed,
 				cfs_rq->curr != NULL)) {
 
 		___update_load_avg(&cfs_rq->avg, 1);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3744f16..d91360b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -603,6 +603,7 @@ struct cfs_rq {
 	unsigned int		h_nr_running;      /* SCHED_{NORMAL,BATCH,IDLE} */
 	unsigned int		idle_nr_running;   /* SCHED_IDLE */
 	unsigned int		idle_h_nr_running; /* SCHED_IDLE */
+	unsigned int		h_nr_delayed;
 
 	s64			avg_vruntime;
 	u64			avg_load;
@@ -813,8 +814,10 @@ struct dl_rq {
 
 static inline void se_update_runnable(struct sched_entity *se)
 {
-	if (!entity_is_task(se))
-		se->runnable_weight = se->my_q->h_nr_running;
+	if (!entity_is_task(se)) {
+		struct cfs_rq *cfs_rq = se->my_q;
+		se->runnable_weight = cfs_rq->h_nr_running - cfs_rq->h_nr_delayed;
+	}
 }
 
 static inline long se_runnable(struct sched_entity *se)

Re: [tip: sched/core] sched/eevdf: More PELT vs DELAYED_DEQUEUE

Posted by K Prateek Nayak 1 year, 2 months ago

Hello Peter,

On 9/10/2024 1:39 PM, tip-bot2 for Peter Zijlstra wrote:
> The following commit has been merged into the sched/core branch of tip:
> 
> Commit-ID:     2e05f6c71d36f8ae1410a1cf3f12848cc17916e9
> Gitweb:        https://git.kernel.org/tip/2e05f6c71d36f8ae1410a1cf3f12848cc17916e9
> Author:        Peter Zijlstra <peterz@infradead.org>
> AuthorDate:    Fri, 06 Sep 2024 12:45:25 +02:00
> Committer:     Peter Zijlstra <peterz@infradead.org>
> CommitterDate: Tue, 10 Sep 2024 09:51:15 +02:00
> 
> sched/eevdf: More PELT vs DELAYED_DEQUEUE
> 
> Vincent and Dietmar noted that while commit fc1892becd56 fixes the
> entity runnable stats, it does not adjust the cfs_rq runnable stats,
> which are based off of h_nr_running.
> 
> Track h_nr_delayed such that we can discount those and adjust the
> signal.
> 
> Fixes: fc1892becd56 ("sched/eevdf: Fixup PELT vs DELAYED_DEQUEUE")
> Reported-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
> Reported-by: Vincent Guittot <vincent.guittot@linaro.org>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Link: https://lkml.kernel.org/r/20240906104525.GG4928@noisy.programming.kicks-ass.net

I've been testing this fix for a while now to see if it helps the
regressions reported around EEVDF complete. The issue with negative
"h_nr_delayed" reported by Luis previously seem to have been fixed as a
result of commit 75b6499024a6 ("sched/fair: Properly deactivate
sched_delayed task upon class change")

I've been running stress-ng for a while and haven't seen any cases of
negative "h_nr_delayed". I'd also added the following WARN_ON() to see
if there are any delayed tasks on the cfs_rq before switching to idle in
some of my previous experiments and I did not see any splat during my
benchmark runs.

diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 621696269584..c19a31fa46c9 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -457,6 +457,9 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct t
  
  static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
  {
+	/* All delayed tasks must be picked off before switching to idle */
+	SCHED_WARN_ON(rq->cfs.h_nr_delayed);
+
  	update_idle_core(rq);
  	scx_update_idle(rq, true);
  	schedstat_inc(rq->sched_goidle);
--

If you are including this back, feel free to add:

Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>

> [..snip..]

-- 
Thanks and Regards,
Prateek

Re: [tip: sched/core] sched/eevdf: More PELT vs DELAYED_DEQUEUE

Posted by Luis Machado 1 year, 2 months ago

Hi,

On 11/27/24 04:17, K Prateek Nayak wrote:
> Hello Peter,
> 
> On 9/10/2024 1:39 PM, tip-bot2 for Peter Zijlstra wrote:
>> The following commit has been merged into the sched/core branch of tip:
>>
>> Commit-ID:     2e05f6c71d36f8ae1410a1cf3f12848cc17916e9
>> Gitweb:        https://git.kernel.org/tip/2e05f6c71d36f8ae1410a1cf3f12848cc17916e9
>> Author:        Peter Zijlstra <peterz@infradead.org>
>> AuthorDate:    Fri, 06 Sep 2024 12:45:25 +02:00
>> Committer:     Peter Zijlstra <peterz@infradead.org>
>> CommitterDate: Tue, 10 Sep 2024 09:51:15 +02:00
>>
>> sched/eevdf: More PELT vs DELAYED_DEQUEUE
>>
>> Vincent and Dietmar noted that while commit fc1892becd56 fixes the
>> entity runnable stats, it does not adjust the cfs_rq runnable stats,
>> which are based off of h_nr_running.
>>
>> Track h_nr_delayed such that we can discount those and adjust the
>> signal.
>>
>> Fixes: fc1892becd56 ("sched/eevdf: Fixup PELT vs DELAYED_DEQUEUE")
>> Reported-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
>> Reported-by: Vincent Guittot <vincent.guittot@linaro.org>
>> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>> Link: https://lkml.kernel.org/r/20240906104525.GG4928@noisy.programming.kicks-ass.net
> 
> I've been testing this fix for a while now to see if it helps the
> regressions reported around EEVDF complete. The issue with negative
> "h_nr_delayed" reported by Luis previously seem to have been fixed as a
> result of commit 75b6499024a6 ("sched/fair: Properly deactivate
> sched_delayed task upon class change")

I recall having 75b6499024a6 in my testing tree and somehow still noticing
unbalanced accounting for h_nr_delayed, where it would be decremented
twice eventually, leading to negative numbers.

I might have to give it another go if we're considering including the change
as-is, just to make sure. Since our setups are slightly different, we could
be exercising some slightly different paths.

Did this patch help with the regressions you noticed though? 

> 
> I've been running stress-ng for a while and haven't seen any cases of
> negative "h_nr_delayed". I'd also added the following WARN_ON() to see
> if there are any delayed tasks on the cfs_rq before switching to idle in
> some of my previous experiments and I did not see any splat during my
> benchmark runs.
> 
> diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
> index 621696269584..c19a31fa46c9 100644
> --- a/kernel/sched/idle.c
> +++ b/kernel/sched/idle.c
> @@ -457,6 +457,9 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct t
>  
>  static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
>  {
> +    /* All delayed tasks must be picked off before switching to idle */
> +    SCHED_WARN_ON(rq->cfs.h_nr_delayed);
> +
>      update_idle_core(rq);
>      scx_update_idle(rq, true);
>      schedstat_inc(rq->sched_goidle);
> -- 
> 
> If you are including this back, feel free to add:
> 
> Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
> 
>> [..snip..]
>

Re: [tip: sched/core] sched/eevdf: More PELT vs DELAYED_DEQUEUE

Posted by K Prateek Nayak 1 year, 2 months ago

(+ Saravana, Samuel)

Hello Luis,

On 11/27/2024 3:04 PM, Luis Machado wrote:
> Hi,
> 
> On 11/27/24 04:17, K Prateek Nayak wrote:
>> Hello Peter,
>>
>> On 9/10/2024 1:39 PM, tip-bot2 for Peter Zijlstra wrote:
>>> The following commit has been merged into the sched/core branch of tip:
>>>
>>> Commit-ID:     2e05f6c71d36f8ae1410a1cf3f12848cc17916e9
>>> Gitweb:        https://git.kernel.org/tip/2e05f6c71d36f8ae1410a1cf3f12848cc17916e9
>>> Author:        Peter Zijlstra <peterz@infradead.org>
>>> AuthorDate:    Fri, 06 Sep 2024 12:45:25 +02:00
>>> Committer:     Peter Zijlstra <peterz@infradead.org>
>>> CommitterDate: Tue, 10 Sep 2024 09:51:15 +02:00
>>>
>>> sched/eevdf: More PELT vs DELAYED_DEQUEUE
>>>
>>> Vincent and Dietmar noted that while commit fc1892becd56 fixes the
>>> entity runnable stats, it does not adjust the cfs_rq runnable stats,
>>> which are based off of h_nr_running.
>>>
>>> Track h_nr_delayed such that we can discount those and adjust the
>>> signal.
>>>
>>> Fixes: fc1892becd56 ("sched/eevdf: Fixup PELT vs DELAYED_DEQUEUE")
>>> Reported-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
>>> Reported-by: Vincent Guittot <vincent.guittot@linaro.org>
>>> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>>> Link: https://lkml.kernel.org/r/20240906104525.GG4928@noisy.programming.kicks-ass.net
>>
>> I've been testing this fix for a while now to see if it helps the
>> regressions reported around EEVDF complete. The issue with negative
>> "h_nr_delayed" reported by Luis previously seem to have been fixed as a
>> result of commit 75b6499024a6 ("sched/fair: Properly deactivate
>> sched_delayed task upon class change")
> 
> I recall having 75b6499024a6 in my testing tree and somehow still noticing
> unbalanced accounting for h_nr_delayed, where it would be decremented
> twice eventually, leading to negative numbers.
> 
> I might have to give it another go if we're considering including the change
> as-is, just to make sure. Since our setups are slightly different, we could
> be exercising some slightly different paths.

That would be great! Thank you :)

Now that I see, you did have Valentine's patches in your tree during
testing
https://lore.kernel.org/lkml/6df12fde-1e0d-445f-8f8a-736d11f9ee41@arm.com/
Perhaps it could be the fixup commit 98442f0ccd82 ("sched: Fix
delayed_dequeue vs switched_from_fair()") or the fact that my benchmark
didn't stress this path enough to break you as you mentioned. I would
have still expected it to hit that SCHED_WARN_ON() I had added in
set_next_task_idle() if something went sideways.

> 
> Did this patch help with the regressions you noticed though?

I believe it was Saravana who was seeing anomalies in PELT ramp-up with
DELAY_DEQUEUE. My test setup is currently not equipped to catch it but
Saravana was interested in these fixes being backported to v6.12 LTS in
https://lore.kernel.org/lkml/CAGETcx_1pZCtWiBbDmUcxEw3abF5dr=XdFCkH9zXWK75g7457w@mail.gmail.com/

I believe tracking h_nr_delayed and disregarding delayed tasks in
certain places is a necessary fix. None of the benchmarks in my test
setup seem to mind running without it but I'm also doing most of my
testing with performance governor and the PELT anomalies seem to affect
more from a PM perspective and not load balancing perspective.

> 
>>
>> I've been running stress-ng for a while and haven't seen any cases of
>> negative "h_nr_delayed". I'd also added the following WARN_ON() to see
>> if there are any delayed tasks on the cfs_rq before switching to idle in
>> some of my previous experiments and I did not see any splat during my
>> benchmark runs.
>>
>> diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
>> index 621696269584..c19a31fa46c9 100644
>> --- a/kernel/sched/idle.c
>> +++ b/kernel/sched/idle.c
>> @@ -457,6 +457,9 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct t
>>   
>>   static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
>>   {
>> +    /* All delayed tasks must be picked off before switching to idle */
>> +    SCHED_WARN_ON(rq->cfs.h_nr_delayed);
>> +
>>       update_idle_core(rq);
>>       scx_update_idle(rq, true);
>>       schedstat_inc(rq->sched_goidle);
>> -- 
>>
>> If you are including this back, feel free to add:
>>
>> Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
>>
>>> [..snip..]
>>
> 

-- 
Thanks and Regards,
Prateek

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Vincent Guittot 1 year, 5 months ago

On Thu, 5 Sept 2024 at 16:54, Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Thu, Sep 05, 2024 at 04:07:01PM +0200, Dietmar Eggemann wrote:
>
> > > Unfortunately, this is not only about util_est
> > >
> > > cfs_rq's runnable_avg is also wrong  because we normally have :
> > > cfs_rq's runnable_avg == /Sum se's runnable_avg
> > > but cfs_rq's runnable_avg uses cfs_rq's h_nr_running but delayed
> > > entities are still accounted in h_nr_running
> >
> > Yes, I agree.
> >
> > se's runnable_avg should be fine already since:
> >
> > se_runnable()
> >
> >   if (se->sched_delayed)
> >     return false
> >
> > But then, like you said, __update_load_avg_cfs_rq() needs correct
> > cfs_rq->h_nr_running.
>
> Uff. So yes __update_load_avg_cfs_rq() needs a different number, but
> I'll contest that h_nr_running is in fact correct, albeit no longer
> suitable for this purpose.

AFAICT, delayed dequeue tasks are there only to consume their negative
lag but don't want to run in any case.  So I keep thinking that they
should not be counted in h_nr_running nor in runnable or load. They
only want to be kept in the rb tree of the cfs to consume this
negative lag and they want to keep their weight in the
cfs_rq->avg_load, which has nothing to do with the pelt load, to keep
a fair slope for the vruntime.

>
> We can track h_nr_delayed I suppose, and subtract that.
>
> > And I guess we need something like:
> >
> > se_on_rq()
> >
> >   if (se->sched_delayed)
> >     return false
> >
> > for
> >
> > __update_load_avg_se()
> >
> > - if (___update_load_sum(now, &se->avg, !!se->on_rq, se_runnable(se),
> > + if (___update_load_sum(now, &se->avg, se_on_rq(se), se_runnable(se),
> >
> >
> > My hope was we can fix util_est independently since it drives CPU
> > frequency. Whereas PELT load_avg and runnable_avg are "only" used for
> > load balancing. But I agree, it has to be fixed as well.
> >
> > > That also means that cfs_rq's h_nr_running is not accurate anymore
> > > because it includes delayed dequeue
> >
> > +1
> >
> > > and cfs_rq load_avg is kept artificially high which biases
> > > load_balance and cgroup's shares
> >
> > +1
>
> Again, fundamentally the delayed tasks are delayed because they need to
> remain part of the competition in order to 'earn' time. It really is
> fully on_rq, and should be for the purpose of load and load-balancing.

They don't compete with other they wait for their lag to become
positive which is completely different and biases all the system

>
> It is only special in that it will never run again (until it gets
> woken).
>
> Consider (2 CPUs, 4 tasks):
>
>   CPU1          CPU2
>    A             D
>    B (delayed)
>    C
>
> Then migrating any one of the tasks on CPU1 to CPU2 will make them all
> earn time at 1/2 instead of 1/3 vs 1/1. More fair etc.

But the one that is "enqueued" with the delayed queue will have twice
more time and balancing the delayed task will doesn't help to balance
the system because it doesn't run

Also the delayed task can make a cpu overloaded where it is not. All
this is unfair

>
> Yes, I realize this might seem weird, but we're going to be getting a
> ton more of this weirdness once proxy execution lands, then we'll be
> having the entire block chain still on the runqueue (and actually
> consuming time).

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Vincent Guittot 1 year, 5 months ago

On Thu, 5 Sept 2024 at 16:07, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:
>
> On 05/09/2024 15:33, Vincent Guittot wrote:
> > On Thu, 5 Sept 2024 at 15:02, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:
> >>
> >> On 29/08/2024 17:42, Hongyan Xia wrote:
> >>> On 22/08/2024 15:58, Vincent Guittot wrote:
> >>>> On Thu, 22 Aug 2024 at 14:10, Vincent Guittot
> >>>> <vincent.guittot@linaro.org> wrote:
> >>>>>
> >>>>> On Thu, 22 Aug 2024 at 14:08, Luis Machado <luis.machado@arm.com> wrote:
> >>>>>>
> >>>>>> Vincent,
> >>>>>>
> >>>>>> On 8/22/24 11:28, Luis Machado wrote:
> >>>>>>> On 8/22/24 10:53, Vincent Guittot wrote:
> >>>>>>>> On Thu, 22 Aug 2024 at 11:22, Luis Machado <luis.machado@arm.com>
> >>>>>>>> wrote:
> >>>>>>>>>
> >>>>>>>>> On 8/22/24 09:19, Vincent Guittot wrote:
> >>>>>>>>>> Hi,
> >>>>>>>>>>
> >>>>>>>>>> On Wed, 21 Aug 2024 at 15:34, Hongyan Xia <hongyan.xia2@arm.com>
>
> [...]
>
> >>> After staring at the code even more, I think the situation is worse.
> >>>
> >>> First thing is that uclamp might also want to be part of these stats
> >>> (h_nr_running, util_est, runnable_avg, load_avg) that do not follow
> >>> delayed dequeue which needs to be specially handled in the same way. The
> >>> current way of handling uclamp in core.c misses the frequency update,
> >>> like I commented before.
> >>>
> >>> Second, there is also an out-of-sync issue in update_load_avg(). We only
> >>> update the task-level se in delayed dequeue and requeue, but we return
> >>> early and the upper levels are completely skipped, as if the delayed
> >>> task is still on rq. This de-sync is wrong.
> >>
> >> I had a look at the util_est issue.
> >>
> >> This keeps rq->cfs.avg.util_avg sane for me with
> >> SCHED_FEAT(DELAY_DEQUEUE, true):
> >>
> >> -->8--
> >>
> >> From 0d7e8d057f49a47e0f3f484ac7d41e047dccec38 Mon Sep 17 00:00:00 2001
> >> From: Dietmar Eggemann <dietmar.eggemann@arm.com>
> >> Date: Thu, 5 Sep 2024 00:05:23 +0200
> >> Subject: [PATCH] kernel/sched: Fix util_est accounting for DELAY_DEQUEUE
> >>
> >> Remove delayed tasks from util_est even they are runnable.
> >
> > Unfortunately, this is not only about util_est
> >
> > cfs_rq's runnable_avg is also wrong  because we normally have :
> > cfs_rq's runnable_avg == /Sum se's runnable_avg
> > but cfs_rq's runnable_avg uses cfs_rq's h_nr_running but delayed
> > entities are still accounted in h_nr_running
>
> Yes, I agree.
>
> se's runnable_avg should be fine already since:
>
> se_runnable()
>
>   if (se->sched_delayed)
>     return false
>
> But then, like you said, __update_load_avg_cfs_rq() needs correct
> cfs_rq->h_nr_running.
>
> And I guess we need something like:
>
> se_on_rq()
>
>   if (se->sched_delayed)
>     return false
>
> for
>
> __update_load_avg_se()
>
> - if (___update_load_sum(now, &se->avg, !!se->on_rq, se_runnable(se),
> + if (___update_load_sum(now, &se->avg, se_on_rq(se), se_runnable(se),
>
>
> My hope was we can fix util_est independently since it drives CPU
> frequency. Whereas PELT load_avg and runnable_avg are "only" used for
> load balancing. But I agree, it has to be fixed as well.

runnable_avg is also used for frequency selection

>
> > That also means that cfs_rq's h_nr_running is not accurate anymore
> > because it includes delayed dequeue
>
> +1
>
> > and cfs_rq load_avg is kept artificially high which biases
> > load_balance and cgroup's shares
>
> +1
>
> >> Exclude delayed task which are (a) migrating between rq's or (b) in a
> >> SAVE/RESTORE dequeue/enqueue.
>
> I just realized that this fixes the uneven util_est_dequeue/enqueue
> calls so we don't see the underflow depicted by Hongyan and no massive
> rq->cfs util_est due to missing ue_dequeues.
> But delayed tasks are part of rq->cfs util_est, not excluded. Let me fix
> that.
>
> >> Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
> >> ---
> >>  kernel/sched/fair.c | 16 +++++++++-------
> >>  1 file changed, 9 insertions(+), 7 deletions(-)
> >>
> >> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> >> index 1e693ca8ebd6..5c32cc26d6c2 100644
> >> --- a/kernel/sched/fair.c
> >> +++ b/kernel/sched/fair.c
> >> @@ -6948,18 +6948,19 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> >>         int rq_h_nr_running = rq->cfs.h_nr_running;
> >>         u64 slice = 0;
> >>
> >> -       if (flags & ENQUEUE_DELAYED) {
> >> -               requeue_delayed_entity(se);
> >> -               return;
> >> -       }
> >> -
> >>         /*
> >>          * The code below (indirectly) updates schedutil which looks at
> >>          * the cfs_rq utilization to select a frequency.
> >>          * Let's add the task's estimated utilization to the cfs_rq's
> >>          * estimated utilization, before we update schedutil.
> >>          */
> >> -       util_est_enqueue(&rq->cfs, p);
> >> +       if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE))))
> >> +               util_est_enqueue(&rq->cfs, p);
> >> +
> >> +       if (flags & ENQUEUE_DELAYED) {
> >> +               requeue_delayed_entity(se);
> >> +               return;
> >> +       }
> >>
> >>         /*
> >>          * If in_iowait is set, the code below may not trigger any cpufreq
> >> @@ -7177,7 +7178,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
> >>   */
> >>  static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> >>  {
> >> -       util_est_dequeue(&rq->cfs, p);
> >> +       if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
> >> +               util_est_dequeue(&rq->cfs, p);
> >>
> >>         if (dequeue_entities(rq, &p->se, flags) < 0) {
> >>                 if (!rq->cfs.h_nr_running)
> >> --
> >> 2.34.1
>

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Dietmar Eggemann 1 year, 5 months ago

On 05/09/2024 16:29, Vincent Guittot wrote:
> On Thu, 5 Sept 2024 at 16:07, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:
>>
>> On 05/09/2024 15:33, Vincent Guittot wrote:
>>> On Thu, 5 Sept 2024 at 15:02, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:
>>>>
>>>> On 29/08/2024 17:42, Hongyan Xia wrote:
>>>>> On 22/08/2024 15:58, Vincent Guittot wrote:
>>>>>> On Thu, 22 Aug 2024 at 14:10, Vincent Guittot
>>>>>> <vincent.guittot@linaro.org> wrote:
>>>>>>>
>>>>>>> On Thu, 22 Aug 2024 at 14:08, Luis Machado <luis.machado@arm.com> wrote:
>>>>>>>>
>>>>>>>> Vincent,
>>>>>>>>
>>>>>>>> On 8/22/24 11:28, Luis Machado wrote:
>>>>>>>>> On 8/22/24 10:53, Vincent Guittot wrote:
>>>>>>>>>> On Thu, 22 Aug 2024 at 11:22, Luis Machado <luis.machado@arm.com>
>>>>>>>>>> wrote:
>>>>>>>>>>>
>>>>>>>>>>> On 8/22/24 09:19, Vincent Guittot wrote:
>>>>>>>>>>>> Hi,
>>>>>>>>>>>>
>>>>>>>>>>>> On Wed, 21 Aug 2024 at 15:34, Hongyan Xia <hongyan.xia2@arm.com>

[...]

>> My hope was we can fix util_est independently since it drives CPU
>> frequency. Whereas PELT load_avg and runnable_avg are "only" used for
>> load balancing. But I agree, it has to be fixed as well.
> 
> runnable_avg is also used for frequency selection

Ah, yes. So we would need proper cfs_rq->h_nr_running accounting as well.

[tip: sched/core] kernel/sched: Fix util_est accounting for DELAY_DEQUEUE

Posted by tip-bot2 for Dietmar Eggemann 1 year, 5 months ago

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     729288bc68560b4d5b094cb7a6f794c752ef22a2
Gitweb:        https://git.kernel.org/tip/729288bc68560b4d5b094cb7a6f794c752ef22a2
Author:        Dietmar Eggemann <dietmar.eggemann@arm.com>
AuthorDate:    Thu, 05 Sep 2024 00:05:23 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 10 Sep 2024 09:51:15 +02:00

kernel/sched: Fix util_est accounting for DELAY_DEQUEUE

Remove delayed tasks from util_est even they are runnable.

Exclude delayed task which are (a) migrating between rq's or (b) in a
SAVE/RESTORE dequeue/enqueue.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/c49ef5fe-a909-43f1-b02f-a765ab9cedbf@arm.com
---
 kernel/sched/fair.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e946ca0..922d690 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6948,18 +6948,19 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	int rq_h_nr_running = rq->cfs.h_nr_running;
 	u64 slice = 0;
 
-	if (flags & ENQUEUE_DELAYED) {
-		requeue_delayed_entity(se);
-		return;
-	}
-
 	/*
 	 * The code below (indirectly) updates schedutil which looks at
 	 * the cfs_rq utilization to select a frequency.
 	 * Let's add the task's estimated utilization to the cfs_rq's
 	 * estimated utilization, before we update schedutil.
 	 */
-	util_est_enqueue(&rq->cfs, p);
+	if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE))))
+		util_est_enqueue(&rq->cfs, p);
+
+	if (flags & ENQUEUE_DELAYED) {
+		requeue_delayed_entity(se);
+		return;
+	}
 
 	/*
 	 * If in_iowait is set, the code below may not trigger any cpufreq
@@ -7177,7 +7178,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
  */
 static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
-	util_est_dequeue(&rq->cfs, p);
+	if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
+		util_est_dequeue(&rq->cfs, p);
 
 	if (dequeue_entities(rq, &p->se, flags) < 0) {
 		util_est_update(&rq->cfs, p, DEQUEUE_SLEEP);

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Vincent Guittot 1 year, 5 months ago

On Thu, 22 Aug 2024 at 11:53, Vincent Guittot
<vincent.guittot@linaro.org> wrote:
>
> On Thu, 22 Aug 2024 at 11:22, Luis Machado <luis.machado@arm.com> wrote:
> >
> > On 8/22/24 09:19, Vincent Guittot wrote:
> > > Hi,
> > >
> > > On Wed, 21 Aug 2024 at 15:34, Hongyan Xia <hongyan.xia2@arm.com> wrote:
> > >>
> > >> Hi Peter,
> > >>
> > >> Sorry for bombarding this thread in the last couple of days. I'm seeing
> > >> several issues in the latest tip/sched/core after these patches landed.
> > >>
> > >> What I'm now seeing seems to be an unbalanced call of util_est. First, I applied
> > >
> > > I also see a remaining util_est for idle rq because of an unbalance
> > > call of util_est_enqueue|dequeue
> > >
> >
> > I can confirm issues with the utilization values and frequencies being driven
> > seemingly incorrectly, in particular for little cores.
> >
> > What I'm seeing with the stock series is high utilization values for some tasks
> > and little cores having their frequencies maxed out for extended periods of
> > time. Sometimes for 5+ or 10+ seconds, which is excessive as the cores are mostly
> > idle. But whenever certain tasks get scheduled there, they have a very high util
> > level and so the frequency is kept at max.
> >
> > As a consequence this drives up power usage.
> >
> > I gave Hongyan's draft fix a try and observed a much more reasonable behavior for
> > the util numbers and frequencies being used for the little cores. With his fix,
> > I can also see lower energy use for my specific benchmark.
>
> The main problem is that the util_est of a delayed dequeued task
> remains on the rq and keeps the rq utilization high and as a result
> the frequency higher than needed.
>
> The below seems to works for me and keep sync the enqueue/dequeue of
> uti_test with the enqueue/dequeue of the task as if de dequeue was not
> delayed
>
> Another interest is that we will not try to migrate a delayed dequeue
> sleeping task that doesn't actually impact the current load of the cpu
> and as a result will not help in the load balance. I haven't yet fully
> checked what would happen with hotplug

And there is the case of a delayed dequeue task that gets its affinity changed

>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index fea057b311f6..0970bcdc889a 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6944,11 +6944,6 @@ enqueue_task_fair(struct rq *rq, struct
> task_struct *p, int flags)
>         int rq_h_nr_running = rq->cfs.h_nr_running;
>         u64 slice = 0;
>
> -       if (flags & ENQUEUE_DELAYED) {
> -               requeue_delayed_entity(se);
> -               return;
> -       }
> -
>         /*
>          * The code below (indirectly) updates schedutil which looks at
>          * the cfs_rq utilization to select a frequency.
> @@ -6957,6 +6952,11 @@ enqueue_task_fair(struct rq *rq, struct
> task_struct *p, int flags)
>          */
>         util_est_enqueue(&rq->cfs, p);
>
> +       if (flags & ENQUEUE_DELAYED) {
> +               requeue_delayed_entity(se);
> +               return;
> +       }
> +
>         /*
>          * If in_iowait is set, the code below may not trigger any cpufreq
>          * utilization updates, so do it here explicitly with the IOWAIT flag
> @@ -9276,6 +9276,8 @@ int can_migrate_task(struct task_struct *p,
> struct lb_env *env)
>
>         lockdep_assert_rq_held(env->src_rq);
>
> +       if (p->se.sched_delayed)
> +               return 0;
>         /*
>          * We do not migrate tasks that are:
>          * 1) throttled_lb_pair, or
>
> >
> >
> > >> the following diff to warn against util_est != 0 when no tasks are on
> > >> the queue:
> > >>
> > >> https://lore.kernel.org/all/752ae417c02b9277ca3ec18893747c54dd5f277f.1724245193.git.hongyan.xia2@arm.com/
> > >>
> > >> Then, I'm reliably seeing warnings on my Juno board during boot in
> > >> latest tip/sched/core.
> > >>
> > >> If I do the same thing to util_est just like what you did in this uclamp
> > >> patch, like this:
> > >
> > > I think that the solution is simpler than your proposal and we just
> > > need to always call util_est_enqueue() before the
> > > requeue_delayed_entity
> > >
> > > @@ -6970,11 +6970,6 @@ enqueue_task_fair(struct rq *rq, struct
> > > task_struct *p, int flags)
> > >         int rq_h_nr_running = rq->cfs.h_nr_running;
> > >         u64 slice = 0;
> > >
> > > -       if (flags & ENQUEUE_DELAYED) {
> > > -               requeue_delayed_entity(se);
> > > -               return;
> > > -       }
> > > -
> > >         /*
> > >          * The code below (indirectly) updates schedutil which looks at
> > >          * the cfs_rq utilization to select a frequency.
> > > @@ -6983,6 +6978,11 @@ enqueue_task_fair(struct rq *rq, struct
> > > task_struct *p, int flags)
> > >          */
> > >         util_est_enqueue(&rq->cfs, p);
> > >
> > > +       if (flags & ENQUEUE_DELAYED) {
> > > +               requeue_delayed_entity(se);
> > > +               return;
> > > +       }
> > > +
> > >         /*
> > >          * If in_iowait is set, the code below may not trigger any cpufreq
> > >          * utilization updates, so do it here explicitly with the IOWAIT flag
> > >
> > >
> > >>
> > >> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > >> index 574ef19df64b..58aac42c99e5 100644
> > >> --- a/kernel/sched/fair.c
> > >> +++ b/kernel/sched/fair.c
> > >> @@ -6946,7 +6946,7 @@ enqueue_task_fair(struct rq *rq, struct
> > >> task_struct *p, int flags)
> > >>
> > >>         if (flags & ENQUEUE_DELAYED) {
> > >>                 requeue_delayed_entity(se);
> > >> -               return;
> > >> +               goto util_est;
> > >>         }
> > >>
> > >>         /*
> > >> @@ -6955,7 +6955,6 @@ enqueue_task_fair(struct rq *rq, struct
> > >> task_struct *p, int flags)
> > >>          * Let's add the task's estimated utilization to the cfs_rq's
> > >>          * estimated utilization, before we update schedutil.
> > >>          */
> > >> -       util_est_enqueue(&rq->cfs, p);
> > >>
> > >>         /*
> > >>          * If in_iowait is set, the code below may not trigger any cpufreq
> > >> @@ -7050,6 +7049,9 @@ enqueue_task_fair(struct rq *rq, struct
> > >> task_struct *p, int flags)
> > >>         assert_list_leaf_cfs_rq(rq);
> > >>
> > >>         hrtick_update(rq);
> > >> +util_est:
> > >> +       if (!p->se.sched_delayed)
> > >> +               util_est_enqueue(&rq->cfs, p);
> > >>   }
> > >>
> > >>   static void set_next_buddy(struct sched_entity *se);
> > >> @@ -7173,7 +7175,8 @@ static int dequeue_entities(struct rq *rq, struct
> > >> sched_entity *se, int flags)
> > >>    */
> > >>   static bool dequeue_task_fair(struct rq *rq, struct task_struct *p,
> > >> int flags)
> > >>   {
> > >> -       util_est_dequeue(&rq->cfs, p);
> > >> +       if (!p->se.sched_delayed)
> > >> +               util_est_dequeue(&rq->cfs, p);
> > >>
> > >>         if (dequeue_entities(rq, &p->se, flags) < 0) {
> > >>                 if (!rq->cfs.h_nr_running)
> > >>
> > >> which is basically enqueuing util_est after enqueue_task_fair(),
> > >> dequeuing util_est before dequeue_task_fair() and double check
> > >> p->se.delayed_dequeue, then the unbalanced issue seems to go away.
> > >>
> > >> Hopefully this helps you in finding where the problem could be.
> > >>
> > >> Hongyan
> > >>
> > >> On 27/07/2024 11:27, Peter Zijlstra wrote:
> > >>> Delayed dequeue has tasks sit around on the runqueue that are not
> > >>> actually runnable -- specifically, they will be dequeued the moment
> > >>> they get picked.
> > >>>
> > >>> One side-effect is that such a task can get migrated, which leads to a
> > >>> 'nested' dequeue_task() scenario that messes up uclamp if we don't
> > >>> take care.
> > >>>
> > >>> Notably, dequeue_task(DEQUEUE_SLEEP) can 'fail' and keep the task on
> > >>> the runqueue. This however will have removed the task from uclamp --
> > >>> per uclamp_rq_dec() in dequeue_task(). So far so good.
> > >>>
> > >>> However, if at that point the task gets migrated -- or nice adjusted
> > >>> or any of a myriad of operations that does a dequeue-enqueue cycle --
> > >>> we'll pass through dequeue_task()/enqueue_task() again. Without
> > >>> modification this will lead to a double decrement for uclamp, which is
> > >>> wrong.
> > >>>
> > >>> Reported-by: Luis Machado <luis.machado@arm.com>
> > >>> Reported-by: Hongyan Xia <hongyan.xia2@arm.com>
> > >>> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > >>> ---
> > >>>   kernel/sched/core.c |   16 +++++++++++++++-
> > >>>   1 file changed, 15 insertions(+), 1 deletion(-)
> > >>>
> > >>> --- a/kernel/sched/core.c
> > >>> +++ b/kernel/sched/core.c
> > >>> @@ -1676,6 +1676,9 @@ static inline void uclamp_rq_inc(struct
> > >>>       if (unlikely(!p->sched_class->uclamp_enabled))
> > >>>               return;
> > >>>
> > >>> +     if (p->se.sched_delayed)
> > >>> +             return;
> > >>> +
> > >>>       for_each_clamp_id(clamp_id)
> > >>>               uclamp_rq_inc_id(rq, p, clamp_id);
> > >>>
> > >>> @@ -1700,6 +1703,9 @@ static inline void uclamp_rq_dec(struct
> > >>>       if (unlikely(!p->sched_class->uclamp_enabled))
> > >>>               return;
> > >>>
> > >>> +     if (p->se.sched_delayed)
> > >>> +             return;
> > >>> +
> > >>>       for_each_clamp_id(clamp_id)
> > >>>               uclamp_rq_dec_id(rq, p, clamp_id);
> > >>>   }
> > >>> @@ -1979,8 +1985,12 @@ void enqueue_task(struct rq *rq, struct
> > >>>               psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
> > >>>       }
> > >>>
> > >>> -     uclamp_rq_inc(rq, p);
> > >>>       p->sched_class->enqueue_task(rq, p, flags);
> > >>> +     /*
> > >>> +      * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
> > >>> +      * ->sched_delayed.
> > >>> +      */
> > >>> +     uclamp_rq_inc(rq, p);
> > >>>
> > >>>       if (sched_core_enabled(rq))
> > >>>               sched_core_enqueue(rq, p);
> > >>> @@ -2002,6 +2012,10 @@ inline bool dequeue_task(struct rq *rq,
> > >>>               psi_dequeue(p, flags & DEQUEUE_SLEEP);
> > >>>       }
> > >>>
> > >>> +     /*
> > >>> +      * Must be before ->dequeue_task() because ->dequeue_task() can 'fail'
> > >>> +      * and mark the task ->sched_delayed.
> > >>> +      */
> > >>>       uclamp_rq_dec(rq, p);
> > >>>       return p->sched_class->dequeue_task(rq, p, flags);
> > >>>   }
> > >>>
> > >>>
> >

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Vincent Guittot 1 year, 5 months ago

On Thu, 22 Aug 2024 at 10:19, Vincent Guittot
<vincent.guittot@linaro.org> wrote:
>
> Hi,
>
> On Wed, 21 Aug 2024 at 15:34, Hongyan Xia <hongyan.xia2@arm.com> wrote:
> >
> > Hi Peter,
> >
> > Sorry for bombarding this thread in the last couple of days. I'm seeing
> > several issues in the latest tip/sched/core after these patches landed.
> >
> > What I'm now seeing seems to be an unbalanced call of util_est. First, I applied
>
> I also see a remaining util_est for idle rq because of an unbalance
> call of util_est_enqueue|dequeue
>
> > the following diff to warn against util_est != 0 when no tasks are on
> > the queue:
> >
> > https://lore.kernel.org/all/752ae417c02b9277ca3ec18893747c54dd5f277f.1724245193.git.hongyan.xia2@arm.com/
> >
> > Then, I'm reliably seeing warnings on my Juno board during boot in
> > latest tip/sched/core.
> >
> > If I do the same thing to util_est just like what you did in this uclamp
> > patch, like this:
>
> I think that the solution is simpler than your proposal and we just
> need to always call util_est_enqueue() before the
> requeue_delayed_entity

I have been too quick and the below doesn't fix the problem
>
> @@ -6970,11 +6970,6 @@ enqueue_task_fair(struct rq *rq, struct
> task_struct *p, int flags)
>         int rq_h_nr_running = rq->cfs.h_nr_running;
>         u64 slice = 0;
>
> -       if (flags & ENQUEUE_DELAYED) {
> -               requeue_delayed_entity(se);
> -               return;
> -       }
> -
>         /*
>          * The code below (indirectly) updates schedutil which looks at
>          * the cfs_rq utilization to select a frequency.
> @@ -6983,6 +6978,11 @@ enqueue_task_fair(struct rq *rq, struct
> task_struct *p, int flags)
>          */
>         util_est_enqueue(&rq->cfs, p);
>
> +       if (flags & ENQUEUE_DELAYED) {
> +               requeue_delayed_entity(se);
> +               return;
> +       }
> +
>         /*
>          * If in_iowait is set, the code below may not trigger any cpufreq
>          * utilization updates, so do it here explicitly with the IOWAIT flag
>
>
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 574ef19df64b..58aac42c99e5 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -6946,7 +6946,7 @@ enqueue_task_fair(struct rq *rq, struct
> > task_struct *p, int flags)
> >
> >         if (flags & ENQUEUE_DELAYED) {
> >                 requeue_delayed_entity(se);
> > -               return;
> > +               goto util_est;
> >         }
> >
> >         /*
> > @@ -6955,7 +6955,6 @@ enqueue_task_fair(struct rq *rq, struct
> > task_struct *p, int flags)
> >          * Let's add the task's estimated utilization to the cfs_rq's
> >          * estimated utilization, before we update schedutil.
> >          */
> > -       util_est_enqueue(&rq->cfs, p);
> >
> >         /*
> >          * If in_iowait is set, the code below may not trigger any cpufreq
> > @@ -7050,6 +7049,9 @@ enqueue_task_fair(struct rq *rq, struct
> > task_struct *p, int flags)
> >         assert_list_leaf_cfs_rq(rq);
> >
> >         hrtick_update(rq);
> > +util_est:
> > +       if (!p->se.sched_delayed)
> > +               util_est_enqueue(&rq->cfs, p);
> >   }
> >
> >   static void set_next_buddy(struct sched_entity *se);
> > @@ -7173,7 +7175,8 @@ static int dequeue_entities(struct rq *rq, struct
> > sched_entity *se, int flags)
> >    */
> >   static bool dequeue_task_fair(struct rq *rq, struct task_struct *p,
> > int flags)
> >   {
> > -       util_est_dequeue(&rq->cfs, p);
> > +       if (!p->se.sched_delayed)
> > +               util_est_dequeue(&rq->cfs, p);
> >
> >         if (dequeue_entities(rq, &p->se, flags) < 0) {
> >                 if (!rq->cfs.h_nr_running)
> >
> > which is basically enqueuing util_est after enqueue_task_fair(),
> > dequeuing util_est before dequeue_task_fair() and double check
> > p->se.delayed_dequeue, then the unbalanced issue seems to go away.
> >
> > Hopefully this helps you in finding where the problem could be.
> >
> > Hongyan
> >
> > On 27/07/2024 11:27, Peter Zijlstra wrote:
> > > Delayed dequeue has tasks sit around on the runqueue that are not
> > > actually runnable -- specifically, they will be dequeued the moment
> > > they get picked.
> > >
> > > One side-effect is that such a task can get migrated, which leads to a
> > > 'nested' dequeue_task() scenario that messes up uclamp if we don't
> > > take care.
> > >
> > > Notably, dequeue_task(DEQUEUE_SLEEP) can 'fail' and keep the task on
> > > the runqueue. This however will have removed the task from uclamp --
> > > per uclamp_rq_dec() in dequeue_task(). So far so good.
> > >
> > > However, if at that point the task gets migrated -- or nice adjusted
> > > or any of a myriad of operations that does a dequeue-enqueue cycle --
> > > we'll pass through dequeue_task()/enqueue_task() again. Without
> > > modification this will lead to a double decrement for uclamp, which is
> > > wrong.
> > >
> > > Reported-by: Luis Machado <luis.machado@arm.com>
> > > Reported-by: Hongyan Xia <hongyan.xia2@arm.com>
> > > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > > ---
> > >   kernel/sched/core.c |   16 +++++++++++++++-
> > >   1 file changed, 15 insertions(+), 1 deletion(-)
> > >
> > > --- a/kernel/sched/core.c
> > > +++ b/kernel/sched/core.c
> > > @@ -1676,6 +1676,9 @@ static inline void uclamp_rq_inc(struct
> > >       if (unlikely(!p->sched_class->uclamp_enabled))
> > >               return;
> > >
> > > +     if (p->se.sched_delayed)
> > > +             return;
> > > +
> > >       for_each_clamp_id(clamp_id)
> > >               uclamp_rq_inc_id(rq, p, clamp_id);
> > >
> > > @@ -1700,6 +1703,9 @@ static inline void uclamp_rq_dec(struct
> > >       if (unlikely(!p->sched_class->uclamp_enabled))
> > >               return;
> > >
> > > +     if (p->se.sched_delayed)
> > > +             return;
> > > +
> > >       for_each_clamp_id(clamp_id)
> > >               uclamp_rq_dec_id(rq, p, clamp_id);
> > >   }
> > > @@ -1979,8 +1985,12 @@ void enqueue_task(struct rq *rq, struct
> > >               psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
> > >       }
> > >
> > > -     uclamp_rq_inc(rq, p);
> > >       p->sched_class->enqueue_task(rq, p, flags);
> > > +     /*
> > > +      * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
> > > +      * ->sched_delayed.
> > > +      */
> > > +     uclamp_rq_inc(rq, p);
> > >
> > >       if (sched_core_enabled(rq))
> > >               sched_core_enqueue(rq, p);
> > > @@ -2002,6 +2012,10 @@ inline bool dequeue_task(struct rq *rq,
> > >               psi_dequeue(p, flags & DEQUEUE_SLEEP);
> > >       }
> > >
> > > +     /*
> > > +      * Must be before ->dequeue_task() because ->dequeue_task() can 'fail'
> > > +      * and mark the task ->sched_delayed.
> > > +      */
> > >       uclamp_rq_dec(rq, p);
> > >       return p->sched_class->dequeue_task(rq, p, flags);
> > >   }
> > >
> > >

Re: [PATCH 10/24] sched/uclamg: Handle delayed dequeue

Posted by Hongyan Xia 1 year, 5 months ago

On 27/07/2024 11:27, Peter Zijlstra wrote:
> Delayed dequeue has tasks sit around on the runqueue that are not
> actually runnable -- specifically, they will be dequeued the moment
> they get picked.
> 
> One side-effect is that such a task can get migrated, which leads to a
> 'nested' dequeue_task() scenario that messes up uclamp if we don't
> take care.
> 
> Notably, dequeue_task(DEQUEUE_SLEEP) can 'fail' and keep the task on
> the runqueue. This however will have removed the task from uclamp --
> per uclamp_rq_dec() in dequeue_task(). So far so good.
> 
> However, if at that point the task gets migrated -- or nice adjusted
> or any of a myriad of operations that does a dequeue-enqueue cycle --
> we'll pass through dequeue_task()/enqueue_task() again. Without
> modification this will lead to a double decrement for uclamp, which is
> wrong.
> 
> Reported-by: Luis Machado <luis.machado@arm.com>
> Reported-by: Hongyan Xia <hongyan.xia2@arm.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>   kernel/sched/core.c |   16 +++++++++++++++-
>   1 file changed, 15 insertions(+), 1 deletion(-)
> 
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1676,6 +1676,9 @@ static inline void uclamp_rq_inc(struct
>   	if (unlikely(!p->sched_class->uclamp_enabled))
>   		return;
>   
> +	if (p->se.sched_delayed)
> +		return;
> +
>   	for_each_clamp_id(clamp_id)
>   		uclamp_rq_inc_id(rq, p, clamp_id);
>   
> @@ -1700,6 +1703,9 @@ static inline void uclamp_rq_dec(struct
>   	if (unlikely(!p->sched_class->uclamp_enabled))
>   		return;
>   
> +	if (p->se.sched_delayed)
> +		return;
> +
>   	for_each_clamp_id(clamp_id)
>   		uclamp_rq_dec_id(rq, p, clamp_id);
>   }
> @@ -1979,8 +1985,12 @@ void enqueue_task(struct rq *rq, struct
>   		psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
>   	}
>   
> -	uclamp_rq_inc(rq, p);
>   	p->sched_class->enqueue_task(rq, p, flags);
> +	/*
> +	 * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
> +	 * ->sched_delayed.
> +	 */
> +	uclamp_rq_inc(rq, p);

Apart from the typo in the title, this is a notable functional change.

Both classes that support uclamp update the CPU frequency in 
enqueue_task(). Before, a task that have uclamp_min will immediately 
drive up the frequency the moment it is enqueued. Now, driving up the 
frequency is delayed until the next util update.

I do not yet have evidence suggesting this is quantitatively bad, like 
first frame drops, but we might want to keep an eye on this, and switch 
back to the old way if possible.

>   
>   	if (sched_core_enabled(rq))
>   		sched_core_enqueue(rq, p);
> @@ -2002,6 +2012,10 @@ inline bool dequeue_task(struct rq *rq,
>   		psi_dequeue(p, flags & DEQUEUE_SLEEP);
>   	}
>   
> +	/*
> +	 * Must be before ->dequeue_task() because ->dequeue_task() can 'fail'
> +	 * and mark the task ->sched_delayed.
> +	 */
>   	uclamp_rq_dec(rq, p);
>   	return p->sched_class->dequeue_task(rq, p, flags);
>   }
> 
>

[tip: sched/core] sched/uclamg: Handle delayed dequeue

Posted by tip-bot2 for Peter Zijlstra 1 year, 5 months ago

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     dfa0a574cbc47bfd5f8985f74c8ea003a37fa078
Gitweb:        https://git.kernel.org/tip/dfa0a574cbc47bfd5f8985f74c8ea003a37fa078
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Wed, 05 Jun 2024 12:09:11 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Sat, 17 Aug 2024 11:06:42 +02:00

sched/uclamg: Handle delayed dequeue

Delayed dequeue has tasks sit around on the runqueue that are not
actually runnable -- specifically, they will be dequeued the moment
they get picked.

One side-effect is that such a task can get migrated, which leads to a
'nested' dequeue_task() scenario that messes up uclamp if we don't
take care.

Notably, dequeue_task(DEQUEUE_SLEEP) can 'fail' and keep the task on
the runqueue. This however will have removed the task from uclamp --
per uclamp_rq_dec() in dequeue_task(). So far so good.

However, if at that point the task gets migrated -- or nice adjusted
or any of a myriad of operations that does a dequeue-enqueue cycle --
we'll pass through dequeue_task()/enqueue_task() again. Without
modification this will lead to a double decrement for uclamp, which is
wrong.

Reported-by: Luis Machado <luis.machado@arm.com>
Reported-by: Hongyan Xia <hongyan.xia2@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Tested-by: Valentin Schneider <vschneid@redhat.com>
Link: https://lkml.kernel.org/r/20240727105029.315205425@infradead.org
---
 kernel/sched/core.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7356464..80e639e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1691,6 +1691,9 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
 	if (unlikely(!p->sched_class->uclamp_enabled))
 		return;
 
+	if (p->se.sched_delayed)
+		return;
+
 	for_each_clamp_id(clamp_id)
 		uclamp_rq_inc_id(rq, p, clamp_id);
 
@@ -1715,6 +1718,9 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
 	if (unlikely(!p->sched_class->uclamp_enabled))
 		return;
 
+	if (p->se.sched_delayed)
+		return;
+
 	for_each_clamp_id(clamp_id)
 		uclamp_rq_dec_id(rq, p, clamp_id);
 }
@@ -1994,8 +2000,12 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 		psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
 	}
 
-	uclamp_rq_inc(rq, p);
 	p->sched_class->enqueue_task(rq, p, flags);
+	/*
+	 * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
+	 * ->sched_delayed.
+	 */
+	uclamp_rq_inc(rq, p);
 
 	if (sched_core_enabled(rq))
 		sched_core_enqueue(rq, p);
@@ -2017,6 +2027,10 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 		psi_dequeue(p, flags & DEQUEUE_SLEEP);
 	}
 
+	/*
+	 * Must be before ->dequeue_task() because ->dequeue_task() can 'fail'
+	 * and mark the task ->sched_delayed.
+	 */
 	uclamp_rq_dec(rq, p);
 	return p->sched_class->dequeue_task(rq, p, flags);
 }

Re: [tip: sched/core] sched/uclamg: Handle delayed dequeue

Posted by Christian Loehle 1 year, 5 months ago

On 8/18/24 07:23, tip-bot2 for Peter Zijlstra wrote:
> The following commit has been merged into the sched/core branch of tip:
> 
> Commit-ID:     dfa0a574cbc47bfd5f8985f74c8ea003a37fa078
> Gitweb:        https://git.kernel.org/tip/dfa0a574cbc47bfd5f8985f74c8ea003a37fa078
> Author:        Peter Zijlstra <peterz@infradead.org>
> AuthorDate:    Wed, 05 Jun 2024 12:09:11 +02:00
> Committer:     Peter Zijlstra <peterz@infradead.org>
> CommitterDate: Sat, 17 Aug 2024 11:06:42 +02:00
> 
> sched/uclamg: Handle delayed dequeue

Nit, but I haven't seen the typo until now.

> 
> Delayed dequeue has tasks sit around on the runqueue that are not
> actually runnable -- specifically, they will be dequeued the moment
> they get picked.
> 
> One side-effect is that such a task can get migrated, which leads to a
> 'nested' dequeue_task() scenario that messes up uclamp if we don't
> take care.
> 
> Notably, dequeue_task(DEQUEUE_SLEEP) can 'fail' and keep the task on
> the runqueue. This however will have removed the task from uclamp --
> per uclamp_rq_dec() in dequeue_task(). So far so good.
> 
> However, if at that point the task gets migrated -- or nice adjusted
> or any of a myriad of operations that does a dequeue-enqueue cycle --
> we'll pass through dequeue_task()/enqueue_task() again. Without
> modification this will lead to a double decrement for uclamp, which is
> wrong.
> 
> Reported-by: Luis Machado <luis.machado@arm.com>
> Reported-by: Hongyan Xia <hongyan.xia2@arm.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Reviewed-by: Valentin Schneider <vschneid@redhat.com>
> Tested-by: Valentin Schneider <vschneid@redhat.com>
> Link: https://lkml.kernel.org/r/20240727105029.315205425@infradead.org
> ---
>  kernel/sched/core.c | 16 +++++++++++++++-
>  1 file changed, 15 insertions(+), 1 deletion(-)
> 
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 7356464..80e639e 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1691,6 +1691,9 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
>  	if (unlikely(!p->sched_class->uclamp_enabled))
>  		return;
>  
> +	if (p->se.sched_delayed)
> +		return;
> +
>  	for_each_clamp_id(clamp_id)
>  		uclamp_rq_inc_id(rq, p, clamp_id);
>  
> @@ -1715,6 +1718,9 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
>  	if (unlikely(!p->sched_class->uclamp_enabled))
>  		return;
>  
> +	if (p->se.sched_delayed)
> +		return;
> +
>  	for_each_clamp_id(clamp_id)
>  		uclamp_rq_dec_id(rq, p, clamp_id);
>  }
> @@ -1994,8 +2000,12 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
>  		psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
>  	}
>  
> -	uclamp_rq_inc(rq, p);
>  	p->sched_class->enqueue_task(rq, p, flags);
> +	/*
> +	 * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
> +	 * ->sched_delayed.
> +	 */
> +	uclamp_rq_inc(rq, p);
>  
>  	if (sched_core_enabled(rq))
>  		sched_core_enqueue(rq, p);
> @@ -2017,6 +2027,10 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
>  		psi_dequeue(p, flags & DEQUEUE_SLEEP);
>  	}
>  
> +	/*
> +	 * Must be before ->dequeue_task() because ->dequeue_task() can 'fail'
> +	 * and mark the task ->sched_delayed.
> +	 */
>  	uclamp_rq_dec(rq, p);
>  	return p->sched_class->dequeue_task(rq, p, flags);
>  }
>