sched/fair: Add push task mecansim and hadle more EAS cases

[RFC PATCH 6/6 v7] sched/fair: Add EAS and idle cpu push trigger

Posted by Vincent Guittot 2 months, 1 week ago

EAS is based on wakeup events to efficiently place tasks on the system, but
there are cases where a task doesn't have wakeup events anymore or at a far
too low pace. For such cases, we check if it's worht pushing hte task on
another CPUs instead of putting it back in the enqueued list.

Wake up events remain the main way to migrate tasks but we now detect
situation where a task is stuck on a CPU by checking that its utilization
is larger than the max available compute capacity (max cpu capacity or
uclamp max setting)

When the system becomes overutilized and some CPUs are idle, we try to
push tasks instead of waiting periodic load balance.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---
 kernel/sched/fair.c     | 65 +++++++++++++++++++++++++++++++++++++++++
 kernel/sched/topology.c |  3 ++
 2 files changed, 68 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9af8d0a61856..e9e1d0c05805 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6990,6 +6990,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 }
 
 static void fair_remove_pushable_task(struct rq *rq, struct task_struct *p);
+
 /*
  * Basically dequeue_task_fair(), except it can deal with dequeue_entity()
  * failing half-way through and resume the dequeue later.
@@ -8499,8 +8500,72 @@ static inline bool sched_push_task_enabled(void)
 	return static_branch_unlikely(&sched_push_task);
 }
 
+static inline bool task_stuck_on_cpu(struct task_struct *p, int cpu)
+{
+	unsigned long max_capa, util;
+
+	max_capa = min(get_actual_cpu_capacity(cpu),
+		       uclamp_eff_value(p, UCLAMP_MAX));
+	util = max(task_util_est(p), task_runnable(p));
+
+	/*
+	 * Return true only if the task might not sleep/wakeup because of a low
+	 * compute capacity. Tasks, which wake up regularly, will be handled by
+	 * feec().
+	 */
+	return (util > max_capa);
+}
+
+static inline bool sched_energy_push_task(struct task_struct *p, struct rq *rq)
+{
+	if (!sched_energy_enabled())
+		return false;
+
+	if (is_rd_overutilized(rq->rd))
+		return false;
+
+	if (task_stuck_on_cpu(p, cpu_of(rq)))
+		return true;
+
+	if (!task_fits_cpu(p, cpu_of(rq)))
+		return true;
+
+	return false;
+}
+
+static inline bool sched_idle_push_task(struct task_struct *p, struct rq *rq)
+{
+	if (rq->nr_running == 1)
+		return false;
+
+	if (!is_rd_overutilized(rq->rd))
+		return false;
+
+	/* If there are idle cpus in the llc then try to push the task on it */
+	if (test_idle_cores(cpu_of(rq)))
+		return true;
+
+	return false;
+}
+
+
 static bool fair_push_task(struct rq *rq, struct task_struct *p)
 {
+	if (!task_on_rq_queued(p))
+		return false;
+
+	if (p->se.sched_delayed)
+		return false;
+
+	if (p->nr_cpus_allowed == 1)
+		return false;
+
+	if (sched_energy_push_task(p, rq))
+		return true;
+
+	if (sched_idle_push_task(p, rq))
+		return true;
+
 	return false;
 }
 
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index cf643a5ddedd..5edf7b117ed9 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -391,10 +391,13 @@ static void sched_energy_set(bool has_eas)
 		if (sched_debug())
 			pr_info("%s: stopping EAS\n", __func__);
 		static_branch_disable_cpuslocked(&sched_energy_present);
+		static_branch_dec_cpuslocked(&sched_push_task);
+	} else if (has_eas && !sched_energy_enabled()) {
 	} else if (has_eas && !static_branch_unlikely(&sched_energy_present)) {
 		if (sched_debug())
 			pr_info("%s: starting EAS\n", __func__);
 		static_branch_enable_cpuslocked(&sched_energy_present);
+		static_branch_inc_cpuslocked(&sched_push_task);
 	}
 }
 
-- 
2.43.0

Re: [RFC PATCH 6/6 v7] sched/fair: Add EAS and idle cpu push trigger

Posted by Hillf Danton 2 months, 1 week ago

On Mon,  1 Dec 2025 10:13:08 +0100 Vincent Guittot wrote:
> EAS is based on wakeup events to efficiently place tasks on the system, but
> there are cases where a task doesn't have wakeup events anymore or at a far
> too low pace. For such cases, we check if it's worht pushing hte task on
> another CPUs instead of putting it back in the enqueued list.
> 
> Wake up events remain the main way to migrate tasks but we now detect
> situation where a task is stuck on a CPU by checking that its utilization
> is larger than the max available compute capacity (max cpu capacity or
> uclamp max setting)
> 
> When the system becomes overutilized and some CPUs are idle, we try to
> push tasks instead of waiting periodic load balance.
> 
> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> ---
>  kernel/sched/fair.c     | 65 +++++++++++++++++++++++++++++++++++++++++
>  kernel/sched/topology.c |  3 ++
>  2 files changed, 68 insertions(+)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 9af8d0a61856..e9e1d0c05805 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6990,6 +6990,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  }
>  
>  static void fair_remove_pushable_task(struct rq *rq, struct task_struct *p);
> +
>  /*
>   * Basically dequeue_task_fair(), except it can deal with dequeue_entity()
>   * failing half-way through and resume the dequeue later.
> @@ -8499,8 +8500,72 @@ static inline bool sched_push_task_enabled(void)
>  	return static_branch_unlikely(&sched_push_task);
>  }
>  
> +static inline bool task_stuck_on_cpu(struct task_struct *p, int cpu)
> +{
> +	unsigned long max_capa, util;
> +
> +	max_capa = min(get_actual_cpu_capacity(cpu),
> +		       uclamp_eff_value(p, UCLAMP_MAX));
> +	util = max(task_util_est(p), task_runnable(p));
> +
> +	/*
> +	 * Return true only if the task might not sleep/wakeup because of a low
> +	 * compute capacity. Tasks, which wake up regularly, will be handled by
> +	 * feec().
> +	 */
> +	return (util > max_capa);
> +}
> +
> +static inline bool sched_energy_push_task(struct task_struct *p, struct rq *rq)
> +{
> +	if (!sched_energy_enabled())
> +		return false;
> +
> +	if (is_rd_overutilized(rq->rd))
> +		return false;
> +
> +	if (task_stuck_on_cpu(p, cpu_of(rq)))
> +		return true;
> +
> +	if (!task_fits_cpu(p, cpu_of(rq)))
> +		return true;
> +
> +	return false;
> +}
> +
> +static inline bool sched_idle_push_task(struct task_struct *p, struct rq *rq)
> +{
> +	if (rq->nr_running == 1)
> +		return false;
> +
> +	if (!is_rd_overutilized(rq->rd))
> +		return false;
> +
> +	/* If there are idle cpus in the llc then try to push the task on it */
> +	if (test_idle_cores(cpu_of(rq)))
> +		return true;
> +
> +	return false;
> +}
> +
> +
>  static bool fair_push_task(struct rq *rq, struct task_struct *p)
>  {
> +	if (!task_on_rq_queued(p))
> +		return false;

Task is queued on rq.
> +
> +	if (p->se.sched_delayed)
> +		return false;
> +
> +	if (p->nr_cpus_allowed == 1)
> +		return false;
> +
> +	if (sched_energy_push_task(p, rq))
> +		return true;

If task is stuck on CPU, it could not be on rq. Weird.
> +
> +	if (sched_idle_push_task(p, rq))
> +		return true;
> +
>  	return false;
>  }
>  
More, in the tick path,

task_tick_fair
  check_pushable_task
    fair_push_task
      task_on_rq_queued // this check makes no sense

Re: [RFC PATCH 6/6 v7] sched/fair: Add EAS and idle cpu push trigger

Posted by Vincent Guittot 2 months, 1 week ago

On Tue, 2 Dec 2025 at 10:45, Hillf Danton <hdanton@sina.com> wrote:
>
> On Mon,  1 Dec 2025 10:13:08 +0100 Vincent Guittot wrote:
> > EAS is based on wakeup events to efficiently place tasks on the system, but
> > there are cases where a task doesn't have wakeup events anymore or at a far
> > too low pace. For such cases, we check if it's worht pushing hte task on
> > another CPUs instead of putting it back in the enqueued list.
> >
> > Wake up events remain the main way to migrate tasks but we now detect
> > situation where a task is stuck on a CPU by checking that its utilization
> > is larger than the max available compute capacity (max cpu capacity or
> > uclamp max setting)
> >
> > When the system becomes overutilized and some CPUs are idle, we try to
> > push tasks instead of waiting periodic load balance.
> >
> > Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> > ---
> >  kernel/sched/fair.c     | 65 +++++++++++++++++++++++++++++++++++++++++
> >  kernel/sched/topology.c |  3 ++
> >  2 files changed, 68 insertions(+)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 9af8d0a61856..e9e1d0c05805 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -6990,6 +6990,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> >  }
> >
> >  static void fair_remove_pushable_task(struct rq *rq, struct task_struct *p);
> > +
> >  /*
> >   * Basically dequeue_task_fair(), except it can deal with dequeue_entity()
> >   * failing half-way through and resume the dequeue later.
> > @@ -8499,8 +8500,72 @@ static inline bool sched_push_task_enabled(void)
> >       return static_branch_unlikely(&sched_push_task);
> >  }
> >
> > +static inline bool task_stuck_on_cpu(struct task_struct *p, int cpu)
> > +{
> > +     unsigned long max_capa, util;
> > +
> > +     max_capa = min(get_actual_cpu_capacity(cpu),
> > +                    uclamp_eff_value(p, UCLAMP_MAX));
> > +     util = max(task_util_est(p), task_runnable(p));
> > +
> > +     /*
> > +      * Return true only if the task might not sleep/wakeup because of a low
> > +      * compute capacity. Tasks, which wake up regularly, will be handled by
> > +      * feec().
> > +      */
> > +     return (util > max_capa);
> > +}
> > +
> > +static inline bool sched_energy_push_task(struct task_struct *p, struct rq *rq)
> > +{
> > +     if (!sched_energy_enabled())
> > +             return false;
> > +
> > +     if (is_rd_overutilized(rq->rd))
> > +             return false;
> > +
> > +     if (task_stuck_on_cpu(p, cpu_of(rq)))
> > +             return true;
> > +
> > +     if (!task_fits_cpu(p, cpu_of(rq)))
> > +             return true;
> > +
> > +     return false;
> > +}
> > +
> > +static inline bool sched_idle_push_task(struct task_struct *p, struct rq *rq)
> > +{
> > +     if (rq->nr_running == 1)
> > +             return false;
> > +
> > +     if (!is_rd_overutilized(rq->rd))
> > +             return false;
> > +
> > +     /* If there are idle cpus in the llc then try to push the task on it */
> > +     if (test_idle_cores(cpu_of(rq)))
> > +             return true;
> > +
> > +     return false;
> > +}
> > +
> > +
> >  static bool fair_push_task(struct rq *rq, struct task_struct *p)
> >  {
> > +     if (!task_on_rq_queued(p))
> > +             return false;
>
> Task is queued on rq.
> > +
> > +     if (p->se.sched_delayed)
> > +             return false;
> > +
> > +     if (p->nr_cpus_allowed == 1)
> > +             return false;
> > +
> > +     if (sched_energy_push_task(p, rq))
> > +             return true;
>
> If task is stuck on CPU, it could not be on rq. Weird.

May be it comes from my description and I should use task_stuck_on_rq
By stuck, I mean that the task doesn't have any opportunity to migrate
on another cpu/rq and stay "forever"  (at least until next sleep) on
this cpu/rq because load balancing is disabled/bypassed w/ EAS
Here Stuck does not mean blocked/sleeping

> > +
> > +     if (sched_idle_push_task(p, rq))
> > +             return true;
> > +
> >       return false;
> >  }
> >
> More, in the tick path,
>
> task_tick_fair
>   check_pushable_task
>     fair_push_task
>       task_on_rq_queued // this check makes no sense

I want to use a single entry point (fair_push_task) for deciding to
push a task so I agree that testing task_on_rq_queued() at tick is
useless but it is needed for other cases when the task is put back in
the rb tree

Re: [RFC PATCH 6/6 v7] sched/fair: Add EAS and idle cpu push trigger

Posted by Hillf Danton 2 months, 1 week ago

On Tue, 2 Dec 2025 14:01:39 +0100 Vincent Guittot wrote:
>On Tue, 2 Dec 2025 at 10:45, Hillf Danton <hdanton@sina.com> wrote:
>> On Mon,  1 Dec 2025 10:13:08 +0100 Vincent Guittot wrote:
>> > EAS is based on wakeup events to efficiently place tasks on the system, but
>> > there are cases where a task doesn't have wakeup events anymore or at a far
>> > too low pace. For such cases, we check if it's worht pushing hte task on
>> > another CPUs instead of putting it back in the enqueued list.
>> >
>> > Wake up events remain the main way to migrate tasks but we now detect
>> > situation where a task is stuck on a CPU by checking that its utilization
>> > is larger than the max available compute capacity (max cpu capacity or
>> > uclamp max setting)
>> >
>> > When the system becomes overutilized and some CPUs are idle, we try to
>> > push tasks instead of waiting periodic load balance.
>> >
>> > Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
>> > ---
>> >  kernel/sched/fair.c     | 65 +++++++++++++++++++++++++++++++++++++++++
>> >  kernel/sched/topology.c |  3 ++
>> >  2 files changed, 68 insertions(+)
>> >
>> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> > index 9af8d0a61856..e9e1d0c05805 100644
>> > --- a/kernel/sched/fair.c
>> > +++ b/kernel/sched/fair.c
>> > @@ -6990,6 +6990,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>> >  }
>> >
>> >  static void fair_remove_pushable_task(struct rq *rq, struct task_struct *p);
>> > +
>> >  /*
>> >   * Basically dequeue_task_fair(), except it can deal with dequeue_entity()
>> >   * failing half-way through and resume the dequeue later.
>> > @@ -8499,8 +8500,72 @@ static inline bool sched_push_task_enabled(void)
>> >       return static_branch_unlikely(&sched_push_task);
>> >  }
>> >
>> > +static inline bool task_stuck_on_cpu(struct task_struct *p, int cpu)
>> > +{
>> > +     unsigned long max_capa, util;
>> > +
>> > +     max_capa = min(get_actual_cpu_capacity(cpu),
>> > +                    uclamp_eff_value(p, UCLAMP_MAX));
>> > +     util = max(task_util_est(p), task_runnable(p));
>> > +
>> > +     /*
>> > +      * Return true only if the task might not sleep/wakeup because of a low
>> > +      * compute capacity. Tasks, which wake up regularly, will be handled by
>> > +      * feec().
>> > +      */
>> > +     return (util > max_capa);
>> > +}
>> > +
>> > +static inline bool sched_energy_push_task(struct task_struct *p, struct rq *rq)
>> > +{
>> > +     if (!sched_energy_enabled())
>> > +             return false;
>> > +
>> > +     if (is_rd_overutilized(rq->rd))
>> > +             return false;
>> > +
>> > +     if (task_stuck_on_cpu(p, cpu_of(rq)))
>> > +             return true;
>> > +
>> > +     if (!task_fits_cpu(p, cpu_of(rq)))
>> > +             return true;
>> > +
>> > +     return false;
>> > +}
>> > +
>> > +static inline bool sched_idle_push_task(struct task_struct *p, struct rq *rq)
>> > +{
>> > +     if (rq->nr_running == 1)
>> > +             return false;
>> > +
>> > +     if (!is_rd_overutilized(rq->rd))
>> > +             return false;
>> > +
>> > +     /* If there are idle cpus in the llc then try to push the task on it */
>> > +     if (test_idle_cores(cpu_of(rq)))
>> > +             return true;
>> > +
>> > +     return false;
>> > +}
>> > +
>> > +
>> >  static bool fair_push_task(struct rq *rq, struct task_struct *p)
>> >  {
>> > +     if (!task_on_rq_queued(p))
>> > +             return false;
>>
>> Task is queued on rq.
>> > +
>> > +     if (p->se.sched_delayed)
>> > +             return false;
>> > +
>> > +     if (p->nr_cpus_allowed == 1)
>> > +             return false;
>> > +
>> > +     if (sched_energy_push_task(p, rq))
>> > +             return true;
>>
>> If task is stuck on CPU, it could not be on rq. Weird.
>
> May be it comes from my description and I should use task_stuck_on_rq
> By stuck, I mean that the task doesn't have any opportunity to migrate
> on another cpu/rq and stay "forever"  (at least until next sleep) on
> this cpu/rq because load balancing is disabled/bypassed w/ EAS
> Here Stuck does not mean blocked/sleeping
>
Given task queued on rq, I find the correct phrase, stack, in the cover
letter instead of stuck, and the long-standing stacking tasks mean load
balancer fails to cure that stack. 1/7 fixes that failure, no?

Re: [RFC PATCH 6/6 v7] sched/fair: Add EAS and idle cpu push trigger

Posted by Vincent Guittot 2 months, 1 week ago

On Wed, 3 Dec 2025 at 10:00, Hillf Danton <hdanton@sina.com> wrote:
>
> On Tue, 2 Dec 2025 14:01:39 +0100 Vincent Guittot wrote:
> >On Tue, 2 Dec 2025 at 10:45, Hillf Danton <hdanton@sina.com> wrote:
> >> On Mon,  1 Dec 2025 10:13:08 +0100 Vincent Guittot wrote:
> >> > EAS is based on wakeup events to efficiently place tasks on the system, but
> >> > there are cases where a task doesn't have wakeup events anymore or at a far
> >> > too low pace. For such cases, we check if it's worht pushing hte task on
> >> > another CPUs instead of putting it back in the enqueued list.
> >> >
> >> > Wake up events remain the main way to migrate tasks but we now detect
> >> > situation where a task is stuck on a CPU by checking that its utilization
> >> > is larger than the max available compute capacity (max cpu capacity or
> >> > uclamp max setting)
> >> >
> >> > When the system becomes overutilized and some CPUs are idle, we try to
> >> > push tasks instead of waiting periodic load balance.
> >> >
> >> > Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> >> > ---
> >> >  kernel/sched/fair.c     | 65 +++++++++++++++++++++++++++++++++++++++++
> >> >  kernel/sched/topology.c |  3 ++
> >> >  2 files changed, 68 insertions(+)
> >> >
> >> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> >> > index 9af8d0a61856..e9e1d0c05805 100644
> >> > --- a/kernel/sched/fair.c
> >> > +++ b/kernel/sched/fair.c
> >> > @@ -6990,6 +6990,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> >> >  }
> >> >
> >> >  static void fair_remove_pushable_task(struct rq *rq, struct task_struct *p);
> >> > +
> >> >  /*
> >> >   * Basically dequeue_task_fair(), except it can deal with dequeue_entity()
> >> >   * failing half-way through and resume the dequeue later.
> >> > @@ -8499,8 +8500,72 @@ static inline bool sched_push_task_enabled(void)
> >> >       return static_branch_unlikely(&sched_push_task);
> >> >  }
> >> >
> >> > +static inline bool task_stuck_on_cpu(struct task_struct *p, int cpu)
> >> > +{
> >> > +     unsigned long max_capa, util;
> >> > +
> >> > +     max_capa = min(get_actual_cpu_capacity(cpu),
> >> > +                    uclamp_eff_value(p, UCLAMP_MAX));
> >> > +     util = max(task_util_est(p), task_runnable(p));
> >> > +
> >> > +     /*
> >> > +      * Return true only if the task might not sleep/wakeup because of a low
> >> > +      * compute capacity. Tasks, which wake up regularly, will be handled by
> >> > +      * feec().
> >> > +      */
> >> > +     return (util > max_capa);
> >> > +}
> >> > +
> >> > +static inline bool sched_energy_push_task(struct task_struct *p, struct rq *rq)
> >> > +{
> >> > +     if (!sched_energy_enabled())
> >> > +             return false;
> >> > +
> >> > +     if (is_rd_overutilized(rq->rd))
> >> > +             return false;
> >> > +
> >> > +     if (task_stuck_on_cpu(p, cpu_of(rq)))
> >> > +             return true;
> >> > +
> >> > +     if (!task_fits_cpu(p, cpu_of(rq)))
> >> > +             return true;
> >> > +
> >> > +     return false;
> >> > +}
> >> > +
> >> > +static inline bool sched_idle_push_task(struct task_struct *p, struct rq *rq)
> >> > +{
> >> > +     if (rq->nr_running == 1)
> >> > +             return false;
> >> > +
> >> > +     if (!is_rd_overutilized(rq->rd))
> >> > +             return false;
> >> > +
> >> > +     /* If there are idle cpus in the llc then try to push the task on it */
> >> > +     if (test_idle_cores(cpu_of(rq)))
> >> > +             return true;
> >> > +
> >> > +     return false;
> >> > +}
> >> > +
> >> > +
> >> >  static bool fair_push_task(struct rq *rq, struct task_struct *p)
> >> >  {
> >> > +     if (!task_on_rq_queued(p))
> >> > +             return false;
> >>
> >> Task is queued on rq.
> >> > +
> >> > +     if (p->se.sched_delayed)
> >> > +             return false;
> >> > +
> >> > +     if (p->nr_cpus_allowed == 1)
> >> > +             return false;
> >> > +
> >> > +     if (sched_energy_push_task(p, rq))
> >> > +             return true;
> >>
> >> If task is stuck on CPU, it could not be on rq. Weird.
> >
> > May be it comes from my description and I should use task_stuck_on_rq
> > By stuck, I mean that the task doesn't have any opportunity to migrate
> > on another cpu/rq and stay "forever"  (at least until next sleep) on
> > this cpu/rq because load balancing is disabled/bypassed w/ EAS
> > Here Stuck does not mean blocked/sleeping
> >
> Given task queued on rq, I find the correct phrase, stack, in the cover
> letter instead of stuck, and the long-standing stacking tasks mean load
> balancer fails to cure that stack. 1/7 fixes that failure, no?

It's not just stacked because we sometimes/often want to stack tasks
on the same CPU. EAS is based on the assumption that tasks will sleep
and wake up regularly and EAS will select a new CPU at each wakeup but
it's not always true. We can have situations where task A has been put
on CPU0when waking up, sharing the CPU with others tasks. But after
some time, task A should be better on CPUB now not because of not
fitting anymore on CPU0 but just because the system state has changed
since its wakeup. Because task A shares the CPU0 with other tasks, it
can takes dozen/hundreds of ms to finish its works and to sleep and we
don't wait those hundreds of ms whereas a CPU1 might be a better
choice now.
Patch 1 fixes a case where a CPU was wrongly classified as overloaded
whereas it's not the case (because of uclamp max as an example)

Re: [RFC PATCH 6/6 v7] sched/fair: Add EAS and idle cpu push trigger

Posted by Hillf Danton 2 months, 1 week ago

On Wed, 3 Dec 2025 14:32:06 +0100 Vincent Guittot wrote:
> On Wed, 3 Dec 2025 at 10:00, Hillf Danton <hdanton@sina.com> wrote:
> > Given task queued on rq, I find the correct phrase, stack, in the cover
> > letter instead of stuck, and the long-standing stacking tasks mean load
> > balancer fails to cure that stack. 1/7 fixes that failure, no?
> 
> It's not just stacked because we sometimes/often want to stack tasks
> on the same CPU. EAS is based on the assumption that tasks will sleep
> and wake up regularly and EAS will select a new CPU at each wakeup but
> it's not always true. We can have situations where task A has been put
> on CPU0when waking up, sharing the CPU with others tasks. But after
> some time, task A should be better on CPUB now not because of not
> fitting anymore on CPU0 but just because the system state has changed
> since its wakeup. Because task A shares the CPU0 with other tasks, it
> can takes dozen/hundreds of ms to finish its works and to sleep and we
> don't wait those hundreds of ms whereas a CPU1 might be a better
> choice now.
> 
Even if task is pushed from an ARM little core to a big one, the net
result could be zero, either because the number of stacking tasks on the
dst CPU increases or more important the dst CPU cycles are shared at the
pace of tick. In general if stacking is not mitigated but migrated from
one CPU to another, pushing could not make much difference.

Re: [RFC PATCH 6/6 v7] sched/fair: Add EAS and idle cpu push trigger

Posted by Vincent Guittot 2 months ago

On Thu, 4 Dec 2025 at 07:59, Hillf Danton <hdanton@sina.com> wrote:
>
> On Wed, 3 Dec 2025 14:32:06 +0100 Vincent Guittot wrote:
> > On Wed, 3 Dec 2025 at 10:00, Hillf Danton <hdanton@sina.com> wrote:
> > > Given task queued on rq, I find the correct phrase, stack, in the cover
> > > letter instead of stuck, and the long-standing stacking tasks mean load
> > > balancer fails to cure that stack. 1/7 fixes that failure, no?
> >
> > It's not just stacked because we sometimes/often want to stack tasks
> > on the same CPU. EAS is based on the assumption that tasks will sleep
> > and wake up regularly and EAS will select a new CPU at each wakeup but
> > it's not always true. We can have situations where task A has been put
> > on CPU0when waking up, sharing the CPU with others tasks. But after
> > some time, task A should be better on CPUB now not because of not
> > fitting anymore on CPU0 but just because the system state has changed
> > since its wakeup. Because task A shares the CPU0 with other tasks, it
> > can takes dozen/hundreds of ms to finish its works and to sleep and we
> > don't wait those hundreds of ms whereas a CPU1 might be a better
> > choice now.
> >
> Even if task is pushed from an ARM little core to a big one, the net
> result could be zero, either because the number of stacking tasks on the
> dst CPU increases or more important the dst CPU cycles are shared at the
> pace of tick. In general if stacking is not mitigated but migrated from
> one CPU to another, pushing could not make much difference.

if select_task_rq/feec returns a new CPU, it means that it will make a
difference in the consumed energy or the available capacity for the
task. And when overutilized, it looks for an idle CPUs

Re: [RFC PATCH 6/6 v7] sched/fair: Add EAS and idle cpu push trigger

Posted by Hillf Danton 2 months ago

On Fri, 5 Dec 2025 16:02:27 +0100 Vincent Guittot wrote:
> On Thu, 4 Dec 2025 at 07:59, Hillf Danton <hdanton@sina.com> wrote:
> > On Wed, 3 Dec 2025 14:32:06 +0100 Vincent Guittot wrote:
> > > On Wed, 3 Dec 2025 at 10:00, Hillf Danton <hdanton@sina.com> wrote:
> > > > Given task queued on rq, I find the correct phrase, stack, in the cover
> > > > letter instead of stuck, and the long-standing stacking tasks mean load
> > > > balancer fails to cure that stack. 1/7 fixes that failure, no?
> > >
> > > It's not just stacked because we sometimes/often want to stack tasks
> > > on the same CPU. EAS is based on the assumption that tasks will sleep
> > > and wake up regularly and EAS will select a new CPU at each wakeup but
> > > it's not always true. We can have situations where task A has been put
> > > on CPU0when waking up, sharing the CPU with others tasks. But after
> > > some time, task A should be better on CPUB now not because of not
> > > fitting anymore on CPU0 but just because the system state has changed
> > > since its wakeup. Because task A shares the CPU0 with other tasks, it
> > > can takes dozen/hundreds of ms to finish its works and to sleep and we
> > > don't wait those hundreds of ms whereas a CPU1 might be a better
> > > choice now.
> > >
> > Even if task is pushed from an ARM little core to a big one, the net
> > result could be zero, either because the number of stacking tasks on the
> > dst CPU increases or more important the dst CPU cycles are shared at the
> > pace of tick. In general if stacking is not mitigated but migrated from
> > one CPU to another, pushing could not make much difference.
> 
> if select_task_rq/feec returns a new CPU, it means that it will make a
> difference in the consumed energy or the available capacity for the
> task. And when overutilized, it looks for an idle CPUs
> 
Yeah given the correct CPU from select_task_rq/feec, in case of stacking
tasks what push does is blindly searching for idlest CPU.
On the opposite, when task sleeps, what pull does is correctly searching
for the busiest CPU. By correctly I mean it is the right time to migrate
task.

Re: [RFC PATCH 6/6 v7] sched/fair: Add EAS and idle cpu push trigger

Posted by Christian Loehle 2 months, 1 week ago

Some nits below for now

On 12/1/25 09:13, Vincent Guittot wrote:
> EAS is based on wakeup events to efficiently place tasks on the system, but
> there are cases where a task doesn't have wakeup events anymore or at a far
> too low pace. For such cases, we check if it's worht pushing hte task on

worth
the

> another CPUs instead of putting it back in the enqueued list.
> 
> Wake up events remain the main way to migrate tasks but we now detect
> situation where a task is stuck on a CPU by checking that its utilization
> is larger than the max available compute capacity (max cpu capacity or
> uclamp max setting)
> 
> When the system becomes overutilized and some CPUs are idle, we try to
> push tasks instead of waiting periodic load balance.
> 
> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> ---
>  kernel/sched/fair.c     | 65 +++++++++++++++++++++++++++++++++++++++++
>  kernel/sched/topology.c |  3 ++
>  2 files changed, 68 insertions(+)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 9af8d0a61856..e9e1d0c05805 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6990,6 +6990,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  }
>  
>  static void fair_remove_pushable_task(struct rq *rq, struct task_struct *p);
> +

This doesn't belong here

>  /*
>   * Basically dequeue_task_fair(), except it can deal with dequeue_entity()
>   * failing half-way through and resume the dequeue later.
> @@ -8499,8 +8500,72 @@ static inline bool sched_push_task_enabled(void)
>  	return static_branch_unlikely(&sched_push_task);
>  }
>  
> +static inline bool task_stuck_on_cpu(struct task_struct *p, int cpu)
> +{
> +	unsigned long max_capa, util;
> +
> +	max_capa = min(get_actual_cpu_capacity(cpu),
> +		       uclamp_eff_value(p, UCLAMP_MAX));
> +	util = max(task_util_est(p), task_runnable(p));
> +
> +	/*
> +	 * Return true only if the task might not sleep/wakeup because of a low
> +	 * compute capacity. Tasks, which wake up regularly, will be handled by
> +	 * feec().
> +	 */
> +	return (util > max_capa);
> +}
> +
> +static inline bool sched_energy_push_task(struct task_struct *p, struct rq *rq)
> +{
> +	if (!sched_energy_enabled())
> +		return false;
> +
> +	if (is_rd_overutilized(rq->rd))
> +		return false;
> +
> +	if (task_stuck_on_cpu(p, cpu_of(rq)))
> +		return true;
> +
> +	if (!task_fits_cpu(p, cpu_of(rq)))
> +		return true;
> +
> +	return false;
> +}
> +
> +static inline bool sched_idle_push_task(struct task_struct *p, struct rq *rq)
> +{
> +	if (rq->nr_running == 1)
> +		return false;
> +
> +	if (!is_rd_overutilized(rq->rd))
> +		return false;
> +
> +	/* If there are idle cpus in the llc then try to push the task on it */
> +	if (test_idle_cores(cpu_of(rq)))
> +		return true;
> +
> +	return false;
> +}
> +
> +
>  static bool fair_push_task(struct rq *rq, struct task_struct *p)
>  {
> +	if (!task_on_rq_queued(p))
> +		return false;
> +
> +	if (p->se.sched_delayed)
> +		return false;
> +
> +	if (p->nr_cpus_allowed == 1)
> +		return false;
> +
> +	if (sched_energy_push_task(p, rq))
> +		return true;
> +
> +	if (sched_idle_push_task(p, rq))
> +		return true;
> +
>  	return false;
>  }
>  
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index cf643a5ddedd..5edf7b117ed9 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -391,10 +391,13 @@ static void sched_energy_set(bool has_eas)
>  		if (sched_debug())
>  			pr_info("%s: stopping EAS\n", __func__);
>  		static_branch_disable_cpuslocked(&sched_energy_present);
> +		static_branch_dec_cpuslocked(&sched_push_task);
> +	} else if (has_eas && !sched_energy_enabled()) {
>  	} else if (has_eas && !static_branch_unlikely(&sched_energy_present)) {

This could just be (has_eas && && sched_energy_enabled() && !static_branch_unlikely(&sched_energy_present))
to avoid the awkward else if above

>  		if (sched_debug())
>  			pr_info("%s: starting EAS\n", __func__);
>  		static_branch_enable_cpuslocked(&sched_energy_present);
> +		static_branch_inc_cpuslocked(&sched_push_task);
>  	}
>  }
>

Re: [RFC PATCH 6/6 v7] sched/fair: Add EAS and idle cpu push trigger

Posted by Vincent Guittot 2 months, 1 week ago

On Mon, 1 Dec 2025 at 14:53, Christian Loehle <christian.loehle@arm.com> wrote:
>
> Some nits below for now
>
> On 12/1/25 09:13, Vincent Guittot wrote:
> > EAS is based on wakeup events to efficiently place tasks on the system, but
> > there are cases where a task doesn't have wakeup events anymore or at a far
> > too low pace. For such cases, we check if it's worht pushing hte task on
>
> worth
> the

+1

>
> > another CPUs instead of putting it back in the enqueued list.
> >
> > Wake up events remain the main way to migrate tasks but we now detect
> > situation where a task is stuck on a CPU by checking that its utilization
> > is larger than the max available compute capacity (max cpu capacity or
> > uclamp max setting)
> >
> > When the system becomes overutilized and some CPUs are idle, we try to
> > push tasks instead of waiting periodic load balance.
> >
> > Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> > ---
> >  kernel/sched/fair.c     | 65 +++++++++++++++++++++++++++++++++++++++++
> >  kernel/sched/topology.c |  3 ++
> >  2 files changed, 68 insertions(+)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 9af8d0a61856..e9e1d0c05805 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -6990,6 +6990,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> >  }
> >
> >  static void fair_remove_pushable_task(struct rq *rq, struct task_struct *p);
> > +
>
> This doesn't belong here

yes, don't know what I mess up with my patches

>
> >  /*
> >   * Basically dequeue_task_fair(), except it can deal with dequeue_entity()
> >   * failing half-way through and resume the dequeue later.
> > @@ -8499,8 +8500,72 @@ static inline bool sched_push_task_enabled(void)
> >       return static_branch_unlikely(&sched_push_task);
> >  }
> >
> > +static inline bool task_stuck_on_cpu(struct task_struct *p, int cpu)
> > +{
> > +     unsigned long max_capa, util;
> > +
> > +     max_capa = min(get_actual_cpu_capacity(cpu),
> > +                    uclamp_eff_value(p, UCLAMP_MAX));
> > +     util = max(task_util_est(p), task_runnable(p));
> > +
> > +     /*
> > +      * Return true only if the task might not sleep/wakeup because of a low
> > +      * compute capacity. Tasks, which wake up regularly, will be handled by
> > +      * feec().
> > +      */
> > +     return (util > max_capa);
> > +}
> > +
> > +static inline bool sched_energy_push_task(struct task_struct *p, struct rq *rq)
> > +{
> > +     if (!sched_energy_enabled())
> > +             return false;
> > +
> > +     if (is_rd_overutilized(rq->rd))
> > +             return false;
> > +
> > +     if (task_stuck_on_cpu(p, cpu_of(rq)))
> > +             return true;
> > +
> > +     if (!task_fits_cpu(p, cpu_of(rq)))
> > +             return true;
> > +
> > +     return false;
> > +}
> > +
> > +static inline bool sched_idle_push_task(struct task_struct *p, struct rq *rq)
> > +{
> > +     if (rq->nr_running == 1)
> > +             return false;
> > +
> > +     if (!is_rd_overutilized(rq->rd))
> > +             return false;
> > +
> > +     /* If there are idle cpus in the llc then try to push the task on it */
> > +     if (test_idle_cores(cpu_of(rq)))
> > +             return true;
> > +
> > +     return false;
> > +}
> > +
> > +
> >  static bool fair_push_task(struct rq *rq, struct task_struct *p)
> >  {
> > +     if (!task_on_rq_queued(p))
> > +             return false;
> > +
> > +     if (p->se.sched_delayed)
> > +             return false;
> > +
> > +     if (p->nr_cpus_allowed == 1)
> > +             return false;
> > +
> > +     if (sched_energy_push_task(p, rq))
> > +             return true;
> > +
> > +     if (sched_idle_push_task(p, rq))
> > +             return true;
> > +
> >       return false;
> >  }
> >
> > diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> > index cf643a5ddedd..5edf7b117ed9 100644
> > --- a/kernel/sched/topology.c
> > +++ b/kernel/sched/topology.c
> > @@ -391,10 +391,13 @@ static void sched_energy_set(bool has_eas)
> >               if (sched_debug())
> >                       pr_info("%s: stopping EAS\n", __func__);
> >               static_branch_disable_cpuslocked(&sched_energy_present);
> > +             static_branch_dec_cpuslocked(&sched_push_task);
> > +     } else if (has_eas && !sched_energy_enabled()) {
> >       } else if (has_eas && !static_branch_unlikely(&sched_energy_present)) {
>
> This could just be (has_eas && && sched_energy_enabled() && !static_branch_unlikely(&sched_energy_present))
> to avoid the awkward else if above

Argh, I messed up something with this patchset and another pending
cleanup patch when I rebased it.
It should be :

                static_branch_disable_cpuslocked(&sched_energy_present);
+                static_branch_dec_cpuslocked(&sched_push_task);
        } else if (has_eas && !static_branch_unlikely(&sched_energy_present)) {

I need to rerun the bench to check that the results of the cover
letter are still correct.

 That's what happens when you want to send a patchset too quickly ...


>
> >               if (sched_debug())
> >                       pr_info("%s: starting EAS\n", __func__);
> >               static_branch_enable_cpuslocked(&sched_energy_present);
> > +             static_branch_inc_cpuslocked(&sched_push_task);
> >       }
> >  }
> >
>

Re: [RFC PATCH 6/6 v7] sched/fair: Add EAS and idle cpu push trigger

Posted by Vincent Guittot 2 months, 1 week ago

On Mon, 1 Dec 2025 at 18:49, Vincent Guittot <vincent.guittot@linaro.org> wrote:
>
> On Mon, 1 Dec 2025 at 14:53, Christian Loehle <christian.loehle@arm.com> wrote:
> >
> > Some nits below for now
> >
> > On 12/1/25 09:13, Vincent Guittot wrote:
> > > EAS is based on wakeup events to efficiently place tasks on the system, but
> > > there are cases where a task doesn't have wakeup events anymore or at a far
> > > too low pace. For such cases, we check if it's worht pushing hte task on
> >
> > worth
> > the
>
> +1
>
> >
> > > another CPUs instead of putting it back in the enqueued list.
> > >
> > > Wake up events remain the main way to migrate tasks but we now detect
> > > situation where a task is stuck on a CPU by checking that its utilization
> > > is larger than the max available compute capacity (max cpu capacity or
> > > uclamp max setting)
> > >
> > > When the system becomes overutilized and some CPUs are idle, we try to
> > > push tasks instead of waiting periodic load balance.
> > >
> > > Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> > > ---
> > >  kernel/sched/fair.c     | 65 +++++++++++++++++++++++++++++++++++++++++
> > >  kernel/sched/topology.c |  3 ++
> > >  2 files changed, 68 insertions(+)
> > >
> > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > index 9af8d0a61856..e9e1d0c05805 100644
> > > --- a/kernel/sched/fair.c
> > > +++ b/kernel/sched/fair.c
> > > @@ -6990,6 +6990,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> > >  }
> > >
> > >  static void fair_remove_pushable_task(struct rq *rq, struct task_struct *p);
> > > +
> >
> > This doesn't belong here
>
> yes, don't know what I mess up with my patches
>
> >
> > >  /*
> > >   * Basically dequeue_task_fair(), except it can deal with dequeue_entity()
> > >   * failing half-way through and resume the dequeue later.
> > > @@ -8499,8 +8500,72 @@ static inline bool sched_push_task_enabled(void)
> > >       return static_branch_unlikely(&sched_push_task);
> > >  }
> > >
> > > +static inline bool task_stuck_on_cpu(struct task_struct *p, int cpu)
> > > +{
> > > +     unsigned long max_capa, util;
> > > +
> > > +     max_capa = min(get_actual_cpu_capacity(cpu),
> > > +                    uclamp_eff_value(p, UCLAMP_MAX));
> > > +     util = max(task_util_est(p), task_runnable(p));
> > > +
> > > +     /*
> > > +      * Return true only if the task might not sleep/wakeup because of a low
> > > +      * compute capacity. Tasks, which wake up regularly, will be handled by
> > > +      * feec().
> > > +      */
> > > +     return (util > max_capa);
> > > +}
> > > +
> > > +static inline bool sched_energy_push_task(struct task_struct *p, struct rq *rq)
> > > +{
> > > +     if (!sched_energy_enabled())
> > > +             return false;
> > > +
> > > +     if (is_rd_overutilized(rq->rd))
> > > +             return false;
> > > +
> > > +     if (task_stuck_on_cpu(p, cpu_of(rq)))
> > > +             return true;
> > > +
> > > +     if (!task_fits_cpu(p, cpu_of(rq)))
> > > +             return true;
> > > +
> > > +     return false;
> > > +}
> > > +
> > > +static inline bool sched_idle_push_task(struct task_struct *p, struct rq *rq)
> > > +{
> > > +     if (rq->nr_running == 1)
> > > +             return false;
> > > +
> > > +     if (!is_rd_overutilized(rq->rd))
> > > +             return false;
> > > +
> > > +     /* If there are idle cpus in the llc then try to push the task on it */
> > > +     if (test_idle_cores(cpu_of(rq)))
> > > +             return true;
> > > +
> > > +     return false;
> > > +}
> > > +
> > > +
> > >  static bool fair_push_task(struct rq *rq, struct task_struct *p)
> > >  {
> > > +     if (!task_on_rq_queued(p))
> > > +             return false;
> > > +
> > > +     if (p->se.sched_delayed)
> > > +             return false;
> > > +
> > > +     if (p->nr_cpus_allowed == 1)
> > > +             return false;
> > > +
> > > +     if (sched_energy_push_task(p, rq))
> > > +             return true;
> > > +
> > > +     if (sched_idle_push_task(p, rq))
> > > +             return true;
> > > +
> > >       return false;
> > >  }
> > >
> > > diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> > > index cf643a5ddedd..5edf7b117ed9 100644
> > > --- a/kernel/sched/topology.c
> > > +++ b/kernel/sched/topology.c
> > > @@ -391,10 +391,13 @@ static void sched_energy_set(bool has_eas)
> > >               if (sched_debug())
> > >                       pr_info("%s: stopping EAS\n", __func__);
> > >               static_branch_disable_cpuslocked(&sched_energy_present);
> > > +             static_branch_dec_cpuslocked(&sched_push_task);
> > > +     } else if (has_eas && !sched_energy_enabled()) {
> > >       } else if (has_eas && !static_branch_unlikely(&sched_energy_present)) {
> >
> > This could just be (has_eas && && sched_energy_enabled() && !static_branch_unlikely(&sched_energy_present))
> > to avoid the awkward else if above
>
> Argh, I messed up something with this patchset and another pending
> cleanup patch when I rebased it.
> It should be :
>
>                 static_branch_disable_cpuslocked(&sched_energy_present);
> +                static_branch_dec_cpuslocked(&sched_push_task);
>         } else if (has_eas && !static_branch_unlikely(&sched_energy_present)) {
>
> I need to rerun the bench to check that the results of the cover
> letter are still correct.

And the results are now the same

Sorry for the noise, I'm going to fix this in a v8

>
>  That's what happens when you want to send a patchset too quickly ...
>
>
> >
> > >               if (sched_debug())
> > >                       pr_info("%s: starting EAS\n", __func__);
> > >               static_branch_enable_cpuslocked(&sched_energy_present);
> > > +             static_branch_inc_cpuslocked(&sched_push_task);
> > >       }
> > >  }
> > >
> >

[PATCH 1/6 v7] sched/fair: Filter false overloaded_group case for EAS
[PATCH 2/6 v7] sched/fair: Update overutilized detection
[PATCH 3/6 v7] sched/fair: Prepare select_task_rq_fair() to be called for new cases
[PATCH 4/6 v7] sched/fair: Add push task mechanism for fair
[RFC PATCH 5/6 v7] sched/fair: Enable idle core tracking for !SMT
[RFC PATCH 6/6 v7] sched/fair: Add EAS and idle cpu push trigger