[PATCH 1/7 v5] sched/fair: Filter false overloaded_group case for EAS

Vincent Guittot posted 7 patches 11 months, 2 weeks ago
There is a newer version of this series
[PATCH 1/7 v5] sched/fair: Filter false overloaded_group case for EAS
Posted by Vincent Guittot 11 months, 2 weeks ago
With EAS, a group should be set overloaded if at least 1 CPU in the group
is overutilized but it can happen that a CPU is fully utilized by tasks
because of clamping the compute capacity of the CPU. In such case, the CPU
is not overutilized and as a result should not be set overloaded as well.

group_overloaded being a higher priority than group_misfit, such group can
be selected as the busiest group instead of a group with a mistfit task
and prevents load_balance to select the CPU with the misfit task to pull
the latter on a fitting CPU.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Tested-by: Pierre Gondois <pierre.gondois@arm.com>
---
 kernel/sched/fair.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 857808da23d8..d3d1a2ba6b1a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9931,6 +9931,7 @@ struct sg_lb_stats {
 	unsigned int group_asym_packing;	/* Tasks should be moved to preferred CPU */
 	unsigned int group_smt_balance;		/* Task on busy SMT be moved */
 	unsigned long group_misfit_task_load;	/* A CPU has a task too big for its capacity */
+	unsigned int group_overutilized;	/* At least one CPU is overutilized in the group */
 #ifdef CONFIG_NUMA_BALANCING
 	unsigned int nr_numa_running;
 	unsigned int nr_preferred_running;
@@ -10163,6 +10164,13 @@ group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
 static inline bool
 group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
 {
+	/*
+	 * With EAS and uclamp, 1 CPU in the group must be overutilized to
+	 * consider the group overloaded.
+	 */
+	if (sched_energy_enabled() && !sgs->group_overutilized)
+		return false;
+
 	if (sgs->sum_nr_running <= sgs->group_weight)
 		return false;
 
@@ -10374,8 +10382,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 		nr_running = rq->nr_running;
 		sgs->sum_nr_running += nr_running;
 
-		if (cpu_overutilized(i))
+		if (cpu_overutilized(i)) {
 			*sg_overutilized = 1;
+			sgs->group_overutilized = 1;
+		}
 
 		/*
 		 * No need to call idle_cpu() if nr_running is not 0
-- 
2.43.0
Re: [PATCH 1/7 v5] sched/fair: Filter false overloaded_group case for EAS
Posted by K Prateek Nayak 11 months, 1 week ago
Hello Vincent,

On 3/3/2025 2:35 AM, Vincent Guittot wrote:
> With EAS, a group should be set overloaded if at least 1 CPU in the group
> is overutilized but it can happen that a CPU is fully utilized by tasks
> because of clamping the compute capacity of the CPU. In such case, the CPU
> is not overutilized and as a result should not be set overloaded as well.
> 
> group_overloaded being a higher priority than group_misfit, such group can
> be selected as the busiest group instead of a group with a mistfit task
> and prevents load_balance to select the CPU with the misfit task to pull
> the latter on a fitting CPU.
> 
> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> Tested-by: Pierre Gondois <pierre.gondois@arm.com>
> ---
>   kernel/sched/fair.c | 12 +++++++++++-
>   1 file changed, 11 insertions(+), 1 deletion(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 857808da23d8..d3d1a2ba6b1a 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -9931,6 +9931,7 @@ struct sg_lb_stats {
>   	unsigned int group_asym_packing;	/* Tasks should be moved to preferred CPU */
>   	unsigned int group_smt_balance;		/* Task on busy SMT be moved */
>   	unsigned long group_misfit_task_load;	/* A CPU has a task too big for its capacity */
> +	unsigned int group_overutilized;	/* At least one CPU is overutilized in the group */
>   #ifdef CONFIG_NUMA_BALANCING
>   	unsigned int nr_numa_running;
>   	unsigned int nr_preferred_running;
> @@ -10163,6 +10164,13 @@ group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
>   static inline bool
>   group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
>   {
> +	/*
> +	 * With EAS and uclamp, 1 CPU in the group must be overutilized to
> +	 * consider the group overloaded.
> +	 */
> +	if (sched_energy_enabled() && !sgs->group_overutilized)
> +		return false;
> +
>   	if (sgs->sum_nr_running <= sgs->group_weight)
>   		return false;
>   
> @@ -10374,8 +10382,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
>   		nr_running = rq->nr_running;
>   		sgs->sum_nr_running += nr_running;
>   
> -		if (cpu_overutilized(i))
> +		if (cpu_overutilized(i)) {
>   			*sg_overutilized = 1;

Since sgs->overutilized is tracking the overutilized status, can we get
avoid passing the "sg_overutilized" pointer to update_sg_lb_stats() and
just use the sg->overutilized in update_sd_lb_stats()?

Something like below:

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 857808da23d8..de4a7e19d383 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10346,14 +10346,12 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
   * @group: sched_group whose statistics are to be updated.
   * @sgs: variable to hold the statistics for this group.
   * @sg_overloaded: sched_group is overloaded
- * @sg_overutilized: sched_group is overutilized
   */
  static inline void update_sg_lb_stats(struct lb_env *env,
  				      struct sd_lb_stats *sds,
  				      struct sched_group *group,
  				      struct sg_lb_stats *sgs,
-				      bool *sg_overloaded,
-				      bool *sg_overutilized)
+				      bool *sg_overloaded)
  {
  	int i, nr_running, local_group, sd_flags = env->sd->flags;
  	bool balancing_at_rd = !env->sd->parent;
@@ -10375,7 +10373,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
  		sgs->sum_nr_running += nr_running;
  
  		if (cpu_overutilized(i))
-			*sg_overutilized = 1;
+			sgs->group_overutilized = 1;
  
  		/*
  		 * No need to call idle_cpu() if nr_running is not 0
@@ -11046,7 +11044,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
  				update_group_capacity(env->sd, env->dst_cpu);
  		}
  
-		update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded, &sg_overutilized);
+		update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded);
  
  		if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
  			sds->busiest = sg;
@@ -11056,6 +11054,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
  		/* Now, start updating sd_lb_stats */
  		sds->total_load += sgs->group_load;
  		sds->total_capacity += sgs->group_capacity;
+		sg_overutilized |= sgs->group_overutilized;
  
  		sum_util += sgs->group_util;
  		sg = sg->next;
-- 
Thanks and Regards,
Prateek

> +			sgs->group_overutilized = 1;
> +		}
>   
>   		/*
>   		 * No need to call idle_cpu() if nr_running is not 0
Re: [PATCH 1/7 v5] sched/fair: Filter false overloaded_group case for EAS
Posted by Vincent Guittot 11 months, 1 week ago
Hi Prateek,

On Tue, 4 Mar 2025 at 05:38, K Prateek Nayak <kprateek.nayak@amd.com> wrote:
>
> Hello Vincent,
>
> On 3/3/2025 2:35 AM, Vincent Guittot wrote:
> > With EAS, a group should be set overloaded if at least 1 CPU in the group
> > is overutilized but it can happen that a CPU is fully utilized by tasks
> > because of clamping the compute capacity of the CPU. In such case, the CPU
> > is not overutilized and as a result should not be set overloaded as well.
> >
> > group_overloaded being a higher priority than group_misfit, such group can
> > be selected as the busiest group instead of a group with a mistfit task
> > and prevents load_balance to select the CPU with the misfit task to pull
> > the latter on a fitting CPU.
> >
> > Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> > Tested-by: Pierre Gondois <pierre.gondois@arm.com>
> > ---
> >   kernel/sched/fair.c | 12 +++++++++++-
> >   1 file changed, 11 insertions(+), 1 deletion(-)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 857808da23d8..d3d1a2ba6b1a 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -9931,6 +9931,7 @@ struct sg_lb_stats {
> >       unsigned int group_asym_packing;        /* Tasks should be moved to preferred CPU */
> >       unsigned int group_smt_balance;         /* Task on busy SMT be moved */
> >       unsigned long group_misfit_task_load;   /* A CPU has a task too big for its capacity */
> > +     unsigned int group_overutilized;        /* At least one CPU is overutilized in the group */
> >   #ifdef CONFIG_NUMA_BALANCING
> >       unsigned int nr_numa_running;
> >       unsigned int nr_preferred_running;
> > @@ -10163,6 +10164,13 @@ group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
> >   static inline bool
> >   group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
> >   {
> > +     /*
> > +      * With EAS and uclamp, 1 CPU in the group must be overutilized to
> > +      * consider the group overloaded.
> > +      */
> > +     if (sched_energy_enabled() && !sgs->group_overutilized)
> > +             return false;
> > +
> >       if (sgs->sum_nr_running <= sgs->group_weight)
> >               return false;
> >
> > @@ -10374,8 +10382,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
> >               nr_running = rq->nr_running;
> >               sgs->sum_nr_running += nr_running;
> >
> > -             if (cpu_overutilized(i))
> > +             if (cpu_overutilized(i)) {
> >                       *sg_overutilized = 1;
>
> Since sgs->overutilized is tracking the overutilized status, can we get
> avoid passing the "sg_overutilized" pointer to update_sg_lb_stats() and
> just use the sg->overutilized in update_sd_lb_stats()?

yes, make sense

>
> Something like below:
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 857808da23d8..de4a7e19d383 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -10346,14 +10346,12 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
>    * @group: sched_group whose statistics are to be updated.
>    * @sgs: variable to hold the statistics for this group.
>    * @sg_overloaded: sched_group is overloaded
> - * @sg_overutilized: sched_group is overutilized
>    */
>   static inline void update_sg_lb_stats(struct lb_env *env,
>                                       struct sd_lb_stats *sds,
>                                       struct sched_group *group,
>                                       struct sg_lb_stats *sgs,
> -                                     bool *sg_overloaded,
> -                                     bool *sg_overutilized)
> +                                     bool *sg_overloaded)
>   {
>         int i, nr_running, local_group, sd_flags = env->sd->flags;
>         bool balancing_at_rd = !env->sd->parent;
> @@ -10375,7 +10373,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
>                 sgs->sum_nr_running += nr_running;
>
>                 if (cpu_overutilized(i))
> -                       *sg_overutilized = 1;
> +                       sgs->group_overutilized = 1;
>
>                 /*
>                  * No need to call idle_cpu() if nr_running is not 0
> @@ -11046,7 +11044,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
>                                 update_group_capacity(env->sd, env->dst_cpu);
>                 }
>
> -               update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded, &sg_overutilized);
> +               update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded);
>
>                 if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
>                         sds->busiest = sg;
> @@ -11056,6 +11054,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
>                 /* Now, start updating sd_lb_stats */
>                 sds->total_load += sgs->group_load;
>                 sds->total_capacity += sgs->group_capacity;
> +               sg_overutilized |= sgs->group_overutilized;
>
>                 sum_util += sgs->group_util;
>                 sg = sg->next;
> --
> Thanks and Regards,
> Prateek
>
> > +                     sgs->group_overutilized = 1;
> > +             }
> >
> >               /*
> >                * No need to call idle_cpu() if nr_running is not 0
>