sched/fair: SMT-aware asymmetric CPU capacity

[PATCH 3/4] sched/fair: Enable EAS with SMT on SD_ASYM_CPUCAPACITY systems

Posted by Andrea Righi 1 week ago

Drop the sched_is_eas_possible() guard that rejects EAS whenever SMT is
active. This allows to enable EAS and perf-domain setup to succeed on
SD_ASYM_CPUCAPACITY topologies with SMT enabled.

Moreover, apply to find_energy_efficient_cpu() the same SMT-aware
preference as the non-EAS wakeup path: when SMT is active and there is a
fully-idle core in the relevant domain, prefer max-spare-capacity
candidates on fully-idle cores. Otherwise, fall back to the prior
behavior, to include also partially-idle SMT siblings.

Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Christian Loehle <christian.loehle@arm.com>
Cc: Koba Ko <kobak@nvidia.com>
Reported-by: Felix Abecassis <fabecassis@nvidia.com>
Signed-off-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/fair.c     | 50 +++++++++++++++++++++++++++++++++++++++--
 kernel/sched/topology.c |  9 --------
 2 files changed, 48 insertions(+), 11 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f8deaaa5bfc85..593a89f688679 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8658,13 +8658,15 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 	eenv_task_busy_time(&eenv, p, prev_cpu);
 
 	for (; pd; pd = pd->next) {
-		unsigned long util_min = p_util_min, util_max = p_util_max;
 		unsigned long cpu_cap, cpu_actual_cap, util;
 		long prev_spare_cap = -1, max_spare_cap = -1;
+		long max_spare_cap_fallback = -1;
 		unsigned long rq_util_min, rq_util_max;
 		unsigned long cur_delta, base_energy;
-		int max_spare_cap_cpu = -1;
+		int max_spare_cap_cpu = -1, max_spare_cap_cpu_fallback = -1;
 		int fits, max_fits = -1;
+		int max_fits_fallback = -1;
+		bool prefer_idle_cores;
 
 		if (!cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask))
 			continue;
@@ -8676,6 +8678,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 		eenv.cpu_cap = cpu_actual_cap;
 		eenv.pd_cap = 0;
 
+		prefer_idle_cores = sched_smt_active() && test_idle_cores(prev_cpu);
+
 		for_each_cpu(cpu, cpus) {
 			struct rq *rq = cpu_rq(cpu);
 
@@ -8687,6 +8691,11 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 			if (!cpumask_test_cpu(cpu, p->cpus_ptr))
 				continue;
 
+			if (prefer_idle_cores && cpu != prev_cpu && !is_core_idle(cpu))
+				goto fallback;
+
+			unsigned long util_min = p_util_min, util_max = p_util_max;
+
 			util = cpu_util(cpu, p, cpu, 0);
 			cpu_cap = capacity_of(cpu);
 
@@ -8733,6 +8742,43 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 				max_spare_cap_cpu = cpu;
 				max_fits = fits;
 			}
+
+fallback:
+			if (!prefer_idle_cores || cpu == prev_cpu || is_core_idle(cpu))
+				continue;
+
+			util_min = p_util_min;
+			util_max = p_util_max;
+			util = cpu_util(cpu, p, cpu, 0);
+			cpu_cap = capacity_of(cpu);
+
+			if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) {
+				rq_util_min = uclamp_rq_get(rq, UCLAMP_MIN);
+				rq_util_max = uclamp_rq_get(rq, UCLAMP_MAX);
+
+				util_min = max(rq_util_min, p_util_min);
+				util_max = max(rq_util_max, p_util_max);
+			}
+
+			fits = util_fits_cpu(util, util_min, util_max, cpu);
+			if (!fits)
+				continue;
+
+			lsub_positive(&cpu_cap, util);
+
+			if ((fits > max_fits_fallback) ||
+			    ((fits == max_fits_fallback) &&
+			     ((long)cpu_cap > max_spare_cap_fallback))) {
+				max_spare_cap_fallback = cpu_cap;
+				max_spare_cap_cpu_fallback = cpu;
+				max_fits_fallback = fits;
+			}
+		}
+
+		if (max_spare_cap_cpu < 0 && max_spare_cap_cpu_fallback >= 0) {
+			max_spare_cap = max_spare_cap_fallback;
+			max_spare_cap_cpu = max_spare_cap_cpu_fallback;
+			max_fits = max_fits_fallback;
 		}
 
 		if (max_spare_cap_cpu < 0 && prev_spare_cap < 0)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 061f8c85f5552..cb060fe56aec1 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -232,15 +232,6 @@ static bool sched_is_eas_possible(const struct cpumask *cpu_mask)
 		return false;
 	}
 
-	/* EAS definitely does *not* handle SMT */
-	if (sched_smt_active()) {
-		if (sched_debug()) {
-			pr_info("rd %*pbl: Checking EAS, SMT is not supported\n",
-				cpumask_pr_args(cpu_mask));
-		}
-		return false;
-	}
-
 	if (!arch_scale_freq_invariant()) {
 		if (sched_debug()) {
 			pr_info("rd %*pbl: Checking EAS: frequency-invariant load tracking not yet supported",
-- 
2.53.0

Re: [PATCH 3/4] sched/fair: Enable EAS with SMT on SD_ASYM_CPUCAPACITY systems

Posted by Vincent Guittot 6 days, 12 hours ago

On Thu, 26 Mar 2026 at 16:12, Andrea Righi <arighi@nvidia.com> wrote:
>
> Drop the sched_is_eas_possible() guard that rejects EAS whenever SMT is
> active. This allows to enable EAS and perf-domain setup to succeed on
> SD_ASYM_CPUCAPACITY topologies with SMT enabled.

I don't think that we want to enable EAS with SMT. So keep EAS and SMT
exclusive, at least for now


>
> Moreover, apply to find_energy_efficient_cpu() the same SMT-aware
> preference as the non-EAS wakeup path: when SMT is active and there is a
> fully-idle core in the relevant domain, prefer max-spare-capacity
> candidates on fully-idle cores. Otherwise, fall back to the prior
> behavior, to include also partially-idle SMT siblings.
>
> Cc: Vincent Guittot <vincent.guittot@linaro.org>
> Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
> Cc: Christian Loehle <christian.loehle@arm.com>
> Cc: Koba Ko <kobak@nvidia.com>
> Reported-by: Felix Abecassis <fabecassis@nvidia.com>
> Signed-off-by: Andrea Righi <arighi@nvidia.com>
> ---
>  kernel/sched/fair.c     | 50 +++++++++++++++++++++++++++++++++++++++--
>  kernel/sched/topology.c |  9 --------
>  2 files changed, 48 insertions(+), 11 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index f8deaaa5bfc85..593a89f688679 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -8658,13 +8658,15 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
>         eenv_task_busy_time(&eenv, p, prev_cpu);
>
>         for (; pd; pd = pd->next) {
> -               unsigned long util_min = p_util_min, util_max = p_util_max;
>                 unsigned long cpu_cap, cpu_actual_cap, util;
>                 long prev_spare_cap = -1, max_spare_cap = -1;
> +               long max_spare_cap_fallback = -1;
>                 unsigned long rq_util_min, rq_util_max;
>                 unsigned long cur_delta, base_energy;
> -               int max_spare_cap_cpu = -1;
> +               int max_spare_cap_cpu = -1, max_spare_cap_cpu_fallback = -1;
>                 int fits, max_fits = -1;
> +               int max_fits_fallback = -1;
> +               bool prefer_idle_cores;
>
>                 if (!cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask))
>                         continue;
> @@ -8676,6 +8678,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
>                 eenv.cpu_cap = cpu_actual_cap;
>                 eenv.pd_cap = 0;
>
> +               prefer_idle_cores = sched_smt_active() && test_idle_cores(prev_cpu);
> +
>                 for_each_cpu(cpu, cpus) {
>                         struct rq *rq = cpu_rq(cpu);
>
> @@ -8687,6 +8691,11 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
>                         if (!cpumask_test_cpu(cpu, p->cpus_ptr))
>                                 continue;
>
> +                       if (prefer_idle_cores && cpu != prev_cpu && !is_core_idle(cpu))
> +                               goto fallback;
> +
> +                       unsigned long util_min = p_util_min, util_max = p_util_max;
> +
>                         util = cpu_util(cpu, p, cpu, 0);
>                         cpu_cap = capacity_of(cpu);
>
> @@ -8733,6 +8742,43 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
>                                 max_spare_cap_cpu = cpu;
>                                 max_fits = fits;
>                         }
> +
> +fallback:
> +                       if (!prefer_idle_cores || cpu == prev_cpu || is_core_idle(cpu))
> +                               continue;
> +
> +                       util_min = p_util_min;
> +                       util_max = p_util_max;
> +                       util = cpu_util(cpu, p, cpu, 0);
> +                       cpu_cap = capacity_of(cpu);
> +
> +                       if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) {
> +                               rq_util_min = uclamp_rq_get(rq, UCLAMP_MIN);
> +                               rq_util_max = uclamp_rq_get(rq, UCLAMP_MAX);
> +
> +                               util_min = max(rq_util_min, p_util_min);
> +                               util_max = max(rq_util_max, p_util_max);
> +                       }
> +
> +                       fits = util_fits_cpu(util, util_min, util_max, cpu);
> +                       if (!fits)
> +                               continue;
> +
> +                       lsub_positive(&cpu_cap, util);
> +
> +                       if ((fits > max_fits_fallback) ||
> +                           ((fits == max_fits_fallback) &&
> +                            ((long)cpu_cap > max_spare_cap_fallback))) {
> +                               max_spare_cap_fallback = cpu_cap;
> +                               max_spare_cap_cpu_fallback = cpu;
> +                               max_fits_fallback = fits;
> +                       }
> +               }
> +
> +               if (max_spare_cap_cpu < 0 && max_spare_cap_cpu_fallback >= 0) {
> +                       max_spare_cap = max_spare_cap_fallback;
> +                       max_spare_cap_cpu = max_spare_cap_cpu_fallback;
> +                       max_fits = max_fits_fallback;
>                 }
>
>                 if (max_spare_cap_cpu < 0 && prev_spare_cap < 0)
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index 061f8c85f5552..cb060fe56aec1 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -232,15 +232,6 @@ static bool sched_is_eas_possible(const struct cpumask *cpu_mask)
>                 return false;
>         }
>
> -       /* EAS definitely does *not* handle SMT */
> -       if (sched_smt_active()) {
> -               if (sched_debug()) {
> -                       pr_info("rd %*pbl: Checking EAS, SMT is not supported\n",
> -                               cpumask_pr_args(cpu_mask));
> -               }
> -               return false;
> -       }
> -
>         if (!arch_scale_freq_invariant()) {
>                 if (sched_debug()) {
>                         pr_info("rd %*pbl: Checking EAS: frequency-invariant load tracking not yet supported",
> --
> 2.53.0
>

Re: [PATCH 3/4] sched/fair: Enable EAS with SMT on SD_ASYM_CPUCAPACITY systems

Posted by Andrea Righi 6 days, 10 hours ago

On Fri, Mar 27, 2026 at 09:09:35AM +0100, Vincent Guittot wrote:
> On Thu, 26 Mar 2026 at 16:12, Andrea Righi <arighi@nvidia.com> wrote:
> >
> > Drop the sched_is_eas_possible() guard that rejects EAS whenever SMT is
> > active. This allows to enable EAS and perf-domain setup to succeed on
> > SD_ASYM_CPUCAPACITY topologies with SMT enabled.
> 
> I don't think that we want to enable EAS with SMT. So keep EAS and SMT
> exclusive, at least for now

Ack.

Thanks,
-Andrea

> 
> 
> >
> > Moreover, apply to find_energy_efficient_cpu() the same SMT-aware
> > preference as the non-EAS wakeup path: when SMT is active and there is a
> > fully-idle core in the relevant domain, prefer max-spare-capacity
> > candidates on fully-idle cores. Otherwise, fall back to the prior
> > behavior, to include also partially-idle SMT siblings.
> >
> > Cc: Vincent Guittot <vincent.guittot@linaro.org>
> > Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
> > Cc: Christian Loehle <christian.loehle@arm.com>
> > Cc: Koba Ko <kobak@nvidia.com>
> > Reported-by: Felix Abecassis <fabecassis@nvidia.com>
> > Signed-off-by: Andrea Righi <arighi@nvidia.com>
> > ---
> >  kernel/sched/fair.c     | 50 +++++++++++++++++++++++++++++++++++++++--
> >  kernel/sched/topology.c |  9 --------
> >  2 files changed, 48 insertions(+), 11 deletions(-)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index f8deaaa5bfc85..593a89f688679 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -8658,13 +8658,15 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> >         eenv_task_busy_time(&eenv, p, prev_cpu);
> >
> >         for (; pd; pd = pd->next) {
> > -               unsigned long util_min = p_util_min, util_max = p_util_max;
> >                 unsigned long cpu_cap, cpu_actual_cap, util;
> >                 long prev_spare_cap = -1, max_spare_cap = -1;
> > +               long max_spare_cap_fallback = -1;
> >                 unsigned long rq_util_min, rq_util_max;
> >                 unsigned long cur_delta, base_energy;
> > -               int max_spare_cap_cpu = -1;
> > +               int max_spare_cap_cpu = -1, max_spare_cap_cpu_fallback = -1;
> >                 int fits, max_fits = -1;
> > +               int max_fits_fallback = -1;
> > +               bool prefer_idle_cores;
> >
> >                 if (!cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask))
> >                         continue;
> > @@ -8676,6 +8678,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> >                 eenv.cpu_cap = cpu_actual_cap;
> >                 eenv.pd_cap = 0;
> >
> > +               prefer_idle_cores = sched_smt_active() && test_idle_cores(prev_cpu);
> > +
> >                 for_each_cpu(cpu, cpus) {
> >                         struct rq *rq = cpu_rq(cpu);
> >
> > @@ -8687,6 +8691,11 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> >                         if (!cpumask_test_cpu(cpu, p->cpus_ptr))
> >                                 continue;
> >
> > +                       if (prefer_idle_cores && cpu != prev_cpu && !is_core_idle(cpu))
> > +                               goto fallback;
> > +
> > +                       unsigned long util_min = p_util_min, util_max = p_util_max;
> > +
> >                         util = cpu_util(cpu, p, cpu, 0);
> >                         cpu_cap = capacity_of(cpu);
> >
> > @@ -8733,6 +8742,43 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> >                                 max_spare_cap_cpu = cpu;
> >                                 max_fits = fits;
> >                         }
> > +
> > +fallback:
> > +                       if (!prefer_idle_cores || cpu == prev_cpu || is_core_idle(cpu))
> > +                               continue;
> > +
> > +                       util_min = p_util_min;
> > +                       util_max = p_util_max;
> > +                       util = cpu_util(cpu, p, cpu, 0);
> > +                       cpu_cap = capacity_of(cpu);
> > +
> > +                       if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) {
> > +                               rq_util_min = uclamp_rq_get(rq, UCLAMP_MIN);
> > +                               rq_util_max = uclamp_rq_get(rq, UCLAMP_MAX);
> > +
> > +                               util_min = max(rq_util_min, p_util_min);
> > +                               util_max = max(rq_util_max, p_util_max);
> > +                       }
> > +
> > +                       fits = util_fits_cpu(util, util_min, util_max, cpu);
> > +                       if (!fits)
> > +                               continue;
> > +
> > +                       lsub_positive(&cpu_cap, util);
> > +
> > +                       if ((fits > max_fits_fallback) ||
> > +                           ((fits == max_fits_fallback) &&
> > +                            ((long)cpu_cap > max_spare_cap_fallback))) {
> > +                               max_spare_cap_fallback = cpu_cap;
> > +                               max_spare_cap_cpu_fallback = cpu;
> > +                               max_fits_fallback = fits;
> > +                       }
> > +               }
> > +
> > +               if (max_spare_cap_cpu < 0 && max_spare_cap_cpu_fallback >= 0) {
> > +                       max_spare_cap = max_spare_cap_fallback;
> > +                       max_spare_cap_cpu = max_spare_cap_cpu_fallback;
> > +                       max_fits = max_fits_fallback;
> >                 }
> >
> >                 if (max_spare_cap_cpu < 0 && prev_spare_cap < 0)
> > diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> > index 061f8c85f5552..cb060fe56aec1 100644
> > --- a/kernel/sched/topology.c
> > +++ b/kernel/sched/topology.c
> > @@ -232,15 +232,6 @@ static bool sched_is_eas_possible(const struct cpumask *cpu_mask)
> >                 return false;
> >         }
> >
> > -       /* EAS definitely does *not* handle SMT */
> > -       if (sched_smt_active()) {
> > -               if (sched_debug()) {
> > -                       pr_info("rd %*pbl: Checking EAS, SMT is not supported\n",
> > -                               cpumask_pr_args(cpu_mask));
> > -               }
> > -               return false;
> > -       }
> > -
> >         if (!arch_scale_freq_invariant()) {
> >                 if (sched_debug()) {
> >                         pr_info("rd %*pbl: Checking EAS: frequency-invariant load tracking not yet supported",
> > --
> > 2.53.0
> >

[PATCH 1/4] sched/fair: Prefer fully-idle SMT cores in asym-capacity idle selection
[PATCH 2/4] sched/fair: Reject misfit pulls onto busy SMT siblings on asym-capacity
[PATCH 3/4] sched/fair: Enable EAS with SMT on SD_ASYM_CPUCAPACITY systems
[PATCH 4/4] sched/fair: Prefer fully-idle SMT core for NOHZ idle load balancer