sched/fair: SMT-aware asymmetric CPU capacity

[PATCH 1/4] sched/fair: Prefer fully-idle SMT cores in asym-capacity idle selection

Posted by Andrea Righi 1 week ago

On systems with asymmetric CPU capacity (e.g., ACPI/CPPC reporting
different per-core frequencies), the wakeup path uses
select_idle_capacity() and prioritizes idle CPUs with higher capacity
for better task placement. However, when those CPUs belong to SMT cores,
their effective capacity can be much lower than the nominal capacity
when the sibling thread is busy: SMT siblings compete for shared
resources, so a "high capacity" CPU that is idle but whose sibling is
busy does not deliver its full capacity. This effective capacity
reduction cannot be modeled by the static capacity value alone.

Introduce SMT awareness in the asym-capacity idle selection policy: when
SMT is active prefer fully-idle SMT cores over partially-idle ones. A
two-phase selection first tries only CPUs on fully idle cores, then
falls back to any idle CPU if none fit.

Prioritizing fully-idle SMT cores yields better task placement because
the effective capacity of partially-idle SMT cores is reduced; always
preferring them when available leads to more accurate capacity usage on
task wakeup.

On an SMT system with asymmetric CPU capacities, SMT-aware idle
selection has been shown to improve throughput by around 15-18% for
CPU-bound workloads, running an amount of tasks equal to the amount of
SMT cores.

Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Christian Loehle <christian.loehle@arm.com>
Cc: Koba Ko <kobak@nvidia.com>
Reported-by: Felix Abecassis <fabecassis@nvidia.com>
Signed-off-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/fair.c | 86 +++++++++++++++++++++++++++++++++++++++------
 1 file changed, 75 insertions(+), 11 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d57c02e82f3a1..9a95628669851 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7940,14 +7940,21 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
  * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
  * the task fits. If no CPU is big enough, but there are idle ones, try to
  * maximize capacity.
+ *
+ * When @prefer_idle_cores is true (asym + SMT and idle cores exist), prefer
+ * CPUs on fully-idle cores over partially-idle ones in a single pass: track
+ * the best candidate among idle-core CPUs and the best among any idle CPU,
+ * then return the idle-core candidate if found, else the best any-idle.
  */
 static int
-select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
+select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target,
+		     bool prefer_idle_cores)
 {
-	unsigned long task_util, util_min, util_max, best_cap = 0;
-	int fits, best_fits = 0;
-	int cpu, best_cpu = -1;
+	unsigned long task_util, util_min, util_max, best_cap = 0, best_cap_core = 0;
+	int fits, best_fits = 0, best_fits_core = 0;
+	int cpu, best_cpu = -1, best_cpu_core = -1;
 	struct cpumask *cpus;
+	bool on_idle_core;
 
 	cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
 	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
@@ -7962,16 +7969,58 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
 		if (!choose_idle_cpu(cpu, p))
 			continue;
 
+		on_idle_core = is_core_idle(cpu);
+		if (prefer_idle_cores && !on_idle_core) {
+			/* Track best among any idle CPU for fallback */
+			fits = util_fits_cpu(task_util, util_min, util_max, cpu);
+			if (fits > 0) {
+				/*
+				 * Full fit: strictly better than fits 0 / -1;
+				 * among several, prefer higher capacity.
+				 */
+				if (best_cpu < 0 || best_fits <= 0 ||
+				    (best_fits > 0 && cpu_cap > best_cap)) {
+					best_cap = cpu_cap;
+					best_cpu = cpu;
+					best_fits = fits;
+				}
+				continue;
+			}
+			if (best_fits > 0)
+				continue;
+			if (fits < 0)
+				cpu_cap = get_actual_cpu_capacity(cpu);
+			if ((fits < best_fits) ||
+			    ((fits == best_fits) && (cpu_cap > best_cap))) {
+				best_cap = cpu_cap;
+				best_cpu = cpu;
+				best_fits = fits;
+			}
+			continue;
+		}
+
 		fits = util_fits_cpu(task_util, util_min, util_max, cpu);
 
 		/* This CPU fits with all requirements */
-		if (fits > 0)
-			return cpu;
+		if (fits > 0) {
+			if (prefer_idle_cores && on_idle_core)
+				return cpu;
+			if (!prefer_idle_cores)
+				return cpu;
+			/*
+			 * Prefer idle cores: record and keep looking for
+			 * idle-core fit.
+			 */
+			best_cap = cpu_cap;
+			best_cpu = cpu;
+			best_fits = fits;
+			continue;
+		}
 		/*
 		 * Only the min performance hint (i.e. uclamp_min) doesn't fit.
 		 * Look for the CPU with best capacity.
 		 */
-		else if (fits < 0)
+		if (fits < 0)
 			cpu_cap = get_actual_cpu_capacity(cpu);
 
 		/*
@@ -7984,8 +8033,17 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
 			best_cpu = cpu;
 			best_fits = fits;
 		}
+		if (prefer_idle_cores && on_idle_core &&
+		    ((fits < best_fits_core) ||
+		     ((fits == best_fits_core) && (cpu_cap > best_cap_core)))) {
+			best_cap_core = cpu_cap;
+			best_cpu_core = cpu;
+			best_fits_core = fits;
+		}
 	}
 
+	if (prefer_idle_cores && best_cpu_core >= 0)
+		return best_cpu_core;
 	return best_cpu;
 }
 
@@ -7994,12 +8052,17 @@ static inline bool asym_fits_cpu(unsigned long util,
 				 unsigned long util_max,
 				 int cpu)
 {
-	if (sched_asym_cpucap_active())
+	if (sched_asym_cpucap_active()) {
 		/*
 		 * Return true only if the cpu fully fits the task requirements
 		 * which include the utilization and the performance hints.
+		 *
+		 * When SMT is active, also require that the core has no busy
+		 * siblings.
 		 */
-		return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
+		return (!sched_smt_active() || is_core_idle(cpu)) &&
+		       (util_fits_cpu(util, util_min, util_max, cpu) > 0);
+	}
 
 	return true;
 }
@@ -8097,8 +8160,9 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 		 * capacity path.
 		 */
 		if (sd) {
-			i = select_idle_capacity(p, sd, target);
-			return ((unsigned)i < nr_cpumask_bits) ? i : target;
+			i = select_idle_capacity(p, sd, target,
+				sched_smt_active() && test_idle_cores(target));
+			return ((unsigned int)i < nr_cpumask_bits) ? i : target;
 		}
 	}
 
-- 
2.53.0

Re: [PATCH 1/4] sched/fair: Prefer fully-idle SMT cores in asym-capacity idle selection

Posted by K Prateek Nayak 6 days, 9 hours ago

Hello Andrea,

On 3/26/2026 8:32 PM, Andrea Righi wrote:
>  		/* This CPU fits with all requirements */
> -		if (fits > 0)
> -			return cpu;
> +		if (fits > 0) {
> +			if (prefer_idle_cores && on_idle_core)
> +				return cpu;
> +			if (!prefer_idle_cores)
> +				return cpu;

nit.

Can the above two be re-wittern as:

    if (!prefer_idle_cores || on_idle_core)
        return cpu; 

since they are equivalent.

-- 
Thanks and Regards,
Prateek

Re: [PATCH 1/4] sched/fair: Prefer fully-idle SMT cores in asym-capacity idle selection

Posted by Andrea Righi 6 days, 9 hours ago

Hi Prateek,

On Fri, Mar 27, 2026 at 04:14:57PM +0530, K Prateek Nayak wrote:
> Hello Andrea,
> 
> On 3/26/2026 8:32 PM, Andrea Righi wrote:
> >  		/* This CPU fits with all requirements */
> > -		if (fits > 0)
> > -			return cpu;
> > +		if (fits > 0) {
> > +			if (prefer_idle_cores && on_idle_core)
> > +				return cpu;
> > +			if (!prefer_idle_cores)
> > +				return cpu;
> 
> nit.
> 
> Can the above two be re-wittern as:
> 
>     if (!prefer_idle_cores || on_idle_core)
>         return cpu; 
> 
> since they are equivalent.

Oh yes, indeed.

Thanks,
-Andrea

Re: [PATCH 1/4] sched/fair: Prefer fully-idle SMT cores in asym-capacity idle selection

Posted by K Prateek Nayak 6 days, 9 hours ago

Hello Andrea,

On 3/27/2026 4:28 PM, Andrea Righi wrote:
> On Fri, Mar 27, 2026 at 04:14:57PM +0530, K Prateek Nayak wrote:
>> Hello Andrea,
>>
>> On 3/26/2026 8:32 PM, Andrea Righi wrote:
>>>             /* This CPU fits with all requirements */
>>> -           if (fits > 0)
>>> -                   return cpu;
>>> +           if (fits > 0) {
>>> +                   if (prefer_idle_cores && on_idle_core)
>>> +                           return cpu;
>>> +                   if (!prefer_idle_cores)
>>> +                           return cpu;
>>
>> nit.
>>
>> Can the above two be re-wittern as:
>>
>>     if (!prefer_idle_cores || on_idle_core)
>>         return cpu;
>>
>> since they are equivalent.
> 
> Oh yes, indeed.

Also, can we just rewrite this Patch as:

  (Includes feedback from Vincent; Only build tested)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 700d0f145ca6..cffd5649b54e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7946,6 +7946,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
 static int
 select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
 {
+	bool prefers_idle_core = sched_smt_active() && test_idle_cores(target);
 	unsigned long task_util, util_min, util_max, best_cap = 0;
 	int fits, best_fits = 0;
 	int cpu, best_cpu = -1;
@@ -7959,6 +7960,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
 	util_max = uclamp_eff_value(p, UCLAMP_MAX);
 
 	for_each_cpu_wrap(cpu, cpus, target) {
+		bool preferred_core = !prefers_idle_core || is_core_idle(cpu);
 		unsigned long cpu_cap = capacity_of(cpu);
 
 		if (!choose_idle_cpu(cpu, p))
@@ -7967,7 +7969,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
 		fits = util_fits_cpu(task_util, util_min, util_max, cpu);
 
 		/* This CPU fits with all requirements */
-		if (fits > 0)
+		if (fits > 0 && preferred_core)
 			return cpu;
 		/*
 		 * Only the min performance hint (i.e. uclamp_min) doesn't fit.
@@ -7976,6 +7978,14 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
 		else if (fits < 0)
 			cpu_cap = get_actual_cpu_capacity(cpu);
 
+		/*
+		 * If we are on an preferred core, translate the range of fits
+		 * from [-1, 1] to [-4, -2]. This ensures that an idle core
+		 * is always given priority over (paritally) busy core.
+		 */
+		if (preferred_core)
+			fits -= 3;
+
 		/*
 		 * First, select CPU which fits better (-1 being better than 0).
 		 * Then, select the one with best capacity at same level.
---

My naive eyes say it should be equivalent of what you have but maybe
I'm wrong?

-- 
Thanks and Regards,
Prateek

Re: [PATCH 1/4] sched/fair: Prefer fully-idle SMT cores in asym-capacity idle selection

Posted by Andrea Righi 6 days, 3 hours ago

Hi Prateek,

On Fri, Mar 27, 2026 at 04:44:01PM +0530, K Prateek Nayak wrote:
> Hello Andrea,
> 
> On 3/27/2026 4:28 PM, Andrea Righi wrote:
> > On Fri, Mar 27, 2026 at 04:14:57PM +0530, K Prateek Nayak wrote:
> >> Hello Andrea,
> >>
> >> On 3/26/2026 8:32 PM, Andrea Righi wrote:
> >>>             /* This CPU fits with all requirements */
> >>> -           if (fits > 0)
> >>> -                   return cpu;
> >>> +           if (fits > 0) {
> >>> +                   if (prefer_idle_cores && on_idle_core)
> >>> +                           return cpu;
> >>> +                   if (!prefer_idle_cores)
> >>> +                           return cpu;
> >>
> >> nit.
> >>
> >> Can the above two be re-wittern as:
> >>
> >>     if (!prefer_idle_cores || on_idle_core)
> >>         return cpu;
> >>
> >> since they are equivalent.
> > 
> > Oh yes, indeed.
> 
> Also, can we just rewrite this Patch as:
> 
>   (Includes feedback from Vincent; Only build tested)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 700d0f145ca6..cffd5649b54e 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7946,6 +7946,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
>  static int
>  select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
>  {
> +	bool prefers_idle_core = sched_smt_active() && test_idle_cores(target);
>  	unsigned long task_util, util_min, util_max, best_cap = 0;
>  	int fits, best_fits = 0;
>  	int cpu, best_cpu = -1;
> @@ -7959,6 +7960,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
>  	util_max = uclamp_eff_value(p, UCLAMP_MAX);
>  
>  	for_each_cpu_wrap(cpu, cpus, target) {
> +		bool preferred_core = !prefers_idle_core || is_core_idle(cpu);
>  		unsigned long cpu_cap = capacity_of(cpu);
>  
>  		if (!choose_idle_cpu(cpu, p))
> @@ -7967,7 +7969,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
>  		fits = util_fits_cpu(task_util, util_min, util_max, cpu);
>  
>  		/* This CPU fits with all requirements */
> -		if (fits > 0)
> +		if (fits > 0 && preferred_core)
>  			return cpu;
>  		/*
>  		 * Only the min performance hint (i.e. uclamp_min) doesn't fit.
> @@ -7976,6 +7978,14 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
>  		else if (fits < 0)
>  			cpu_cap = get_actual_cpu_capacity(cpu);
>  
> +		/*
> +		 * If we are on an preferred core, translate the range of fits
> +		 * from [-1, 1] to [-4, -2]. This ensures that an idle core
> +		 * is always given priority over (paritally) busy core.
> +		 */
> +		if (preferred_core)
> +			fits -= 3;
> +

Ah, I like this trick. Yes, this definitely makes the patch more compact.

>  		/*
>  		 * First, select CPU which fits better (-1 being better than 0).
>  		 * Then, select the one with best capacity at same level.
> ---
> 
> My naive eyes say it should be equivalent of what you have but maybe
> I'm wrong?

It seems correct to my naive eyes as well. Will test this out to make sure.

Unfortunately I just lost access to my system (bummer), I found another
Vera machine, but this one has a version of the firmware that exposes all
CPUs with the same highest_perf... so I can still do some testing, but not
the same one with SD_ASYM_CPUCAPACITY + SMT. I should get access to the
previous system with the different highest_perf values on Monday.

Thanks,
-Andrea

Re: [PATCH 1/4] sched/fair: Prefer fully-idle SMT cores in asym-capacity idle selection

Posted by K Prateek Nayak 3 days, 10 hours ago

Hello Andrea,

On 3/27/2026 10:09 PM, Andrea Righi wrote:
>> My naive eyes say it should be equivalent of what you have but maybe
>> I'm wrong?
> 
> It seems correct to my naive eyes as well. Will test this out to make sure.

So I found one small problem with fits > 0 && !preferred_core where even
though it is an ideal target, we don't end up preferring it because of
the larger "fits" value.

Here is an updated diff:

  (Only build tested)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 226509231e67..580218656865 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7949,6 +7949,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
 static int
 select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
 {
+	bool prefers_idle_core = sched_smt_active() && test_idle_cores(target);
 	unsigned long task_util, util_min, util_max, best_cap = 0;
 	int fits, best_fits = 0;
 	int cpu, best_cpu = -1;
@@ -7962,6 +7963,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
 	util_max = uclamp_eff_value(p, UCLAMP_MAX);
 
 	for_each_cpu_wrap(cpu, cpus, target) {
+		bool preferred_core = !prefers_idle_core || is_core_idle(cpu);
 		unsigned long cpu_cap = capacity_of(cpu);
 
 		if (!choose_idle_cpu(cpu, p))
@@ -7970,7 +7972,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
 		fits = util_fits_cpu(task_util, util_min, util_max, cpu);
 
 		/* This CPU fits with all requirements */
-		if (fits > 0)
+		if (fits > 0 && preferred_core)
 			return cpu;
 		/*
 		 * Only the min performance hint (i.e. uclamp_min) doesn't fit.
@@ -7978,9 +7980,30 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
 		 */
 		else if (fits < 0)
 			cpu_cap = get_actual_cpu_capacity(cpu);
+		/*
+		 * fits > 0 implies we are not on a preferred core
+		 * but the util fits CPU capacity. Set fits to -2 so
+		 * the effective range becomes [-2, 0] where:
+		 *    0 - does not fit
+		 *   -1 - fits with the exception of UCLAMP_MIN
+		 *   -2 - fits with the exception of preferred_core
+		 */
+		else if (fits > 0)
+			fits = -2;
+
+		/*
+		 * If we are on an preferred core, translate the range of fits
+		 * of [-1, 0] to [-4, -3]. This ensures that an idle core
+		 * is always given priority over (partially) busy core.
+		 *
+		 * A fully fitting idle core would have returned early and hence
+		 * fits > 0 for preferred_core need not be dealt with.
+		 */
+		if (preferred_core)
+			fits -= 3;
 
 		/*
-		 * First, select CPU which fits better (-1 being better than 0).
+		 * First, select CPU which fits better (lower is more preferred).
 		 * Then, select the one with best capacity at same level.
 		 */
 		if ((fits < best_fits) ||
---

Sorry for the oversight but this should now be equivalent to your
Patch 1. I'll let Vincent comment if he prefers this to the original
or not :-)

-- 
Thanks and Regards,
Prateek

Re: [PATCH 1/4] sched/fair: Prefer fully-idle SMT cores in asym-capacity idle selection

Posted by Andrea Righi 3 days, 7 hours ago

Hi Prateek,

On Mon, Mar 30, 2026 at 03:47:07PM +0530, K Prateek Nayak wrote:
> Hello Andrea,
> 
> On 3/27/2026 10:09 PM, Andrea Righi wrote:
> >> My naive eyes say it should be equivalent of what you have but maybe
> >> I'm wrong?
> > 
> > It seems correct to my naive eyes as well. Will test this out to make sure.
> 
> So I found one small problem with fits > 0 && !preferred_core where even
> though it is an ideal target, we don't end up preferring it because of
> the larger "fits" value.
> 
> Here is an updated diff:
> 
>   (Only build tested)

I'm getting worse performance with this one (but better than mainline).
I'm trying to understand why.

BTW, we also need to fix asym_fits_cpu() to do something like this:

	return (!sched_smt_active() || is_core_idle(cpu)) &&
	       (util_fits_cpu(util, util_min, util_max, cpu) > 0);

...or we'd return early from select_idle_sibling() with busy SMT cores.

Thanks,
-Andrea

> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 226509231e67..580218656865 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7949,6 +7949,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
>  static int
>  select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
>  {
> +	bool prefers_idle_core = sched_smt_active() && test_idle_cores(target);
>  	unsigned long task_util, util_min, util_max, best_cap = 0;
>  	int fits, best_fits = 0;
>  	int cpu, best_cpu = -1;
> @@ -7962,6 +7963,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
>  	util_max = uclamp_eff_value(p, UCLAMP_MAX);
>  
>  	for_each_cpu_wrap(cpu, cpus, target) {
> +		bool preferred_core = !prefers_idle_core || is_core_idle(cpu);
>  		unsigned long cpu_cap = capacity_of(cpu);
>  
>  		if (!choose_idle_cpu(cpu, p))
> @@ -7970,7 +7972,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
>  		fits = util_fits_cpu(task_util, util_min, util_max, cpu);
>  
>  		/* This CPU fits with all requirements */
> -		if (fits > 0)
> +		if (fits > 0 && preferred_core)
>  			return cpu;
>  		/*
>  		 * Only the min performance hint (i.e. uclamp_min) doesn't fit.
> @@ -7978,9 +7980,30 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
>  		 */
>  		else if (fits < 0)
>  			cpu_cap = get_actual_cpu_capacity(cpu);
> +		/*
> +		 * fits > 0 implies we are not on a preferred core
> +		 * but the util fits CPU capacity. Set fits to -2 so
> +		 * the effective range becomes [-2, 0] where:
> +		 *    0 - does not fit
> +		 *   -1 - fits with the exception of UCLAMP_MIN
> +		 *   -2 - fits with the exception of preferred_core
> +		 */
> +		else if (fits > 0)
> +			fits = -2;
> +
> +		/*
> +		 * If we are on an preferred core, translate the range of fits
> +		 * of [-1, 0] to [-4, -3]. This ensures that an idle core
> +		 * is always given priority over (partially) busy core.
> +		 *
> +		 * A fully fitting idle core would have returned early and hence
> +		 * fits > 0 for preferred_core need not be dealt with.
> +		 */
> +		if (preferred_core)
> +			fits -= 3;
>  
>  		/*
> -		 * First, select CPU which fits better (-1 being better than 0).
> +		 * First, select CPU which fits better (lower is more preferred).
>  		 * Then, select the one with best capacity at same level.
>  		 */
>  		if ((fits < best_fits) ||
> ---
> 
> Sorry for the oversight but this should now be equivalent to your
> Patch 1. I'll let Vincent comment if he prefers this to the original
> or not :-)
> 
> -- 
> Thanks and Regards,
> Prateek
>

Re: [PATCH 1/4] sched/fair: Prefer fully-idle SMT cores in asym-capacity idle selection

Posted by Andrea Righi 3 days, 6 hours ago

On Mon, Mar 30, 2026 at 03:22:27PM +0200, Andrea Righi wrote:
> Hi Prateek,
> 
> On Mon, Mar 30, 2026 at 03:47:07PM +0530, K Prateek Nayak wrote:
> > Hello Andrea,
> > 
> > On 3/27/2026 10:09 PM, Andrea Righi wrote:
> > >> My naive eyes say it should be equivalent of what you have but maybe
> > >> I'm wrong?
> > > 
> > > It seems correct to my naive eyes as well. Will test this out to make sure.
> > 
> > So I found one small problem with fits > 0 && !preferred_core where even
> > though it is an ideal target, we don't end up preferring it because of
> > the larger "fits" value.
> > 
> > Here is an updated diff:
> > 
> >   (Only build tested)
> 
> I'm getting worse performance with this one (but better than mainline).
> I'm trying to understand why.

Nevermind...

> 
> BTW, we also need to fix asym_fits_cpu() to do something like this:
> 
> 	return (!sched_smt_active() || is_core_idle(cpu)) &&
> 	       (util_fits_cpu(util, util_min, util_max, cpu) > 0);
> 
> ...or we'd return early from select_idle_sibling() with busy SMT cores.

...I was actually missing this piece right here. So, everything looks good
with this extra change applied.

I'll repeat all my tests just in case and will send a new version with your
changes.

Thanks!
-Andrea

Re: [PATCH 1/4] sched/fair: Prefer fully-idle SMT cores in asym-capacity idle selection

Posted by Vincent Guittot 3 days, 7 hours ago

On Mon, 30 Mar 2026 at 12:17, K Prateek Nayak <kprateek.nayak@amd.com> wrote:
>
> Hello Andrea,
>
> On 3/27/2026 10:09 PM, Andrea Righi wrote:
> >> My naive eyes say it should be equivalent of what you have but maybe
> >> I'm wrong?
> >
> > It seems correct to my naive eyes as well. Will test this out to make sure.
>
> So I found one small problem with fits > 0 && !preferred_core where even
> though it is an ideal target, we don't end up preferring it because of
> the larger "fits" value.
>
> Here is an updated diff:
>
>   (Only build tested)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 226509231e67..580218656865 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7949,6 +7949,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
>  static int
>  select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
>  {
> +       bool prefers_idle_core = sched_smt_active() && test_idle_cores(target);
>         unsigned long task_util, util_min, util_max, best_cap = 0;
>         int fits, best_fits = 0;
>         int cpu, best_cpu = -1;
> @@ -7962,6 +7963,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
>         util_max = uclamp_eff_value(p, UCLAMP_MAX);
>
>         for_each_cpu_wrap(cpu, cpus, target) {
> +               bool preferred_core = !prefers_idle_core || is_core_idle(cpu);
>                 unsigned long cpu_cap = capacity_of(cpu);
>
>                 if (!choose_idle_cpu(cpu, p))
> @@ -7970,7 +7972,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
>                 fits = util_fits_cpu(task_util, util_min, util_max, cpu);
>
>                 /* This CPU fits with all requirements */
> -               if (fits > 0)
> +               if (fits > 0 && preferred_core)
>                         return cpu;
>                 /*
>                  * Only the min performance hint (i.e. uclamp_min) doesn't fit.
> @@ -7978,9 +7980,30 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
>                  */
>                 else if (fits < 0)
>                         cpu_cap = get_actual_cpu_capacity(cpu);
> +               /*
> +                * fits > 0 implies we are not on a preferred core
> +                * but the util fits CPU capacity. Set fits to -2 so
> +                * the effective range becomes [-2, 0] where:
> +                *    0 - does not fit
> +                *   -1 - fits with the exception of UCLAMP_MIN
> +                *   -2 - fits with the exception of preferred_core
> +                */
> +               else if (fits > 0)
> +                       fits = -2;
> +
> +               /*
> +                * If we are on an preferred core, translate the range of fits
> +                * of [-1, 0] to [-4, -3]. This ensures that an idle core
> +                * is always given priority over (partially) busy core.
> +                *
> +                * A fully fitting idle core would have returned early and hence
> +                * fits > 0 for preferred_core need not be dealt with.
> +                */
> +               if (preferred_core)
> +                       fits -= 3;
>
>                 /*
> -                * First, select CPU which fits better (-1 being better than 0).
> +                * First, select CPU which fits better (lower is more preferred).
>                  * Then, select the one with best capacity at same level.
>                  */
>                 if ((fits < best_fits) ||
> ---
>
> Sorry for the oversight but this should now be equivalent to your
> Patch 1. I'll let Vincent comment if he prefers this to the original
> or not :-)

Yes, I prefer this version which keeps the same logic for selecting the best cpu

Thanks
Vincent

>
> --
> Thanks and Regards,
> Prateek
>

Re: [PATCH 1/4] sched/fair: Prefer fully-idle SMT cores in asym-capacity idle selection

Posted by Vincent Guittot 6 days, 12 hours ago

On Thu, 26 Mar 2026 at 16:12, Andrea Righi <arighi@nvidia.com> wrote:
>
> On systems with asymmetric CPU capacity (e.g., ACPI/CPPC reporting
> different per-core frequencies), the wakeup path uses
> select_idle_capacity() and prioritizes idle CPUs with higher capacity
> for better task placement. However, when those CPUs belong to SMT cores,
> their effective capacity can be much lower than the nominal capacity
> when the sibling thread is busy: SMT siblings compete for shared
> resources, so a "high capacity" CPU that is idle but whose sibling is
> busy does not deliver its full capacity. This effective capacity
> reduction cannot be modeled by the static capacity value alone.
>
> Introduce SMT awareness in the asym-capacity idle selection policy: when
> SMT is active prefer fully-idle SMT cores over partially-idle ones. A
> two-phase selection first tries only CPUs on fully idle cores, then
> falls back to any idle CPU if none fit.
>
> Prioritizing fully-idle SMT cores yields better task placement because
> the effective capacity of partially-idle SMT cores is reduced; always
> preferring them when available leads to more accurate capacity usage on
> task wakeup.
>
> On an SMT system with asymmetric CPU capacities, SMT-aware idle
> selection has been shown to improve throughput by around 15-18% for
> CPU-bound workloads, running an amount of tasks equal to the amount of
> SMT cores.
>
> Cc: Vincent Guittot <vincent.guittot@linaro.org>
> Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
> Cc: Christian Loehle <christian.loehle@arm.com>
> Cc: Koba Ko <kobak@nvidia.com>
> Reported-by: Felix Abecassis <fabecassis@nvidia.com>
> Signed-off-by: Andrea Righi <arighi@nvidia.com>
> ---
>  kernel/sched/fair.c | 86 +++++++++++++++++++++++++++++++++++++++------
>  1 file changed, 75 insertions(+), 11 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index d57c02e82f3a1..9a95628669851 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7940,14 +7940,21 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
>   * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
>   * the task fits. If no CPU is big enough, but there are idle ones, try to
>   * maximize capacity.
> + *
> + * When @prefer_idle_cores is true (asym + SMT and idle cores exist), prefer
> + * CPUs on fully-idle cores over partially-idle ones in a single pass: track
> + * the best candidate among idle-core CPUs and the best among any idle CPU,
> + * then return the idle-core candidate if found, else the best any-idle.
>   */
>  static int
> -select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
> +select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target,
> +                    bool prefer_idle_cores)
>  {
> -       unsigned long task_util, util_min, util_max, best_cap = 0;
> -       int fits, best_fits = 0;
> -       int cpu, best_cpu = -1;
> +       unsigned long task_util, util_min, util_max, best_cap = 0, best_cap_core = 0;
> +       int fits, best_fits = 0, best_fits_core = 0;
> +       int cpu, best_cpu = -1, best_cpu_core = -1;
>         struct cpumask *cpus;
> +       bool on_idle_core;
>
>         cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
>         cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
> @@ -7962,16 +7969,58 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
>                 if (!choose_idle_cpu(cpu, p))
>                         continue;
>
> +               on_idle_core = is_core_idle(cpu);
> +               if (prefer_idle_cores && !on_idle_core) {
> +                       /* Track best among any idle CPU for fallback */
> +                       fits = util_fits_cpu(task_util, util_min, util_max, cpu);

fits = util_fits_cpu(task_util, util_min, util_max, cpu); is always
called so call it once above this if condition

this will help factorize the selection of best_cpu and best_cpu_core

> +                       if (fits > 0) {
> +                               /*
> +                                * Full fit: strictly better than fits 0 / -1;
> +                                * among several, prefer higher capacity.
> +                                */
> +                               if (best_cpu < 0 || best_fits <= 0 ||
> +                                   (best_fits > 0 && cpu_cap > best_cap)) {
> +                                       best_cap = cpu_cap;
> +                                       best_cpu = cpu;
> +                                       best_fits = fits;
> +                               }
> +                               continue;
> +                       }
> +                       if (best_fits > 0)
> +                               continue;
> +                       if (fits < 0)
> +                               cpu_cap = get_actual_cpu_capacity(cpu);
> +                       if ((fits < best_fits) ||
> +                           ((fits == best_fits) && (cpu_cap > best_cap))) {
> +                               best_cap = cpu_cap;
> +                               best_cpu = cpu;
> +                               best_fits = fits;
> +                       }
> +                       continue;
> +               }
> +
>                 fits = util_fits_cpu(task_util, util_min, util_max, cpu);
>
>                 /* This CPU fits with all requirements */
> -               if (fits > 0)
> -                       return cpu;
> +               if (fits > 0) {
> +                       if (prefer_idle_cores && on_idle_core)
> +                               return cpu;
> +                       if (!prefer_idle_cores)
> +                               return cpu;
> +                       /*
> +                        * Prefer idle cores: record and keep looking for
> +                        * idle-core fit.
> +                        */
> +                       best_cap = cpu_cap;
> +                       best_cpu = cpu;
> +                       best_fits = fits;
> +                       continue;
> +               }
>                 /*
>                  * Only the min performance hint (i.e. uclamp_min) doesn't fit.
>                  * Look for the CPU with best capacity.
>                  */
> -               else if (fits < 0)
> +               if (fits < 0)
>                         cpu_cap = get_actual_cpu_capacity(cpu);
>
>                 /*
> @@ -7984,8 +8033,17 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
>                         best_cpu = cpu;
>                         best_fits = fits;
>                 }
> +               if (prefer_idle_cores && on_idle_core &&
> +                   ((fits < best_fits_core) ||
> +                    ((fits == best_fits_core) && (cpu_cap > best_cap_core)))) {
> +                       best_cap_core = cpu_cap;
> +                       best_cpu_core = cpu;
> +                       best_fits_core = fits;
> +               }
>         }
>
> +       if (prefer_idle_cores && best_cpu_core >= 0)
> +               return best_cpu_core;
>         return best_cpu;
>  }
>
> @@ -7994,12 +8052,17 @@ static inline bool asym_fits_cpu(unsigned long util,
>                                  unsigned long util_max,
>                                  int cpu)
>  {
> -       if (sched_asym_cpucap_active())
> +       if (sched_asym_cpucap_active()) {
>                 /*
>                  * Return true only if the cpu fully fits the task requirements
>                  * which include the utilization and the performance hints.
> +                *
> +                * When SMT is active, also require that the core has no busy
> +                * siblings.
>                  */
> -               return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
> +               return (!sched_smt_active() || is_core_idle(cpu)) &&
> +                      (util_fits_cpu(util, util_min, util_max, cpu) > 0);
> +       }
>
>         return true;
>  }
> @@ -8097,8 +8160,9 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>                  * capacity path.
>                  */
>                 if (sd) {
> -                       i = select_idle_capacity(p, sd, target);
> -                       return ((unsigned)i < nr_cpumask_bits) ? i : target;
> +                       i = select_idle_capacity(p, sd, target,
> +                               sched_smt_active() && test_idle_cores(target));

Move "sched_smt_active() && test_idle_cores(target)" inside
select_idle_capacity(). I don't see the benefit of making it a
parameter
or use has_idle_core for the parameter like other smt related function


> +                       return ((unsigned int)i < nr_cpumask_bits) ? i : target;
>                 }
>         }
>
> --
> 2.53.0
>

Re: [PATCH 1/4] sched/fair: Prefer fully-idle SMT cores in asym-capacity idle selection

Posted by Andrea Righi 6 days, 10 hours ago

Hi Vincent,

On Fri, Mar 27, 2026 at 09:09:24AM +0100, Vincent Guittot wrote:
> On Thu, 26 Mar 2026 at 16:12, Andrea Righi <arighi@nvidia.com> wrote:
> >
> > On systems with asymmetric CPU capacity (e.g., ACPI/CPPC reporting
> > different per-core frequencies), the wakeup path uses
> > select_idle_capacity() and prioritizes idle CPUs with higher capacity
> > for better task placement. However, when those CPUs belong to SMT cores,
> > their effective capacity can be much lower than the nominal capacity
> > when the sibling thread is busy: SMT siblings compete for shared
> > resources, so a "high capacity" CPU that is idle but whose sibling is
> > busy does not deliver its full capacity. This effective capacity
> > reduction cannot be modeled by the static capacity value alone.
> >
> > Introduce SMT awareness in the asym-capacity idle selection policy: when
> > SMT is active prefer fully-idle SMT cores over partially-idle ones. A
> > two-phase selection first tries only CPUs on fully idle cores, then
> > falls back to any idle CPU if none fit.
> >
> > Prioritizing fully-idle SMT cores yields better task placement because
> > the effective capacity of partially-idle SMT cores is reduced; always
> > preferring them when available leads to more accurate capacity usage on
> > task wakeup.
> >
> > On an SMT system with asymmetric CPU capacities, SMT-aware idle
> > selection has been shown to improve throughput by around 15-18% for
> > CPU-bound workloads, running an amount of tasks equal to the amount of
> > SMT cores.
> >
> > Cc: Vincent Guittot <vincent.guittot@linaro.org>
> > Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
> > Cc: Christian Loehle <christian.loehle@arm.com>
> > Cc: Koba Ko <kobak@nvidia.com>
> > Reported-by: Felix Abecassis <fabecassis@nvidia.com>
> > Signed-off-by: Andrea Righi <arighi@nvidia.com>
> > ---
> >  kernel/sched/fair.c | 86 +++++++++++++++++++++++++++++++++++++++------
> >  1 file changed, 75 insertions(+), 11 deletions(-)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index d57c02e82f3a1..9a95628669851 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -7940,14 +7940,21 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
> >   * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
> >   * the task fits. If no CPU is big enough, but there are idle ones, try to
> >   * maximize capacity.
> > + *
> > + * When @prefer_idle_cores is true (asym + SMT and idle cores exist), prefer
> > + * CPUs on fully-idle cores over partially-idle ones in a single pass: track
> > + * the best candidate among idle-core CPUs and the best among any idle CPU,
> > + * then return the idle-core candidate if found, else the best any-idle.
> >   */
> >  static int
> > -select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
> > +select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target,
> > +                    bool prefer_idle_cores)
> >  {
> > -       unsigned long task_util, util_min, util_max, best_cap = 0;
> > -       int fits, best_fits = 0;
> > -       int cpu, best_cpu = -1;
> > +       unsigned long task_util, util_min, util_max, best_cap = 0, best_cap_core = 0;
> > +       int fits, best_fits = 0, best_fits_core = 0;
> > +       int cpu, best_cpu = -1, best_cpu_core = -1;
> >         struct cpumask *cpus;
> > +       bool on_idle_core;
> >
> >         cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
> >         cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
> > @@ -7962,16 +7969,58 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
> >                 if (!choose_idle_cpu(cpu, p))
> >                         continue;
> >
> > +               on_idle_core = is_core_idle(cpu);
> > +               if (prefer_idle_cores && !on_idle_core) {
> > +                       /* Track best among any idle CPU for fallback */
> > +                       fits = util_fits_cpu(task_util, util_min, util_max, cpu);
> 
> fits = util_fits_cpu(task_util, util_min, util_max, cpu); is always
> called so call it once above this if condition
> 
> this will help factorize the selection of best_cpu and best_cpu_core

Makes sense.

> 
> > +                       if (fits > 0) {
> > +                               /*
> > +                                * Full fit: strictly better than fits 0 / -1;
> > +                                * among several, prefer higher capacity.
> > +                                */
> > +                               if (best_cpu < 0 || best_fits <= 0 ||
> > +                                   (best_fits > 0 && cpu_cap > best_cap)) {
> > +                                       best_cap = cpu_cap;
> > +                                       best_cpu = cpu;
> > +                                       best_fits = fits;
> > +                               }
> > +                               continue;
> > +                       }
> > +                       if (best_fits > 0)
> > +                               continue;
> > +                       if (fits < 0)
> > +                               cpu_cap = get_actual_cpu_capacity(cpu);
> > +                       if ((fits < best_fits) ||
> > +                           ((fits == best_fits) && (cpu_cap > best_cap))) {
> > +                               best_cap = cpu_cap;
> > +                               best_cpu = cpu;
> > +                               best_fits = fits;
> > +                       }
> > +                       continue;
> > +               }
> > +
> >                 fits = util_fits_cpu(task_util, util_min, util_max, cpu);
> >
> >                 /* This CPU fits with all requirements */
> > -               if (fits > 0)
> > -                       return cpu;
> > +               if (fits > 0) {
> > +                       if (prefer_idle_cores && on_idle_core)
> > +                               return cpu;
> > +                       if (!prefer_idle_cores)
> > +                               return cpu;
> > +                       /*
> > +                        * Prefer idle cores: record and keep looking for
> > +                        * idle-core fit.
> > +                        */
> > +                       best_cap = cpu_cap;
> > +                       best_cpu = cpu;
> > +                       best_fits = fits;
> > +                       continue;
> > +               }
> >                 /*
> >                  * Only the min performance hint (i.e. uclamp_min) doesn't fit.
> >                  * Look for the CPU with best capacity.
> >                  */
> > -               else if (fits < 0)
> > +               if (fits < 0)
> >                         cpu_cap = get_actual_cpu_capacity(cpu);
> >
> >                 /*
> > @@ -7984,8 +8033,17 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
> >                         best_cpu = cpu;
> >                         best_fits = fits;
> >                 }
> > +               if (prefer_idle_cores && on_idle_core &&
> > +                   ((fits < best_fits_core) ||
> > +                    ((fits == best_fits_core) && (cpu_cap > best_cap_core)))) {
> > +                       best_cap_core = cpu_cap;
> > +                       best_cpu_core = cpu;
> > +                       best_fits_core = fits;
> > +               }
> >         }
> >
> > +       if (prefer_idle_cores && best_cpu_core >= 0)
> > +               return best_cpu_core;
> >         return best_cpu;
> >  }
> >
> > @@ -7994,12 +8052,17 @@ static inline bool asym_fits_cpu(unsigned long util,
> >                                  unsigned long util_max,
> >                                  int cpu)
> >  {
> > -       if (sched_asym_cpucap_active())
> > +       if (sched_asym_cpucap_active()) {
> >                 /*
> >                  * Return true only if the cpu fully fits the task requirements
> >                  * which include the utilization and the performance hints.
> > +                *
> > +                * When SMT is active, also require that the core has no busy
> > +                * siblings.
> >                  */
> > -               return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
> > +               return (!sched_smt_active() || is_core_idle(cpu)) &&
> > +                      (util_fits_cpu(util, util_min, util_max, cpu) > 0);
> > +       }
> >
> >         return true;
> >  }
> > @@ -8097,8 +8160,9 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
> >                  * capacity path.
> >                  */
> >                 if (sd) {
> > -                       i = select_idle_capacity(p, sd, target);
> > -                       return ((unsigned)i < nr_cpumask_bits) ? i : target;
> > +                       i = select_idle_capacity(p, sd, target,
> > +                               sched_smt_active() && test_idle_cores(target));
> 
> Move "sched_smt_active() && test_idle_cores(target)" inside
> select_idle_capacity(). I don't see the benefit of making it a
> parameter
> or use has_idle_core for the parameter like other smt related function

And also makes sense.

> 
> 
> > +                       return ((unsigned int)i < nr_cpumask_bits) ? i : target;
> >                 }
> >         }
> >
> > --
> > 2.53.0
> >

Thanks,
-Andrea

[PATCH 1/4] sched/fair: Prefer fully-idle SMT cores in asym-capacity idle selection
[PATCH 2/4] sched/fair: Reject misfit pulls onto busy SMT siblings on asym-capacity
[PATCH 3/4] sched/fair: Enable EAS with SMT on SD_ASYM_CPUCAPACITY systems
[PATCH 4/4] sched/fair: Prefer fully-idle SMT core for NOHZ idle load balancer