[v2] sched/fair: Skip SCHED_IDLE rq for SCHED_IDLE task

[PATCHv2] sched/fair: Skip SCHED_IDLE rq for SCHED_IDLE task

Posted by Christian Loehle 3 days, 8 hours ago

CPUs whose rq only have SCHED_IDLE tasks running are considered to be
equivalent to truly idle CPUs during wakeup path. For fork and exec
SCHED_IDLE is even preferred.
This is based on the assumption that the SCHED_IDLE CPU is not in an
idle state and might be in a higher P-state, allowing the task/wakee
to run immediately without sharing the rq.

However this assumption doesn't hold if the wakee has SCHED_IDLE policy
itself, as it will share the rq with existing SCHED_IDLE tasks. In this
case, we are better off continuing to look for a truly idle CPU.

On a Intel Xeon 2-socket with 64 logical cores in total this yields
for kernel compilation using SCHED_IDLE:

+---------+----------------------+----------------------+--------+
| workers | mainline (seconds)   | patch (seconds)      | delta% |
+=========+======================+======================+========+
|       1 | 4384.728 ± 21.085    | 3843.250 ± 16.235    | -12.35 |
|       2 | 2242.513 ± 2.099     | 1971.696 ± 2.842     | -12.08 |
|       4 | 1199.324 ± 1.823     | 1033.744 ± 1.803     | -13.81 |
|       8 |  649.083 ± 1.959     |  559.123 ± 4.301     | -13.86 |
|      16 |  370.425 ± 0.915     |  325.906 ± 4.623     | -12.02 |
|      32 |  234.651 ± 2.255     |  217.266 ± 0.253     |  -7.41 |
|      64 |  202.286 ± 1.452     |  197.977 ± 2.275     |  -2.13 |
|     128 |  217.092 ± 1.687     |  212.164 ± 1.138     |  -2.27 |
+---------+----------------------+----------------------+--------+

Signed-off-by: Christian Loehle <christian.loehle@arm.com>
---
v2: Reword commit message, SCHED_IDLE aren't always preferred,
but rather equivalent
Factor out choose_sched_idle_rq() too (Both Vincent)

 kernel/sched/fair.c | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3eaeceda71b0..6510ab6eb44b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6832,9 +6832,15 @@ static int sched_idle_rq(struct rq *rq)
 			rq->nr_running);
 }
 
-static int sched_idle_cpu(int cpu)
+static int choose_sched_idle_rq(struct rq *rq, struct task_struct *p)
 {
-	return sched_idle_rq(cpu_rq(cpu));
+	return sched_idle_rq(rq) && !task_has_idle_policy(p);
+}
+
+static int choose_idle_cpu(int cpu, struct task_struct *p)
+{
+	return available_idle_cpu(cpu) ||
+	       choose_sched_idle_rq(cpu_rq(cpu), p);
 }
 
 static void
@@ -7400,7 +7406,7 @@ sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *
 		if (!sched_core_cookie_match(rq, p))
 			continue;
 
-		if (sched_idle_cpu(i))
+		if (choose_sched_idle_rq(rq, p))
 			return i;
 
 		if (available_idle_cpu(i)) {
@@ -7491,8 +7497,7 @@ static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct tas
 
 static inline int __select_idle_cpu(int cpu, struct task_struct *p)
 {
-	if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
-	    sched_cpu_cookie_match(cpu_rq(cpu), p))
+	if (choose_idle_cpu(cpu, p) && sched_cpu_cookie_match(cpu_rq(cpu), p))
 		return cpu;
 
 	return -1;
@@ -7565,7 +7570,8 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
 		if (!available_idle_cpu(cpu)) {
 			idle = false;
 			if (*idle_cpu == -1) {
-				if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) {
+				if (choose_sched_idle_rq(cpu_rq(cpu), p) &&
+				    cpumask_test_cpu(cpu, cpus)) {
 					*idle_cpu = cpu;
 					break;
 				}
@@ -7600,7 +7606,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
 		 */
 		if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
 			continue;
-		if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
+		if (choose_idle_cpu(cpu, p))
 			return cpu;
 	}
 
@@ -7722,7 +7728,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
 	for_each_cpu_wrap(cpu, cpus, target) {
 		unsigned long cpu_cap = capacity_of(cpu);
 
-		if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
+		if (!choose_idle_cpu(cpu, p))
 			continue;
 
 		fits = util_fits_cpu(task_util, util_min, util_max, cpu);
@@ -7793,7 +7799,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	 */
 	lockdep_assert_irqs_disabled();
 
-	if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
+	if (choose_idle_cpu(target, p) &&
 	    asym_fits_cpu(task_util, util_min, util_max, target))
 		return target;
 
@@ -7801,7 +7807,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	 * If the previous CPU is cache affine and idle, don't be stupid:
 	 */
 	if (prev != target && cpus_share_cache(prev, target) &&
-	    (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
+	    choose_idle_cpu(prev, p) &&
 	    asym_fits_cpu(task_util, util_min, util_max, prev)) {
 
 		if (!static_branch_unlikely(&sched_cluster_active) ||
@@ -7833,7 +7839,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	if (recent_used_cpu != prev &&
 	    recent_used_cpu != target &&
 	    cpus_share_cache(recent_used_cpu, target) &&
-	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
+	    choose_idle_cpu(recent_used_cpu, p) &&
 	    cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
 	    asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
 
@@ -12261,7 +12267,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
 {
 	int continue_balancing = 1;
 	int cpu = rq->cpu;
-	int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
+	int busy = idle != CPU_IDLE && !sched_idle_rq(rq);
 	unsigned long interval;
 	struct sched_domain *sd;
 	/* Earliest time when we have to do rebalance again */
@@ -12299,7 +12305,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
 				 * state even if we migrated tasks. Update it.
 				 */
 				idle = idle_cpu(cpu);
-				busy = !idle && !sched_idle_cpu(cpu);
+				busy = !idle && !sched_idle_rq(rq);
 			}
 			sd->last_balance = jiffies;
 			interval = get_sd_balance_interval(sd, busy);
-- 
2.34.1

Re: [PATCHv2] sched/fair: Skip SCHED_IDLE rq for SCHED_IDLE task

Posted by Shubhang Kaushik 2 days, 3 hours ago

On Tue, 3 Feb 2026, Christian Loehle wrote:

> CPUs whose rq only have SCHED_IDLE tasks running are considered to be
> equivalent to truly idle CPUs during wakeup path. For fork and exec
> SCHED_IDLE is even preferred.
> This is based on the assumption that the SCHED_IDLE CPU is not in an
> idle state and might be in a higher P-state, allowing the task/wakee
> to run immediately without sharing the rq.
>
> However this assumption doesn't hold if the wakee has SCHED_IDLE policy
> itself, as it will share the rq with existing SCHED_IDLE tasks. In this
> case, we are better off continuing to look for a truly idle CPU.
>
> On a Intel Xeon 2-socket with 64 logical cores in total this yields
> for kernel compilation using SCHED_IDLE:
>
> +---------+----------------------+----------------------+--------+
> | workers | mainline (seconds)   | patch (seconds)      | delta% |
> +=========+======================+======================+========+
> |       1 | 4384.728 ± 21.085    | 3843.250 ± 16.235    | -12.35 |
> |       2 | 2242.513 ± 2.099     | 1971.696 ± 2.842     | -12.08 |
> |       4 | 1199.324 ± 1.823     | 1033.744 ± 1.803     | -13.81 |
> |       8 |  649.083 ± 1.959     |  559.123 ± 4.301     | -13.86 |
> |      16 |  370.425 ± 0.915     |  325.906 ± 4.623     | -12.02 |
> |      32 |  234.651 ± 2.255     |  217.266 ± 0.253     |  -7.41 |
> |      64 |  202.286 ± 1.452     |  197.977 ± 2.275     |  -2.13 |
> |     128 |  217.092 ± 1.687     |  212.164 ± 1.138     |  -2.27 |
> +---------+----------------------+----------------------+--------+
>
> Signed-off-by: Christian Loehle <christian.loehle@arm.com>

I’ve been testing this patch on an 80-core Ampere Altra (Neoverse-N1) and 
the results look very solid. On these high-core-count ARM systems, we 
definitely see the benefit of being pickier about where we place 
SCHED_IDLE tasks.

Treating an occupied SCHED_IDLE rq as idle seems to cause 
unnecessary packing that shows up in the tail latency. By spreading these 
background tasks to truly idle cores, I'm seeing a nice boost in both 
background compilation and AI inference throughput.

The reduction in sys time confirms that the domain balancing remains 
stable despite the refactor to sched_idle_rq(rq) as you and Prateek 
mentioned.

1. Background Kernel Compilation:

I ran `time nice -n 19 make -j$nproc` to see how it handles a heavy 
background load. We saved nearly 3 minutes of 'sys' time showing
lower scheduler overhead.

Mainline (6.19.0-rc8):
real 9m28.403s
sys 219m21.591s

Patched:
real 9m16.167s (-12.2s)
sys 216m28.323s (-2m53s)

I was initially concerned about the impact on domain balancing, but the 
significant reduction in 'sys' time during the kernel build confirms that 
we aren't seeing any regressive balancing overhead.

2. AI Inference (llama-batched-bench):

For background LLM inference, the patch consistently delivered about 8.7% 
more throughput when we're running near core saturation.

51 Threads: 30.03 t/s (vs 27.62 on Mainline) -> +8.7%
80 Threads: 27.20 t/s (vs 25.01 on Mainline) -> +8.7%

3. Scheduler Latency using schbench:

The biggest win was in the p99.9 tail latency. Under a locked workload, 
the latency spikes dropped significantly.
4 Threads (Locking): 10085 us (vs 12421 us) -> -18.8%
8 Threads (Locking): 9563 us (vs 11589 us) -> -17.5%

The patch really helps clean up the noise for background tasks on these 
large ARM platforms. Nice work.

Tested-by: Shubhang Kaushik <shubhang@os.amperecomputing.com>

Regards,
Shubhang Kaushik

> 	int cpu = rq->cpu;
> -	int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
ma> +	int busy = idle != CPU_IDLE && !sched_idle_rq(rq);
> 	unsigned long interval;
> 	struct sched_domain *sd;
> 	/* Earliest time when we have to do rebalance again */
> @@ -12299,7 +12305,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
> 				 * state even if we migrated tasks. Update it.
> 				 */
> 				idle = idle_cpu(cpu);
> -				busy = !idle && !sched_idle_cpu(cpu);
> +				busy = !idle && !sched_idle_rq(rq);
> 			}
> 			sd->last_balance = jiffies;
> 			interval = get_sd_balance_interval(sd, busy);
> -- 
> 2.34.1
>
>

Re: [PATCHv2] sched/fair: Skip SCHED_IDLE rq for SCHED_IDLE task

Posted by Vincent Guittot 1 day, 20 hours ago

On Thu, 5 Feb 2026 at 01:00, Shubhang Kaushik
<shubhang@os.amperecomputing.com> wrote:
>
> On Tue, 3 Feb 2026, Christian Loehle wrote:
>
> > CPUs whose rq only have SCHED_IDLE tasks running are considered to be
> > equivalent to truly idle CPUs during wakeup path. For fork and exec
> > SCHED_IDLE is even preferred.
> > This is based on the assumption that the SCHED_IDLE CPU is not in an
> > idle state and might be in a higher P-state, allowing the task/wakee
> > to run immediately without sharing the rq.
> >
> > However this assumption doesn't hold if the wakee has SCHED_IDLE policy
> > itself, as it will share the rq with existing SCHED_IDLE tasks. In this
> > case, we are better off continuing to look for a truly idle CPU.
> >
> > On a Intel Xeon 2-socket with 64 logical cores in total this yields
> > for kernel compilation using SCHED_IDLE:
> >
> > +---------+----------------------+----------------------+--------+
> > | workers | mainline (seconds)   | patch (seconds)      | delta% |
> > +=========+======================+======================+========+
> > |       1 | 4384.728 ± 21.085    | 3843.250 ± 16.235    | -12.35 |
> > |       2 | 2242.513 ± 2.099     | 1971.696 ± 2.842     | -12.08 |
> > |       4 | 1199.324 ± 1.823     | 1033.744 ± 1.803     | -13.81 |
> > |       8 |  649.083 ± 1.959     |  559.123 ± 4.301     | -13.86 |
> > |      16 |  370.425 ± 0.915     |  325.906 ± 4.623     | -12.02 |
> > |      32 |  234.651 ± 2.255     |  217.266 ± 0.253     |  -7.41 |
> > |      64 |  202.286 ± 1.452     |  197.977 ± 2.275     |  -2.13 |
> > |     128 |  217.092 ± 1.687     |  212.164 ± 1.138     |  -2.27 |
> > +---------+----------------------+----------------------+--------+
> >
> > Signed-off-by: Christian Loehle <christian.loehle@arm.com>
>
> I’ve been testing this patch on an 80-core Ampere Altra (Neoverse-N1) and
> the results look very solid. On these high-core-count ARM systems, we
> definitely see the benefit of being pickier about where we place
> SCHED_IDLE tasks.
>
> Treating an occupied SCHED_IDLE rq as idle seems to cause
> unnecessary packing that shows up in the tail latency. By spreading these
> background tasks to truly idle cores, I'm seeing a nice boost in both
> background compilation and AI inference throughput.
>
> The reduction in sys time confirms that the domain balancing remains
> stable despite the refactor to sched_idle_rq(rq) as you and Prateek
> mentioned.
>
> 1. Background Kernel Compilation:
>
> I ran `time nice -n 19 make -j$nproc` to see how it handles a heavy

nice -n 19 uses sched_other with prio 19 and not sched_idle so I'm
curious how you can see a difference ?
Or something is missing in your test description
Or we have a bug somewhere

> background load. We saved nearly 3 minutes of 'sys' time showing
> lower scheduler overhead.
>
> Mainline (6.19.0-rc8):
> real 9m28.403s
> sys 219m21.591s
>
> Patched:
> real 9m16.167s (-12.2s)
> sys 216m28.323s (-2m53s)
>
> I was initially concerned about the impact on domain balancing, but the
> significant reduction in 'sys' time during the kernel build confirms that
> we aren't seeing any regressive balancing overhead.
>
> 2. AI Inference (llama-batched-bench):
>
> For background LLM inference, the patch consistently delivered about 8.7%
> more throughput when we're running near core saturation.
>
> 51 Threads: 30.03 t/s (vs 27.62 on Mainline) -> +8.7%
> 80 Threads: 27.20 t/s (vs 25.01 on Mainline) -> +8.7%
>
> 3. Scheduler Latency using schbench:
>
> The biggest win was in the p99.9 tail latency. Under a locked workload,
> the latency spikes dropped significantly.
> 4 Threads (Locking): 10085 us (vs 12421 us) -> -18.8%
> 8 Threads (Locking): 9563 us (vs 11589 us) -> -17.5%
>
> The patch really helps clean up the noise for background tasks on these
> large ARM platforms. Nice work.
>
> Tested-by: Shubhang Kaushik <shubhang@os.amperecomputing.com>
>
> Regards,
> Shubhang Kaushik
>
> >       int cpu = rq->cpu;
> > -     int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
> ma> +   int busy = idle != CPU_IDLE && !sched_idle_rq(rq);
> >       unsigned long interval;
> >       struct sched_domain *sd;
> >       /* Earliest time when we have to do rebalance again */
> > @@ -12299,7 +12305,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
> >                                * state even if we migrated tasks. Update it.
> >                                */
> >                               idle = idle_cpu(cpu);
> > -                             busy = !idle && !sched_idle_cpu(cpu);
> > +                             busy = !idle && !sched_idle_rq(rq);
> >                       }
> >                       sd->last_balance = jiffies;
> >                       interval = get_sd_balance_interval(sd, busy);
> > --
> > 2.34.1
> >
> >

Re: [PATCHv2] sched/fair: Skip SCHED_IDLE rq for SCHED_IDLE task

Posted by Shubhang Kaushik 1 day, 8 hours ago

On Thu, 5 Feb 2026, Vincent Guittot wrote:

> On Thu, 5 Feb 2026 at 01:00, Shubhang Kaushik
> <shubhang@os.amperecomputing.com> wrote:
>>
>> On Tue, 3 Feb 2026, Christian Loehle wrote:
>>
>>> CPUs whose rq only have SCHED_IDLE tasks running are considered to be
>>> equivalent to truly idle CPUs during wakeup path. For fork and exec
>>> SCHED_IDLE is even preferred.
>>> This is based on the assumption that the SCHED_IDLE CPU is not in an
>>> idle state and might be in a higher P-state, allowing the task/wakee
>>> to run immediately without sharing the rq.
>>>
>>> However this assumption doesn't hold if the wakee has SCHED_IDLE policy
>>> itself, as it will share the rq with existing SCHED_IDLE tasks. In this
>>> case, we are better off continuing to look for a truly idle CPU.
>>>
>>> On a Intel Xeon 2-socket with 64 logical cores in total this yields
>>> for kernel compilation using SCHED_IDLE:
>>>
>>> +---------+----------------------+----------------------+--------+
>>> | workers | mainline (seconds)   | patch (seconds)      | delta% |
>>> +=========+======================+======================+========+
>>> |       1 | 4384.728 ± 21.085    | 3843.250 ± 16.235    | -12.35 |
>>> |       2 | 2242.513 ± 2.099     | 1971.696 ± 2.842     | -12.08 |
>>> |       4 | 1199.324 ± 1.823     | 1033.744 ± 1.803     | -13.81 |
>>> |       8 |  649.083 ± 1.959     |  559.123 ± 4.301     | -13.86 |
>>> |      16 |  370.425 ± 0.915     |  325.906 ± 4.623     | -12.02 |
>>> |      32 |  234.651 ± 2.255     |  217.266 ± 0.253     |  -7.41 |
>>> |      64 |  202.286 ± 1.452     |  197.977 ± 2.275     |  -2.13 |
>>> |     128 |  217.092 ± 1.687     |  212.164 ± 1.138     |  -2.27 |
>>> +---------+----------------------+----------------------+--------+
>>>
>>> Signed-off-by: Christian Loehle <christian.loehle@arm.com>
>>
>> I’ve been testing this patch on an 80-core Ampere Altra (Neoverse-N1) and
>> the results look very solid. On these high-core-count ARM systems, we
>> definitely see the benefit of being pickier about where we place
>> SCHED_IDLE tasks.
>>
>> Treating an occupied SCHED_IDLE rq as idle seems to cause
>> unnecessary packing that shows up in the tail latency. By spreading these
>> background tasks to truly idle cores, I'm seeing a nice boost in both
>> background compilation and AI inference throughput.
>>
>> The reduction in sys time confirms that the domain balancing remains
>> stable despite the refactor to sched_idle_rq(rq) as you and Prateek
>> mentioned.
>>
>> 1. Background Kernel Compilation:
>>
>> I ran `time nice -n 19 make -j$nproc` to see how it handles a heavy
>
> nice -n 19 uses sched_other with prio 19 and not sched_idle so I'm
> curious how you can see a difference ?
> Or something is missing in your test description
> Or we have a bug somewhere
>

Okay, I realized I had used nice -n 19 (SCHED_OTHER) for the initial 
build, which wouldn't have directly triggered the SCHED_IDLE logic. 
But, I did use chrt for the schbench runs, which is why those p99 wins 
were so consistent.

I've re-run the kernel build using the correct chrt --idle 0 policy. On 
Ampere Altra, the throughput is along the same lines as mainline.

Metric	Mainline	Patched		Delta
Real	9m 20.120s	9m 18.472s	-1.6s
User	382m 24.966s	380m 41.716s	-1m 43s
Sys	218m 26.192s	218m 44.908s	+18.7s

  >> background load. We 
saved nearly 3 minutes of 'sys' time showing >> lower scheduler overhead.
>>
>> Mainline (6.19.0-rc8):
>> real 9m28.403s
>> sys 219m21.591s
>>
>> Patched:
>> real 9m16.167s (-12.2s)
>> sys 216m28.323s (-2m53s)
>>
>> I was initially concerned about the impact on domain balancing, but the
>> significant reduction in 'sys' time during the kernel build confirms that
>> we aren't seeing any regressive balancing overhead.
>>
>> 2. AI Inference (llama-batched-bench):
>>
>> For background LLM inference, the patch consistently delivered about 8.7%
>> more throughput when we're running near core saturation.
>>
>> 51 Threads: 30.03 t/s (vs 27.62 on Mainline) -> +8.7%
>> 80 Threads: 27.20 t/s (vs 25.01 on Mainline) -> +8.7%
>>
>> 3. Scheduler Latency using schbench:
>>
>> The biggest win was in the p99.9 tail latency. Under a locked workload,
>> the latency spikes dropped significantly.
>> 4 Threads (Locking): 10085 us (vs 12421 us) -> -18.8%
>> 8 Threads (Locking): 9563 us (vs 11589 us) -> -17.5%
>>
>> The patch really helps clean up the noise for background tasks on these
>> large ARM platforms. Nice work.
>>
>> Tested-by: Shubhang Kaushik <shubhang@os.amperecomputing.com>
>>
>> Regards,
>> Shubhang Kaushik
>>
>>>       int cpu = rq->cpu;
>>> -     int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
>> ma> +   int busy = idle != CPU_IDLE && !sched_idle_rq(rq);
>>>       unsigned long interval;
>>>       struct sched_domain *sd;
>>>       /* Earliest time when we have to do rebalance again */
>>> @@ -12299,7 +12305,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
>>>                                * state even if we migrated tasks. Update it.
>>>                                */
>>>                               idle = idle_cpu(cpu);
>>> -                             busy = !idle && !sched_idle_cpu(cpu);
>>> +                             busy = !idle && !sched_idle_rq(rq);
>>>                       }
>>>                       sd->last_balance = jiffies;
>>>                       interval = get_sd_balance_interval(sd, busy);
>>> --
>>> 2.34.1
>>>
>>>
>

Re: [PATCHv2] sched/fair: Skip SCHED_IDLE rq for SCHED_IDLE task

Posted by Christian Loehle 13 hours ago

On 2/5/26 18:52, Shubhang Kaushik wrote:
> On Thu, 5 Feb 2026, Vincent Guittot wrote:
> 
>> On Thu, 5 Feb 2026 at 01:00, Shubhang Kaushik
>> <shubhang@os.amperecomputing.com> wrote:
>>>
>>> On Tue, 3 Feb 2026, Christian Loehle wrote:
>>>
>>>> CPUs whose rq only have SCHED_IDLE tasks running are considered to be
>>>> equivalent to truly idle CPUs during wakeup path. For fork and exec
>>>> SCHED_IDLE is even preferred.
>>>> This is based on the assumption that the SCHED_IDLE CPU is not in an
>>>> idle state and might be in a higher P-state, allowing the task/wakee
>>>> to run immediately without sharing the rq.
>>>>
>>>> However this assumption doesn't hold if the wakee has SCHED_IDLE policy
>>>> itself, as it will share the rq with existing SCHED_IDLE tasks. In this
>>>> case, we are better off continuing to look for a truly idle CPU.
>>>>
>>>> On a Intel Xeon 2-socket with 64 logical cores in total this yields
>>>> for kernel compilation using SCHED_IDLE:
>>>>
>>>> +---------+----------------------+----------------------+--------+
>>>> | workers | mainline (seconds)   | patch (seconds)      | delta% |
>>>> +=========+======================+======================+========+
>>>> |       1 | 4384.728 ± 21.085    | 3843.250 ± 16.235    | -12.35 |
>>>> |       2 | 2242.513 ± 2.099     | 1971.696 ± 2.842     | -12.08 |
>>>> |       4 | 1199.324 ± 1.823     | 1033.744 ± 1.803     | -13.81 |
>>>> |       8 |  649.083 ± 1.959     |  559.123 ± 4.301     | -13.86 |
>>>> |      16 |  370.425 ± 0.915     |  325.906 ± 4.623     | -12.02 |
>>>> |      32 |  234.651 ± 2.255     |  217.266 ± 0.253     |  -7.41 |
>>>> |      64 |  202.286 ± 1.452     |  197.977 ± 2.275     |  -2.13 |
>>>> |     128 |  217.092 ± 1.687     |  212.164 ± 1.138     |  -2.27 |
>>>> +---------+----------------------+----------------------+--------+
>>>>
>>>> Signed-off-by: Christian Loehle <christian.loehle@arm.com>
>>>
>>> I’ve been testing this patch on an 80-core Ampere Altra (Neoverse-N1) and
>>> the results look very solid. On these high-core-count ARM systems, we
>>> definitely see the benefit of being pickier about where we place
>>> SCHED_IDLE tasks.
>>>
>>> Treating an occupied SCHED_IDLE rq as idle seems to cause
>>> unnecessary packing that shows up in the tail latency. By spreading these
>>> background tasks to truly idle cores, I'm seeing a nice boost in both
>>> background compilation and AI inference throughput.
>>>
>>> The reduction in sys time confirms that the domain balancing remains
>>> stable despite the refactor to sched_idle_rq(rq) as you and Prateek
>>> mentioned.
>>>
>>> 1. Background Kernel Compilation:
>>>
>>> I ran `time nice -n 19 make -j$nproc` to see how it handles a heavy
>>
>> nice -n 19 uses sched_other with prio 19 and not sched_idle so I'm
>> curious how you can see a difference ?
>> Or something is missing in your test description
>> Or we have a bug somewhere
>>
> 
> Okay, I realized I had used nice -n 19 (SCHED_OTHER) for the initial build, which wouldn't have directly triggered the SCHED_IDLE logic. But, I did use chrt for the schbench runs, which is why those p99 wins were so consistent.
> 
> I've re-run the kernel build using the correct chrt --idle 0 policy. On Ampere Altra, the throughput is along the same lines as mainline.
> 
> Metric    Mainline    Patched        Delta
> Real    9m 20.120s    9m 18.472s    -1.6s
> User    382m 24.966s    380m 41.716s    -1m 43s
> Sys    218m 26.192s    218m 44.908s    +18.7s
> 

Thanks for testing Shubhang, although I find it a bit surprising that your
kernel compilation under SCHED_IDLE doesn't improve.
Are you running with CONFIG_SCHED_CLUSTER=y? I'll try to reproduce.
Anyway at least you see a schbench improvement, I'm assuming I'll
keep you Tested-by?

Re: [PATCHv2] sched/fair: Skip SCHED_IDLE rq for SCHED_IDLE task

Posted by Shubhang Kaushik 8 hours ago

On Fri, 6 Feb 2026, Christian Loehle wrote:

> On 2/5/26 18:52, Shubhang Kaushik wrote:
>> On Thu, 5 Feb 2026, Vincent Guittot wrote:
>>
>>> On Thu, 5 Feb 2026 at 01:00, Shubhang Kaushik
>>> <shubhang@os.amperecomputing.com> wrote:
>>>>
>>>> On Tue, 3 Feb 2026, Christian Loehle wrote:
>>>>
>>>>> CPUs whose rq only have SCHED_IDLE tasks running are considered to be
>>>>> equivalent to truly idle CPUs during wakeup path. For fork and exec
>>>>> SCHED_IDLE is even preferred.
>>>>> This is based on the assumption that the SCHED_IDLE CPU is not in an
>>>>> idle state and might be in a higher P-state, allowing the task/wakee
>>>>> to run immediately without sharing the rq.
>>>>>
>>>>> However this assumption doesn't hold if the wakee has SCHED_IDLE policy
>>>>> itself, as it will share the rq with existing SCHED_IDLE tasks. In this
>>>>> case, we are better off continuing to look for a truly idle CPU.
>>>>>
>>>>> On a Intel Xeon 2-socket with 64 logical cores in total this yields
>>>>> for kernel compilation using SCHED_IDLE:
>>>>>
>>>>> +---------+----------------------+----------------------+--------+
>>>>> | workers | mainline (seconds)   | patch (seconds)      | delta% |
>>>>> +=========+======================+======================+========+
>>>>> |       1 | 4384.728 ± 21.085    | 3843.250 ± 16.235    | -12.35 |
>>>>> |       2 | 2242.513 ± 2.099     | 1971.696 ± 2.842     | -12.08 |
>>>>> |       4 | 1199.324 ± 1.823     | 1033.744 ± 1.803     | -13.81 |
>>>>> |       8 |  649.083 ± 1.959     |  559.123 ± 4.301     | -13.86 |
>>>>> |      16 |  370.425 ± 0.915     |  325.906 ± 4.623     | -12.02 |
>>>>> |      32 |  234.651 ± 2.255     |  217.266 ± 0.253     |  -7.41 |
>>>>> |      64 |  202.286 ± 1.452     |  197.977 ± 2.275     |  -2.13 |
>>>>> |     128 |  217.092 ± 1.687     |  212.164 ± 1.138     |  -2.27 |
>>>>> +---------+----------------------+----------------------+--------+
>>>>>
>>>>> Signed-off-by: Christian Loehle <christian.loehle@arm.com>
>>>>
>>>> I’ve been testing this patch on an 80-core Ampere Altra (Neoverse-N1) and
>>>> the results look very solid. On these high-core-count ARM systems, we
>>>> definitely see the benefit of being pickier about where we place
>>>> SCHED_IDLE tasks.
>>>>
>>>> Treating an occupied SCHED_IDLE rq as idle seems to cause
>>>> unnecessary packing that shows up in the tail latency. By spreading these
>>>> background tasks to truly idle cores, I'm seeing a nice boost in both
>>>> background compilation and AI inference throughput.
>>>>
>>>> The reduction in sys time confirms that the domain balancing remains
>>>> stable despite the refactor to sched_idle_rq(rq) as you and Prateek
>>>> mentioned.
>>>>
>>>> 1. Background Kernel Compilation:
>>>>
>>>> I ran `time nice -n 19 make -j$nproc` to see how it handles a heavy
>>>
>>> nice -n 19 uses sched_other with prio 19 and not sched_idle so I'm
>>> curious how you can see a difference ?
>>> Or something is missing in your test description
>>> Or we have a bug somewhere
>>>
>>
>> Okay, I realized I had used nice -n 19 (SCHED_OTHER) for the initial build, which wouldn't have directly triggered the SCHED_IDLE logic. But, I did use chrt for the schbench runs, which is why those p99 wins were so consistent.
>>
>> I've re-run the kernel build using the correct chrt --idle 0 policy. On Ampere Altra, the throughput is along the same lines as mainline.
>>
>> Metric    Mainline    Patched        Delta
>> Real    9m 20.120s    9m 18.472s    -1.6s
>> User    382m 24.966s    380m 41.716s    -1m 43s
>> Sys    218m 26.192s    218m 44.908s    +18.7s
>>
>
> Thanks for testing Shubhang, although I find it a bit surprising that your
> kernel compilation under SCHED_IDLE doesn't improve.
> Are you running with CONFIG_SCHED_CLUSTER=y? I'll try to reproduce.
> Anyway at least you see a schbench improvement, I'm assuming I'll
> keep you Tested-by?
>
>

Yes, that's right CONFIG_SCHED_CLUSTER=y is enabled. That likely 
explains why the build throughput isn't shifting as much as your Xeon 
results, though the drop in the user time still suggests better 
efficiency.

Feel free to keep the Tested-by tag.
Tested-by: Shubhang Kaushik shubhang@os.amperecomputing.com

Re: [PATCHv2] sched/fair: Skip SCHED_IDLE rq for SCHED_IDLE task

Posted by Vincent Guittot 2 days, 16 hours ago

On Tue, 3 Feb 2026 at 19:49, Christian Loehle <christian.loehle@arm.com> wrote:
>
> CPUs whose rq only have SCHED_IDLE tasks running are considered to be
> equivalent to truly idle CPUs during wakeup path. For fork and exec
> SCHED_IDLE is even preferred.
> This is based on the assumption that the SCHED_IDLE CPU is not in an
> idle state and might be in a higher P-state, allowing the task/wakee
> to run immediately without sharing the rq.
>
> However this assumption doesn't hold if the wakee has SCHED_IDLE policy
> itself, as it will share the rq with existing SCHED_IDLE tasks. In this
> case, we are better off continuing to look for a truly idle CPU.
>
> On a Intel Xeon 2-socket with 64 logical cores in total this yields
> for kernel compilation using SCHED_IDLE:
>
> +---------+----------------------+----------------------+--------+
> | workers | mainline (seconds)   | patch (seconds)      | delta% |
> +=========+======================+======================+========+
> |       1 | 4384.728 ± 21.085    | 3843.250 ± 16.235    | -12.35 |
> |       2 | 2242.513 ± 2.099     | 1971.696 ± 2.842     | -12.08 |
> |       4 | 1199.324 ± 1.823     | 1033.744 ± 1.803     | -13.81 |
> |       8 |  649.083 ± 1.959     |  559.123 ± 4.301     | -13.86 |
> |      16 |  370.425 ± 0.915     |  325.906 ± 4.623     | -12.02 |
> |      32 |  234.651 ± 2.255     |  217.266 ± 0.253     |  -7.41 |
> |      64 |  202.286 ± 1.452     |  197.977 ± 2.275     |  -2.13 |
> |     128 |  217.092 ± 1.687     |  212.164 ± 1.138     |  -2.27 |
> +---------+----------------------+----------------------+--------+
>
> Signed-off-by: Christian Loehle <christian.loehle@arm.com>

Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>

> ---
> v2: Reword commit message, SCHED_IDLE aren't always preferred,
> but rather equivalent
> Factor out choose_sched_idle_rq() too (Both Vincent)
>
>  kernel/sched/fair.c | 32 +++++++++++++++++++-------------
>  1 file changed, 19 insertions(+), 13 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 3eaeceda71b0..6510ab6eb44b 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6832,9 +6832,15 @@ static int sched_idle_rq(struct rq *rq)
>                         rq->nr_running);
>  }
>
> -static int sched_idle_cpu(int cpu)
> +static int choose_sched_idle_rq(struct rq *rq, struct task_struct *p)
>  {
> -       return sched_idle_rq(cpu_rq(cpu));
> +       return sched_idle_rq(rq) && !task_has_idle_policy(p);
> +}
> +
> +static int choose_idle_cpu(int cpu, struct task_struct *p)
> +{
> +       return available_idle_cpu(cpu) ||
> +              choose_sched_idle_rq(cpu_rq(cpu), p);
>  }
>
>  static void
> @@ -7400,7 +7406,7 @@ sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *
>                 if (!sched_core_cookie_match(rq, p))
>                         continue;
>
> -               if (sched_idle_cpu(i))
> +               if (choose_sched_idle_rq(rq, p))
>                         return i;
>
>                 if (available_idle_cpu(i)) {
> @@ -7491,8 +7497,7 @@ static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct tas
>
>  static inline int __select_idle_cpu(int cpu, struct task_struct *p)
>  {
> -       if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
> -           sched_cpu_cookie_match(cpu_rq(cpu), p))
> +       if (choose_idle_cpu(cpu, p) && sched_cpu_cookie_match(cpu_rq(cpu), p))
>                 return cpu;
>
>         return -1;
> @@ -7565,7 +7570,8 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
>                 if (!available_idle_cpu(cpu)) {
>                         idle = false;
>                         if (*idle_cpu == -1) {
> -                               if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) {
> +                               if (choose_sched_idle_rq(cpu_rq(cpu), p) &&
> +                                   cpumask_test_cpu(cpu, cpus)) {
>                                         *idle_cpu = cpu;
>                                         break;
>                                 }
> @@ -7600,7 +7606,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
>                  */
>                 if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
>                         continue;
> -               if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
> +               if (choose_idle_cpu(cpu, p))
>                         return cpu;
>         }
>
> @@ -7722,7 +7728,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
>         for_each_cpu_wrap(cpu, cpus, target) {
>                 unsigned long cpu_cap = capacity_of(cpu);
>
> -               if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
> +               if (!choose_idle_cpu(cpu, p))
>                         continue;
>
>                 fits = util_fits_cpu(task_util, util_min, util_max, cpu);
> @@ -7793,7 +7799,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>          */
>         lockdep_assert_irqs_disabled();
>
> -       if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
> +       if (choose_idle_cpu(target, p) &&
>             asym_fits_cpu(task_util, util_min, util_max, target))
>                 return target;
>
> @@ -7801,7 +7807,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>          * If the previous CPU is cache affine and idle, don't be stupid:
>          */
>         if (prev != target && cpus_share_cache(prev, target) &&
> -           (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
> +           choose_idle_cpu(prev, p) &&
>             asym_fits_cpu(task_util, util_min, util_max, prev)) {
>
>                 if (!static_branch_unlikely(&sched_cluster_active) ||
> @@ -7833,7 +7839,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>         if (recent_used_cpu != prev &&
>             recent_used_cpu != target &&
>             cpus_share_cache(recent_used_cpu, target) &&
> -           (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
> +           choose_idle_cpu(recent_used_cpu, p) &&
>             cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
>             asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
>
> @@ -12261,7 +12267,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
>  {
>         int continue_balancing = 1;
>         int cpu = rq->cpu;
> -       int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
> +       int busy = idle != CPU_IDLE && !sched_idle_rq(rq);
>         unsigned long interval;
>         struct sched_domain *sd;
>         /* Earliest time when we have to do rebalance again */
> @@ -12299,7 +12305,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
>                                  * state even if we migrated tasks. Update it.
>                                  */
>                                 idle = idle_cpu(cpu);
> -                               busy = !idle && !sched_idle_cpu(cpu);
> +                               busy = !idle && !sched_idle_rq(rq);
>                         }
>                         sd->last_balance = jiffies;
>                         interval = get_sd_balance_interval(sd, busy);
> --
> 2.34.1
>

Re: [PATCHv2] sched/fair: Skip SCHED_IDLE rq for SCHED_IDLE task

Posted by K Prateek Nayak 2 days, 19 hours ago

Hello Chris,

On 2/4/2026 12:19 AM, Christian Loehle wrote:
> CPUs whose rq only have SCHED_IDLE tasks running are considered to be
> equivalent to truly idle CPUs during wakeup path. For fork and exec
> SCHED_IDLE is even preferred.
> This is based on the assumption that the SCHED_IDLE CPU is not in an
> idle state and might be in a higher P-state, allowing the task/wakee
> to run immediately without sharing the rq.
> 
> However this assumption doesn't hold if the wakee has SCHED_IDLE policy
> itself, as it will share the rq with existing SCHED_IDLE tasks. In this
> case, we are better off continuing to look for a truly idle CPU.
> 
> On a Intel Xeon 2-socket with 64 logical cores in total this yields
> for kernel compilation using SCHED_IDLE:
> 
> +---------+----------------------+----------------------+--------+
> | workers | mainline (seconds)   | patch (seconds)      | delta% |
> +=========+======================+======================+========+
> |       1 | 4384.728 ± 21.085    | 3843.250 ± 16.235    | -12.35 |
> |       2 | 2242.513 ± 2.099     | 1971.696 ± 2.842     | -12.08 |
> |       4 | 1199.324 ± 1.823     | 1033.744 ± 1.803     | -13.81 |
> |       8 |  649.083 ± 1.959     |  559.123 ± 4.301     | -13.86 |
> |      16 |  370.425 ± 0.915     |  325.906 ± 4.623     | -12.02 |
> |      32 |  234.651 ± 2.255     |  217.266 ± 0.253     |  -7.41 |
> |      64 |  202.286 ± 1.452     |  197.977 ± 2.275     |  -2.13 |
> |     128 |  217.092 ± 1.687     |  212.164 ± 1.138     |  -2.27 |
> +---------+----------------------+----------------------+--------+

I couldn't spot much difference for kernel compilation on my
3rd Generation EPYC system likely due to smaller LLC size. For
sched-messaging, I found the following interesting trend when
running with SCHED_IDLE:

  (Normalized runtime [Var%]; %diff - higher the better)

                tip/sched:core           +patch        (%diff)
    
     1-group      1.00 [5.00%]        0.88 [10.78%]    11.80%
     2-group      1.00 [5.15%]        0.93 [26.06%]     6.99%
     4-group      1.00 [5.48%]        0.89 [11.03%]    11.13%
     8-group      1.00 [6.62%]        1.21 [12.37%]   -21.30%
    16-group      1.00 [9.46%]        1.28 [ 9.42%]   -27.59%


There is a good improvement for lower utilization. Once the
system is trending towards overutilized but SIS_UTIL cut-off
is still non-zero, we search a little bit longer for a fully
idle CPU when the probability for finding one is actually
low.

I suppose that scenario is rare where we only have SCHED_IDLE
tasks that care about throughput on a busy system to
actually notice this but it was worth pointing out.

Feel free to include:

Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>

> 
> Signed-off-by: Christian Loehle <christian.loehle@arm.com>

-- 
Thanks and Regards,
Prateek

Re: [PATCHv2] sched/fair: Skip SCHED_IDLE rq for SCHED_IDLE task

Posted by Christian Loehle 2 days, 17 hours ago

On 2/4/26 07:48, K Prateek Nayak wrote:
> Hello Chris,
> 
> On 2/4/2026 12:19 AM, Christian Loehle wrote:
>> CPUs whose rq only have SCHED_IDLE tasks running are considered to be
>> equivalent to truly idle CPUs during wakeup path. For fork and exec
>> SCHED_IDLE is even preferred.
>> This is based on the assumption that the SCHED_IDLE CPU is not in an
>> idle state and might be in a higher P-state, allowing the task/wakee
>> to run immediately without sharing the rq.
>>
>> However this assumption doesn't hold if the wakee has SCHED_IDLE policy
>> itself, as it will share the rq with existing SCHED_IDLE tasks. In this
>> case, we are better off continuing to look for a truly idle CPU.
>>
>> On a Intel Xeon 2-socket with 64 logical cores in total this yields
>> for kernel compilation using SCHED_IDLE:
>>
>> +---------+----------------------+----------------------+--------+
>> | workers | mainline (seconds)   | patch (seconds)      | delta% |
>> +=========+======================+======================+========+
>> |       1 | 4384.728 ± 21.085    | 3843.250 ± 16.235    | -12.35 |
>> |       2 | 2242.513 ± 2.099     | 1971.696 ± 2.842     | -12.08 |
>> |       4 | 1199.324 ± 1.823     | 1033.744 ± 1.803     | -13.81 |
>> |       8 |  649.083 ± 1.959     |  559.123 ± 4.301     | -13.86 |
>> |      16 |  370.425 ± 0.915     |  325.906 ± 4.623     | -12.02 |
>> |      32 |  234.651 ± 2.255     |  217.266 ± 0.253     |  -7.41 |
>> |      64 |  202.286 ± 1.452     |  197.977 ± 2.275     |  -2.13 |
>> |     128 |  217.092 ± 1.687     |  212.164 ± 1.138     |  -2.27 |
>> +---------+----------------------+----------------------+--------+
> 
> I couldn't spot much difference for kernel compilation on my
> 3rd Generation EPYC system likely due to smaller LLC size. For
> sched-messaging, I found the following interesting trend when
> running with SCHED_IDLE:
> 
>   (Normalized runtime [Var%]; %diff - higher the better)
> 
>                 tip/sched:core           +patch        (%diff)
>     
>      1-group      1.00 [5.00%]        0.88 [10.78%]    11.80%
>      2-group      1.00 [5.15%]        0.93 [26.06%]     6.99%
>      4-group      1.00 [5.48%]        0.89 [11.03%]    11.13%
>      8-group      1.00 [6.62%]        1.21 [12.37%]   -21.30%
>     16-group      1.00 [9.46%]        1.28 [ 9.42%]   -27.59%
> 
> 
> There is a good improvement for lower utilization. Once the
> system is trending towards overutilized but SIS_UTIL cut-off
> is still non-zero, we search a little bit longer for a fully
> idle CPU when the probability for finding one is actually
> low.
> 
> I suppose that scenario is rare where we only have SCHED_IDLE
> tasks that care about throughput on a busy system to
> actually notice this but it was worth pointing out.

If we're unlikely to find a good candidate then doing anything
on wakeup is kind of a waste of time, especially for sched
messaging.
So I guess without $PATCH it will basically always bail out
when looking at the first few CPUs because it sees SCHED_IDLE
sched messaging :)

> 
> Feel free to include:
> 
> Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>

Thanks for testing!

Re: [PATCHv2] sched/fair: Skip SCHED_IDLE rq for SCHED_IDLE task

Posted by Shubhang Kaushik 3 days, 1 hour ago

Hi Christian Loehle,

On Tue, 3 Feb 2026, Christian Loehle wrote:

> CPUs whose rq only have SCHED_IDLE tasks running are considered to be
> equivalent to truly idle CPUs during wakeup path. For fork and exec
> SCHED_IDLE is even preferred.
> This is based on the assumption that the SCHED_IDLE CPU is not in an
> idle state and might be in a higher P-state, allowing the task/wakee
> to run immediately without sharing the rq.
>
> However this assumption doesn't hold if the wakee has SCHED_IDLE policy
> itself, as it will share the rq with existing SCHED_IDLE tasks. In this
> case, we are better off continuing to look for a truly idle CPU.
>
> On a Intel Xeon 2-socket with 64 logical cores in total this yields
> for kernel compilation using SCHED_IDLE:
>
> +---------+----------------------+----------------------+--------+
> | workers | mainline (seconds)   | patch (seconds)      | delta% |
> +=========+======================+======================+========+
> |       1 | 4384.728 ± 21.085    | 3843.250 ± 16.235    | -12.35 |
> |       2 | 2242.513 ± 2.099     | 1971.696 ± 2.842     | -12.08 |
> |       4 | 1199.324 ± 1.823     | 1033.744 ± 1.803     | -13.81 |
> |       8 |  649.083 ± 1.959     |  559.123 ± 4.301     | -13.86 |
> |      16 |  370.425 ± 0.915     |  325.906 ± 4.623     | -12.02 |
> |      32 |  234.651 ± 2.255     |  217.266 ± 0.253     |  -7.41 |
> |      64 |  202.286 ± 1.452     |  197.977 ± 2.275     |  -2.13 |
> |     128 |  217.092 ± 1.687     |  212.164 ± 1.138     |  -2.27 |
> +---------+----------------------+----------------------+--------+
>
> Signed-off-by: Christian Loehle <christian.loehle@arm.com>
> ---
> v2: Reword commit message, SCHED_IDLE aren't always preferred,
> but rather equivalent
> Factor out choose_sched_idle_rq() too (Both Vincent)
>
> kernel/sched/fair.c | 32 +++++++++++++++++++-------------
> 1 file changed, 19 insertions(+), 13 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 3eaeceda71b0..6510ab6eb44b 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6832,9 +6832,15 @@ static int sched_idle_rq(struct rq *rq)
> 			rq->nr_running);
> }
>
> -static int sched_idle_cpu(int cpu)
> +static int choose_sched_idle_rq(struct rq *rq, struct task_struct *p)
> {
> -	return sched_idle_rq(cpu_rq(cpu));
> +	return sched_idle_rq(rq) && !task_has_idle_policy(p);
> +}
> +
> +static int choose_idle_cpu(int cpu, struct task_struct *p)
> +{
> +	return available_idle_cpu(cpu) ||
> +	       choose_sched_idle_rq(cpu_rq(cpu), p);
> }
>
> static void
> @@ -7400,7 +7406,7 @@ sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *
> 		if (!sched_core_cookie_match(rq, p))
> 			continue;
>
> -		if (sched_idle_cpu(i))
> +		if (choose_sched_idle_rq(rq, p))
> 			return i;
>
> 		if (available_idle_cpu(i)) {
> @@ -7491,8 +7497,7 @@ static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct tas
>
> static inline int __select_idle_cpu(int cpu, struct task_struct *p)
> {
> -	if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
> -	    sched_cpu_cookie_match(cpu_rq(cpu), p))
> +	if (choose_idle_cpu(cpu, p) && sched_cpu_cookie_match(cpu_rq(cpu), p))
> 		return cpu;
>
> 	return -1;
> @@ -7565,7 +7570,8 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
> 		if (!available_idle_cpu(cpu)) {
> 			idle = false;
> 			if (*idle_cpu == -1) {
> -				if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) {
> +				if (choose_sched_idle_rq(cpu_rq(cpu), p) &&
> +				    cpumask_test_cpu(cpu, cpus)) {
> 					*idle_cpu = cpu;
> 					break;
> 				}
> @@ -7600,7 +7606,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
> 		 */
> 		if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
> 			continue;
> -		if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
> +		if (choose_idle_cpu(cpu, p))
> 			return cpu;
> 	}
>
> @@ -7722,7 +7728,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
> 	for_each_cpu_wrap(cpu, cpus, target) {
> 		unsigned long cpu_cap = capacity_of(cpu);
>
> -		if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
> +		if (!choose_idle_cpu(cpu, p))
> 			continue;
>
> 		fits = util_fits_cpu(task_util, util_min, util_max, cpu);
> @@ -7793,7 +7799,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
> 	 */
> 	lockdep_assert_irqs_disabled();
>
> -	if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
> +	if (choose_idle_cpu(target, p) &&
> 	    asym_fits_cpu(task_util, util_min, util_max, target))
> 		return target;
>
> @@ -7801,7 +7807,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
> 	 * If the previous CPU is cache affine and idle, don't be stupid:
> 	 */
> 	if (prev != target && cpus_share_cache(prev, target) &&
> -	    (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
> +	    choose_idle_cpu(prev, p) &&
> 	    asym_fits_cpu(task_util, util_min, util_max, prev)) {
>
> 		if (!static_branch_unlikely(&sched_cluster_active) ||
> @@ -7833,7 +7839,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
> 	if (recent_used_cpu != prev &&
> 	    recent_used_cpu != target &&
> 	    cpus_share_cache(recent_used_cpu, target) &&
> -	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
> +	    choose_idle_cpu(recent_used_cpu, p) &&
> 	    cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
> 	    asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
>
> @@ -12261,7 +12267,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
> {
> 	int continue_balancing = 1;
> 	int cpu = rq->cpu;
> -	int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
> +	int busy = idle != CPU_IDLE && !sched_idle_rq(rq);
> 	unsigned long interval;
> 	struct sched_domain *sd;
> 	/* Earliest time when we have to do rebalance again */
> @@ -12299,7 +12305,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
> 				 * state even if we migrated tasks. Update it.
> 				 */
> 				idle = idle_cpu(cpu);
> -				busy = !idle && !sched_idle_cpu(cpu);
> +				busy = !idle && !sched_idle_rq(rq);

Usually sched_idle_rqs were treated as not-busy in several balancing 
decisions to avoid yielding to background load. Does this change alter 
that interpretation at the domain balancing level ?

> 			}
> 			sd->last_balance = jiffies;
> 			interval = get_sd_balance_interval(sd, busy);
> -- 
> 2.34.1
>
>

Regards,
Shubhang Kaushik

Re: [PATCHv2] sched/fair: Skip SCHED_IDLE rq for SCHED_IDLE task

Posted by Christian Loehle 2 days, 17 hours ago

On 2/4/26 02:08, Shubhang Kaushik wrote:
> Hi Christian Loehle,
> 
> On Tue, 3 Feb 2026, Christian Loehle wrote:
> 
>> CPUs whose rq only have SCHED_IDLE tasks running are considered to be
>> equivalent to truly idle CPUs during wakeup path. For fork and exec
>> SCHED_IDLE is even preferred.
>> This is based on the assumption that the SCHED_IDLE CPU is not in an
>> idle state and might be in a higher P-state, allowing the task/wakee
>> to run immediately without sharing the rq.
>>
>> However this assumption doesn't hold if the wakee has SCHED_IDLE policy
>> itself, as it will share the rq with existing SCHED_IDLE tasks. In this
>> case, we are better off continuing to look for a truly idle CPU.
>>
>> On a Intel Xeon 2-socket with 64 logical cores in total this yields
>> for kernel compilation using SCHED_IDLE:
>>
>> +---------+----------------------+----------------------+--------+
>> | workers | mainline (seconds)   | patch (seconds)      | delta% |
>> +=========+======================+======================+========+
>> |       1 | 4384.728 ± 21.085    | 3843.250 ± 16.235    | -12.35 |
>> |       2 | 2242.513 ± 2.099     | 1971.696 ± 2.842     | -12.08 |
>> |       4 | 1199.324 ± 1.823     | 1033.744 ± 1.803     | -13.81 |
>> |       8 |  649.083 ± 1.959     |  559.123 ± 4.301     | -13.86 |
>> |      16 |  370.425 ± 0.915     |  325.906 ± 4.623     | -12.02 |
>> |      32 |  234.651 ± 2.255     |  217.266 ± 0.253     |  -7.41 |
>> |      64 |  202.286 ± 1.452     |  197.977 ± 2.275     |  -2.13 |
>> |     128 |  217.092 ± 1.687     |  212.164 ± 1.138     |  -2.27 |
>> +---------+----------------------+----------------------+--------+
>>
>> Signed-off-by: Christian Loehle <christian.loehle@arm.com>
>> ---
>> v2: Reword commit message, SCHED_IDLE aren't always preferred,
>> but rather equivalent
>> Factor out choose_sched_idle_rq() too (Both Vincent)
>>
>> kernel/sched/fair.c | 32 +++++++++++++++++++-------------
>> 1 file changed, 19 insertions(+), 13 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 3eaeceda71b0..6510ab6eb44b 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -6832,9 +6832,15 @@ static int sched_idle_rq(struct rq *rq)
>>             rq->nr_running);
>> }
>>
>> -static int sched_idle_cpu(int cpu)
>> +static int choose_sched_idle_rq(struct rq *rq, struct task_struct *p)
>> {
>> -    return sched_idle_rq(cpu_rq(cpu));
>> +    return sched_idle_rq(rq) && !task_has_idle_policy(p);
>> +}
>> +
>> +static int choose_idle_cpu(int cpu, struct task_struct *p)
>> +{
>> +    return available_idle_cpu(cpu) ||
>> +           choose_sched_idle_rq(cpu_rq(cpu), p);
>> }
>>
>> static void
>> @@ -7400,7 +7406,7 @@ sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *
>>         if (!sched_core_cookie_match(rq, p))
>>             continue;
>>
>> -        if (sched_idle_cpu(i))
>> +        if (choose_sched_idle_rq(rq, p))
>>             return i;
>>
>>         if (available_idle_cpu(i)) {
>> @@ -7491,8 +7497,7 @@ static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct tas
>>
>> static inline int __select_idle_cpu(int cpu, struct task_struct *p)
>> {
>> -    if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
>> -        sched_cpu_cookie_match(cpu_rq(cpu), p))
>> +    if (choose_idle_cpu(cpu, p) && sched_cpu_cookie_match(cpu_rq(cpu), p))
>>         return cpu;
>>
>>     return -1;
>> @@ -7565,7 +7570,8 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
>>         if (!available_idle_cpu(cpu)) {
>>             idle = false;
>>             if (*idle_cpu == -1) {
>> -                if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) {
>> +                if (choose_sched_idle_rq(cpu_rq(cpu), p) &&
>> +                    cpumask_test_cpu(cpu, cpus)) {
>>                     *idle_cpu = cpu;
>>                     break;
>>                 }
>> @@ -7600,7 +7606,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
>>          */
>>         if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
>>             continue;
>> -        if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
>> +        if (choose_idle_cpu(cpu, p))
>>             return cpu;
>>     }
>>
>> @@ -7722,7 +7728,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
>>     for_each_cpu_wrap(cpu, cpus, target) {
>>         unsigned long cpu_cap = capacity_of(cpu);
>>
>> -        if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
>> +        if (!choose_idle_cpu(cpu, p))
>>             continue;
>>
>>         fits = util_fits_cpu(task_util, util_min, util_max, cpu);
>> @@ -7793,7 +7799,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>>      */
>>     lockdep_assert_irqs_disabled();
>>
>> -    if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
>> +    if (choose_idle_cpu(target, p) &&
>>         asym_fits_cpu(task_util, util_min, util_max, target))
>>         return target;
>>
>> @@ -7801,7 +7807,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>>      * If the previous CPU is cache affine and idle, don't be stupid:
>>      */
>>     if (prev != target && cpus_share_cache(prev, target) &&
>> -        (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
>> +        choose_idle_cpu(prev, p) &&
>>         asym_fits_cpu(task_util, util_min, util_max, prev)) {
>>
>>         if (!static_branch_unlikely(&sched_cluster_active) ||
>> @@ -7833,7 +7839,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>>     if (recent_used_cpu != prev &&
>>         recent_used_cpu != target &&
>>         cpus_share_cache(recent_used_cpu, target) &&
>> -        (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
>> +        choose_idle_cpu(recent_used_cpu, p) &&
>>         cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
>>         asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
>>
>> @@ -12261,7 +12267,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
>> {
>>     int continue_balancing = 1;
>>     int cpu = rq->cpu;
>> -    int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
>> +    int busy = idle != CPU_IDLE && !sched_idle_rq(rq);
>>     unsigned long interval;
>>     struct sched_domain *sd;
>>     /* Earliest time when we have to do rebalance again */
>> @@ -12299,7 +12305,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
>>                  * state even if we migrated tasks. Update it.
>>                  */
>>                 idle = idle_cpu(cpu);
>> -                busy = !idle && !sched_idle_cpu(cpu);
>> +                busy = !idle && !sched_idle_rq(rq);
> 
> Usually sched_idle_rqs were treated as not-busy in several balancing decisions to avoid yielding to background load. Does this change alter that interpretation at the domain balancing level ?
> 

Like Prateek already mentioned, the load-balancing didn't change.
(I did s/sched_idle_cpu(cpu)/sched_idle_rq(rq)/ though because
sched_idle_cpu didn't have many callers left and removed it
entirely).

Re: [PATCHv2] sched/fair: Skip SCHED_IDLE rq for SCHED_IDLE task

Posted by K Prateek Nayak 2 days, 19 hours ago

Hello Shubhang,

On 2/4/2026 7:38 AM, Shubhang Kaushik wrote:
>> @@ -12299,7 +12305,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
>>                  * state even if we migrated tasks. Update it.
>>                  */
>>                 idle = idle_cpu(cpu);
>> -                busy = !idle && !sched_idle_cpu(cpu);
>> +                busy = !idle && !sched_idle_rq(rq);
> 
> Usually sched_idle_rqs were treated as not-busy in several balancing decisions to avoid yielding to background load. Does this change alter that interpretation at the domain balancing level ?

I don't think anything changes in the load-balancer path since we still
check for sched_idle_rq() here. Only the wakeup path will consider both
the waking task's policy and whether the CPU is completely idle vs
sched_idle_rq().

> 
>>             }
>>             sd->last_balance = jiffies;
>>             interval = get_sd_balance_interval(sd, busy);
-- 
Thanks and Regards,
Prateek