kernel/sched/fair.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-)
CPUs whose rq only have SCHED_IDLE tasks running are considered to be
equivalent to truly idle CPUs during wakeup path. For fork and exec
SCHED_IDLE is even preferred.
This is based on the assumption that the SCHED_IDLE CPU is not in an
idle state and might be in a higher P-state, allowing the task/wakee
to run immediately without sharing the rq.
However this assumption doesn't hold if the wakee has SCHED_IDLE policy
itself, as it will share the rq with existing SCHED_IDLE tasks. In this
case, we are better off continuing to look for a truly idle CPU.
On a Intel Xeon 2-socket with 64 logical cores in total this yields
for kernel compilation using SCHED_IDLE:
+---------+----------------------+----------------------+--------+
| workers | mainline (seconds) | patch (seconds) | delta% |
+=========+======================+======================+========+
| 1 | 4384.728 ± 21.085 | 3843.250 ± 16.235 | -12.35 |
| 2 | 2242.513 ± 2.099 | 1971.696 ± 2.842 | -12.08 |
| 4 | 1199.324 ± 1.823 | 1033.744 ± 1.803 | -13.81 |
| 8 | 649.083 ± 1.959 | 559.123 ± 4.301 | -13.86 |
| 16 | 370.425 ± 0.915 | 325.906 ± 4.623 | -12.02 |
| 32 | 234.651 ± 2.255 | 217.266 ± 0.253 | -7.41 |
| 64 | 202.286 ± 1.452 | 197.977 ± 2.275 | -2.13 |
| 128 | 217.092 ± 1.687 | 212.164 ± 1.138 | -2.27 |
+---------+----------------------+----------------------+--------+
Signed-off-by: Christian Loehle <christian.loehle@arm.com>
---
v2: Reword commit message, SCHED_IDLE aren't always preferred,
but rather equivalent
Factor out choose_sched_idle_rq() too (Both Vincent)
kernel/sched/fair.c | 32 +++++++++++++++++++-------------
1 file changed, 19 insertions(+), 13 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3eaeceda71b0..6510ab6eb44b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6832,9 +6832,15 @@ static int sched_idle_rq(struct rq *rq)
rq->nr_running);
}
-static int sched_idle_cpu(int cpu)
+static int choose_sched_idle_rq(struct rq *rq, struct task_struct *p)
{
- return sched_idle_rq(cpu_rq(cpu));
+ return sched_idle_rq(rq) && !task_has_idle_policy(p);
+}
+
+static int choose_idle_cpu(int cpu, struct task_struct *p)
+{
+ return available_idle_cpu(cpu) ||
+ choose_sched_idle_rq(cpu_rq(cpu), p);
}
static void
@@ -7400,7 +7406,7 @@ sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *
if (!sched_core_cookie_match(rq, p))
continue;
- if (sched_idle_cpu(i))
+ if (choose_sched_idle_rq(rq, p))
return i;
if (available_idle_cpu(i)) {
@@ -7491,8 +7497,7 @@ static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct tas
static inline int __select_idle_cpu(int cpu, struct task_struct *p)
{
- if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
- sched_cpu_cookie_match(cpu_rq(cpu), p))
+ if (choose_idle_cpu(cpu, p) && sched_cpu_cookie_match(cpu_rq(cpu), p))
return cpu;
return -1;
@@ -7565,7 +7570,8 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
if (!available_idle_cpu(cpu)) {
idle = false;
if (*idle_cpu == -1) {
- if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) {
+ if (choose_sched_idle_rq(cpu_rq(cpu), p) &&
+ cpumask_test_cpu(cpu, cpus)) {
*idle_cpu = cpu;
break;
}
@@ -7600,7 +7606,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
*/
if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
continue;
- if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
+ if (choose_idle_cpu(cpu, p))
return cpu;
}
@@ -7722,7 +7728,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
for_each_cpu_wrap(cpu, cpus, target) {
unsigned long cpu_cap = capacity_of(cpu);
- if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
+ if (!choose_idle_cpu(cpu, p))
continue;
fits = util_fits_cpu(task_util, util_min, util_max, cpu);
@@ -7793,7 +7799,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
*/
lockdep_assert_irqs_disabled();
- if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
+ if (choose_idle_cpu(target, p) &&
asym_fits_cpu(task_util, util_min, util_max, target))
return target;
@@ -7801,7 +7807,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
* If the previous CPU is cache affine and idle, don't be stupid:
*/
if (prev != target && cpus_share_cache(prev, target) &&
- (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
+ choose_idle_cpu(prev, p) &&
asym_fits_cpu(task_util, util_min, util_max, prev)) {
if (!static_branch_unlikely(&sched_cluster_active) ||
@@ -7833,7 +7839,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
if (recent_used_cpu != prev &&
recent_used_cpu != target &&
cpus_share_cache(recent_used_cpu, target) &&
- (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
+ choose_idle_cpu(recent_used_cpu, p) &&
cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
@@ -12261,7 +12267,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
{
int continue_balancing = 1;
int cpu = rq->cpu;
- int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
+ int busy = idle != CPU_IDLE && !sched_idle_rq(rq);
unsigned long interval;
struct sched_domain *sd;
/* Earliest time when we have to do rebalance again */
@@ -12299,7 +12305,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
* state even if we migrated tasks. Update it.
*/
idle = idle_cpu(cpu);
- busy = !idle && !sched_idle_cpu(cpu);
+ busy = !idle && !sched_idle_rq(rq);
}
sd->last_balance = jiffies;
interval = get_sd_balance_interval(sd, busy);
--
2.34.1
On Tue, 3 Feb 2026, Christian Loehle wrote: > CPUs whose rq only have SCHED_IDLE tasks running are considered to be > equivalent to truly idle CPUs during wakeup path. For fork and exec > SCHED_IDLE is even preferred. > This is based on the assumption that the SCHED_IDLE CPU is not in an > idle state and might be in a higher P-state, allowing the task/wakee > to run immediately without sharing the rq. > > However this assumption doesn't hold if the wakee has SCHED_IDLE policy > itself, as it will share the rq with existing SCHED_IDLE tasks. In this > case, we are better off continuing to look for a truly idle CPU. > > On a Intel Xeon 2-socket with 64 logical cores in total this yields > for kernel compilation using SCHED_IDLE: > > +---------+----------------------+----------------------+--------+ > | workers | mainline (seconds) | patch (seconds) | delta% | > +=========+======================+======================+========+ > | 1 | 4384.728 ± 21.085 | 3843.250 ± 16.235 | -12.35 | > | 2 | 2242.513 ± 2.099 | 1971.696 ± 2.842 | -12.08 | > | 4 | 1199.324 ± 1.823 | 1033.744 ± 1.803 | -13.81 | > | 8 | 649.083 ± 1.959 | 559.123 ± 4.301 | -13.86 | > | 16 | 370.425 ± 0.915 | 325.906 ± 4.623 | -12.02 | > | 32 | 234.651 ± 2.255 | 217.266 ± 0.253 | -7.41 | > | 64 | 202.286 ± 1.452 | 197.977 ± 2.275 | -2.13 | > | 128 | 217.092 ± 1.687 | 212.164 ± 1.138 | -2.27 | > +---------+----------------------+----------------------+--------+ > > Signed-off-by: Christian Loehle <christian.loehle@arm.com> I’ve been testing this patch on an 80-core Ampere Altra (Neoverse-N1) and the results look very solid. On these high-core-count ARM systems, we definitely see the benefit of being pickier about where we place SCHED_IDLE tasks. Treating an occupied SCHED_IDLE rq as idle seems to cause unnecessary packing that shows up in the tail latency. By spreading these background tasks to truly idle cores, I'm seeing a nice boost in both background compilation and AI inference throughput. The reduction in sys time confirms that the domain balancing remains stable despite the refactor to sched_idle_rq(rq) as you and Prateek mentioned. 1. Background Kernel Compilation: I ran `time nice -n 19 make -j$nproc` to see how it handles a heavy background load. We saved nearly 3 minutes of 'sys' time showing lower scheduler overhead. Mainline (6.19.0-rc8): real 9m28.403s sys 219m21.591s Patched: real 9m16.167s (-12.2s) sys 216m28.323s (-2m53s) I was initially concerned about the impact on domain balancing, but the significant reduction in 'sys' time during the kernel build confirms that we aren't seeing any regressive balancing overhead. 2. AI Inference (llama-batched-bench): For background LLM inference, the patch consistently delivered about 8.7% more throughput when we're running near core saturation. 51 Threads: 30.03 t/s (vs 27.62 on Mainline) -> +8.7% 80 Threads: 27.20 t/s (vs 25.01 on Mainline) -> +8.7% 3. Scheduler Latency using schbench: The biggest win was in the p99.9 tail latency. Under a locked workload, the latency spikes dropped significantly. 4 Threads (Locking): 10085 us (vs 12421 us) -> -18.8% 8 Threads (Locking): 9563 us (vs 11589 us) -> -17.5% The patch really helps clean up the noise for background tasks on these large ARM platforms. Nice work. Tested-by: Shubhang Kaushik <shubhang@os.amperecomputing.com> Regards, Shubhang Kaushik > int cpu = rq->cpu; > - int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); ma> + int busy = idle != CPU_IDLE && !sched_idle_rq(rq); > unsigned long interval; > struct sched_domain *sd; > /* Earliest time when we have to do rebalance again */ > @@ -12299,7 +12305,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) > * state even if we migrated tasks. Update it. > */ > idle = idle_cpu(cpu); > - busy = !idle && !sched_idle_cpu(cpu); > + busy = !idle && !sched_idle_rq(rq); > } > sd->last_balance = jiffies; > interval = get_sd_balance_interval(sd, busy); > -- > 2.34.1 > >
On Thu, 5 Feb 2026 at 01:00, Shubhang Kaushik <shubhang@os.amperecomputing.com> wrote: > > On Tue, 3 Feb 2026, Christian Loehle wrote: > > > CPUs whose rq only have SCHED_IDLE tasks running are considered to be > > equivalent to truly idle CPUs during wakeup path. For fork and exec > > SCHED_IDLE is even preferred. > > This is based on the assumption that the SCHED_IDLE CPU is not in an > > idle state and might be in a higher P-state, allowing the task/wakee > > to run immediately without sharing the rq. > > > > However this assumption doesn't hold if the wakee has SCHED_IDLE policy > > itself, as it will share the rq with existing SCHED_IDLE tasks. In this > > case, we are better off continuing to look for a truly idle CPU. > > > > On a Intel Xeon 2-socket with 64 logical cores in total this yields > > for kernel compilation using SCHED_IDLE: > > > > +---------+----------------------+----------------------+--------+ > > | workers | mainline (seconds) | patch (seconds) | delta% | > > +=========+======================+======================+========+ > > | 1 | 4384.728 ± 21.085 | 3843.250 ± 16.235 | -12.35 | > > | 2 | 2242.513 ± 2.099 | 1971.696 ± 2.842 | -12.08 | > > | 4 | 1199.324 ± 1.823 | 1033.744 ± 1.803 | -13.81 | > > | 8 | 649.083 ± 1.959 | 559.123 ± 4.301 | -13.86 | > > | 16 | 370.425 ± 0.915 | 325.906 ± 4.623 | -12.02 | > > | 32 | 234.651 ± 2.255 | 217.266 ± 0.253 | -7.41 | > > | 64 | 202.286 ± 1.452 | 197.977 ± 2.275 | -2.13 | > > | 128 | 217.092 ± 1.687 | 212.164 ± 1.138 | -2.27 | > > +---------+----------------------+----------------------+--------+ > > > > Signed-off-by: Christian Loehle <christian.loehle@arm.com> > > I’ve been testing this patch on an 80-core Ampere Altra (Neoverse-N1) and > the results look very solid. On these high-core-count ARM systems, we > definitely see the benefit of being pickier about where we place > SCHED_IDLE tasks. > > Treating an occupied SCHED_IDLE rq as idle seems to cause > unnecessary packing that shows up in the tail latency. By spreading these > background tasks to truly idle cores, I'm seeing a nice boost in both > background compilation and AI inference throughput. > > The reduction in sys time confirms that the domain balancing remains > stable despite the refactor to sched_idle_rq(rq) as you and Prateek > mentioned. > > 1. Background Kernel Compilation: > > I ran `time nice -n 19 make -j$nproc` to see how it handles a heavy nice -n 19 uses sched_other with prio 19 and not sched_idle so I'm curious how you can see a difference ? Or something is missing in your test description Or we have a bug somewhere > background load. We saved nearly 3 minutes of 'sys' time showing > lower scheduler overhead. > > Mainline (6.19.0-rc8): > real 9m28.403s > sys 219m21.591s > > Patched: > real 9m16.167s (-12.2s) > sys 216m28.323s (-2m53s) > > I was initially concerned about the impact on domain balancing, but the > significant reduction in 'sys' time during the kernel build confirms that > we aren't seeing any regressive balancing overhead. > > 2. AI Inference (llama-batched-bench): > > For background LLM inference, the patch consistently delivered about 8.7% > more throughput when we're running near core saturation. > > 51 Threads: 30.03 t/s (vs 27.62 on Mainline) -> +8.7% > 80 Threads: 27.20 t/s (vs 25.01 on Mainline) -> +8.7% > > 3. Scheduler Latency using schbench: > > The biggest win was in the p99.9 tail latency. Under a locked workload, > the latency spikes dropped significantly. > 4 Threads (Locking): 10085 us (vs 12421 us) -> -18.8% > 8 Threads (Locking): 9563 us (vs 11589 us) -> -17.5% > > The patch really helps clean up the noise for background tasks on these > large ARM platforms. Nice work. > > Tested-by: Shubhang Kaushik <shubhang@os.amperecomputing.com> > > Regards, > Shubhang Kaushik > > > int cpu = rq->cpu; > > - int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); > ma> + int busy = idle != CPU_IDLE && !sched_idle_rq(rq); > > unsigned long interval; > > struct sched_domain *sd; > > /* Earliest time when we have to do rebalance again */ > > @@ -12299,7 +12305,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) > > * state even if we migrated tasks. Update it. > > */ > > idle = idle_cpu(cpu); > > - busy = !idle && !sched_idle_cpu(cpu); > > + busy = !idle && !sched_idle_rq(rq); > > } > > sd->last_balance = jiffies; > > interval = get_sd_balance_interval(sd, busy); > > -- > > 2.34.1 > > > >
On Thu, 5 Feb 2026, Vincent Guittot wrote: > On Thu, 5 Feb 2026 at 01:00, Shubhang Kaushik > <shubhang@os.amperecomputing.com> wrote: >> >> On Tue, 3 Feb 2026, Christian Loehle wrote: >> >>> CPUs whose rq only have SCHED_IDLE tasks running are considered to be >>> equivalent to truly idle CPUs during wakeup path. For fork and exec >>> SCHED_IDLE is even preferred. >>> This is based on the assumption that the SCHED_IDLE CPU is not in an >>> idle state and might be in a higher P-state, allowing the task/wakee >>> to run immediately without sharing the rq. >>> >>> However this assumption doesn't hold if the wakee has SCHED_IDLE policy >>> itself, as it will share the rq with existing SCHED_IDLE tasks. In this >>> case, we are better off continuing to look for a truly idle CPU. >>> >>> On a Intel Xeon 2-socket with 64 logical cores in total this yields >>> for kernel compilation using SCHED_IDLE: >>> >>> +---------+----------------------+----------------------+--------+ >>> | workers | mainline (seconds) | patch (seconds) | delta% | >>> +=========+======================+======================+========+ >>> | 1 | 4384.728 ± 21.085 | 3843.250 ± 16.235 | -12.35 | >>> | 2 | 2242.513 ± 2.099 | 1971.696 ± 2.842 | -12.08 | >>> | 4 | 1199.324 ± 1.823 | 1033.744 ± 1.803 | -13.81 | >>> | 8 | 649.083 ± 1.959 | 559.123 ± 4.301 | -13.86 | >>> | 16 | 370.425 ± 0.915 | 325.906 ± 4.623 | -12.02 | >>> | 32 | 234.651 ± 2.255 | 217.266 ± 0.253 | -7.41 | >>> | 64 | 202.286 ± 1.452 | 197.977 ± 2.275 | -2.13 | >>> | 128 | 217.092 ± 1.687 | 212.164 ± 1.138 | -2.27 | >>> +---------+----------------------+----------------------+--------+ >>> >>> Signed-off-by: Christian Loehle <christian.loehle@arm.com> >> >> I’ve been testing this patch on an 80-core Ampere Altra (Neoverse-N1) and >> the results look very solid. On these high-core-count ARM systems, we >> definitely see the benefit of being pickier about where we place >> SCHED_IDLE tasks. >> >> Treating an occupied SCHED_IDLE rq as idle seems to cause >> unnecessary packing that shows up in the tail latency. By spreading these >> background tasks to truly idle cores, I'm seeing a nice boost in both >> background compilation and AI inference throughput. >> >> The reduction in sys time confirms that the domain balancing remains >> stable despite the refactor to sched_idle_rq(rq) as you and Prateek >> mentioned. >> >> 1. Background Kernel Compilation: >> >> I ran `time nice -n 19 make -j$nproc` to see how it handles a heavy > > nice -n 19 uses sched_other with prio 19 and not sched_idle so I'm > curious how you can see a difference ? > Or something is missing in your test description > Or we have a bug somewhere > Okay, I realized I had used nice -n 19 (SCHED_OTHER) for the initial build, which wouldn't have directly triggered the SCHED_IDLE logic. But, I did use chrt for the schbench runs, which is why those p99 wins were so consistent. I've re-run the kernel build using the correct chrt --idle 0 policy. On Ampere Altra, the throughput is along the same lines as mainline. Metric Mainline Patched Delta Real 9m 20.120s 9m 18.472s -1.6s User 382m 24.966s 380m 41.716s -1m 43s Sys 218m 26.192s 218m 44.908s +18.7s >> background load. We saved nearly 3 minutes of 'sys' time showing >> lower scheduler overhead. >> >> Mainline (6.19.0-rc8): >> real 9m28.403s >> sys 219m21.591s >> >> Patched: >> real 9m16.167s (-12.2s) >> sys 216m28.323s (-2m53s) >> >> I was initially concerned about the impact on domain balancing, but the >> significant reduction in 'sys' time during the kernel build confirms that >> we aren't seeing any regressive balancing overhead. >> >> 2. AI Inference (llama-batched-bench): >> >> For background LLM inference, the patch consistently delivered about 8.7% >> more throughput when we're running near core saturation. >> >> 51 Threads: 30.03 t/s (vs 27.62 on Mainline) -> +8.7% >> 80 Threads: 27.20 t/s (vs 25.01 on Mainline) -> +8.7% >> >> 3. Scheduler Latency using schbench: >> >> The biggest win was in the p99.9 tail latency. Under a locked workload, >> the latency spikes dropped significantly. >> 4 Threads (Locking): 10085 us (vs 12421 us) -> -18.8% >> 8 Threads (Locking): 9563 us (vs 11589 us) -> -17.5% >> >> The patch really helps clean up the noise for background tasks on these >> large ARM platforms. Nice work. >> >> Tested-by: Shubhang Kaushik <shubhang@os.amperecomputing.com> >> >> Regards, >> Shubhang Kaushik >> >>> int cpu = rq->cpu; >>> - int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); >> ma> + int busy = idle != CPU_IDLE && !sched_idle_rq(rq); >>> unsigned long interval; >>> struct sched_domain *sd; >>> /* Earliest time when we have to do rebalance again */ >>> @@ -12299,7 +12305,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) >>> * state even if we migrated tasks. Update it. >>> */ >>> idle = idle_cpu(cpu); >>> - busy = !idle && !sched_idle_cpu(cpu); >>> + busy = !idle && !sched_idle_rq(rq); >>> } >>> sd->last_balance = jiffies; >>> interval = get_sd_balance_interval(sd, busy); >>> -- >>> 2.34.1 >>> >>> >
On 2/5/26 18:52, Shubhang Kaushik wrote: > On Thu, 5 Feb 2026, Vincent Guittot wrote: > >> On Thu, 5 Feb 2026 at 01:00, Shubhang Kaushik >> <shubhang@os.amperecomputing.com> wrote: >>> >>> On Tue, 3 Feb 2026, Christian Loehle wrote: >>> >>>> CPUs whose rq only have SCHED_IDLE tasks running are considered to be >>>> equivalent to truly idle CPUs during wakeup path. For fork and exec >>>> SCHED_IDLE is even preferred. >>>> This is based on the assumption that the SCHED_IDLE CPU is not in an >>>> idle state and might be in a higher P-state, allowing the task/wakee >>>> to run immediately without sharing the rq. >>>> >>>> However this assumption doesn't hold if the wakee has SCHED_IDLE policy >>>> itself, as it will share the rq with existing SCHED_IDLE tasks. In this >>>> case, we are better off continuing to look for a truly idle CPU. >>>> >>>> On a Intel Xeon 2-socket with 64 logical cores in total this yields >>>> for kernel compilation using SCHED_IDLE: >>>> >>>> +---------+----------------------+----------------------+--------+ >>>> | workers | mainline (seconds) | patch (seconds) | delta% | >>>> +=========+======================+======================+========+ >>>> | 1 | 4384.728 ± 21.085 | 3843.250 ± 16.235 | -12.35 | >>>> | 2 | 2242.513 ± 2.099 | 1971.696 ± 2.842 | -12.08 | >>>> | 4 | 1199.324 ± 1.823 | 1033.744 ± 1.803 | -13.81 | >>>> | 8 | 649.083 ± 1.959 | 559.123 ± 4.301 | -13.86 | >>>> | 16 | 370.425 ± 0.915 | 325.906 ± 4.623 | -12.02 | >>>> | 32 | 234.651 ± 2.255 | 217.266 ± 0.253 | -7.41 | >>>> | 64 | 202.286 ± 1.452 | 197.977 ± 2.275 | -2.13 | >>>> | 128 | 217.092 ± 1.687 | 212.164 ± 1.138 | -2.27 | >>>> +---------+----------------------+----------------------+--------+ >>>> >>>> Signed-off-by: Christian Loehle <christian.loehle@arm.com> >>> >>> I’ve been testing this patch on an 80-core Ampere Altra (Neoverse-N1) and >>> the results look very solid. On these high-core-count ARM systems, we >>> definitely see the benefit of being pickier about where we place >>> SCHED_IDLE tasks. >>> >>> Treating an occupied SCHED_IDLE rq as idle seems to cause >>> unnecessary packing that shows up in the tail latency. By spreading these >>> background tasks to truly idle cores, I'm seeing a nice boost in both >>> background compilation and AI inference throughput. >>> >>> The reduction in sys time confirms that the domain balancing remains >>> stable despite the refactor to sched_idle_rq(rq) as you and Prateek >>> mentioned. >>> >>> 1. Background Kernel Compilation: >>> >>> I ran `time nice -n 19 make -j$nproc` to see how it handles a heavy >> >> nice -n 19 uses sched_other with prio 19 and not sched_idle so I'm >> curious how you can see a difference ? >> Or something is missing in your test description >> Or we have a bug somewhere >> > > Okay, I realized I had used nice -n 19 (SCHED_OTHER) for the initial build, which wouldn't have directly triggered the SCHED_IDLE logic. But, I did use chrt for the schbench runs, which is why those p99 wins were so consistent. > > I've re-run the kernel build using the correct chrt --idle 0 policy. On Ampere Altra, the throughput is along the same lines as mainline. > > Metric Mainline Patched Delta > Real 9m 20.120s 9m 18.472s -1.6s > User 382m 24.966s 380m 41.716s -1m 43s > Sys 218m 26.192s 218m 44.908s +18.7s > Thanks for testing Shubhang, although I find it a bit surprising that your kernel compilation under SCHED_IDLE doesn't improve. Are you running with CONFIG_SCHED_CLUSTER=y? I'll try to reproduce. Anyway at least you see a schbench improvement, I'm assuming I'll keep you Tested-by?
On Fri, 6 Feb 2026, Christian Loehle wrote: > On 2/5/26 18:52, Shubhang Kaushik wrote: >> On Thu, 5 Feb 2026, Vincent Guittot wrote: >> >>> On Thu, 5 Feb 2026 at 01:00, Shubhang Kaushik >>> <shubhang@os.amperecomputing.com> wrote: >>>> >>>> On Tue, 3 Feb 2026, Christian Loehle wrote: >>>> >>>>> CPUs whose rq only have SCHED_IDLE tasks running are considered to be >>>>> equivalent to truly idle CPUs during wakeup path. For fork and exec >>>>> SCHED_IDLE is even preferred. >>>>> This is based on the assumption that the SCHED_IDLE CPU is not in an >>>>> idle state and might be in a higher P-state, allowing the task/wakee >>>>> to run immediately without sharing the rq. >>>>> >>>>> However this assumption doesn't hold if the wakee has SCHED_IDLE policy >>>>> itself, as it will share the rq with existing SCHED_IDLE tasks. In this >>>>> case, we are better off continuing to look for a truly idle CPU. >>>>> >>>>> On a Intel Xeon 2-socket with 64 logical cores in total this yields >>>>> for kernel compilation using SCHED_IDLE: >>>>> >>>>> +---------+----------------------+----------------------+--------+ >>>>> | workers | mainline (seconds) | patch (seconds) | delta% | >>>>> +=========+======================+======================+========+ >>>>> | 1 | 4384.728 ± 21.085 | 3843.250 ± 16.235 | -12.35 | >>>>> | 2 | 2242.513 ± 2.099 | 1971.696 ± 2.842 | -12.08 | >>>>> | 4 | 1199.324 ± 1.823 | 1033.744 ± 1.803 | -13.81 | >>>>> | 8 | 649.083 ± 1.959 | 559.123 ± 4.301 | -13.86 | >>>>> | 16 | 370.425 ± 0.915 | 325.906 ± 4.623 | -12.02 | >>>>> | 32 | 234.651 ± 2.255 | 217.266 ± 0.253 | -7.41 | >>>>> | 64 | 202.286 ± 1.452 | 197.977 ± 2.275 | -2.13 | >>>>> | 128 | 217.092 ± 1.687 | 212.164 ± 1.138 | -2.27 | >>>>> +---------+----------------------+----------------------+--------+ >>>>> >>>>> Signed-off-by: Christian Loehle <christian.loehle@arm.com> >>>> >>>> I’ve been testing this patch on an 80-core Ampere Altra (Neoverse-N1) and >>>> the results look very solid. On these high-core-count ARM systems, we >>>> definitely see the benefit of being pickier about where we place >>>> SCHED_IDLE tasks. >>>> >>>> Treating an occupied SCHED_IDLE rq as idle seems to cause >>>> unnecessary packing that shows up in the tail latency. By spreading these >>>> background tasks to truly idle cores, I'm seeing a nice boost in both >>>> background compilation and AI inference throughput. >>>> >>>> The reduction in sys time confirms that the domain balancing remains >>>> stable despite the refactor to sched_idle_rq(rq) as you and Prateek >>>> mentioned. >>>> >>>> 1. Background Kernel Compilation: >>>> >>>> I ran `time nice -n 19 make -j$nproc` to see how it handles a heavy >>> >>> nice -n 19 uses sched_other with prio 19 and not sched_idle so I'm >>> curious how you can see a difference ? >>> Or something is missing in your test description >>> Or we have a bug somewhere >>> >> >> Okay, I realized I had used nice -n 19 (SCHED_OTHER) for the initial build, which wouldn't have directly triggered the SCHED_IDLE logic. But, I did use chrt for the schbench runs, which is why those p99 wins were so consistent. >> >> I've re-run the kernel build using the correct chrt --idle 0 policy. On Ampere Altra, the throughput is along the same lines as mainline. >> >> Metric Mainline Patched Delta >> Real 9m 20.120s 9m 18.472s -1.6s >> User 382m 24.966s 380m 41.716s -1m 43s >> Sys 218m 26.192s 218m 44.908s +18.7s >> > > Thanks for testing Shubhang, although I find it a bit surprising that your > kernel compilation under SCHED_IDLE doesn't improve. > Are you running with CONFIG_SCHED_CLUSTER=y? I'll try to reproduce. > Anyway at least you see a schbench improvement, I'm assuming I'll > keep you Tested-by? > > Yes, that's right CONFIG_SCHED_CLUSTER=y is enabled. That likely explains why the build throughput isn't shifting as much as your Xeon results, though the drop in the user time still suggests better efficiency. Feel free to keep the Tested-by tag. Tested-by: Shubhang Kaushik shubhang@os.amperecomputing.com
On Tue, 3 Feb 2026 at 19:49, Christian Loehle <christian.loehle@arm.com> wrote:
>
> CPUs whose rq only have SCHED_IDLE tasks running are considered to be
> equivalent to truly idle CPUs during wakeup path. For fork and exec
> SCHED_IDLE is even preferred.
> This is based on the assumption that the SCHED_IDLE CPU is not in an
> idle state and might be in a higher P-state, allowing the task/wakee
> to run immediately without sharing the rq.
>
> However this assumption doesn't hold if the wakee has SCHED_IDLE policy
> itself, as it will share the rq with existing SCHED_IDLE tasks. In this
> case, we are better off continuing to look for a truly idle CPU.
>
> On a Intel Xeon 2-socket with 64 logical cores in total this yields
> for kernel compilation using SCHED_IDLE:
>
> +---------+----------------------+----------------------+--------+
> | workers | mainline (seconds) | patch (seconds) | delta% |
> +=========+======================+======================+========+
> | 1 | 4384.728 ± 21.085 | 3843.250 ± 16.235 | -12.35 |
> | 2 | 2242.513 ± 2.099 | 1971.696 ± 2.842 | -12.08 |
> | 4 | 1199.324 ± 1.823 | 1033.744 ± 1.803 | -13.81 |
> | 8 | 649.083 ± 1.959 | 559.123 ± 4.301 | -13.86 |
> | 16 | 370.425 ± 0.915 | 325.906 ± 4.623 | -12.02 |
> | 32 | 234.651 ± 2.255 | 217.266 ± 0.253 | -7.41 |
> | 64 | 202.286 ± 1.452 | 197.977 ± 2.275 | -2.13 |
> | 128 | 217.092 ± 1.687 | 212.164 ± 1.138 | -2.27 |
> +---------+----------------------+----------------------+--------+
>
> Signed-off-by: Christian Loehle <christian.loehle@arm.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
> ---
> v2: Reword commit message, SCHED_IDLE aren't always preferred,
> but rather equivalent
> Factor out choose_sched_idle_rq() too (Both Vincent)
>
> kernel/sched/fair.c | 32 +++++++++++++++++++-------------
> 1 file changed, 19 insertions(+), 13 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 3eaeceda71b0..6510ab6eb44b 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6832,9 +6832,15 @@ static int sched_idle_rq(struct rq *rq)
> rq->nr_running);
> }
>
> -static int sched_idle_cpu(int cpu)
> +static int choose_sched_idle_rq(struct rq *rq, struct task_struct *p)
> {
> - return sched_idle_rq(cpu_rq(cpu));
> + return sched_idle_rq(rq) && !task_has_idle_policy(p);
> +}
> +
> +static int choose_idle_cpu(int cpu, struct task_struct *p)
> +{
> + return available_idle_cpu(cpu) ||
> + choose_sched_idle_rq(cpu_rq(cpu), p);
> }
>
> static void
> @@ -7400,7 +7406,7 @@ sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *
> if (!sched_core_cookie_match(rq, p))
> continue;
>
> - if (sched_idle_cpu(i))
> + if (choose_sched_idle_rq(rq, p))
> return i;
>
> if (available_idle_cpu(i)) {
> @@ -7491,8 +7497,7 @@ static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct tas
>
> static inline int __select_idle_cpu(int cpu, struct task_struct *p)
> {
> - if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
> - sched_cpu_cookie_match(cpu_rq(cpu), p))
> + if (choose_idle_cpu(cpu, p) && sched_cpu_cookie_match(cpu_rq(cpu), p))
> return cpu;
>
> return -1;
> @@ -7565,7 +7570,8 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
> if (!available_idle_cpu(cpu)) {
> idle = false;
> if (*idle_cpu == -1) {
> - if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) {
> + if (choose_sched_idle_rq(cpu_rq(cpu), p) &&
> + cpumask_test_cpu(cpu, cpus)) {
> *idle_cpu = cpu;
> break;
> }
> @@ -7600,7 +7606,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
> */
> if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
> continue;
> - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
> + if (choose_idle_cpu(cpu, p))
> return cpu;
> }
>
> @@ -7722,7 +7728,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
> for_each_cpu_wrap(cpu, cpus, target) {
> unsigned long cpu_cap = capacity_of(cpu);
>
> - if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
> + if (!choose_idle_cpu(cpu, p))
> continue;
>
> fits = util_fits_cpu(task_util, util_min, util_max, cpu);
> @@ -7793,7 +7799,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
> */
> lockdep_assert_irqs_disabled();
>
> - if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
> + if (choose_idle_cpu(target, p) &&
> asym_fits_cpu(task_util, util_min, util_max, target))
> return target;
>
> @@ -7801,7 +7807,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
> * If the previous CPU is cache affine and idle, don't be stupid:
> */
> if (prev != target && cpus_share_cache(prev, target) &&
> - (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
> + choose_idle_cpu(prev, p) &&
> asym_fits_cpu(task_util, util_min, util_max, prev)) {
>
> if (!static_branch_unlikely(&sched_cluster_active) ||
> @@ -7833,7 +7839,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
> if (recent_used_cpu != prev &&
> recent_used_cpu != target &&
> cpus_share_cache(recent_used_cpu, target) &&
> - (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
> + choose_idle_cpu(recent_used_cpu, p) &&
> cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
> asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
>
> @@ -12261,7 +12267,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
> {
> int continue_balancing = 1;
> int cpu = rq->cpu;
> - int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
> + int busy = idle != CPU_IDLE && !sched_idle_rq(rq);
> unsigned long interval;
> struct sched_domain *sd;
> /* Earliest time when we have to do rebalance again */
> @@ -12299,7 +12305,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
> * state even if we migrated tasks. Update it.
> */
> idle = idle_cpu(cpu);
> - busy = !idle && !sched_idle_cpu(cpu);
> + busy = !idle && !sched_idle_rq(rq);
> }
> sd->last_balance = jiffies;
> interval = get_sd_balance_interval(sd, busy);
> --
> 2.34.1
>
Hello Chris,
On 2/4/2026 12:19 AM, Christian Loehle wrote:
> CPUs whose rq only have SCHED_IDLE tasks running are considered to be
> equivalent to truly idle CPUs during wakeup path. For fork and exec
> SCHED_IDLE is even preferred.
> This is based on the assumption that the SCHED_IDLE CPU is not in an
> idle state and might be in a higher P-state, allowing the task/wakee
> to run immediately without sharing the rq.
>
> However this assumption doesn't hold if the wakee has SCHED_IDLE policy
> itself, as it will share the rq with existing SCHED_IDLE tasks. In this
> case, we are better off continuing to look for a truly idle CPU.
>
> On a Intel Xeon 2-socket with 64 logical cores in total this yields
> for kernel compilation using SCHED_IDLE:
>
> +---------+----------------------+----------------------+--------+
> | workers | mainline (seconds) | patch (seconds) | delta% |
> +=========+======================+======================+========+
> | 1 | 4384.728 ± 21.085 | 3843.250 ± 16.235 | -12.35 |
> | 2 | 2242.513 ± 2.099 | 1971.696 ± 2.842 | -12.08 |
> | 4 | 1199.324 ± 1.823 | 1033.744 ± 1.803 | -13.81 |
> | 8 | 649.083 ± 1.959 | 559.123 ± 4.301 | -13.86 |
> | 16 | 370.425 ± 0.915 | 325.906 ± 4.623 | -12.02 |
> | 32 | 234.651 ± 2.255 | 217.266 ± 0.253 | -7.41 |
> | 64 | 202.286 ± 1.452 | 197.977 ± 2.275 | -2.13 |
> | 128 | 217.092 ± 1.687 | 212.164 ± 1.138 | -2.27 |
> +---------+----------------------+----------------------+--------+
I couldn't spot much difference for kernel compilation on my
3rd Generation EPYC system likely due to smaller LLC size. For
sched-messaging, I found the following interesting trend when
running with SCHED_IDLE:
(Normalized runtime [Var%]; %diff - higher the better)
tip/sched:core +patch (%diff)
1-group 1.00 [5.00%] 0.88 [10.78%] 11.80%
2-group 1.00 [5.15%] 0.93 [26.06%] 6.99%
4-group 1.00 [5.48%] 0.89 [11.03%] 11.13%
8-group 1.00 [6.62%] 1.21 [12.37%] -21.30%
16-group 1.00 [9.46%] 1.28 [ 9.42%] -27.59%
There is a good improvement for lower utilization. Once the
system is trending towards overutilized but SIS_UTIL cut-off
is still non-zero, we search a little bit longer for a fully
idle CPU when the probability for finding one is actually
low.
I suppose that scenario is rare where we only have SCHED_IDLE
tasks that care about throughput on a busy system to
actually notice this but it was worth pointing out.
Feel free to include:
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
>
> Signed-off-by: Christian Loehle <christian.loehle@arm.com>
--
Thanks and Regards,
Prateek
On 2/4/26 07:48, K Prateek Nayak wrote: > Hello Chris, > > On 2/4/2026 12:19 AM, Christian Loehle wrote: >> CPUs whose rq only have SCHED_IDLE tasks running are considered to be >> equivalent to truly idle CPUs during wakeup path. For fork and exec >> SCHED_IDLE is even preferred. >> This is based on the assumption that the SCHED_IDLE CPU is not in an >> idle state and might be in a higher P-state, allowing the task/wakee >> to run immediately without sharing the rq. >> >> However this assumption doesn't hold if the wakee has SCHED_IDLE policy >> itself, as it will share the rq with existing SCHED_IDLE tasks. In this >> case, we are better off continuing to look for a truly idle CPU. >> >> On a Intel Xeon 2-socket with 64 logical cores in total this yields >> for kernel compilation using SCHED_IDLE: >> >> +---------+----------------------+----------------------+--------+ >> | workers | mainline (seconds) | patch (seconds) | delta% | >> +=========+======================+======================+========+ >> | 1 | 4384.728 ± 21.085 | 3843.250 ± 16.235 | -12.35 | >> | 2 | 2242.513 ± 2.099 | 1971.696 ± 2.842 | -12.08 | >> | 4 | 1199.324 ± 1.823 | 1033.744 ± 1.803 | -13.81 | >> | 8 | 649.083 ± 1.959 | 559.123 ± 4.301 | -13.86 | >> | 16 | 370.425 ± 0.915 | 325.906 ± 4.623 | -12.02 | >> | 32 | 234.651 ± 2.255 | 217.266 ± 0.253 | -7.41 | >> | 64 | 202.286 ± 1.452 | 197.977 ± 2.275 | -2.13 | >> | 128 | 217.092 ± 1.687 | 212.164 ± 1.138 | -2.27 | >> +---------+----------------------+----------------------+--------+ > > I couldn't spot much difference for kernel compilation on my > 3rd Generation EPYC system likely due to smaller LLC size. For > sched-messaging, I found the following interesting trend when > running with SCHED_IDLE: > > (Normalized runtime [Var%]; %diff - higher the better) > > tip/sched:core +patch (%diff) > > 1-group 1.00 [5.00%] 0.88 [10.78%] 11.80% > 2-group 1.00 [5.15%] 0.93 [26.06%] 6.99% > 4-group 1.00 [5.48%] 0.89 [11.03%] 11.13% > 8-group 1.00 [6.62%] 1.21 [12.37%] -21.30% > 16-group 1.00 [9.46%] 1.28 [ 9.42%] -27.59% > > > There is a good improvement for lower utilization. Once the > system is trending towards overutilized but SIS_UTIL cut-off > is still non-zero, we search a little bit longer for a fully > idle CPU when the probability for finding one is actually > low. > > I suppose that scenario is rare where we only have SCHED_IDLE > tasks that care about throughput on a busy system to > actually notice this but it was worth pointing out. If we're unlikely to find a good candidate then doing anything on wakeup is kind of a waste of time, especially for sched messaging. So I guess without $PATCH it will basically always bail out when looking at the first few CPUs because it sees SCHED_IDLE sched messaging :) > > Feel free to include: > > Tested-by: K Prateek Nayak <kprateek.nayak@amd.com> Thanks for testing!
Hi Christian Loehle,
On Tue, 3 Feb 2026, Christian Loehle wrote:
> CPUs whose rq only have SCHED_IDLE tasks running are considered to be
> equivalent to truly idle CPUs during wakeup path. For fork and exec
> SCHED_IDLE is even preferred.
> This is based on the assumption that the SCHED_IDLE CPU is not in an
> idle state and might be in a higher P-state, allowing the task/wakee
> to run immediately without sharing the rq.
>
> However this assumption doesn't hold if the wakee has SCHED_IDLE policy
> itself, as it will share the rq with existing SCHED_IDLE tasks. In this
> case, we are better off continuing to look for a truly idle CPU.
>
> On a Intel Xeon 2-socket with 64 logical cores in total this yields
> for kernel compilation using SCHED_IDLE:
>
> +---------+----------------------+----------------------+--------+
> | workers | mainline (seconds) | patch (seconds) | delta% |
> +=========+======================+======================+========+
> | 1 | 4384.728 ± 21.085 | 3843.250 ± 16.235 | -12.35 |
> | 2 | 2242.513 ± 2.099 | 1971.696 ± 2.842 | -12.08 |
> | 4 | 1199.324 ± 1.823 | 1033.744 ± 1.803 | -13.81 |
> | 8 | 649.083 ± 1.959 | 559.123 ± 4.301 | -13.86 |
> | 16 | 370.425 ± 0.915 | 325.906 ± 4.623 | -12.02 |
> | 32 | 234.651 ± 2.255 | 217.266 ± 0.253 | -7.41 |
> | 64 | 202.286 ± 1.452 | 197.977 ± 2.275 | -2.13 |
> | 128 | 217.092 ± 1.687 | 212.164 ± 1.138 | -2.27 |
> +---------+----------------------+----------------------+--------+
>
> Signed-off-by: Christian Loehle <christian.loehle@arm.com>
> ---
> v2: Reword commit message, SCHED_IDLE aren't always preferred,
> but rather equivalent
> Factor out choose_sched_idle_rq() too (Both Vincent)
>
> kernel/sched/fair.c | 32 +++++++++++++++++++-------------
> 1 file changed, 19 insertions(+), 13 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 3eaeceda71b0..6510ab6eb44b 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6832,9 +6832,15 @@ static int sched_idle_rq(struct rq *rq)
> rq->nr_running);
> }
>
> -static int sched_idle_cpu(int cpu)
> +static int choose_sched_idle_rq(struct rq *rq, struct task_struct *p)
> {
> - return sched_idle_rq(cpu_rq(cpu));
> + return sched_idle_rq(rq) && !task_has_idle_policy(p);
> +}
> +
> +static int choose_idle_cpu(int cpu, struct task_struct *p)
> +{
> + return available_idle_cpu(cpu) ||
> + choose_sched_idle_rq(cpu_rq(cpu), p);
> }
>
> static void
> @@ -7400,7 +7406,7 @@ sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *
> if (!sched_core_cookie_match(rq, p))
> continue;
>
> - if (sched_idle_cpu(i))
> + if (choose_sched_idle_rq(rq, p))
> return i;
>
> if (available_idle_cpu(i)) {
> @@ -7491,8 +7497,7 @@ static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct tas
>
> static inline int __select_idle_cpu(int cpu, struct task_struct *p)
> {
> - if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
> - sched_cpu_cookie_match(cpu_rq(cpu), p))
> + if (choose_idle_cpu(cpu, p) && sched_cpu_cookie_match(cpu_rq(cpu), p))
> return cpu;
>
> return -1;
> @@ -7565,7 +7570,8 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
> if (!available_idle_cpu(cpu)) {
> idle = false;
> if (*idle_cpu == -1) {
> - if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) {
> + if (choose_sched_idle_rq(cpu_rq(cpu), p) &&
> + cpumask_test_cpu(cpu, cpus)) {
> *idle_cpu = cpu;
> break;
> }
> @@ -7600,7 +7606,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
> */
> if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
> continue;
> - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
> + if (choose_idle_cpu(cpu, p))
> return cpu;
> }
>
> @@ -7722,7 +7728,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
> for_each_cpu_wrap(cpu, cpus, target) {
> unsigned long cpu_cap = capacity_of(cpu);
>
> - if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
> + if (!choose_idle_cpu(cpu, p))
> continue;
>
> fits = util_fits_cpu(task_util, util_min, util_max, cpu);
> @@ -7793,7 +7799,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
> */
> lockdep_assert_irqs_disabled();
>
> - if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
> + if (choose_idle_cpu(target, p) &&
> asym_fits_cpu(task_util, util_min, util_max, target))
> return target;
>
> @@ -7801,7 +7807,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
> * If the previous CPU is cache affine and idle, don't be stupid:
> */
> if (prev != target && cpus_share_cache(prev, target) &&
> - (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
> + choose_idle_cpu(prev, p) &&
> asym_fits_cpu(task_util, util_min, util_max, prev)) {
>
> if (!static_branch_unlikely(&sched_cluster_active) ||
> @@ -7833,7 +7839,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
> if (recent_used_cpu != prev &&
> recent_used_cpu != target &&
> cpus_share_cache(recent_used_cpu, target) &&
> - (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
> + choose_idle_cpu(recent_used_cpu, p) &&
> cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
> asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
>
> @@ -12261,7 +12267,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
> {
> int continue_balancing = 1;
> int cpu = rq->cpu;
> - int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
> + int busy = idle != CPU_IDLE && !sched_idle_rq(rq);
> unsigned long interval;
> struct sched_domain *sd;
> /* Earliest time when we have to do rebalance again */
> @@ -12299,7 +12305,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
> * state even if we migrated tasks. Update it.
> */
> idle = idle_cpu(cpu);
> - busy = !idle && !sched_idle_cpu(cpu);
> + busy = !idle && !sched_idle_rq(rq);
Usually sched_idle_rqs were treated as not-busy in several balancing
decisions to avoid yielding to background load. Does this change alter
that interpretation at the domain balancing level ?
> }
> sd->last_balance = jiffies;
> interval = get_sd_balance_interval(sd, busy);
> --
> 2.34.1
>
>
Regards,
Shubhang Kaushik
On 2/4/26 02:08, Shubhang Kaushik wrote:
> Hi Christian Loehle,
>
> On Tue, 3 Feb 2026, Christian Loehle wrote:
>
>> CPUs whose rq only have SCHED_IDLE tasks running are considered to be
>> equivalent to truly idle CPUs during wakeup path. For fork and exec
>> SCHED_IDLE is even preferred.
>> This is based on the assumption that the SCHED_IDLE CPU is not in an
>> idle state and might be in a higher P-state, allowing the task/wakee
>> to run immediately without sharing the rq.
>>
>> However this assumption doesn't hold if the wakee has SCHED_IDLE policy
>> itself, as it will share the rq with existing SCHED_IDLE tasks. In this
>> case, we are better off continuing to look for a truly idle CPU.
>>
>> On a Intel Xeon 2-socket with 64 logical cores in total this yields
>> for kernel compilation using SCHED_IDLE:
>>
>> +---------+----------------------+----------------------+--------+
>> | workers | mainline (seconds) | patch (seconds) | delta% |
>> +=========+======================+======================+========+
>> | 1 | 4384.728 ± 21.085 | 3843.250 ± 16.235 | -12.35 |
>> | 2 | 2242.513 ± 2.099 | 1971.696 ± 2.842 | -12.08 |
>> | 4 | 1199.324 ± 1.823 | 1033.744 ± 1.803 | -13.81 |
>> | 8 | 649.083 ± 1.959 | 559.123 ± 4.301 | -13.86 |
>> | 16 | 370.425 ± 0.915 | 325.906 ± 4.623 | -12.02 |
>> | 32 | 234.651 ± 2.255 | 217.266 ± 0.253 | -7.41 |
>> | 64 | 202.286 ± 1.452 | 197.977 ± 2.275 | -2.13 |
>> | 128 | 217.092 ± 1.687 | 212.164 ± 1.138 | -2.27 |
>> +---------+----------------------+----------------------+--------+
>>
>> Signed-off-by: Christian Loehle <christian.loehle@arm.com>
>> ---
>> v2: Reword commit message, SCHED_IDLE aren't always preferred,
>> but rather equivalent
>> Factor out choose_sched_idle_rq() too (Both Vincent)
>>
>> kernel/sched/fair.c | 32 +++++++++++++++++++-------------
>> 1 file changed, 19 insertions(+), 13 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 3eaeceda71b0..6510ab6eb44b 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -6832,9 +6832,15 @@ static int sched_idle_rq(struct rq *rq)
>> rq->nr_running);
>> }
>>
>> -static int sched_idle_cpu(int cpu)
>> +static int choose_sched_idle_rq(struct rq *rq, struct task_struct *p)
>> {
>> - return sched_idle_rq(cpu_rq(cpu));
>> + return sched_idle_rq(rq) && !task_has_idle_policy(p);
>> +}
>> +
>> +static int choose_idle_cpu(int cpu, struct task_struct *p)
>> +{
>> + return available_idle_cpu(cpu) ||
>> + choose_sched_idle_rq(cpu_rq(cpu), p);
>> }
>>
>> static void
>> @@ -7400,7 +7406,7 @@ sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *
>> if (!sched_core_cookie_match(rq, p))
>> continue;
>>
>> - if (sched_idle_cpu(i))
>> + if (choose_sched_idle_rq(rq, p))
>> return i;
>>
>> if (available_idle_cpu(i)) {
>> @@ -7491,8 +7497,7 @@ static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct tas
>>
>> static inline int __select_idle_cpu(int cpu, struct task_struct *p)
>> {
>> - if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
>> - sched_cpu_cookie_match(cpu_rq(cpu), p))
>> + if (choose_idle_cpu(cpu, p) && sched_cpu_cookie_match(cpu_rq(cpu), p))
>> return cpu;
>>
>> return -1;
>> @@ -7565,7 +7570,8 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
>> if (!available_idle_cpu(cpu)) {
>> idle = false;
>> if (*idle_cpu == -1) {
>> - if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) {
>> + if (choose_sched_idle_rq(cpu_rq(cpu), p) &&
>> + cpumask_test_cpu(cpu, cpus)) {
>> *idle_cpu = cpu;
>> break;
>> }
>> @@ -7600,7 +7606,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
>> */
>> if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
>> continue;
>> - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
>> + if (choose_idle_cpu(cpu, p))
>> return cpu;
>> }
>>
>> @@ -7722,7 +7728,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
>> for_each_cpu_wrap(cpu, cpus, target) {
>> unsigned long cpu_cap = capacity_of(cpu);
>>
>> - if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
>> + if (!choose_idle_cpu(cpu, p))
>> continue;
>>
>> fits = util_fits_cpu(task_util, util_min, util_max, cpu);
>> @@ -7793,7 +7799,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>> */
>> lockdep_assert_irqs_disabled();
>>
>> - if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
>> + if (choose_idle_cpu(target, p) &&
>> asym_fits_cpu(task_util, util_min, util_max, target))
>> return target;
>>
>> @@ -7801,7 +7807,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>> * If the previous CPU is cache affine and idle, don't be stupid:
>> */
>> if (prev != target && cpus_share_cache(prev, target) &&
>> - (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
>> + choose_idle_cpu(prev, p) &&
>> asym_fits_cpu(task_util, util_min, util_max, prev)) {
>>
>> if (!static_branch_unlikely(&sched_cluster_active) ||
>> @@ -7833,7 +7839,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>> if (recent_used_cpu != prev &&
>> recent_used_cpu != target &&
>> cpus_share_cache(recent_used_cpu, target) &&
>> - (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
>> + choose_idle_cpu(recent_used_cpu, p) &&
>> cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
>> asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
>>
>> @@ -12261,7 +12267,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
>> {
>> int continue_balancing = 1;
>> int cpu = rq->cpu;
>> - int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
>> + int busy = idle != CPU_IDLE && !sched_idle_rq(rq);
>> unsigned long interval;
>> struct sched_domain *sd;
>> /* Earliest time when we have to do rebalance again */
>> @@ -12299,7 +12305,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
>> * state even if we migrated tasks. Update it.
>> */
>> idle = idle_cpu(cpu);
>> - busy = !idle && !sched_idle_cpu(cpu);
>> + busy = !idle && !sched_idle_rq(rq);
>
> Usually sched_idle_rqs were treated as not-busy in several balancing decisions to avoid yielding to background load. Does this change alter that interpretation at the domain balancing level ?
>
Like Prateek already mentioned, the load-balancing didn't change.
(I did s/sched_idle_cpu(cpu)/sched_idle_rq(rq)/ though because
sched_idle_cpu didn't have many callers left and removed it
entirely).
Hello Shubhang, On 2/4/2026 7:38 AM, Shubhang Kaushik wrote: >> @@ -12299,7 +12305,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) >> * state even if we migrated tasks. Update it. >> */ >> idle = idle_cpu(cpu); >> - busy = !idle && !sched_idle_cpu(cpu); >> + busy = !idle && !sched_idle_rq(rq); > > Usually sched_idle_rqs were treated as not-busy in several balancing decisions to avoid yielding to background load. Does this change alter that interpretation at the domain balancing level ? I don't think anything changes in the load-balancer path since we still check for sched_idle_rq() here. Only the wakeup path will consider both the waking task's policy and whether the CPU is completely idle vs sched_idle_rq(). > >> } >> sd->last_balance = jiffies; >> interval = get_sd_balance_interval(sd, busy); -- Thanks and Regards, Prateek
© 2016 - 2026 Red Hat, Inc.