The fast path in select_idle_sibling() can place tasks on CPUs without
considering core scheduling constraints, potentially causing immediate
force idle when the sibling runs an incompatible task.
Add cookie compatibility checks before selecting a CPU in the fast path.
This prevents placing waking tasks on CPUs where the sibling is running
an incompatible task, reducing force idle occurrences.
Signed-off-by: Fernand Sieber <sieberf@amazon.com>
---
kernel/sched/fair.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 78b36225a039..a9cbb0e9bb43 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7578,7 +7578,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
*/
if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
continue;
- if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
+ if (__select_idle_cpu(cpu, p) != -1)
return cpu;
}
@@ -7771,7 +7771,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
*/
lockdep_assert_irqs_disabled();
- if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
+ if ((__select_idle_cpu(target, p) != -1) &&
asym_fits_cpu(task_util, util_min, util_max, target))
return target;
@@ -7779,7 +7779,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
* If the previous CPU is cache affine and idle, don't be stupid:
*/
if (prev != target && cpus_share_cache(prev, target) &&
- (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
+ (__select_idle_cpu(prev, p) != -1) &&
asym_fits_cpu(task_util, util_min, util_max, prev)) {
if (!static_branch_unlikely(&sched_cluster_active) ||
@@ -7811,7 +7811,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
if (recent_used_cpu != prev &&
recent_used_cpu != target &&
cpus_share_cache(recent_used_cpu, target) &&
- (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
+ (__select_idle_cpu(recent_used_cpu, p) != -1) &&
cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
--
2.43.0
Amazon Development Centre (South Africa) (Proprietary) Limited
29 Gogosoa Street, Observatory, Cape Town, Western Cape, 7925, South Africa
Registration Number: 2004 / 034463 / 07
Hello Fernand, On 9/22/2025 6:09 PM, Fernand Sieber wrote: > The fast path in select_idle_sibling() can place tasks on CPUs without > considering core scheduling constraints, potentially causing immediate > force idle when the sibling runs an incompatible task. > > Add cookie compatibility checks before selecting a CPU in the fast path. > This prevents placing waking tasks on CPUs where the sibling is running > an incompatible task, reducing force idle occurrences. > > Signed-off-by: Fernand Sieber <sieberf@amazon.com> > --- > kernel/sched/fair.c | 8 ++++---- > 1 file changed, 4 insertions(+), 4 deletions(-) > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index 78b36225a039..a9cbb0e9bb43 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -7578,7 +7578,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t > */ > if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) > continue; > - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) > + if (__select_idle_cpu(cpu, p) != -1) So with Patch 1, you already check for cookie matching while entering select_idle_smt() and now, each pass of the loop again does a sched_core_cookie_match() which internally loops through the smt mask again! Seems wasteful. > return cpu; > } > > @@ -7771,7 +7771,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) > */ > lockdep_assert_irqs_disabled(); > > - if ((available_idle_cpu(target) || sched_idle_cpu(target)) && > + if ((__select_idle_cpu(target, p) != -1) && > asym_fits_cpu(task_util, util_min, util_max, target)) > return target; > > @@ -7779,7 +7779,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) > * If the previous CPU is cache affine and idle, don't be stupid: > */ > if (prev != target && cpus_share_cache(prev, target) && > - (available_idle_cpu(prev) || sched_idle_cpu(prev)) && > + (__select_idle_cpu(prev, p) != -1) && > asym_fits_cpu(task_util, util_min, util_max, prev)) { > > if (!static_branch_unlikely(&sched_cluster_active) || > @@ -7811,7 +7811,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) > if (recent_used_cpu != prev && > recent_used_cpu != target && > cpus_share_cache(recent_used_cpu, target) && > - (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && > + (__select_idle_cpu(recent_used_cpu, p) != -1) && On an SMT-8 system, all the looping over smt mask per wakeup will add up. Is that not a concern? A single task with core cookie enabled will add massive overhead for all wakeup in the system. > cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) && > asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { > -- Thanks and Regards, Prateek
Hi Prateek, On 9/23/2025 2:25 PM, K Prateek Nayak wrote: > So with Patch 1, you already check for cookie matching while entering > select_idle_smt() and now, each pass of the loop again does a > sched_core_cookie_match() which internally loops through the smt mask > again! Seems wasteful. Right. The change in select_idle_smt() is unnecessary. > On an SMT-8 system, all the looping over smt mask per wakeup will add > up. Is that not a concern? A single task with core cookie enabled will > add massive overhead for all wakeup in the system. In such a scenario there should generally be no looping because I introduced an early return in patch 3 in __sched_core_cookie_match(). Perhaps it's worth extracting this early return as standalone optimization patch? Something like this: @@ -1404,10 +1404,12 @@ static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p) if (!sched_core_enabled(rq)) return true; + if (rq->core->core_cookie == p->core_cookie) + return true; + for_each_cpu(cpu, cpu_smt_mask(cpu_of(rq))) { if (!available_idle_cpu(cpu)) { - idle_core = false; - break; + return false; } } @@ -1415,7 +1417,7 @@ static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p) * A CPU in an idle core is always the best choice for tasks with * cookies. */ - return idle_core || rq->core->core_cookie == p->core_cookie; + return true; } Thanks, Fernand Amazon Development Centre (South Africa) (Proprietary) Limited 29 Gogosoa Street, Observatory, Cape Town, Western Cape, 7925, South Africa Registration Number: 2004 / 034463 / 07
Hello Fernand, On 9/23/2025 3:00 PM, Fernand Sieber wrote: > Hi Prateek, > > On 9/23/2025 2:25 PM, K Prateek Nayak wrote: >> So with Patch 1, you already check for cookie matching while entering >> select_idle_smt() and now, each pass of the loop again does a >> sched_core_cookie_match() which internally loops through the smt mask >> again! Seems wasteful. > > Right. The change in select_idle_smt() is unnecessary. > >> On an SMT-8 system, all the looping over smt mask per wakeup will add >> up. Is that not a concern? A single task with core cookie enabled will >> add massive overhead for all wakeup in the system. > > In such a scenario there should generally be no looping because I introduced an > early return in patch 3 in __sched_core_cookie_match(). Perhaps it's worth > extracting this early return as standalone optimization patch? Something like > this: Yes, that would be great! Thank you. And also please include some benchmark numbers either in improved core utilization or the benchmark results actually improving from these changes. It would be great to know how much things improve by :) -- Thanks and Regards, Prateek
© 2016 - 2025 Red Hat, Inc.