kernel/sched/fair.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-)
Pre-calculate the base maximum utilization of each performance domain during the
main loop of find_energy_efficient_cpu() and cache it in the local
'energy_env' structure.
By caching this base value, the maximum utilization for candidate CPU
placements (such as prev_cpu and max_spare_cap_cpu) can be determined in
O(1) time, eliminating redundant scans of the performance domain. This
optimizes the energy estimation path by reducing the number of scans per
performance domain from three to one.
This change significantly reduces wake-up latency on systems with high core
counts or complex performance domain topologies by minimizing the overall
complexity of the Energy-Aware Scheduling (EAS) calculation.
Signed-off-by: Qiliang Yuan <yuanql9@chinatelecom.cn>
Signed-off-by: Qiliang Yuan <realwujing@gmail.com>
---
v2:
- Ensure RCU safety by using local 'energy_env' for caching instead of
modifying the shared 'perf_domain' structure.
- Consolidate pre-calculation into the main loop to avoid an extra pass
over the performance domains.
v1:
- Optimize energy calculation by pre-calculating performance domain max utilization.
- Add max_util and max_spare_cap_cpu to struct perf_domain.
- Reduce inner loop complexity from O(N) to O(1) for energy estimation.
kernel/sched/fair.c | 36 ++++++++++++++++++------------------
1 file changed, 18 insertions(+), 18 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e71302282671..5c114c49c202 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8148,6 +8148,7 @@ struct energy_env {
unsigned long pd_busy_time;
unsigned long cpu_cap;
unsigned long pd_cap;
+ unsigned long pd_max_util;
};
/*
@@ -8215,41 +8216,32 @@ static inline void eenv_pd_busy_time(struct energy_env *eenv,
* exceed @eenv->cpu_cap.
*/
static inline unsigned long
-eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
+eenv_pd_max_util(struct energy_env *eenv, struct perf_domain *pd,
struct task_struct *p, int dst_cpu)
{
- unsigned long max_util = 0;
- int cpu;
+ unsigned long max_util = eenv->pd_max_util;
- for_each_cpu(cpu, pd_cpus) {
- struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
- unsigned long util = cpu_util(cpu, p, dst_cpu, 1);
+ if (dst_cpu >= 0 && cpumask_test_cpu(dst_cpu, perf_domain_span(pd))) {
+ unsigned long util = cpu_util(dst_cpu, p, dst_cpu, 1);
unsigned long eff_util, min, max;
- /*
- * Performance domain frequency: utilization clamping
- * must be considered since it affects the selection
- * of the performance domain frequency.
- * NOTE: in case RT tasks are running, by default the min
- * utilization can be max OPP.
- */
- eff_util = effective_cpu_util(cpu, util, &min, &max);
+ eff_util = effective_cpu_util(dst_cpu, util, &min, &max);
/* Task's uclamp can modify min and max value */
- if (tsk && uclamp_is_used()) {
+ if (uclamp_is_used()) {
min = max(min, uclamp_eff_value(p, UCLAMP_MIN));
/*
* If there is no active max uclamp constraint,
* directly use task's one, otherwise keep max.
*/
- if (uclamp_rq_is_idle(cpu_rq(cpu)))
+ if (uclamp_rq_is_idle(cpu_rq(dst_cpu)))
max = uclamp_eff_value(p, UCLAMP_MAX);
else
max = max(max, uclamp_eff_value(p, UCLAMP_MAX));
}
- eff_util = sugov_effective_cpu_perf(cpu, eff_util, min, max);
+ eff_util = sugov_effective_cpu_perf(dst_cpu, eff_util, min, max);
max_util = max(max_util, eff_util);
}
@@ -8265,7 +8257,7 @@ static inline unsigned long
compute_energy(struct energy_env *eenv, struct perf_domain *pd,
struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu)
{
- unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
+ unsigned long max_util = eenv_pd_max_util(eenv, pd, p, dst_cpu);
unsigned long busy_time = eenv->pd_busy_time;
unsigned long energy;
@@ -8376,12 +8368,20 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
eenv.cpu_cap = cpu_actual_cap;
eenv.pd_cap = 0;
+ eenv.pd_max_util = 0;
for_each_cpu(cpu, cpus) {
struct rq *rq = cpu_rq(cpu);
+ unsigned long util_b, eff_util_b, min_b, max_b;
eenv.pd_cap += cpu_actual_cap;
+ /* Pre-calculate base max utilization for the performance domain */
+ util_b = cpu_util(cpu, p, -1, 1);
+ eff_util_b = effective_cpu_util(cpu, util_b, &min_b, &max_b);
+ eff_util_b = sugov_effective_cpu_perf(cpu, eff_util_b, min_b, max_b);
+ eenv.pd_max_util = max(eenv.pd_max_util, eff_util_b);
+
if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
continue;
--
2.51.0
On Mon, 2 Feb 2026 at 04:05, Qiliang Yuan <realwujing@gmail.com> wrote:
>
> Pre-calculate the base maximum utilization of each performance domain during the
> main loop of find_energy_efficient_cpu() and cache it in the local
> 'energy_env' structure.
>
> By caching this base value, the maximum utilization for candidate CPU
> placements (such as prev_cpu and max_spare_cap_cpu) can be determined in
> O(1) time, eliminating redundant scans of the performance domain. This
> optimizes the energy estimation path by reducing the number of scans per
> performance domain from three to one.
Ok, but the whole feec() remains O(n)
>
> This change significantly reduces wake-up latency on systems with high core
> counts or complex performance domain topologies by minimizing the overall
> complexity of the Energy-Aware Scheduling (EAS) calculation.
Could you add some figures to highlight the statement above ?
>
> Signed-off-by: Qiliang Yuan <yuanql9@chinatelecom.cn>
> Signed-off-by: Qiliang Yuan <realwujing@gmail.com>
> ---
> v2:
> - Ensure RCU safety by using local 'energy_env' for caching instead of
> modifying the shared 'perf_domain' structure.
> - Consolidate pre-calculation into the main loop to avoid an extra pass
> over the performance domains.
> v1:
> - Optimize energy calculation by pre-calculating performance domain max utilization.
> - Add max_util and max_spare_cap_cpu to struct perf_domain.
> - Reduce inner loop complexity from O(N) to O(1) for energy estimation.
>
> kernel/sched/fair.c | 36 ++++++++++++++++++------------------
> 1 file changed, 18 insertions(+), 18 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index e71302282671..5c114c49c202 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -8148,6 +8148,7 @@ struct energy_env {
> unsigned long pd_busy_time;
> unsigned long cpu_cap;
> unsigned long pd_cap;
> + unsigned long pd_max_util;
> };
>
> /*
> @@ -8215,41 +8216,32 @@ static inline void eenv_pd_busy_time(struct energy_env *eenv,
> * exceed @eenv->cpu_cap.
> */
> static inline unsigned long
> -eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
> +eenv_pd_max_util(struct energy_env *eenv, struct perf_domain *pd,
> struct task_struct *p, int dst_cpu)
> {
> - unsigned long max_util = 0;
> - int cpu;
> + unsigned long max_util = eenv->pd_max_util;
>
> - for_each_cpu(cpu, pd_cpus) {
> - struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
> - unsigned long util = cpu_util(cpu, p, dst_cpu, 1);
> + if (dst_cpu >= 0 && cpumask_test_cpu(dst_cpu, perf_domain_span(pd))) {
> + unsigned long util = cpu_util(dst_cpu, p, dst_cpu, 1);
> unsigned long eff_util, min, max;
>
> - /*
> - * Performance domain frequency: utilization clamping
> - * must be considered since it affects the selection
> - * of the performance domain frequency.
> - * NOTE: in case RT tasks are running, by default the min
> - * utilization can be max OPP.
> - */
> - eff_util = effective_cpu_util(cpu, util, &min, &max);
> + eff_util = effective_cpu_util(dst_cpu, util, &min, &max);
>
> /* Task's uclamp can modify min and max value */
> - if (tsk && uclamp_is_used()) {
> + if (uclamp_is_used()) {
> min = max(min, uclamp_eff_value(p, UCLAMP_MIN));
>
> /*
> * If there is no active max uclamp constraint,
> * directly use task's one, otherwise keep max.
> */
> - if (uclamp_rq_is_idle(cpu_rq(cpu)))
> + if (uclamp_rq_is_idle(cpu_rq(dst_cpu)))
> max = uclamp_eff_value(p, UCLAMP_MAX);
> else
> max = max(max, uclamp_eff_value(p, UCLAMP_MAX));
> }
>
> - eff_util = sugov_effective_cpu_perf(cpu, eff_util, min, max);
> + eff_util = sugov_effective_cpu_perf(dst_cpu, eff_util, min, max);
> max_util = max(max_util, eff_util);
> }
>
> @@ -8265,7 +8257,7 @@ static inline unsigned long
> compute_energy(struct energy_env *eenv, struct perf_domain *pd,
> struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu)
> {
> - unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
> + unsigned long max_util = eenv_pd_max_util(eenv, pd, p, dst_cpu);
> unsigned long busy_time = eenv->pd_busy_time;
> unsigned long energy;
>
> @@ -8376,12 +8368,20 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
>
> eenv.cpu_cap = cpu_actual_cap;
> eenv.pd_cap = 0;
> + eenv.pd_max_util = 0;
>
> for_each_cpu(cpu, cpus) {
> struct rq *rq = cpu_rq(cpu);
> + unsigned long util_b, eff_util_b, min_b, max_b;
>
> eenv.pd_cap += cpu_actual_cap;
>
> + /* Pre-calculate base max utilization for the performance domain */
> + util_b = cpu_util(cpu, p, -1, 1);
> + eff_util_b = effective_cpu_util(cpu, util_b, &min_b, &max_b);
> + eff_util_b = sugov_effective_cpu_perf(cpu, eff_util_b, min_b, max_b);
> + eenv.pd_max_util = max(eenv.pd_max_util, eff_util_b);
You pre calculate the above for all CPUs (each CPU of each PD)
in order to not do the above 2-3 times for PDs with a CPU that could fit.
So this will be a win if all PDs have a CPU that fits and we have to
compute_energy for them but not if only few/one PDs have a CPU that
fits
> +
> if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
> continue;
>
> --
> 2.51.0
>
Hi Christian, Vincent, Thank you for the detailed feedback. On Mon, Feb 02, 2026 at 10:48:04AM +0000, Christian Loehle wrote: > Which is still O(n), I think the title is misleading. On Tue, Feb 03, 2026 at 06:16:27PM +0100, Vincent Guittot wrote: > Ok, but the whole feec() remains O(n) You are absolutely right. While the per-candidate CPU energy estimation was optimized, the overall complexity of find_energy_efficient_cpu() remains O(N). I've renamed the patch in v3 to "Optimize EAS by reducing redundant performance domain scans" to more accurately reflect the scope of the improvement. On Mon, Feb 02, 2026 at 10:48:04AM +0000, Christian Loehle wrote: > I don't think this is actually true. EAS doesn't really work with a large > number of PDs because of the expensive wakeup path. > I don't think there's an EAS system out there where this would actually make > a measurable impact. On Tue, Feb 03, 2026 at 06:16:27PM +0100, Vincent Guittot wrote: > Could you add some figures to highlight the statement above ? In v3, I've further optimized the path by consolidating the 'pd_max_util' and 'pd_busy_time' calculations into the same loop that finds the 'max_spare_cap_cpu'. This reduces the total number of full PD scans from three down to one per performance domain. I agree that the impact on current mobile systems with 2-3 PDs might be subtle. However, as topologies grow and the wake-up path becomes more sensitive to cache misses, reducing redundant scans of task structures and rq utilization is a worthwhile constant-factor improvement. I'm investigating synthetic benchmarks on systems with higher core counts to provide more concrete figures. I've sent out v3 which includes these further logic consolidations. v3 link: https://lore.kernel.org/all/20260204120509.3950227-1-realwujing@gmail.com/ Thanks, Qiliang
On 2/2/26 03:05, Qiliang Yuan wrote:
> Pre-calculate the base maximum utilization of each performance domain during the
> main loop of find_energy_efficient_cpu() and cache it in the local
> 'energy_env' structure.
>
> By caching this base value, the maximum utilization for candidate CPU
> placements (such as prev_cpu and max_spare_cap_cpu) can be determined in
> O(1) time, eliminating redundant scans of the performance domain. This
> optimizes the energy estimation path by reducing the number of scans per
> performance domain from three to one.
Which is still O(n), I think the title is misleading.
>
> This change significantly reduces wake-up latency on systems with high core
> counts or complex performance domain topologies by minimizing the overall
> complexity of the Energy-Aware Scheduling (EAS) calculation.
I don't think this is actually true. EAS doesn't really work with a large
number of PDs because of the expensive wakeup path.
I don't think there's an EAS system out there where this would actually make a
measurable impact. Most have 2 or 3, the highest number of PDs I'm aware of
is 5, but FWIW the actual change looks correct to me.
>
> Signed-off-by: Qiliang Yuan <yuanql9@chinatelecom.cn>
> Signed-off-by: Qiliang Yuan <realwujing@gmail.com>
> ---
> v2:
> - Ensure RCU safety by using local 'energy_env' for caching instead of
> modifying the shared 'perf_domain' structure.
> - Consolidate pre-calculation into the main loop to avoid an extra pass
> over the performance domains.
> v1:
> - Optimize energy calculation by pre-calculating performance domain max utilization.
> - Add max_util and max_spare_cap_cpu to struct perf_domain.
> - Reduce inner loop complexity from O(N) to O(1) for energy estimation.
>
> kernel/sched/fair.c | 36 ++++++++++++++++++------------------
> 1 file changed, 18 insertions(+), 18 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index e71302282671..5c114c49c202 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -8148,6 +8148,7 @@ struct energy_env {
> unsigned long pd_busy_time;
> unsigned long cpu_cap;
> unsigned long pd_cap;
> + unsigned long pd_max_util;
> };
>
> /*
> @@ -8215,41 +8216,32 @@ static inline void eenv_pd_busy_time(struct energy_env *eenv,
> * exceed @eenv->cpu_cap.
> */
> static inline unsigned long
> -eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
> +eenv_pd_max_util(struct energy_env *eenv, struct perf_domain *pd,
> struct task_struct *p, int dst_cpu)
> {
> - unsigned long max_util = 0;
> - int cpu;
> + unsigned long max_util = eenv->pd_max_util;
>
> - for_each_cpu(cpu, pd_cpus) {
> - struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
> - unsigned long util = cpu_util(cpu, p, dst_cpu, 1);
> + if (dst_cpu >= 0 && cpumask_test_cpu(dst_cpu, perf_domain_span(pd))) {
> + unsigned long util = cpu_util(dst_cpu, p, dst_cpu, 1);
> unsigned long eff_util, min, max;
>
> - /*
> - * Performance domain frequency: utilization clamping
> - * must be considered since it affects the selection
> - * of the performance domain frequency.
> - * NOTE: in case RT tasks are running, by default the min
> - * utilization can be max OPP.
> - */
> - eff_util = effective_cpu_util(cpu, util, &min, &max);
> + eff_util = effective_cpu_util(dst_cpu, util, &min, &max);
>
> /* Task's uclamp can modify min and max value */
> - if (tsk && uclamp_is_used()) {
> + if (uclamp_is_used()) {
> min = max(min, uclamp_eff_value(p, UCLAMP_MIN));
>
> /*
> * If there is no active max uclamp constraint,
> * directly use task's one, otherwise keep max.
> */
> - if (uclamp_rq_is_idle(cpu_rq(cpu)))
> + if (uclamp_rq_is_idle(cpu_rq(dst_cpu)))
> max = uclamp_eff_value(p, UCLAMP_MAX);
> else
> max = max(max, uclamp_eff_value(p, UCLAMP_MAX));
> }
>
> - eff_util = sugov_effective_cpu_perf(cpu, eff_util, min, max);
> + eff_util = sugov_effective_cpu_perf(dst_cpu, eff_util, min, max);
> max_util = max(max_util, eff_util);
> }
>
> @@ -8265,7 +8257,7 @@ static inline unsigned long
> compute_energy(struct energy_env *eenv, struct perf_domain *pd,
> struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu)
> {
> - unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
> + unsigned long max_util = eenv_pd_max_util(eenv, pd, p, dst_cpu);
> unsigned long busy_time = eenv->pd_busy_time;
> unsigned long energy;
>
> @@ -8376,12 +8368,20 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
>
> eenv.cpu_cap = cpu_actual_cap;
> eenv.pd_cap = 0;
> + eenv.pd_max_util = 0;
>
> for_each_cpu(cpu, cpus) {
> struct rq *rq = cpu_rq(cpu);
> + unsigned long util_b, eff_util_b, min_b, max_b;
>
> eenv.pd_cap += cpu_actual_cap;
>
> + /* Pre-calculate base max utilization for the performance domain */
> + util_b = cpu_util(cpu, p, -1, 1);
> + eff_util_b = effective_cpu_util(cpu, util_b, &min_b, &max_b);
> + eff_util_b = sugov_effective_cpu_perf(cpu, eff_util_b, min_b, max_b);
> + eenv.pd_max_util = max(eenv.pd_max_util, eff_util_b);
> +
> if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
> continue;
>
© 2016 - 2026 Red Hat, Inc.