[v2] sched/fair: Optimize EAS energy calculation complexity from O(N) to O(1) inside inner loop

[PATCH v2 RSEND] sched/fair: Optimize EAS energy calculation complexity from O(N) to O(1) inside inner loop

Posted by Qiliang Yuan 5 days ago

Pre-calculate the base maximum utilization of each performance domain during the
main loop of find_energy_efficient_cpu() and cache it in the local
'energy_env' structure.

By caching this base value, the maximum utilization for candidate CPU
placements (such as prev_cpu and max_spare_cap_cpu) can be determined in
O(1) time, eliminating redundant scans of the performance domain. This
optimizes the energy estimation path by reducing the number of scans per
performance domain from three to one.

This change significantly reduces wake-up latency on systems with high core
counts or complex performance domain topologies by minimizing the overall
complexity of the Energy-Aware Scheduling (EAS) calculation.

Signed-off-by: Qiliang Yuan <yuanql9@chinatelecom.cn>
Signed-off-by: Qiliang Yuan <realwujing@gmail.com>
---
v2:
 - Ensure RCU safety by using local 'energy_env' for caching instead of
   modifying the shared 'perf_domain' structure.
 - Consolidate pre-calculation into the main loop to avoid an extra pass
   over the performance domains.
v1:
 - Optimize energy calculation by pre-calculating performance domain max utilization.
 - Add max_util and max_spare_cap_cpu to struct perf_domain.
 - Reduce inner loop complexity from O(N) to O(1) for energy estimation.

 kernel/sched/fair.c | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e71302282671..5c114c49c202 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8148,6 +8148,7 @@ struct energy_env {
 	unsigned long pd_busy_time;
 	unsigned long cpu_cap;
 	unsigned long pd_cap;
+	unsigned long pd_max_util;
 };
 
 /*
@@ -8215,41 +8216,32 @@ static inline void eenv_pd_busy_time(struct energy_env *eenv,
  * exceed @eenv->cpu_cap.
  */
 static inline unsigned long
-eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
+eenv_pd_max_util(struct energy_env *eenv, struct perf_domain *pd,
 		 struct task_struct *p, int dst_cpu)
 {
-	unsigned long max_util = 0;
-	int cpu;
+	unsigned long max_util = eenv->pd_max_util;
 
-	for_each_cpu(cpu, pd_cpus) {
-		struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
-		unsigned long util = cpu_util(cpu, p, dst_cpu, 1);
+	if (dst_cpu >= 0 && cpumask_test_cpu(dst_cpu, perf_domain_span(pd))) {
+		unsigned long util = cpu_util(dst_cpu, p, dst_cpu, 1);
 		unsigned long eff_util, min, max;
 
-		/*
-		 * Performance domain frequency: utilization clamping
-		 * must be considered since it affects the selection
-		 * of the performance domain frequency.
-		 * NOTE: in case RT tasks are running, by default the min
-		 * utilization can be max OPP.
-		 */
-		eff_util = effective_cpu_util(cpu, util, &min, &max);
+		eff_util = effective_cpu_util(dst_cpu, util, &min, &max);
 
 		/* Task's uclamp can modify min and max value */
-		if (tsk && uclamp_is_used()) {
+		if (uclamp_is_used()) {
 			min = max(min, uclamp_eff_value(p, UCLAMP_MIN));
 
 			/*
 			 * If there is no active max uclamp constraint,
 			 * directly use task's one, otherwise keep max.
 			 */
-			if (uclamp_rq_is_idle(cpu_rq(cpu)))
+			if (uclamp_rq_is_idle(cpu_rq(dst_cpu)))
 				max = uclamp_eff_value(p, UCLAMP_MAX);
 			else
 				max = max(max, uclamp_eff_value(p, UCLAMP_MAX));
 		}
 
-		eff_util = sugov_effective_cpu_perf(cpu, eff_util, min, max);
+		eff_util = sugov_effective_cpu_perf(dst_cpu, eff_util, min, max);
 		max_util = max(max_util, eff_util);
 	}
 
@@ -8265,7 +8257,7 @@ static inline unsigned long
 compute_energy(struct energy_env *eenv, struct perf_domain *pd,
 	       struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu)
 {
-	unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
+	unsigned long max_util = eenv_pd_max_util(eenv, pd, p, dst_cpu);
 	unsigned long busy_time = eenv->pd_busy_time;
 	unsigned long energy;
 
@@ -8376,12 +8368,20 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 
 		eenv.cpu_cap = cpu_actual_cap;
 		eenv.pd_cap = 0;
+		eenv.pd_max_util = 0;
 
 		for_each_cpu(cpu, cpus) {
 			struct rq *rq = cpu_rq(cpu);
+			unsigned long util_b, eff_util_b, min_b, max_b;
 
 			eenv.pd_cap += cpu_actual_cap;
 
+			/* Pre-calculate base max utilization for the performance domain */
+			util_b = cpu_util(cpu, p, -1, 1);
+			eff_util_b = effective_cpu_util(cpu, util_b, &min_b, &max_b);
+			eff_util_b = sugov_effective_cpu_perf(cpu, eff_util_b, min_b, max_b);
+			eenv.pd_max_util = max(eenv.pd_max_util, eff_util_b);
+
 			if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
 				continue;
 
-- 
2.51.0

Re: [PATCH v2 RSEND] sched/fair: Optimize EAS energy calculation complexity from O(N) to O(1) inside inner loop

Posted by Vincent Guittot 3 days, 10 hours ago

On Mon, 2 Feb 2026 at 04:05, Qiliang Yuan <realwujing@gmail.com> wrote:
>
> Pre-calculate the base maximum utilization of each performance domain during the
> main loop of find_energy_efficient_cpu() and cache it in the local
> 'energy_env' structure.
>
> By caching this base value, the maximum utilization for candidate CPU
> placements (such as prev_cpu and max_spare_cap_cpu) can be determined in
> O(1) time, eliminating redundant scans of the performance domain. This
> optimizes the energy estimation path by reducing the number of scans per
> performance domain from three to one.

Ok, but the whole feec() remains O(n)

>
> This change significantly reduces wake-up latency on systems with high core
> counts or complex performance domain topologies by minimizing the overall
> complexity of the Energy-Aware Scheduling (EAS) calculation.

Could you add some figures to highlight the statement above ?

>
> Signed-off-by: Qiliang Yuan <yuanql9@chinatelecom.cn>
> Signed-off-by: Qiliang Yuan <realwujing@gmail.com>
> ---
> v2:
>  - Ensure RCU safety by using local 'energy_env' for caching instead of
>    modifying the shared 'perf_domain' structure.
>  - Consolidate pre-calculation into the main loop to avoid an extra pass
>    over the performance domains.
> v1:
>  - Optimize energy calculation by pre-calculating performance domain max utilization.
>  - Add max_util and max_spare_cap_cpu to struct perf_domain.
>  - Reduce inner loop complexity from O(N) to O(1) for energy estimation.
>
>  kernel/sched/fair.c | 36 ++++++++++++++++++------------------
>  1 file changed, 18 insertions(+), 18 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index e71302282671..5c114c49c202 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -8148,6 +8148,7 @@ struct energy_env {
>         unsigned long pd_busy_time;
>         unsigned long cpu_cap;
>         unsigned long pd_cap;
> +       unsigned long pd_max_util;
>  };
>
>  /*
> @@ -8215,41 +8216,32 @@ static inline void eenv_pd_busy_time(struct energy_env *eenv,
>   * exceed @eenv->cpu_cap.
>   */
>  static inline unsigned long
> -eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
> +eenv_pd_max_util(struct energy_env *eenv, struct perf_domain *pd,
>                  struct task_struct *p, int dst_cpu)
>  {
> -       unsigned long max_util = 0;
> -       int cpu;
> +       unsigned long max_util = eenv->pd_max_util;
>
> -       for_each_cpu(cpu, pd_cpus) {
> -               struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
> -               unsigned long util = cpu_util(cpu, p, dst_cpu, 1);
> +       if (dst_cpu >= 0 && cpumask_test_cpu(dst_cpu, perf_domain_span(pd))) {
> +               unsigned long util = cpu_util(dst_cpu, p, dst_cpu, 1);
>                 unsigned long eff_util, min, max;
>
> -               /*
> -                * Performance domain frequency: utilization clamping
> -                * must be considered since it affects the selection
> -                * of the performance domain frequency.
> -                * NOTE: in case RT tasks are running, by default the min
> -                * utilization can be max OPP.
> -                */
> -               eff_util = effective_cpu_util(cpu, util, &min, &max);
> +               eff_util = effective_cpu_util(dst_cpu, util, &min, &max);
>
>                 /* Task's uclamp can modify min and max value */
> -               if (tsk && uclamp_is_used()) {
> +               if (uclamp_is_used()) {
>                         min = max(min, uclamp_eff_value(p, UCLAMP_MIN));
>
>                         /*
>                          * If there is no active max uclamp constraint,
>                          * directly use task's one, otherwise keep max.
>                          */
> -                       if (uclamp_rq_is_idle(cpu_rq(cpu)))
> +                       if (uclamp_rq_is_idle(cpu_rq(dst_cpu)))
>                                 max = uclamp_eff_value(p, UCLAMP_MAX);
>                         else
>                                 max = max(max, uclamp_eff_value(p, UCLAMP_MAX));
>                 }
>
> -               eff_util = sugov_effective_cpu_perf(cpu, eff_util, min, max);
> +               eff_util = sugov_effective_cpu_perf(dst_cpu, eff_util, min, max);
>                 max_util = max(max_util, eff_util);
>         }
>
> @@ -8265,7 +8257,7 @@ static inline unsigned long
>  compute_energy(struct energy_env *eenv, struct perf_domain *pd,
>                struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu)
>  {
> -       unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
> +       unsigned long max_util = eenv_pd_max_util(eenv, pd, p, dst_cpu);
>         unsigned long busy_time = eenv->pd_busy_time;
>         unsigned long energy;
>
> @@ -8376,12 +8368,20 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
>
>                 eenv.cpu_cap = cpu_actual_cap;
>                 eenv.pd_cap = 0;
> +               eenv.pd_max_util = 0;
>
>                 for_each_cpu(cpu, cpus) {
>                         struct rq *rq = cpu_rq(cpu);
> +                       unsigned long util_b, eff_util_b, min_b, max_b;
>
>                         eenv.pd_cap += cpu_actual_cap;
>
> +                       /* Pre-calculate base max utilization for the performance domain */
> +                       util_b = cpu_util(cpu, p, -1, 1);
> +                       eff_util_b = effective_cpu_util(cpu, util_b, &min_b, &max_b);
> +                       eff_util_b = sugov_effective_cpu_perf(cpu, eff_util_b, min_b, max_b);
> +                       eenv.pd_max_util = max(eenv.pd_max_util, eff_util_b);

You pre calculate the above for all CPUs (each CPU of each PD)

in order to not do the above 2-3 times for PDs with a CPU that could fit.

So this will be a win if all PDs have a CPU that fits and we have to
compute_energy for them but not if only few/one PDs have a CPU that
fits

> +
>                         if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
>                                 continue;
>
> --
> 2.51.0
>

Re: [PATCH v2 RSEND] sched/fair: Optimize EAS energy calculation complexity from O(N) to O(1) inside inner loop

Posted by Qiliang Yuan 2 days, 15 hours ago

Hi Christian, Vincent,

Thank you for the detailed feedback.

On Mon, Feb 02, 2026 at 10:48:04AM +0000, Christian Loehle wrote:
> Which is still O(n), I think the title is misleading.

On Tue, Feb 03, 2026 at 06:16:27PM +0100, Vincent Guittot wrote:
> Ok, but the whole feec() remains O(n)

You are absolutely right. While the per-candidate CPU energy estimation was 
optimized, the overall complexity of find_energy_efficient_cpu() remains 
O(N). I've renamed the patch in v3 to "Optimize EAS by reducing redundant 
performance domain scans" to more accurately reflect the scope of the 
improvement.

On Mon, Feb 02, 2026 at 10:48:04AM +0000, Christian Loehle wrote:
> I don't think this is actually true. EAS doesn't really work with a large 
> number of PDs because of the expensive wakeup path.
> I don't think there's an EAS system out there where this would actually make 
> a measurable impact.

On Tue, Feb 03, 2026 at 06:16:27PM +0100, Vincent Guittot wrote:
> Could you add some figures to highlight the statement above ?

In v3, I've further optimized the path by consolidating the 'pd_max_util' and 
'pd_busy_time' calculations into the same loop that finds the 
'max_spare_cap_cpu'. This reduces the total number of full PD scans from three 
down to one per performance domain.

I agree that the impact on current mobile systems with 2-3 PDs might be subtle. 
However, as topologies grow and the wake-up path becomes more sensitive to 
cache misses, reducing redundant scans of task structures and rq utilization 
is a worthwhile constant-factor improvement. I'm investigating synthetic 
benchmarks on systems with higher core counts to provide more concrete figures.

I've sent out v3 which includes these further logic consolidations.

v3 link: https://lore.kernel.org/all/20260204120509.3950227-1-realwujing@gmail.com/

Thanks,
Qiliang

Re: [PATCH v2 RSEND] sched/fair: Optimize EAS energy calculation complexity from O(N) to O(1) inside inner loop

Posted by Christian Loehle 4 days, 16 hours ago

On 2/2/26 03:05, Qiliang Yuan wrote:
> Pre-calculate the base maximum utilization of each performance domain during the
> main loop of find_energy_efficient_cpu() and cache it in the local
> 'energy_env' structure.
> 
> By caching this base value, the maximum utilization for candidate CPU
> placements (such as prev_cpu and max_spare_cap_cpu) can be determined in
> O(1) time, eliminating redundant scans of the performance domain. This
> optimizes the energy estimation path by reducing the number of scans per
> performance domain from three to one.

Which is still O(n), I think the title is misleading.

> 
> This change significantly reduces wake-up latency on systems with high core
> counts or complex performance domain topologies by minimizing the overall
> complexity of the Energy-Aware Scheduling (EAS) calculation.

I don't think this is actually true. EAS doesn't really work with a large
number of PDs because of the expensive wakeup path.
I don't think there's an EAS system out there where this would actually make a
measurable impact. Most have 2 or 3, the highest number of PDs I'm aware of
is 5, but FWIW the actual change looks correct to me.

> 
> Signed-off-by: Qiliang Yuan <yuanql9@chinatelecom.cn>
> Signed-off-by: Qiliang Yuan <realwujing@gmail.com>
> ---
> v2:
>  - Ensure RCU safety by using local 'energy_env' for caching instead of
>    modifying the shared 'perf_domain' structure.
>  - Consolidate pre-calculation into the main loop to avoid an extra pass
>    over the performance domains.
> v1:
>  - Optimize energy calculation by pre-calculating performance domain max utilization.
>  - Add max_util and max_spare_cap_cpu to struct perf_domain.
>  - Reduce inner loop complexity from O(N) to O(1) for energy estimation.
> 
>  kernel/sched/fair.c | 36 ++++++++++++++++++------------------
>  1 file changed, 18 insertions(+), 18 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index e71302282671..5c114c49c202 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -8148,6 +8148,7 @@ struct energy_env {
>  	unsigned long pd_busy_time;
>  	unsigned long cpu_cap;
>  	unsigned long pd_cap;
> +	unsigned long pd_max_util;
>  };
>  
>  /*
> @@ -8215,41 +8216,32 @@ static inline void eenv_pd_busy_time(struct energy_env *eenv,
>   * exceed @eenv->cpu_cap.
>   */
>  static inline unsigned long
> -eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
> +eenv_pd_max_util(struct energy_env *eenv, struct perf_domain *pd,
>  		 struct task_struct *p, int dst_cpu)
>  {
> -	unsigned long max_util = 0;
> -	int cpu;
> +	unsigned long max_util = eenv->pd_max_util;
>  
> -	for_each_cpu(cpu, pd_cpus) {
> -		struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
> -		unsigned long util = cpu_util(cpu, p, dst_cpu, 1);
> +	if (dst_cpu >= 0 && cpumask_test_cpu(dst_cpu, perf_domain_span(pd))) {
> +		unsigned long util = cpu_util(dst_cpu, p, dst_cpu, 1);
>  		unsigned long eff_util, min, max;
>  
> -		/*
> -		 * Performance domain frequency: utilization clamping
> -		 * must be considered since it affects the selection
> -		 * of the performance domain frequency.
> -		 * NOTE: in case RT tasks are running, by default the min
> -		 * utilization can be max OPP.
> -		 */
> -		eff_util = effective_cpu_util(cpu, util, &min, &max);
> +		eff_util = effective_cpu_util(dst_cpu, util, &min, &max);
>  
>  		/* Task's uclamp can modify min and max value */
> -		if (tsk && uclamp_is_used()) {
> +		if (uclamp_is_used()) {
>  			min = max(min, uclamp_eff_value(p, UCLAMP_MIN));
>  
>  			/*
>  			 * If there is no active max uclamp constraint,
>  			 * directly use task's one, otherwise keep max.
>  			 */
> -			if (uclamp_rq_is_idle(cpu_rq(cpu)))
> +			if (uclamp_rq_is_idle(cpu_rq(dst_cpu)))
>  				max = uclamp_eff_value(p, UCLAMP_MAX);
>  			else
>  				max = max(max, uclamp_eff_value(p, UCLAMP_MAX));
>  		}
>  
> -		eff_util = sugov_effective_cpu_perf(cpu, eff_util, min, max);
> +		eff_util = sugov_effective_cpu_perf(dst_cpu, eff_util, min, max);
>  		max_util = max(max_util, eff_util);
>  	}
>  
> @@ -8265,7 +8257,7 @@ static inline unsigned long
>  compute_energy(struct energy_env *eenv, struct perf_domain *pd,
>  	       struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu)
>  {
> -	unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
> +	unsigned long max_util = eenv_pd_max_util(eenv, pd, p, dst_cpu);
>  	unsigned long busy_time = eenv->pd_busy_time;
>  	unsigned long energy;
>  
> @@ -8376,12 +8368,20 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
>  
>  		eenv.cpu_cap = cpu_actual_cap;
>  		eenv.pd_cap = 0;
> +		eenv.pd_max_util = 0;
>  
>  		for_each_cpu(cpu, cpus) {
>  			struct rq *rq = cpu_rq(cpu);
> +			unsigned long util_b, eff_util_b, min_b, max_b;
>  
>  			eenv.pd_cap += cpu_actual_cap;
>  
> +			/* Pre-calculate base max utilization for the performance domain */
> +			util_b = cpu_util(cpu, p, -1, 1);
> +			eff_util_b = effective_cpu_util(cpu, util_b, &min_b, &max_b);
> +			eff_util_b = sugov_effective_cpu_perf(cpu, eff_util_b, min_b, max_b);
> +			eenv.pd_max_util = max(eenv.pd_max_util, eff_util_b);
> +
>  			if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
>  				continue;
>