kernel/sched/fair.c | 46 ++++++++++++++++++++++++----------------- kernel/sched/sched.h | 4 ++++ kernel/sched/topology.c | 1 + 3 files changed, 32 insertions(+), 19 deletions(-)
By pre-calculating the base max utilization of each performance domain
at the start of find_energy_efficient_cpu(), we can avoid the repetitive
O(M) scan in eenv_pd_max_util() for every candidate CPU.
This reduces the complexity of energy calculation from O(P*N) to O(N + P^2),
where P is the number of performance domains. For systems with many
performance domains or high core counts, this results in significant
reduction in wakeup latency.
Signed-off-by: Qiliang Yuan <realwujing@gmail.com>
Signed-off-by: Qiliang Yuan <yuanql9@chinatelecom.cn>
---
kernel/sched/fair.c | 46 ++++++++++++++++++++++++-----------------
kernel/sched/sched.h | 4 ++++
kernel/sched/topology.c | 1 +
3 files changed, 32 insertions(+), 19 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 458324d240e9..de5bfdfe553f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8236,41 +8236,32 @@ static inline void eenv_pd_busy_time(struct energy_env *eenv,
* exceed @eenv->cpu_cap.
*/
static inline unsigned long
-eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
+eenv_pd_max_util(struct energy_env *eenv, struct perf_domain *pd,
struct task_struct *p, int dst_cpu)
{
- unsigned long max_util = 0;
- int cpu;
+ unsigned long max_util = pd->max_util;
- for_each_cpu(cpu, pd_cpus) {
- struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
- unsigned long util = cpu_util(cpu, p, dst_cpu, 1);
+ if (dst_cpu >= 0 && cpumask_test_cpu(dst_cpu, perf_domain_span(pd))) {
+ unsigned long util = cpu_util(dst_cpu, p, dst_cpu, 1);
unsigned long eff_util, min, max;
- /*
- * Performance domain frequency: utilization clamping
- * must be considered since it affects the selection
- * of the performance domain frequency.
- * NOTE: in case RT tasks are running, by default the min
- * utilization can be max OPP.
- */
- eff_util = effective_cpu_util(cpu, util, &min, &max);
+ eff_util = effective_cpu_util(dst_cpu, util, &min, &max);
/* Task's uclamp can modify min and max value */
- if (tsk && uclamp_is_used()) {
+ if (uclamp_is_used()) {
min = max(min, uclamp_eff_value(p, UCLAMP_MIN));
/*
* If there is no active max uclamp constraint,
* directly use task's one, otherwise keep max.
*/
- if (uclamp_rq_is_idle(cpu_rq(cpu)))
+ if (uclamp_rq_is_idle(cpu_rq(dst_cpu)))
max = uclamp_eff_value(p, UCLAMP_MAX);
else
max = max(max, uclamp_eff_value(p, UCLAMP_MAX));
}
- eff_util = sugov_effective_cpu_perf(cpu, eff_util, min, max);
+ eff_util = sugov_effective_cpu_perf(dst_cpu, eff_util, min, max);
max_util = max(max_util, eff_util);
}
@@ -8286,7 +8277,7 @@ static inline unsigned long
compute_energy(struct energy_env *eenv, struct perf_domain *pd,
struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu)
{
- unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
+ unsigned long max_util = eenv_pd_max_util(eenv, pd, p, dst_cpu);
unsigned long busy_time = eenv->pd_busy_time;
unsigned long energy;
@@ -8351,7 +8342,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
unsigned long best_actual_cap = 0;
unsigned long prev_actual_cap = 0;
struct sched_domain *sd;
- struct perf_domain *pd;
+ struct perf_domain *pd, *tmp_pd;
struct energy_env eenv;
rcu_read_lock();
@@ -8377,6 +8368,23 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
eenv_task_busy_time(&eenv, p, prev_cpu);
+ /* Algorithmic Optimization: Pre-calculate max_util for O(1) energy estimation */
+ for (tmp_pd = pd; tmp_pd; tmp_pd = tmp_pd->next) {
+ unsigned long max_u = 0;
+ int i;
+
+ for_each_cpu(i, perf_domain_span(tmp_pd)) {
+ unsigned long util = cpu_util(i, p, -1, 1);
+ unsigned long eff_util, min, max;
+
+ eff_util = effective_cpu_util(i, util, &min, &max);
+ eff_util = sugov_effective_cpu_perf(i, eff_util, min, max);
+ if (eff_util > max_u)
+ max_u = eff_util;
+ }
+ tmp_pd->max_util = max_u;
+ }
+
for (; pd; pd = pd->next) {
unsigned long util_min = p_util_min, util_max = p_util_max;
unsigned long cpu_cap, cpu_actual_cap, util;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d30cca6870f5..f308d335ca77 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -972,6 +972,10 @@ struct perf_domain {
struct em_perf_domain *em_pd;
struct perf_domain *next;
struct rcu_head rcu;
+
+ /* O(1) optimization hints */
+ unsigned long max_util;
+ int max_spare_cap_cpu;
};
/*
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 5a3f29a26bdb..b9de022ddb53 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -354,6 +354,7 @@ static struct perf_domain *pd_init(int cpu)
if (!pd)
return NULL;
pd->em_pd = obj;
+ pd->max_spare_cap_cpu = -1;
return pd;
}
--
2.51.0
Hi Qiliang,
kernel test robot noticed the following build warnings:
[auto build test WARNING on tip/sched/core]
[also build test WARNING on tip/master peterz-queue/sched/core linus/master v6.16-rc1 next-20260122]
[cannot apply to tip/auto-latest]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Qiliang-Yuan/sched-fair-Optimize-EAS-energy-calculation-complexity-from-O-N-to-O-1-inside-inner-loop/20260123-000924
base: tip/sched/core
patch link: https://lore.kernel.org/r/20260122154227.130767-1-realwujing%40gmail.com
patch subject: [PATCH] sched/fair: Optimize EAS energy calculation complexity from O(N) to O(1) inside inner loop
config: s390-allnoconfig-bpf (https://download.01.org/0day-ci/archive/20260123/202601231209.7Aqyec0v-lkp@intel.com/config)
compiler: s390-linux-gcc (GCC) 15.1.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260123/202601231209.7Aqyec0v-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202601231209.7Aqyec0v-lkp@intel.com/
All warnings (new ones prefixed by >>):
In file included from ./include/linux/bitmap.h:11,
from ./include/linux/cpumask.h:11,
from ./include/linux/energy_model.h:4,
from kernel/sched/fair.c:23:
kernel/sched/fair.c: In function 'find_energy_efficient_cpu':
./include/linux/cpumask_types.h:18:37: warning: dereferencing 'void *' pointer
18 | #define cpumask_bits(maskp) ((maskp)->bits)
| ^~
./include/linux/find.h:586:48: note: in definition of macro 'for_each_set_bit'
586 | for ((bit) = 0; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++)
| ^~~~
./include/linux/cpumask.h:380:31: note: in expansion of macro 'cpumask_bits'
380 | for_each_set_bit(cpu, cpumask_bits(mask), small_cpumask_bits)
| ^~~~~~~~~~~~
kernel/sched/fair.c:8345:17: note: in expansion of macro 'for_each_cpu'
8345 | for_each_cpu(i, perf_domain_span(tmp_pd)) {
| ^~~~~~~~~~~~
./include/linux/cpumask_types.h:18:37: error: request for member 'bits' in something not a structure or union
18 | #define cpumask_bits(maskp) ((maskp)->bits)
| ^~
./include/linux/find.h:586:48: note: in definition of macro 'for_each_set_bit'
586 | for ((bit) = 0; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++)
| ^~~~
./include/linux/cpumask.h:380:31: note: in expansion of macro 'cpumask_bits'
380 | for_each_set_bit(cpu, cpumask_bits(mask), small_cpumask_bits)
| ^~~~~~~~~~~~
kernel/sched/fair.c:8345:17: note: in expansion of macro 'for_each_cpu'
8345 | for_each_cpu(i, perf_domain_span(tmp_pd)) {
| ^~~~~~~~~~~~
>> ./include/linux/find.h:586:69: warning: left-hand operand of comma expression has no effect [-Wunused-value]
586 | for ((bit) = 0; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++)
| ^
./include/linux/cpumask.h:380:9: note: in expansion of macro 'for_each_set_bit'
380 | for_each_set_bit(cpu, cpumask_bits(mask), small_cpumask_bits)
| ^~~~~~~~~~~~~~~~
kernel/sched/fair.c:8345:17: note: in expansion of macro 'for_each_cpu'
8345 | for_each_cpu(i, perf_domain_span(tmp_pd)) {
| ^~~~~~~~~~~~
vim +586 ./include/linux/find.h
6b8ecb84f8f640 include/asm-generic/bitops/find.h Yury Norov 2021-08-14 584
bc9d6635c293a2 include/linux/find.h Yury Norov 2021-08-14 585 #define for_each_set_bit(bit, addr, size) \
fdae96a3fc7f70 include/linux/find.h Yury Norov 2022-09-19 @586 for ((bit) = 0; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++)
bc9d6635c293a2 include/linux/find.h Yury Norov 2021-08-14 587
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Hi Qiliang,
kernel test robot noticed the following build errors:
[auto build test ERROR on tip/sched/core]
[also build test ERROR on tip/master peterz-queue/sched/core linus/master v6.19-rc6 next-20260122]
[cannot apply to tip/auto-latest]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Qiliang-Yuan/sched-fair-Optimize-EAS-energy-calculation-complexity-from-O-N-to-O-1-inside-inner-loop/20260123-000924
base: tip/sched/core
patch link: https://lore.kernel.org/r/20260122154227.130767-1-realwujing%40gmail.com
patch subject: [PATCH] sched/fair: Optimize EAS energy calculation complexity from O(N) to O(1) inside inner loop
config: sh-defconfig (https://download.01.org/0day-ci/archive/20260123/202601230700.0p7YsNA0-lkp@intel.com/config)
compiler: sh4-linux-gcc (GCC) 15.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260123/202601230700.0p7YsNA0-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202601230700.0p7YsNA0-lkp@intel.com/
All errors (new ones prefixed by >>):
In file included from include/linux/bitmap.h:11,
from include/linux/cpumask.h:11,
from include/linux/energy_model.h:4,
from kernel/sched/fair.c:23:
kernel/sched/fair.c: In function 'find_energy_efficient_cpu':
include/linux/cpumask_types.h:18:37: warning: dereferencing 'void *' pointer
18 | #define cpumask_bits(maskp) ((maskp)->bits)
| ^~
include/linux/find.h:586:48: note: in definition of macro 'for_each_set_bit'
586 | for ((bit) = 0; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++)
| ^~~~
include/linux/cpumask.h:380:31: note: in expansion of macro 'cpumask_bits'
380 | for_each_set_bit(cpu, cpumask_bits(mask), small_cpumask_bits)
| ^~~~~~~~~~~~
kernel/sched/fair.c:8345:17: note: in expansion of macro 'for_each_cpu'
8345 | for_each_cpu(i, perf_domain_span(tmp_pd)) {
| ^~~~~~~~~~~~
>> include/linux/cpumask_types.h:18:37: error: request for member 'bits' in something not a structure or union
18 | #define cpumask_bits(maskp) ((maskp)->bits)
| ^~
include/linux/find.h:586:48: note: in definition of macro 'for_each_set_bit'
586 | for ((bit) = 0; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++)
| ^~~~
include/linux/cpumask.h:380:31: note: in expansion of macro 'cpumask_bits'
380 | for_each_set_bit(cpu, cpumask_bits(mask), small_cpumask_bits)
| ^~~~~~~~~~~~
kernel/sched/fair.c:8345:17: note: in expansion of macro 'for_each_cpu'
8345 | for_each_cpu(i, perf_domain_span(tmp_pd)) {
| ^~~~~~~~~~~~
include/linux/find.h:586:69: warning: left-hand operand of comma expression has no effect [-Wunused-value]
586 | for ((bit) = 0; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++)
| ^
include/linux/cpumask.h:380:9: note: in expansion of macro 'for_each_set_bit'
380 | for_each_set_bit(cpu, cpumask_bits(mask), small_cpumask_bits)
| ^~~~~~~~~~~~~~~~
kernel/sched/fair.c:8345:17: note: in expansion of macro 'for_each_cpu'
8345 | for_each_cpu(i, perf_domain_span(tmp_pd)) {
| ^~~~~~~~~~~~
vim +/bits +18 include/linux/cpumask_types.h
eb4faa36d674ee Yury Norov 2024-05-27 10
eb4faa36d674ee Yury Norov 2024-05-27 11 /**
eb4faa36d674ee Yury Norov 2024-05-27 12 * cpumask_bits - get the bits in a cpumask
eb4faa36d674ee Yury Norov 2024-05-27 13 * @maskp: the struct cpumask *
eb4faa36d674ee Yury Norov 2024-05-27 14 *
eb4faa36d674ee Yury Norov 2024-05-27 15 * You should only assume nr_cpu_ids bits of this mask are valid. This is
eb4faa36d674ee Yury Norov 2024-05-27 16 * a macro so it's const-correct.
eb4faa36d674ee Yury Norov 2024-05-27 17 */
eb4faa36d674ee Yury Norov 2024-05-27 @18 #define cpumask_bits(maskp) ((maskp)->bits)
eb4faa36d674ee Yury Norov 2024-05-27 19
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Hi Qiliang,
kernel test robot noticed the following build errors:
[auto build test ERROR on tip/sched/core]
[also build test ERROR on tip/master peterz-queue/sched/core linus/master v6.19-rc6 next-20260122]
[cannot apply to tip/auto-latest]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Qiliang-Yuan/sched-fair-Optimize-EAS-energy-calculation-complexity-from-O-N-to-O-1-inside-inner-loop/20260123-000924
base: tip/sched/core
patch link: https://lore.kernel.org/r/20260122154227.130767-1-realwujing%40gmail.com
patch subject: [PATCH] sched/fair: Optimize EAS energy calculation complexity from O(N) to O(1) inside inner loop
config: x86_64-randconfig-161-20260123 (https://download.01.org/0day-ci/archive/20260123/202601230609.zsX7Ojfn-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
smatch version: v0.5.0-8994-gd50c5a4c
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260123/202601230609.zsX7Ojfn-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202601230609.zsX7Ojfn-lkp@intel.com/
All errors (new ones prefixed by >>):
>> kernel/sched/fair.c:8345:3: error: member reference base type 'void' is not a structure or union
8345 | for_each_cpu(i, perf_domain_span(tmp_pd)) {
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
include/linux/cpumask.h:380:24: note: expanded from macro 'for_each_cpu'
380 | for_each_set_bit(cpu, cpumask_bits(mask), small_cpumask_bits)
| ~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
include/linux/cpumask_types.h:18:37: note: expanded from macro 'cpumask_bits'
18 | #define cpumask_bits(maskp) ((maskp)->bits)
| ^ ~~~~
include/linux/find.h:586:41: note: expanded from macro 'for_each_set_bit'
586 | for ((bit) = 0; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++)
| ^~~~
1 error generated.
vim +/void +8345 kernel/sched/fair.c
8262
8263 /*
8264 * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
8265 * waking task. find_energy_efficient_cpu() looks for the CPU with maximum
8266 * spare capacity in each performance domain and uses it as a potential
8267 * candidate to execute the task. Then, it uses the Energy Model to figure
8268 * out which of the CPU candidates is the most energy-efficient.
8269 *
8270 * The rationale for this heuristic is as follows. In a performance domain,
8271 * all the most energy efficient CPU candidates (according to the Energy
8272 * Model) are those for which we'll request a low frequency. When there are
8273 * several CPUs for which the frequency request will be the same, we don't
8274 * have enough data to break the tie between them, because the Energy Model
8275 * only includes active power costs. With this model, if we assume that
8276 * frequency requests follow utilization (e.g. using schedutil), the CPU with
8277 * the maximum spare capacity in a performance domain is guaranteed to be among
8278 * the best candidates of the performance domain.
8279 *
8280 * In practice, it could be preferable from an energy standpoint to pack
8281 * small tasks on a CPU in order to let other CPUs go in deeper idle states,
8282 * but that could also hurt our chances to go cluster idle, and we have no
8283 * ways to tell with the current Energy Model if this is actually a good
8284 * idea or not. So, find_energy_efficient_cpu() basically favors
8285 * cluster-packing, and spreading inside a cluster. That should at least be
8286 * a good thing for latency, and this is consistent with the idea that most
8287 * of the energy savings of EAS come from the asymmetry of the system, and
8288 * not so much from breaking the tie between identical CPUs. That's also the
8289 * reason why EAS is enabled in the topology code only for systems where
8290 * SD_ASYM_CPUCAPACITY is set.
8291 *
8292 * NOTE: Forkees are not accepted in the energy-aware wake-up path because
8293 * they don't have any useful utilization data yet and it's not possible to
8294 * forecast their impact on energy consumption. Consequently, they will be
8295 * placed by sched_balance_find_dst_cpu() on the least loaded CPU, which might turn out
8296 * to be energy-inefficient in some use-cases. The alternative would be to
8297 * bias new tasks towards specific types of CPUs first, or to try to infer
8298 * their util_avg from the parent task, but those heuristics could hurt
8299 * other use-cases too. So, until someone finds a better way to solve this,
8300 * let's keep things simple by re-using the existing slow path.
8301 */
8302 static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
8303 {
8304 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
8305 unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
8306 unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MIN) : 0;
8307 unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
8308 struct root_domain *rd = this_rq()->rd;
8309 int cpu, best_energy_cpu, target = -1;
8310 int prev_fits = -1, best_fits = -1;
8311 unsigned long best_actual_cap = 0;
8312 unsigned long prev_actual_cap = 0;
8313 struct sched_domain *sd;
8314 struct perf_domain *pd, *tmp_pd;
8315 struct energy_env eenv;
8316
8317 rcu_read_lock();
8318 pd = rcu_dereference_all(rd->pd);
8319 if (!pd)
8320 goto unlock;
8321
8322 /*
8323 * Energy-aware wake-up happens on the lowest sched_domain starting
8324 * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
8325 */
8326 sd = rcu_dereference_all(*this_cpu_ptr(&sd_asym_cpucapacity));
8327 while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
8328 sd = sd->parent;
8329 if (!sd)
8330 goto unlock;
8331
8332 target = prev_cpu;
8333
8334 sync_entity_load_avg(&p->se);
8335 if (!task_util_est(p) && p_util_min == 0)
8336 goto unlock;
8337
8338 eenv_task_busy_time(&eenv, p, prev_cpu);
8339
8340 /* Algorithmic Optimization: Pre-calculate max_util for O(1) energy estimation */
8341 for (tmp_pd = pd; tmp_pd; tmp_pd = tmp_pd->next) {
8342 unsigned long max_u = 0;
8343 int i;
8344
> 8345 for_each_cpu(i, perf_domain_span(tmp_pd)) {
8346 unsigned long util = cpu_util(i, p, -1, 1);
8347 unsigned long eff_util, min, max;
8348
8349 eff_util = effective_cpu_util(i, util, &min, &max);
8350 eff_util = sugov_effective_cpu_perf(i, eff_util, min, max);
8351 if (eff_util > max_u)
8352 max_u = eff_util;
8353 }
8354 tmp_pd->max_util = max_u;
8355 }
8356
8357 for (; pd; pd = pd->next) {
8358 unsigned long util_min = p_util_min, util_max = p_util_max;
8359 unsigned long cpu_cap, cpu_actual_cap, util;
8360 long prev_spare_cap = -1, max_spare_cap = -1;
8361 unsigned long rq_util_min, rq_util_max;
8362 unsigned long cur_delta, base_energy;
8363 int max_spare_cap_cpu = -1;
8364 int fits, max_fits = -1;
8365
8366 if (!cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask))
8367 continue;
8368
8369 /* Account external pressure for the energy estimation */
8370 cpu = cpumask_first(cpus);
8371 cpu_actual_cap = get_actual_cpu_capacity(cpu);
8372
8373 eenv.cpu_cap = cpu_actual_cap;
8374 eenv.pd_cap = 0;
8375
8376 for_each_cpu(cpu, cpus) {
8377 struct rq *rq = cpu_rq(cpu);
8378
8379 eenv.pd_cap += cpu_actual_cap;
8380
8381 if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
8382 continue;
8383
8384 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
8385 continue;
8386
8387 util = cpu_util(cpu, p, cpu, 0);
8388 cpu_cap = capacity_of(cpu);
8389
8390 /*
8391 * Skip CPUs that cannot satisfy the capacity request.
8392 * IOW, placing the task there would make the CPU
8393 * overutilized. Take uclamp into account to see how
8394 * much capacity we can get out of the CPU; this is
8395 * aligned with sched_cpu_util().
8396 */
8397 if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) {
8398 /*
8399 * Open code uclamp_rq_util_with() except for
8400 * the clamp() part. I.e.: apply max aggregation
8401 * only. util_fits_cpu() logic requires to
8402 * operate on non clamped util but must use the
8403 * max-aggregated uclamp_{min, max}.
8404 */
8405 rq_util_min = uclamp_rq_get(rq, UCLAMP_MIN);
8406 rq_util_max = uclamp_rq_get(rq, UCLAMP_MAX);
8407
8408 util_min = max(rq_util_min, p_util_min);
8409 util_max = max(rq_util_max, p_util_max);
8410 }
8411
8412 fits = util_fits_cpu(util, util_min, util_max, cpu);
8413 if (!fits)
8414 continue;
8415
8416 lsub_positive(&cpu_cap, util);
8417
8418 if (cpu == prev_cpu) {
8419 /* Always use prev_cpu as a candidate. */
8420 prev_spare_cap = cpu_cap;
8421 prev_fits = fits;
8422 } else if ((fits > max_fits) ||
8423 ((fits == max_fits) && ((long)cpu_cap > max_spare_cap))) {
8424 /*
8425 * Find the CPU with the maximum spare capacity
8426 * among the remaining CPUs in the performance
8427 * domain.
8428 */
8429 max_spare_cap = cpu_cap;
8430 max_spare_cap_cpu = cpu;
8431 max_fits = fits;
8432 }
8433 }
8434
8435 if (max_spare_cap_cpu < 0 && prev_spare_cap < 0)
8436 continue;
8437
8438 eenv_pd_busy_time(&eenv, cpus, p);
8439 /* Compute the 'base' energy of the pd, without @p */
8440 base_energy = compute_energy(&eenv, pd, cpus, p, -1);
8441
8442 /* Evaluate the energy impact of using prev_cpu. */
8443 if (prev_spare_cap > -1) {
8444 prev_delta = compute_energy(&eenv, pd, cpus, p,
8445 prev_cpu);
8446 /* CPU utilization has changed */
8447 if (prev_delta < base_energy)
8448 goto unlock;
8449 prev_delta -= base_energy;
8450 prev_actual_cap = cpu_actual_cap;
8451 best_delta = min(best_delta, prev_delta);
8452 }
8453
8454 /* Evaluate the energy impact of using max_spare_cap_cpu. */
8455 if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) {
8456 /* Current best energy cpu fits better */
8457 if (max_fits < best_fits)
8458 continue;
8459
8460 /*
8461 * Both don't fit performance hint (i.e. uclamp_min)
8462 * but best energy cpu has better capacity.
8463 */
8464 if ((max_fits < 0) &&
8465 (cpu_actual_cap <= best_actual_cap))
8466 continue;
8467
8468 cur_delta = compute_energy(&eenv, pd, cpus, p,
8469 max_spare_cap_cpu);
8470 /* CPU utilization has changed */
8471 if (cur_delta < base_energy)
8472 goto unlock;
8473 cur_delta -= base_energy;
8474
8475 /*
8476 * Both fit for the task but best energy cpu has lower
8477 * energy impact.
8478 */
8479 if ((max_fits > 0) && (best_fits > 0) &&
8480 (cur_delta >= best_delta))
8481 continue;
8482
8483 best_delta = cur_delta;
8484 best_energy_cpu = max_spare_cap_cpu;
8485 best_fits = max_fits;
8486 best_actual_cap = cpu_actual_cap;
8487 }
8488 }
8489 rcu_read_unlock();
8490
8491 if ((best_fits > prev_fits) ||
8492 ((best_fits > 0) && (best_delta < prev_delta)) ||
8493 ((best_fits < 0) && (best_actual_cap > prev_actual_cap)))
8494 target = best_energy_cpu;
8495
8496 return target;
8497
8498 unlock:
8499 rcu_read_unlock();
8500
8501 return target;
8502 }
8503
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Hi Qiliang,
kernel test robot noticed the following build warnings:
[auto build test WARNING on tip/sched/core]
[also build test WARNING on tip/master peterz-queue/sched/core linus/master v6.19-rc6 next-20260122]
[cannot apply to tip/auto-latest]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Qiliang-Yuan/sched-fair-Optimize-EAS-energy-calculation-complexity-from-O-N-to-O-1-inside-inner-loop/20260123-000924
base: tip/sched/core
patch link: https://lore.kernel.org/r/20260122154227.130767-1-realwujing%40gmail.com
patch subject: [PATCH] sched/fair: Optimize EAS energy calculation complexity from O(N) to O(1) inside inner loop
config: sh-defconfig (https://download.01.org/0day-ci/archive/20260123/202601230422.1xw7nG4T-lkp@intel.com/config)
compiler: sh4-linux-gcc (GCC) 15.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260123/202601230422.1xw7nG4T-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202601230422.1xw7nG4T-lkp@intel.com/
All warnings (new ones prefixed by >>):
In file included from include/linux/bitmap.h:11,
from include/linux/cpumask.h:11,
from include/linux/energy_model.h:4,
from kernel/sched/fair.c:23:
kernel/sched/fair.c: In function 'find_energy_efficient_cpu':
>> include/linux/cpumask_types.h:18:37: warning: dereferencing 'void *' pointer
18 | #define cpumask_bits(maskp) ((maskp)->bits)
| ^~
include/linux/find.h:586:48: note: in definition of macro 'for_each_set_bit'
586 | for ((bit) = 0; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++)
| ^~~~
include/linux/cpumask.h:380:31: note: in expansion of macro 'cpumask_bits'
380 | for_each_set_bit(cpu, cpumask_bits(mask), small_cpumask_bits)
| ^~~~~~~~~~~~
kernel/sched/fair.c:8345:17: note: in expansion of macro 'for_each_cpu'
8345 | for_each_cpu(i, perf_domain_span(tmp_pd)) {
| ^~~~~~~~~~~~
include/linux/cpumask_types.h:18:37: error: request for member 'bits' in something not a structure or union
18 | #define cpumask_bits(maskp) ((maskp)->bits)
| ^~
include/linux/find.h:586:48: note: in definition of macro 'for_each_set_bit'
586 | for ((bit) = 0; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++)
| ^~~~
include/linux/cpumask.h:380:31: note: in expansion of macro 'cpumask_bits'
380 | for_each_set_bit(cpu, cpumask_bits(mask), small_cpumask_bits)
| ^~~~~~~~~~~~
kernel/sched/fair.c:8345:17: note: in expansion of macro 'for_each_cpu'
8345 | for_each_cpu(i, perf_domain_span(tmp_pd)) {
| ^~~~~~~~~~~~
>> include/linux/find.h:586:69: warning: left-hand operand of comma expression has no effect [-Wunused-value]
586 | for ((bit) = 0; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++)
| ^
include/linux/cpumask.h:380:9: note: in expansion of macro 'for_each_set_bit'
380 | for_each_set_bit(cpu, cpumask_bits(mask), small_cpumask_bits)
| ^~~~~~~~~~~~~~~~
kernel/sched/fair.c:8345:17: note: in expansion of macro 'for_each_cpu'
8345 | for_each_cpu(i, perf_domain_span(tmp_pd)) {
| ^~~~~~~~~~~~
vim +18 include/linux/cpumask_types.h
eb4faa36d674ee Yury Norov 2024-05-27 10
eb4faa36d674ee Yury Norov 2024-05-27 11 /**
eb4faa36d674ee Yury Norov 2024-05-27 12 * cpumask_bits - get the bits in a cpumask
eb4faa36d674ee Yury Norov 2024-05-27 13 * @maskp: the struct cpumask *
eb4faa36d674ee Yury Norov 2024-05-27 14 *
eb4faa36d674ee Yury Norov 2024-05-27 15 * You should only assume nr_cpu_ids bits of this mask are valid. This is
eb4faa36d674ee Yury Norov 2024-05-27 16 * a macro so it's const-correct.
eb4faa36d674ee Yury Norov 2024-05-27 17 */
eb4faa36d674ee Yury Norov 2024-05-27 @18 #define cpumask_bits(maskp) ((maskp)->bits)
eb4faa36d674ee Yury Norov 2024-05-27 19
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
On 1/22/26 15:42, Qiliang Yuan wrote:
> By pre-calculating the base max utilization of each performance domain
> at the start of find_energy_efficient_cpu(), we can avoid the repetitive
> O(M) scan in eenv_pd_max_util() for every candidate CPU.
>
> This reduces the complexity of energy calculation from O(P*N) to O(N + P^2),
> where P is the number of performance domains. For systems with many
> performance domains or high core counts, this results in significant
> reduction in wakeup latency.
I don't think these are correct, but also
>
> Signed-off-by: Qiliang Yuan <realwujing@gmail.com>
> Signed-off-by: Qiliang Yuan <yuanql9@chinatelecom.cn>
> ---
> kernel/sched/fair.c | 46 ++++++++++++++++++++++++-----------------
> kernel/sched/sched.h | 4 ++++
> kernel/sched/topology.c | 1 +
> 3 files changed, 32 insertions(+), 19 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 458324d240e9..de5bfdfe553f 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -8236,41 +8236,32 @@ static inline void eenv_pd_busy_time(struct energy_env *eenv,
> * exceed @eenv->cpu_cap.
> */
> static inline unsigned long
> -eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
> +eenv_pd_max_util(struct energy_env *eenv, struct perf_domain *pd,
> struct task_struct *p, int dst_cpu)
> {
> - unsigned long max_util = 0;
> - int cpu;
> + unsigned long max_util = pd->max_util;
>
> - for_each_cpu(cpu, pd_cpus) {
> - struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
> - unsigned long util = cpu_util(cpu, p, dst_cpu, 1);
> + if (dst_cpu >= 0 && cpumask_test_cpu(dst_cpu, perf_domain_span(pd))) {
> + unsigned long util = cpu_util(dst_cpu, p, dst_cpu, 1);
> unsigned long eff_util, min, max;
>
> - /*
> - * Performance domain frequency: utilization clamping
> - * must be considered since it affects the selection
> - * of the performance domain frequency.
> - * NOTE: in case RT tasks are running, by default the min
> - * utilization can be max OPP.
> - */
> - eff_util = effective_cpu_util(cpu, util, &min, &max);
> + eff_util = effective_cpu_util(dst_cpu, util, &min, &max);
>
> /* Task's uclamp can modify min and max value */
> - if (tsk && uclamp_is_used()) {
> + if (uclamp_is_used()) {
> min = max(min, uclamp_eff_value(p, UCLAMP_MIN));
>
> /*
> * If there is no active max uclamp constraint,
> * directly use task's one, otherwise keep max.
> */
> - if (uclamp_rq_is_idle(cpu_rq(cpu)))
> + if (uclamp_rq_is_idle(cpu_rq(dst_cpu)))
> max = uclamp_eff_value(p, UCLAMP_MAX);
> else
> max = max(max, uclamp_eff_value(p, UCLAMP_MAX));
> }
>
> - eff_util = sugov_effective_cpu_perf(cpu, eff_util, min, max);
> + eff_util = sugov_effective_cpu_perf(dst_cpu, eff_util, min, max);
> max_util = max(max_util, eff_util);
> }
>
> @@ -8286,7 +8277,7 @@ static inline unsigned long
> compute_energy(struct energy_env *eenv, struct perf_domain *pd,
> struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu)
> {
> - unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
> + unsigned long max_util = eenv_pd_max_util(eenv, pd, p, dst_cpu);
> unsigned long busy_time = eenv->pd_busy_time;
> unsigned long energy;
>
> @@ -8351,7 +8342,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> unsigned long best_actual_cap = 0;
> unsigned long prev_actual_cap = 0;
> struct sched_domain *sd;
> - struct perf_domain *pd;
> + struct perf_domain *pd, *tmp_pd;
> struct energy_env eenv;
>
> rcu_read_lock();
> @@ -8377,6 +8368,23 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
>
> eenv_task_busy_time(&eenv, p, prev_cpu);
>
> + /* Algorithmic Optimization: Pre-calculate max_util for O(1) energy estimation */
> + for (tmp_pd = pd; tmp_pd; tmp_pd = tmp_pd->next) {
> + unsigned long max_u = 0;
> + int i;
> +
> + for_each_cpu(i, perf_domain_span(tmp_pd)) {
> + unsigned long util = cpu_util(i, p, -1, 1);
> + unsigned long eff_util, min, max;
> +
> + eff_util = effective_cpu_util(i, util, &min, &max);
> + eff_util = sugov_effective_cpu_perf(i, eff_util, min, max);
> + if (eff_util > max_u)
> + max_u = eff_util;
> + }
> + tmp_pd->max_util = max_u;
You can't do this as there's no synchronisation for perf_domain apart from
the rcu_read_lock() here.
> + }
> +
> for (; pd; pd = pd->next) {
> unsigned long util_min = p_util_min, util_max = p_util_max;
> unsigned long cpu_cap, cpu_actual_cap, util;
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index d30cca6870f5..f308d335ca77 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -972,6 +972,10 @@ struct perf_domain {
> struct em_perf_domain *em_pd;
> struct perf_domain *next;
> struct rcu_head rcu;
> +
> + /* O(1) optimization hints */
> + unsigned long max_util;
> + int max_spare_cap_cpu;
> };
>
> /*
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index 5a3f29a26bdb..b9de022ddb53 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -354,6 +354,7 @@ static struct perf_domain *pd_init(int cpu)
> if (!pd)
> return NULL;
> pd->em_pd = obj;
> + pd->max_spare_cap_cpu = -1;
>
> return pd;
> }
Pre-calculate the base maximum utilization of each performance domain during the
main loop of find_energy_efficient_cpu() and cache it in the local
'energy_env' structure.
By caching this base value, the maximum utilization for candidate CPU
placements (such as prev_cpu and max_spare_cap_cpu) can be determined in
O(1) time, eliminating redundant scans of the performance domain. This
optimizes the energy estimation path by reducing the number of scans per
performance domain from three to one.
This change significantly reduces wake-up latency on systems with high core
counts or complex performance domain topologies by minimizing the overall
complexity of the Energy-Aware Scheduling (EAS) calculation.
Signed-off-by: Qiliang Yuan <yuanql9@chinatelecom.cn>
Signed-off-by: Qiliang Yuan <realwujing@gmail.com>
---
v2:
- Ensure RCU safety by using local 'energy_env' for caching instead of
modifying the shared 'perf_domain' structure.
- Consolidate pre-calculation into the main loop to avoid an extra pass
over the performance domains.
v1:
- Optimize energy calculation by pre-calculating performance domain max utilization.
- Add max_util and max_spare_cap_cpu to struct perf_domain.
- Reduce inner loop complexity from O(N) to O(1) for energy estimation.
kernel/sched/fair.c | 36 ++++++++++++++++++------------------
1 file changed, 18 insertions(+), 18 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e71302282671..5c114c49c202 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8148,6 +8148,7 @@ struct energy_env {
unsigned long pd_busy_time;
unsigned long cpu_cap;
unsigned long pd_cap;
+ unsigned long pd_max_util;
};
/*
@@ -8215,41 +8216,32 @@ static inline void eenv_pd_busy_time(struct energy_env *eenv,
* exceed @eenv->cpu_cap.
*/
static inline unsigned long
-eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
+eenv_pd_max_util(struct energy_env *eenv, struct perf_domain *pd,
struct task_struct *p, int dst_cpu)
{
- unsigned long max_util = 0;
- int cpu;
+ unsigned long max_util = eenv->pd_max_util;
- for_each_cpu(cpu, pd_cpus) {
- struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
- unsigned long util = cpu_util(cpu, p, dst_cpu, 1);
+ if (dst_cpu >= 0 && cpumask_test_cpu(dst_cpu, perf_domain_span(pd))) {
+ unsigned long util = cpu_util(dst_cpu, p, dst_cpu, 1);
unsigned long eff_util, min, max;
- /*
- * Performance domain frequency: utilization clamping
- * must be considered since it affects the selection
- * of the performance domain frequency.
- * NOTE: in case RT tasks are running, by default the min
- * utilization can be max OPP.
- */
- eff_util = effective_cpu_util(cpu, util, &min, &max);
+ eff_util = effective_cpu_util(dst_cpu, util, &min, &max);
/* Task's uclamp can modify min and max value */
- if (tsk && uclamp_is_used()) {
+ if (uclamp_is_used()) {
min = max(min, uclamp_eff_value(p, UCLAMP_MIN));
/*
* If there is no active max uclamp constraint,
* directly use task's one, otherwise keep max.
*/
- if (uclamp_rq_is_idle(cpu_rq(cpu)))
+ if (uclamp_rq_is_idle(cpu_rq(dst_cpu)))
max = uclamp_eff_value(p, UCLAMP_MAX);
else
max = max(max, uclamp_eff_value(p, UCLAMP_MAX));
}
- eff_util = sugov_effective_cpu_perf(cpu, eff_util, min, max);
+ eff_util = sugov_effective_cpu_perf(dst_cpu, eff_util, min, max);
max_util = max(max_util, eff_util);
}
@@ -8265,7 +8257,7 @@ static inline unsigned long
compute_energy(struct energy_env *eenv, struct perf_domain *pd,
struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu)
{
- unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
+ unsigned long max_util = eenv_pd_max_util(eenv, pd, p, dst_cpu);
unsigned long busy_time = eenv->pd_busy_time;
unsigned long energy;
@@ -8376,12 +8368,20 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
eenv.cpu_cap = cpu_actual_cap;
eenv.pd_cap = 0;
+ eenv.pd_max_util = 0;
for_each_cpu(cpu, cpus) {
struct rq *rq = cpu_rq(cpu);
+ unsigned long util_b, eff_util_b, min_b, max_b;
eenv.pd_cap += cpu_actual_cap;
+ /* Pre-calculate base max utilization for the performance domain */
+ util_b = cpu_util(cpu, p, -1, 1);
+ eff_util_b = effective_cpu_util(cpu, util_b, &min_b, &max_b);
+ eff_util_b = sugov_effective_cpu_perf(cpu, eff_util_b, min_b, max_b);
+ eenv.pd_max_util = max(eenv.pd_max_util, eff_util_b);
+
if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
continue;
--
2.51.0
© 2016 - 2026 Red Hat, Inc.