feec() looks for the CPU with highest spare capacity in a PD assuming that
it will be the best CPU from a energy efficiency PoV because it will
require the smallest increase of OPP. Although this is true generally
speaking, this policy also filters some others CPUs which will be as
efficients because of using the same OPP.
In fact, we really care about the cost of the new OPP that will be
selected to handle the waking task. In many cases, several CPUs will end
up selecting the same OPP and as a result using the same energy cost. In
these cases, we can use other metrics to select the best CPU for the same
energy cost.
Rework feec() to look 1st for the lowest cost in a PD and then the most
performant CPU between CPUs.
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---
kernel/sched/fair.c | 466 +++++++++++++++++++++++---------------------
1 file changed, 244 insertions(+), 222 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e67d6029b269..2273eecf6086 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8081,29 +8081,37 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
}
/*
- * energy_env - Utilization landscape for energy estimation.
- * @task_busy_time: Utilization contribution by the task for which we test the
- * placement. Given by eenv_task_busy_time().
- * @pd_busy_time: Utilization of the whole perf domain without the task
- * contribution. Given by eenv_pd_busy_time().
- * @cpu_cap: Maximum CPU capacity for the perf domain.
- * @pd_cap: Entire perf domain capacity. (pd->nr_cpus * cpu_cap).
- */
-struct energy_env {
- unsigned long task_busy_time;
- unsigned long pd_busy_time;
- unsigned long cpu_cap;
- unsigned long pd_cap;
+ * energy_cpu_stat - Utilization landscape for energy estimation.
+ * @idx : Index of the OPP in the performance domain
+ * @cost : Cost of the OPP
+ * @max_perf : Compute capacity of OPP
+ * @min_perf : Compute capacity of the previous OPP
+ * @capa : Capacity of the CPU
+ * @runnable : runnbale_avg of the CPU
+ * @nr_running : number of cfs running task
+ * @fits : Fits level of the CPU
+ * @cpu : current best CPU
+ */
+struct energy_cpu_stat {
+ unsigned long idx;
+ unsigned long cost;
+ unsigned long max_perf;
+ unsigned long min_perf;
+ unsigned long capa;
+ unsigned long util;
+ unsigned long runnable;
+ unsigned int nr_running;
+ int fits;
+ int cpu;
};
/*
- * Compute the task busy time for compute_energy(). This time cannot be
- * injected directly into effective_cpu_util() because of the IRQ scaling.
+ * Compute the task busy time for computing its energy impact. This time cannot
+ * be injected directly into effective_cpu_util() because of the IRQ scaling.
* The latter only makes sense with the most recent CPUs where the task has
* run.
*/
-static inline void eenv_task_busy_time(struct energy_env *eenv,
- struct task_struct *p, int prev_cpu)
+static inline unsigned long task_busy_time(struct task_struct *p, int prev_cpu)
{
unsigned long busy_time, max_cap = arch_scale_cpu_capacity(prev_cpu);
unsigned long irq = cpu_util_irq(cpu_rq(prev_cpu));
@@ -8113,124 +8121,152 @@ static inline void eenv_task_busy_time(struct energy_env *eenv,
else
busy_time = scale_irq_capacity(task_util_est(p), irq, max_cap);
- eenv->task_busy_time = busy_time;
+ return busy_time;
}
-/*
- * Compute the perf_domain (PD) busy time for compute_energy(). Based on the
- * utilization for each @pd_cpus, it however doesn't take into account
- * clamping since the ratio (utilization / cpu_capacity) is already enough to
- * scale the EM reported power consumption at the (eventually clamped)
- * cpu_capacity.
- *
- * The contribution of the task @p for which we want to estimate the
- * energy cost is removed (by cpu_util()) and must be calculated
- * separately (see eenv_task_busy_time). This ensures:
- *
- * - A stable PD utilization, no matter which CPU of that PD we want to place
- * the task on.
- *
- * - A fair comparison between CPUs as the task contribution (task_util())
- * will always be the same no matter which CPU utilization we rely on
- * (util_avg or util_est).
- *
- * Set @eenv busy time for the PD that spans @pd_cpus. This busy time can't
- * exceed @eenv->pd_cap.
- */
-static inline void eenv_pd_busy_time(struct energy_env *eenv,
- struct cpumask *pd_cpus,
- struct task_struct *p)
+/* Estimate the utilization of the CPU that is then used to select the OPP */
+static unsigned long find_cpu_max_util(int cpu, struct task_struct *p, int dst_cpu)
{
- unsigned long busy_time = 0;
- int cpu;
+ unsigned long util = cpu_util(cpu, p, dst_cpu, 1);
+ unsigned long eff_util, min, max;
+
+ /*
+ * Performance domain frequency: utilization clamping
+ * must be considered since it affects the selection
+ * of the performance domain frequency.
+ */
+ eff_util = effective_cpu_util(cpu, util, &min, &max);
- for_each_cpu(cpu, pd_cpus) {
- unsigned long util = cpu_util(cpu, p, -1, 0);
+ /* Task's uclamp can modify min and max value */
+ if (uclamp_is_used() && cpu == dst_cpu) {
+ min = max(min, uclamp_eff_value(p, UCLAMP_MIN));
- busy_time += effective_cpu_util(cpu, util, NULL, NULL);
+ /*
+ * If there is no active max uclamp constraint,
+ * directly use task's one, otherwise keep max.
+ */
+ if (uclamp_rq_is_idle(cpu_rq(cpu)))
+ max = uclamp_eff_value(p, UCLAMP_MAX);
+ else
+ max = max(max, uclamp_eff_value(p, UCLAMP_MAX));
}
- eenv->pd_busy_time = min(eenv->pd_cap, busy_time);
+ eff_util = sugov_effective_cpu_perf(cpu, eff_util, min, max);
+ return eff_util;
}
-/*
- * Compute the maximum utilization for compute_energy() when the task @p
- * is placed on the cpu @dst_cpu.
- *
- * Returns the maximum utilization among @eenv->cpus. This utilization can't
- * exceed @eenv->cpu_cap.
- */
-static inline unsigned long
-eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
- struct task_struct *p, int dst_cpu)
+/* Estimate the utilization of the CPU without the task */
+static unsigned long find_cpu_actual_util(int cpu, struct task_struct *p)
{
- unsigned long max_util = 0;
- int cpu;
+ unsigned long util = cpu_util(cpu, p, -1, 0);
+ unsigned long eff_util;
- for_each_cpu(cpu, pd_cpus) {
- struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
- unsigned long util = cpu_util(cpu, p, dst_cpu, 1);
- unsigned long eff_util, min, max;
+ eff_util = effective_cpu_util(cpu, util, NULL, NULL);
- /*
- * Performance domain frequency: utilization clamping
- * must be considered since it affects the selection
- * of the performance domain frequency.
- * NOTE: in case RT tasks are running, by default the min
- * utilization can be max OPP.
- */
- eff_util = effective_cpu_util(cpu, util, &min, &max);
+ return eff_util;
+}
- /* Task's uclamp can modify min and max value */
- if (tsk && uclamp_is_used()) {
- min = max(min, uclamp_eff_value(p, UCLAMP_MIN));
+/* Find the cost of a performance domain for the estimated utilization */
+static inline void find_pd_cost(struct em_perf_domain *pd,
+ unsigned long max_util,
+ struct energy_cpu_stat *stat)
+{
+ struct em_perf_table *em_table;
+ struct em_perf_state *ps;
+ int i;
- /*
- * If there is no active max uclamp constraint,
- * directly use task's one, otherwise keep max.
- */
- if (uclamp_rq_is_idle(cpu_rq(cpu)))
- max = uclamp_eff_value(p, UCLAMP_MAX);
- else
- max = max(max, uclamp_eff_value(p, UCLAMP_MAX));
- }
+ /*
+ * Find the lowest performance state of the Energy Model above the
+ * requested performance.
+ */
+ em_table = rcu_dereference(pd->em_table);
+ i = em_pd_get_efficient_state(em_table->state, pd->nr_perf_states,
+ max_util, pd->flags);
+ ps = &em_table->state[i];
- eff_util = sugov_effective_cpu_perf(cpu, eff_util, min, max);
- max_util = max(max_util, eff_util);
+ /* Save the cost and performance range of the OPP */
+ stat->max_perf = ps->performance;
+ stat->cost = ps->cost;
+ i = em_pd_get_previous_state(em_table->state, pd->nr_perf_states,
+ i, pd->flags);
+ if (i < 0)
+ stat->min_perf = 0;
+ else {
+ ps = &em_table->state[i];
+ stat->min_perf = ps->performance;
}
-
- return min(max_util, eenv->cpu_cap);
}
-/*
- * compute_energy(): Use the Energy Model to estimate the energy that @pd would
- * consume for a given utilization landscape @eenv. When @dst_cpu < 0, the task
- * contribution is ignored.
- */
-static inline unsigned long
-compute_energy(struct energy_env *eenv, struct perf_domain *pd,
- struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu)
+/*Check if the CPU can handle the waking task */
+static int check_cpu_with_task(struct task_struct *p, int cpu)
{
- unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
- unsigned long busy_time = eenv->pd_busy_time;
- unsigned long energy;
+ unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MIN) : 0;
+ unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
+ unsigned long util_min = p_util_min;
+ unsigned long util_max = p_util_max;
+ unsigned long util = cpu_util(cpu, p, cpu, 0);
+ struct rq *rq = cpu_rq(cpu);
- if (dst_cpu >= 0)
- busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time);
+ /*
+ * Skip CPUs that cannot satisfy the capacity request.
+ * IOW, placing the task there would make the CPU
+ * overutilized. Take uclamp into account to see how
+ * much capacity we can get out of the CPU; this is
+ * aligned with sched_cpu_util().
+ */
+ if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) {
+ unsigned long rq_util_min, rq_util_max;
+ /*
+ * Open code uclamp_rq_util_with() except for
+ * the clamp() part. I.e.: apply max aggregation
+ * only. util_fits_cpu() logic requires to
+ * operate on non clamped util but must use the
+ * max-aggregated uclamp_{min, max}.
+ */
+ rq_util_min = uclamp_rq_get(rq, UCLAMP_MIN);
+ rq_util_max = uclamp_rq_get(rq, UCLAMP_MAX);
+ util_min = max(rq_util_min, p_util_min);
+ util_max = max(rq_util_max, p_util_max);
+ }
+ return util_fits_cpu(util, util_min, util_max, cpu);
+}
- energy = em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
+/* For a same cost, select the CPU that will povide best performance for the task */
+static bool select_best_cpu(struct energy_cpu_stat *target,
+ struct energy_cpu_stat *min,
+ int prev, struct sched_domain *sd)
+{
+ /* Select the one with the least number of running tasks */
+ if (target->nr_running < min->nr_running)
+ return true;
+ if (target->nr_running > min->nr_running)
+ return false;
- trace_sched_compute_energy_tp(p, dst_cpu, energy, max_util, busy_time);
+ /* Favor previous CPU otherwise */
+ if (target->cpu == prev)
+ return true;
+ if (min->cpu == prev)
+ return false;
- return energy;
+ /*
+ * Choose CPU with lowest contention. One might want to consider load instead of
+ * runnable but we are supposed to not be overutilized so there is enough compute
+ * capacity for everybody.
+ */
+ if ((target->runnable * min->capa * sd->imbalance_pct) >=
+ (min->runnable * target->capa * 100))
+ return false;
+
+ return true;
}
/*
* find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
- * waking task. find_energy_efficient_cpu() looks for the CPU with maximum
- * spare capacity in each performance domain and uses it as a potential
- * candidate to execute the task. Then, it uses the Energy Model to figure
- * out which of the CPU candidates is the most energy-efficient.
+ * waking task. find_energy_efficient_cpu() looks for the CPU with the lowest
+ * power cost (usually with maximum spare capacity but not always) in each
+ * performance domain and uses it as a potential candidate to execute the task.
+ * Then, it uses the Energy Model to figure out which of the CPU candidates is
+ * the most energy-efficient.
*
* The rationale for this heuristic is as follows. In a performance domain,
* all the most energy efficient CPU candidates (according to the Energy
@@ -8267,17 +8303,14 @@ compute_energy(struct energy_env *eenv, struct perf_domain *pd,
static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
{
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
- unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
- unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MIN) : 0;
- unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
+ unsigned long task_util;
+ unsigned long best_nrg = ULONG_MAX;
+ int best_fits = -1;
+ int best_cpu = -1;
struct root_domain *rd = this_rq()->rd;
- int cpu, best_energy_cpu, target = -1;
- int prev_fits = -1, best_fits = -1;
- unsigned long best_actual_cap = 0;
- unsigned long prev_actual_cap = 0;
+ int cpu, target = -1;
struct sched_domain *sd;
struct perf_domain *pd;
- struct energy_env eenv;
rcu_read_lock();
pd = rcu_dereference(rd->pd);
@@ -8296,20 +8329,21 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
target = prev_cpu;
- sync_entity_load_avg(&p->se);
- if (!task_util_est(p) && p_util_min == 0)
- goto unlock;
- eenv_task_busy_time(&eenv, p, prev_cpu);
+ sync_entity_load_avg(&p->se);
+ task_util = task_busy_time(p, prev_cpu);
for (; pd; pd = pd->next) {
- unsigned long util_min = p_util_min, util_max = p_util_max;
- unsigned long cpu_cap, cpu_actual_cap, util;
- long prev_spare_cap = -1, max_spare_cap = -1;
- unsigned long rq_util_min, rq_util_max;
- unsigned long cur_delta, base_energy;
- int max_spare_cap_cpu = -1;
- int fits, max_fits = -1;
+ unsigned long cpu_actual_cap, max_cost = 0;
+ unsigned long pd_actual_util = 0, delta_nrg = 0;
+ struct energy_cpu_stat target_stat;
+ struct energy_cpu_stat min_stat = {
+ .cost = ULONG_MAX,
+ .max_perf = ULONG_MAX,
+ .min_perf = ULONG_MAX,
+ .fits = -2,
+ .cpu = -1,
+ };
cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
@@ -8320,13 +8354,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
cpu = cpumask_first(cpus);
cpu_actual_cap = get_actual_cpu_capacity(cpu);
- eenv.cpu_cap = cpu_actual_cap;
- eenv.pd_cap = 0;
-
+ /* In a PD, the CPU with the lowest cost will be the most efficient */
for_each_cpu(cpu, cpus) {
- struct rq *rq = cpu_rq(cpu);
-
- eenv.pd_cap += cpu_actual_cap;
+ unsigned long target_perf;
if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
continue;
@@ -8334,120 +8364,112 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
- util = cpu_util(cpu, p, cpu, 0);
- cpu_cap = capacity_of(cpu);
+ target_stat.fits = check_cpu_with_task(p, cpu);
+
+ if (!target_stat.fits)
+ continue;
+
+ /* 1st select the CPU that fits best */
+ if (target_stat.fits < min_stat.fits)
+ continue;
+
+ /* Then select the CPU with lowest cost */
+
+ /* Get the performance of the CPU w/ waking task. */
+ target_perf = find_cpu_max_util(cpu, p, cpu);
+ target_perf = min(target_perf, cpu_actual_cap);
+
+ /* Needing a higher OPP means a higher cost */
+ if (target_perf > min_stat.max_perf)
+ continue;
/*
- * Skip CPUs that cannot satisfy the capacity request.
- * IOW, placing the task there would make the CPU
- * overutilized. Take uclamp into account to see how
- * much capacity we can get out of the CPU; this is
- * aligned with sched_cpu_util().
+ * At this point, target's cost can be either equal or
+ * lower than the current minimum cost.
*/
- if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) {
- /*
- * Open code uclamp_rq_util_with() except for
- * the clamp() part. I.e.: apply max aggregation
- * only. util_fits_cpu() logic requires to
- * operate on non clamped util but must use the
- * max-aggregated uclamp_{min, max}.
- */
- rq_util_min = uclamp_rq_get(rq, UCLAMP_MIN);
- rq_util_max = uclamp_rq_get(rq, UCLAMP_MAX);
- util_min = max(rq_util_min, p_util_min);
- util_max = max(rq_util_max, p_util_max);
- }
+ /* Gather more statistics */
+ target_stat.cpu = cpu;
+ target_stat.runnable = cpu_runnable(cpu_rq(cpu));
+ target_stat.capa = capacity_of(cpu);
+ target_stat.nr_running = cpu_rq(cpu)->cfs.h_nr_running;
- fits = util_fits_cpu(util, util_min, util_max, cpu);
- if (!fits)
+ /* If the target needs a lower OPP, then look up for
+ * the corresponding OPP and its associated cost.
+ * Otherwise at same cost level, select the CPU which
+ * provides best performance.
+ */
+ if (target_perf < min_stat.min_perf)
+ find_pd_cost(pd->em_pd, target_perf, &target_stat);
+ else if (!select_best_cpu(&target_stat, &min_stat, prev_cpu, sd))
continue;
- lsub_positive(&cpu_cap, util);
-
- if (cpu == prev_cpu) {
- /* Always use prev_cpu as a candidate. */
- prev_spare_cap = cpu_cap;
- prev_fits = fits;
- } else if ((fits > max_fits) ||
- ((fits == max_fits) && ((long)cpu_cap > max_spare_cap))) {
- /*
- * Find the CPU with the maximum spare capacity
- * among the remaining CPUs in the performance
- * domain.
- */
- max_spare_cap = cpu_cap;
- max_spare_cap_cpu = cpu;
- max_fits = fits;
- }
+ /* Save the new most efficient CPU of the PD */
+ min_stat = target_stat;
}
- if (max_spare_cap_cpu < 0 && prev_spare_cap < 0)
+ if (min_stat.cpu == -1)
continue;
- eenv_pd_busy_time(&eenv, cpus, p);
- /* Compute the 'base' energy of the pd, without @p */
- base_energy = compute_energy(&eenv, pd, cpus, p, -1);
+ if (min_stat.fits < best_fits)
+ continue;
- /* Evaluate the energy impact of using prev_cpu. */
- if (prev_spare_cap > -1) {
- prev_delta = compute_energy(&eenv, pd, cpus, p,
- prev_cpu);
- /* CPU utilization has changed */
- if (prev_delta < base_energy)
- goto unlock;
- prev_delta -= base_energy;
- prev_actual_cap = cpu_actual_cap;
- best_delta = min(best_delta, prev_delta);
- }
+ /* Idle system costs nothing */
+ target_stat.max_perf = 0;
+ target_stat.cost = 0;
- /* Evaluate the energy impact of using max_spare_cap_cpu. */
- if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) {
- /* Current best energy cpu fits better */
- if (max_fits < best_fits)
- continue;
+ /* Estimate utilization and cost without p */
+ for_each_cpu(cpu, cpus) {
+ unsigned long target_util;
- /*
- * Both don't fit performance hint (i.e. uclamp_min)
- * but best energy cpu has better capacity.
- */
- if ((max_fits < 0) &&
- (cpu_actual_cap <= best_actual_cap))
- continue;
+ /* Accumulate actual utilization w/o task p */
+ pd_actual_util += find_cpu_actual_util(cpu, p);
- cur_delta = compute_energy(&eenv, pd, cpus, p,
- max_spare_cap_cpu);
- /* CPU utilization has changed */
- if (cur_delta < base_energy)
- goto unlock;
- cur_delta -= base_energy;
+ /* Get the max utilization of the CPU w/o task p */
+ target_util = find_cpu_max_util(cpu, p, -1);
+ target_util = min(target_util, cpu_actual_cap);
- /*
- * Both fit for the task but best energy cpu has lower
- * energy impact.
- */
- if ((max_fits > 0) && (best_fits > 0) &&
- (cur_delta >= best_delta))
+ /* Current OPP is enough */
+ if (target_util <= target_stat.max_perf)
continue;
- best_delta = cur_delta;
- best_energy_cpu = max_spare_cap_cpu;
- best_fits = max_fits;
- best_actual_cap = cpu_actual_cap;
+ /* Compute and save the cost of the OPP */
+ find_pd_cost(pd->em_pd, target_util, &target_stat);
+ max_cost = target_stat.cost;
}
- }
- rcu_read_unlock();
- if ((best_fits > prev_fits) ||
- ((best_fits > 0) && (best_delta < prev_delta)) ||
- ((best_fits < 0) && (best_actual_cap > prev_actual_cap)))
- target = best_energy_cpu;
+ /* Add the NRG cost of p */
+ delta_nrg = task_util * min_stat.cost;
- return target;
+ /* Compute the NRG cost of others running at higher OPP because of p */
+ if (min_stat.cost > max_cost)
+ delta_nrg += pd_actual_util * (min_stat.cost - max_cost);
+
+ /* nrg with p */
+ trace_sched_compute_energy_tp(p, min_stat.cpu, delta_nrg,
+ min_stat.max_perf, pd_actual_util + task_util);
+
+ /*
+ * The probability that delta NRGs are equals is almost null. PDs being sorted
+ * by max capacity, keep the one with highest max capacity if this
+ * happens.
+ * TODO: add a margin in nrg cost and take into account other stats
+ */
+ if ((min_stat.fits == best_fits) &&
+ (delta_nrg >= best_nrg))
+ continue;
+
+ best_fits = min_stat.fits;
+ best_nrg = delta_nrg;
+ best_cpu = min_stat.cpu;
+ }
unlock:
rcu_read_unlock();
+ if (best_cpu >= 0)
+ target = best_cpu;
+
return target;
}
--
2.34.1
Hello Vincent,
On 8/30/24 15:03, Vincent Guittot wrote:
> feec() looks for the CPU with highest spare capacity in a PD assuming that
> it will be the best CPU from a energy efficiency PoV because it will
> require the smallest increase of OPP. Although this is true generally
> speaking, this policy also filters some others CPUs which will be as
> efficients because of using the same OPP.
> In fact, we really care about the cost of the new OPP that will be
> selected to handle the waking task. In many cases, several CPUs will end
> up selecting the same OPP and as a result using the same energy cost. In
> these cases, we can use other metrics to select the best CPU for the same
> energy cost.
>
> Rework feec() to look 1st for the lowest cost in a PD and then the most
> performant CPU between CPUs.
>
> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> ---
> kernel/sched/fair.c | 466 +++++++++++++++++++++++---------------------
> 1 file changed, 244 insertions(+), 222 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index e67d6029b269..2273eecf6086 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
[snip]
>
> -/*
> - * compute_energy(): Use the Energy Model to estimate the energy that @pd would
> - * consume for a given utilization landscape @eenv. When @dst_cpu < 0, the task
> - * contribution is ignored.
> - */
> -static inline unsigned long
> -compute_energy(struct energy_env *eenv, struct perf_domain *pd,
> - struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu)
> +/*Check if the CPU can handle the waking task */
> +static int check_cpu_with_task(struct task_struct *p, int cpu)
> {
> - unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
> - unsigned long busy_time = eenv->pd_busy_time;
> - unsigned long energy;
> + unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MIN) : 0;
> + unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
> + unsigned long util_min = p_util_min;
> + unsigned long util_max = p_util_max;
> + unsigned long util = cpu_util(cpu, p, cpu, 0);
> + struct rq *rq = cpu_rq(cpu);
>
> - if (dst_cpu >= 0)
> - busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time);
I think you should mention that the energy computation is not capped anymore.
It used to be:
pd_util: sum of the CPU's util for the pd considered, without task_util
pd_cap: sum of the CPU's capacity for the pd
(a)
busy_time = min(pd_cap, pd_util);
prev_energy = busy_time * OPP[prev_max_util].cost;
busy_time = min(pd_cap, pd_util + task_util);
new_energy = busy_time * OPP[new_max_util].cost;
delta_energy = new_energy - prev_energy;
Note that when computing busy_time, task_util is not capped to one CPU's
max_cap. This means that in empty pd, a task can 'steal' capacity from
CPUs during the energy computation.
Cf. [1]
and it is now:
(b)
delta_energy = task_util * OPP[new_max_util].cost;
delta_energy += pd_util * (OPP[new_max_util].cost - OPP[prev_max_util].cost);
Note that the busy_time (task_util + pd_util) is now not capped by anything.
---
Not capping the task_util is discussed in [1][3] and [2] (at 21:15).
IIUC, UCLAMP_MAX tasks are the only case leveraging this. Indeed,
non-clamped tasks will not fit and be placed on a bigger CPU. Same for
UCLAMP_MIN tasks.
FWIU, not capping the utilization of tasks during the energy computation
allows to reflect that a task will (likely) run longer. However the
task's performance would likely decrease as the other tasks on the target
CPU are not taken into account (it is assumed the task to be placed will
receive the compute power it requires).
---
Case A:
Assuming we have an empty system with:
- 4 little CPUs (max_capa=512, first OPP as [capa=256, cost=10])
- 2 big CPUs (max_capa=1024, first OPP as [capa=512, cost=10])
i.e. the first OPP of all the CPU consumes the same amount of energy.
And a task with: [UCLAMP_MIN=0, UCLAMP_MAX=10, util = 1000]
Then feec() would have no reason to prefer a big CPU over a little CPU,
even though the big CPU would provide more performance.
---
Case B:
(This is not especially related to this patch.)
Another case that might be problematic:
- 4 little CPUs (max_capa=512, first OPP as [capa=256])
- 2 big CPUs (max_capa=1024, first OPP as [capa=512])
- little CPUs consume less than big CPUs, but the highest OPP
of the little CPUs consume more than the lowest of the big CPUs.
And tasks:
- 3 tasks with [UCLAMP_MIN=0, UCLAMP_MAX=10, util = 1000]
- 1 task with [UCLAMP_MIN=0, UCLAMP_MAX=1024, util = 50]
Then
- the 3 UCLAMP_MAX tasks will be placed on the little CPUs. Indeed,
due to the last patch of the serie, these tasks have now an opportunity
to run feec() and be placed on a more energy efficient CPU.
- the 'normal' task will be placed on a big CPU. Indeed, placing
it on a little CPU would raise the OPP of the little cluster.
This means that the 'normal' task is prevented to run the remaining little
CPU even though:
- the little CPU can provide the compute capacity
- the little CPU would consume less energy
In other terms, using UCLAMP_MAX on some tasks makes the system consume
more energy.
---
In my opinion, this last case comes from the difficulty of defining UCLAMP_MAX.
From sched-util-clamp.rst (about UCLAMP_MAX):
- Uclamp is a hinting mechanism that allows the scheduler to understand the
performance requirements and restrictions of the tasks
- The right way to view util clamp is as a mechanism to make request or hint on
performance constraints.
- some tasks should be restricted from consuming too
much resources and should not go above a specific performance point.
-
Another example is in Android where tasks are classified as background,
foreground, top-app, etc. Util clamp can be used to constrain how much
resources background tasks are consuming by capping the performance point they
can run at. This constraint helps reserve resources for important tasks, like
the ones belonging to the currently active app (top-app group). Beside this
helps in limiting how much power they consume. This can be more obvious in
heterogeneous systems (e.g. Arm big.LITTLE); the constraint will help bias the
background tasks to stay on the little cores which will ensure that:
1. The big cores are free to run top-app tasks immediately. top-app
tasks are the tasks the user is currently interacting with, hence
the most important tasks in the system.
2. They don't run on a power hungry core and drain battery even if they
are CPU intensive tasks.
-
For example, it can be handy to limit performance when running low on battery
or when the system wants to limit access to more energy hungry performance
levels when it's in idle state or screen is off.
"""
This constraint helps reserve resources for important tasks, like
the ones belonging to the currently active app (top-app group).
"""
It doesn't seem that UCLAMP_MAX does this. This looks more like bandwidth
control.
"""
They don't run on a power hungry core and drain battery even if they
are CPU intensive tasks.
"""
Avoiding mid/big CPUs could be done with tasksets,
I can understand that one might want to run a task at a higher OPP due to
timing constraints for instance. However I don't see why someone would want
to run a task at a lower OPP, regarding only the performance and not the
energy consumption. It thus means that UCLAMP_MAX is an energy hint rather
of a performance hint.
UCLAMP_MAX could be set for a task to make it spend less energy, but the
loss in performance would be unknown.
A case Hongyan mentioned in his uclamp sum aggregation serie [4] is that
an infinite task with [UCLAMP_MIN=0, UCLAMP_MAX=1, util = 1000] could fit
in a little CPU. The energy consumption would indeed be very low, but the
performance would also be very low.
With Hongyan's sum aggregation serie [5]:
- case B would not happen as the 'normal' task would not raise the OPP of
the whole cluster.
- the 'infinite UCLAMP_MAX tasks' case would not happen as each task would
account for 1 util
- case A would still happen, but could be solved in any case.
I know Hongyan's patchset has already been discussed, but I still don't
understand why max aggregation is preferred over sum aggregation.
The definition of UCLAMP_MAX values seems clearer and in effect results in
a simpler implementation and less corner cases. In simple words:
"When estimating the CPU frequency to set, for this task,
account for at most X util."
rather than:
"When estimating the CPU frequency to set, the task with the highest
UCLAMP_MAX of the CPU will cap the requested frequency."
Note that actually I think it's a good idea to run feec() regularly
and to take into account other parameters like nr_running. I just think
that UCLAMP_MAX's max aggregation creates corner cases that are difficult
to solve altogether.
Thanks in advance for the time you will take answering,
Regards,
Pierre
[1] https://lore.kernel.org/all/20240606070645.3295-1-xuewen.yan@unisoc.com/
[2] https://www.youtube.com/watch?v=PHEBAyxeM_M
[3] https://lore.kernel.org/all/CAKfTPtDPCPYvCi1c_Nh+Cn01ZVS7E=tAHQeNX-mArBt3BXdjYw@mail.gmail.com/
[4] https://lore.kernel.org/all/b81a5b1c-14de-4232-bee9-ee647355dd8c@arm.com/
[5] https://lore.kernel.org/all/cover.1706792708.git.hongyan.xia2@arm.com/#t
On Wed, 11 Sept 2024 at 16:03, Pierre Gondois <pierre.gondois@arm.com> wrote:
>
> Hello Vincent,
>
> On 8/30/24 15:03, Vincent Guittot wrote:
> > feec() looks for the CPU with highest spare capacity in a PD assuming that
> > it will be the best CPU from a energy efficiency PoV because it will
> > require the smallest increase of OPP. Although this is true generally
> > speaking, this policy also filters some others CPUs which will be as
> > efficients because of using the same OPP.
> > In fact, we really care about the cost of the new OPP that will be
> > selected to handle the waking task. In many cases, several CPUs will end
> > up selecting the same OPP and as a result using the same energy cost. In
> > these cases, we can use other metrics to select the best CPU for the same
> > energy cost.
> >
> > Rework feec() to look 1st for the lowest cost in a PD and then the most
> > performant CPU between CPUs.
> >
> > Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> > ---
> > kernel/sched/fair.c | 466 +++++++++++++++++++++++---------------------
> > 1 file changed, 244 insertions(+), 222 deletions(-)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index e67d6029b269..2273eecf6086 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
>
> [snip]
>
> >
> > -/*
> > - * compute_energy(): Use the Energy Model to estimate the energy that @pd would
> > - * consume for a given utilization landscape @eenv. When @dst_cpu < 0, the task
> > - * contribution is ignored.
> > - */
> > -static inline unsigned long
> > -compute_energy(struct energy_env *eenv, struct perf_domain *pd,
> > - struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu)
> > +/*Check if the CPU can handle the waking task */
> > +static int check_cpu_with_task(struct task_struct *p, int cpu)
> > {
> > - unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
> > - unsigned long busy_time = eenv->pd_busy_time;
> > - unsigned long energy;
> > + unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MIN) : 0;
> > + unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
> > + unsigned long util_min = p_util_min;
> > + unsigned long util_max = p_util_max;
> > + unsigned long util = cpu_util(cpu, p, cpu, 0);
> > + struct rq *rq = cpu_rq(cpu);
> >
> > - if (dst_cpu >= 0)
> > - busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time);
>
> I think you should mention that the energy computation is not capped anymore.
> It used to be:
> pd_util: sum of the CPU's util for the pd considered, without task_util
> pd_cap: sum of the CPU's capacity for the pd
>
> (a)
> busy_time = min(pd_cap, pd_util);
> prev_energy = busy_time * OPP[prev_max_util].cost;
>
> busy_time = min(pd_cap, pd_util + task_util);
> new_energy = busy_time * OPP[new_max_util].cost;
>
> delta_energy = new_energy - prev_energy;
>
> Note that when computing busy_time, task_util is not capped to one CPU's
> max_cap. This means that in empty pd, a task can 'steal' capacity from
> CPUs during the energy computation.
> Cf. [1]
>
> and it is now:
> (b)
> delta_energy = task_util * OPP[new_max_util].cost;
> delta_energy += pd_util * (OPP[new_max_util].cost - OPP[prev_max_util].cost);
>
> Note that the busy_time (task_util + pd_util) is now not capped by anything.
As discussed in [1], capping utilization with max capacity is a
mistake because we lost the information that this will run longer
>
> ---
>
> Not capping the task_util is discussed in [1][3] and [2] (at 21:15).
> IIUC, UCLAMP_MAX tasks are the only case leveraging this. Indeed,
> non-clamped tasks will not fit and be placed on a bigger CPU. Same for
> UCLAMP_MIN tasks.
> FWIU, not capping the utilization of tasks during the energy computation
> allows to reflect that a task will (likely) run longer. However the
> task's performance would likely decrease as the other tasks on the target
> CPU are not taken into account (it is assumed the task to be placed will
> receive the compute power it requires).
>
> ---
> Case A:
>
> Assuming we have an empty system with:
> - 4 little CPUs (max_capa=512, first OPP as [capa=256, cost=10])
> - 2 big CPUs (max_capa=1024, first OPP as [capa=512, cost=10])
Quite an inefficient hardware design here where the big core provides
twice more capacity for the same cost of the little for their 1st OPP
!!!
> i.e. the first OPP of all the CPU consumes the same amount of energy.
> And a task with: [UCLAMP_MIN=0, UCLAMP_MAX=10, util = 1000]
>
> Then feec() would have no reason to prefer a big CPU over a little CPU,
> even though the big CPU would provide more performance.
As mentioned in the comment of feec(), I don't expect to face a
situation where the delta energy is equal for 2 PDs especially with
the uWatt that has been introduced to prevent such situation. But if
this should happen, it is in the TODO to add margin and to take other
stats into account like compute capacity. Also, the PDs are sorted by
max capacity so we currently keep the one with highest capacity ie big
CPU in your case
>
> ---
> Case B:
>
> (This is not especially related to this patch.)
> Another case that might be problematic:
> - 4 little CPUs (max_capa=512, first OPP as [capa=256])
> - 2 big CPUs (max_capa=1024, first OPP as [capa=512])
> - little CPUs consume less than big CPUs, but the highest OPP
> of the little CPUs consume more than the lowest of the big CPUs.
> And tasks:
> - 3 tasks with [UCLAMP_MIN=0, UCLAMP_MAX=10, util = 1000]
> - 1 task with [UCLAMP_MIN=0, UCLAMP_MAX=1024, util = 50]
>
> Then
> - the 3 UCLAMP_MAX tasks will be placed on the little CPUs. Indeed,
> due to the last patch of the serie, these tasks have now an opportunity
> to run feec() and be placed on a more energy efficient CPU.
> - the 'normal' task will be placed on a big CPU. Indeed, placing
> it on a little CPU would raise the OPP of the little cluster.
>
> This means that the 'normal' task is prevented to run the remaining little
> CPU even though:
> - the little CPU can provide the compute capacity
> - the little CPU would consume less energy
>
> In other terms, using UCLAMP_MAX on some tasks makes the system consume
> more energy.
You have probably noticed that this patchset doesn't make any
assumption about uclamp_max/min behavior and the below doesn't seem to
be related to this patchset but to the uclamp_max behavior so I don't
think it's the right place to discuss this. A talk or a BoF at LPC
would have been a better place
>
> ---
>
> In my opinion, this last case comes from the difficulty of defining UCLAMP_MAX.
> From sched-util-clamp.rst (about UCLAMP_MAX):
> - Uclamp is a hinting mechanism that allows the scheduler to understand the
> performance requirements and restrictions of the tasks
> - The right way to view util clamp is as a mechanism to make request or hint on
> performance constraints.
> - some tasks should be restricted from consuming too
> much resources and should not go above a specific performance point.
> -
> Another example is in Android where tasks are classified as background,
> foreground, top-app, etc. Util clamp can be used to constrain how much
> resources background tasks are consuming by capping the performance point they
> can run at. This constraint helps reserve resources for important tasks, like
> the ones belonging to the currently active app (top-app group). Beside this
> helps in limiting how much power they consume. This can be more obvious in
> heterogeneous systems (e.g. Arm big.LITTLE); the constraint will help bias the
> background tasks to stay on the little cores which will ensure that:
>
> 1. The big cores are free to run top-app tasks immediately. top-app
> tasks are the tasks the user is currently interacting with, hence
> the most important tasks in the system.
> 2. They don't run on a power hungry core and drain battery even if they
> are CPU intensive tasks.
> -
> For example, it can be handy to limit performance when running low on battery
> or when the system wants to limit access to more energy hungry performance
> levels when it's in idle state or screen is off.
>
> """
> This constraint helps reserve resources for important tasks, like
> the ones belonging to the currently active app (top-app group).
> """
> It doesn't seem that UCLAMP_MAX does this. This looks more like bandwidth
> control.
>
> """
> They don't run on a power hungry core and drain battery even if they
> are CPU intensive tasks.
> """
> Avoiding mid/big CPUs could be done with tasksets,
>
> I can understand that one might want to run a task at a higher OPP due to
> timing constraints for instance. However I don't see why someone would want
> to run a task at a lower OPP, regarding only the performance and not the
> energy consumption. It thus means that UCLAMP_MAX is an energy hint rather
> of a performance hint.
>
> UCLAMP_MAX could be set for a task to make it spend less energy, but the
> loss in performance would be unknown.
> A case Hongyan mentioned in his uclamp sum aggregation serie [4] is that
> an infinite task with [UCLAMP_MIN=0, UCLAMP_MAX=1, util = 1000] could fit
> in a little CPU. The energy consumption would indeed be very low, but the
> performance would also be very low.
>
> With Hongyan's sum aggregation serie [5]:
> - case B would not happen as the 'normal' task would not raise the OPP of
> the whole cluster.
> - the 'infinite UCLAMP_MAX tasks' case would not happen as each task would
> account for 1 util
> - case A would still happen, but could be solved in any case.
>
> I know Hongyan's patchset has already been discussed, but I still don't
> understand why max aggregation is preferred over sum aggregation.
> The definition of UCLAMP_MAX values seems clearer and in effect results in
> a simpler implementation and less corner cases. In simple words:
> "When estimating the CPU frequency to set, for this task,
> account for at most X util."
> rather than:
> "When estimating the CPU frequency to set, the task with the highest
> UCLAMP_MAX of the CPU will cap the requested frequency."
>
> Note that actually I think it's a good idea to run feec() regularly
> and to take into account other parameters like nr_running. I just think
> that UCLAMP_MAX's max aggregation creates corner cases that are difficult
> to solve altogether.
>
> Thanks in advance for the time you will take answering,
> Regards,
> Pierre
>
> [1] https://lore.kernel.org/all/20240606070645.3295-1-xuewen.yan@unisoc.com/
> [2] https://www.youtube.com/watch?v=PHEBAyxeM_M
> [3] https://lore.kernel.org/all/CAKfTPtDPCPYvCi1c_Nh+Cn01ZVS7E=tAHQeNX-mArBt3BXdjYw@mail.gmail.com/
> [4] https://lore.kernel.org/all/b81a5b1c-14de-4232-bee9-ee647355dd8c@arm.com/
> [5] https://lore.kernel.org/all/cover.1706792708.git.hongyan.xia2@arm.com/#t
Hello Vincent,
On 9/12/24 14:22, Vincent Guittot wrote:
> On Wed, 11 Sept 2024 at 16:03, Pierre Gondois <pierre.gondois@arm.com> wrote:
>>
>> Hello Vincent,
>>
>> On 8/30/24 15:03, Vincent Guittot wrote:
>>> feec() looks for the CPU with highest spare capacity in a PD assuming that
>>> it will be the best CPU from a energy efficiency PoV because it will
>>> require the smallest increase of OPP. Although this is true generally
>>> speaking, this policy also filters some others CPUs which will be as
>>> efficients because of using the same OPP.
>>> In fact, we really care about the cost of the new OPP that will be
>>> selected to handle the waking task. In many cases, several CPUs will end
>>> up selecting the same OPP and as a result using the same energy cost. In
>>> these cases, we can use other metrics to select the best CPU for the same
>>> energy cost.
>>>
>>> Rework feec() to look 1st for the lowest cost in a PD and then the most
>>> performant CPU between CPUs.
>>>
>>> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
>>> ---
>>> kernel/sched/fair.c | 466 +++++++++++++++++++++++---------------------
>>> 1 file changed, 244 insertions(+), 222 deletions(-)
>>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index e67d6029b269..2273eecf6086 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>
>> [snip]
>>
>>>
>>> -/*
>>> - * compute_energy(): Use the Energy Model to estimate the energy that @pd would
>>> - * consume for a given utilization landscape @eenv. When @dst_cpu < 0, the task
>>> - * contribution is ignored.
>>> - */
>>> -static inline unsigned long
>>> -compute_energy(struct energy_env *eenv, struct perf_domain *pd,
>>> - struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu)
>>> +/*Check if the CPU can handle the waking task */
>>> +static int check_cpu_with_task(struct task_struct *p, int cpu)
>>> {
>>> - unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
>>> - unsigned long busy_time = eenv->pd_busy_time;
>>> - unsigned long energy;
>>> + unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MIN) : 0;
>>> + unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
>>> + unsigned long util_min = p_util_min;
>>> + unsigned long util_max = p_util_max;
>>> + unsigned long util = cpu_util(cpu, p, cpu, 0);
>>> + struct rq *rq = cpu_rq(cpu);
>>>
>>> - if (dst_cpu >= 0)
>>> - busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time);
>>
>> I think you should mention that the energy computation is not capped anymore.
>> It used to be:
>> pd_util: sum of the CPU's util for the pd considered, without task_util
>> pd_cap: sum of the CPU's capacity for the pd
>>
>> (a)
>> busy_time = min(pd_cap, pd_util);
>> prev_energy = busy_time * OPP[prev_max_util].cost;
>>
>> busy_time = min(pd_cap, pd_util + task_util);
>> new_energy = busy_time * OPP[new_max_util].cost;
>>
>> delta_energy = new_energy - prev_energy;
>>
>> Note that when computing busy_time, task_util is not capped to one CPU's
>> max_cap. This means that in empty pd, a task can 'steal' capacity from
>> CPUs during the energy computation.
>> Cf. [1]
>>
>> and it is now:
>> (b)
>> delta_energy = task_util * OPP[new_max_util].cost;
>> delta_energy += pd_util * (OPP[new_max_util].cost - OPP[prev_max_util].cost);
>>
>> Note that the busy_time (task_util + pd_util) is now not capped by anything.
>
> As discussed in [1], capping utilization with max capacity is a
> mistake because we lost the information that this will run longer
>
I think this comes down to the fact that uClampMax tasks are force fit into
CPUs that don't have the required spare capacity [1].
On a little CPU with a capacity=200, a task with util=600 will run longer,
but it will eventually finish. Such task should not fit a little CPU normally.
By setting the task with UCLAMP_MAX=100, the task now fits the little CPU
and consumes less energy.
As Quentin mentioned I think, EAS can place tasks if utilization values are
correct. The initial condition was to have a 20% margin on the CPU capacity,
but it seems the intent was more to detect situations where the CPU is always
running (i.e. no idle time anymore).
With [1], having a 20% margin (with uClampMax tasks) doesn't mean that there
is idle time anymore. When there is no idle time anymore, utilization values
are a reflection of the niceness of the task. I.e. the utilization represents
how much time the CFS scheduler allowed them to run rather than how much
compute power tasks require.
------- Example start -------
In this condition, EAS should not be able to make always accurate task
placements. For instance, on a platform with 1 big CPU (capa=1024) and one
little CPU (capa=200), and with X tasks such as:
- duty-cycle=60%
- UCLAMP_MAX=800
Tasks being CPU demanding, they are placed on the big CPU. Each task will have
a utilization of 1024/X. The system is not overutilized since tasks have
a UCLAMP_MAX setting.
The bigger X, the lower the task utilization. Eventually, tasks' utilization
will be low enough to have feec() migrating one of them to the little CPU.
The system then becomes overutilized.
With the present patchset, the task is migrated back to the big CPU. Indeed:
task_tick_fair()
\-check_update_overutilized_status() --> // setting overutilzed=1
\-check_misfit_cpu()
\-find_energy_efficient_cpu() --> task doesn't fit the little CPU anymore,
--> migrate back to the big CPU
--> // resetting overutilzed=0 later
So these UCLAMP_MAX tasks will bounce on the little CPU, transiently activating
the overutilized state.
Similarly, when there is no idle time, it is possible to play with the task
niceness to make the utilization smaller/bigger.
------- Example end -------
This behaviour was existing before this present patchset due to [1]. However,
it was not really visible since feec() only ran during task wakeup.
It seems that the condition in update_idle_rq_clock_pelt() would be a better
way to tag a CPU as overutilized.
Down migrating UCLAMP_MAX tasks makes sense IMO, but to avoid making a CPU
overutilized, throttling these tasks (like the bandwidth control seems to do)
could be a solution.
[1] https://lore.kernel.org/lkml/20220629194632.1117723-1-qais.yousef@arm.com/
Regards,
Pierre
(edit)
On 9/11/24 16:02, Pierre Gondois wrote:
> Hello Vincent,
>
> On 8/30/24 15:03, Vincent Guittot wrote:
>> feec() looks for the CPU with highest spare capacity in a PD assuming that
>> it will be the best CPU from a energy efficiency PoV because it will
>> require the smallest increase of OPP. Although this is true generally
>> speaking, this policy also filters some others CPUs which will be as
>> efficients because of using the same OPP.
>> In fact, we really care about the cost of the new OPP that will be
>> selected to handle the waking task. In many cases, several CPUs will end
>> up selecting the same OPP and as a result using the same energy cost. In
>> these cases, we can use other metrics to select the best CPU for the same
>> energy cost.
>>
>> Rework feec() to look 1st for the lowest cost in a PD and then the most
>> performant CPU between CPUs.
>>
>> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
>> ---
>> kernel/sched/fair.c | 466 +++++++++++++++++++++++---------------------
>> 1 file changed, 244 insertions(+), 222 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index e67d6029b269..2273eecf6086 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>
> [snip]
>
>>
>> -/*
>> - * compute_energy(): Use the Energy Model to estimate the energy that @pd would
>> - * consume for a given utilization landscape @eenv. When @dst_cpu < 0, the task
>> - * contribution is ignored.
>> - */
>> -static inline unsigned long
>> -compute_energy(struct energy_env *eenv, struct perf_domain *pd,
>> - struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu)
>> +/*Check if the CPU can handle the waking task */
>> +static int check_cpu_with_task(struct task_struct *p, int cpu)
>> {
>> - unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
>> - unsigned long busy_time = eenv->pd_busy_time;
>> - unsigned long energy;
>> + unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MIN) : 0;
>> + unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
>> + unsigned long util_min = p_util_min;
>> + unsigned long util_max = p_util_max;
>> + unsigned long util = cpu_util(cpu, p, cpu, 0);
>> + struct rq *rq = cpu_rq(cpu);
>>
>> - if (dst_cpu >= 0)
>> - busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time);
>
> I think you should mention that the energy computation is not capped anymore.
> It used to be:
> pd_util: sum of the CPU's util for the pd considered, without task_util
> pd_cap: sum of the CPU's capacity for the pd
>
> (a)
> busy_time = min(pd_cap, pd_util);
> prev_energy = busy_time * OPP[prev_max_util].cost;
>
> busy_time = min(pd_cap, pd_util + task_util);
> new_energy = busy_time * OPP[new_max_util].cost;
>
> delta_energy = new_energy - prev_energy;
>
> Note that when computing busy_time, task_util is not capped to one CPU's
> max_cap. This means that in empty pd, a task can 'steal' capacity from
> CPUs during the energy computation.
> Cf. [1]
>
> and it is now:
> (b)
> delta_energy = task_util * OPP[new_max_util].cost;
> delta_energy += pd_util * (OPP[new_max_util].cost - OPP[prev_max_util].cost);
>
> Note that the busy_time (task_util + pd_util) is now not capped by anything.
>
> ---
>
> Not capping the task_util is discussed in [1][3] and [2] (at 21:15).
> IIUC, UCLAMP_MAX tasks are the only case leveraging this. Indeed,
> non-clamped tasks will not fit and be placed on a bigger CPU. Same for
> UCLAMP_MIN tasks.
> FWIU, not capping the utilization of tasks during the energy computation
> allows to reflect that a task will (likely) run longer. However the
> task's performance would likely decrease as the other tasks on the target
> CPU are not taken into account (it is assumed the task to be placed will
> receive the compute power it requires).
>
> ---
> Case A:
>
> Assuming we have an empty system with:
> - 4 little CPUs (max_capa=512, first OPP as [capa=256, cost=10])
> - 2 big CPUs (max_capa=1024, first OPP as [capa=512, cost=10])
> i.e. the first OPP of all the CPU consumes the same amount of energy.
> And a task with: [UCLAMP_MIN=0, UCLAMP_MAX=10, util = 1000]
>
> Then feec() would have no reason to prefer a big CPU over a little CPU,
> even though the big CPU would provide more performance.
>
> ---
> Case B:
>
> (This is not especially related to this patch.)
> Another case that might be problematic:
> - 4 little CPUs (max_capa=512, first OPP as [capa=256])
> - 2 big CPUs (max_capa=1024, first OPP as [capa=512])
> - little CPUs consume less than big CPUs, but the highest OPP
> of the little CPUs consume more than the lowest of the big CPUs.
> And tasks:
> - 3 tasks with [UCLAMP_MIN=0, UCLAMP_MAX=10, util = 1000]
> - 1 task with [UCLAMP_MIN=0, UCLAMP_MAX=1024, util = 50]
>
> Then
> - the 3 UCLAMP_MAX tasks will be placed on the little CPUs. Indeed,
> due to the last patch of the serie, these tasks have now an opportunity
> to run feec() and be placed on a more energy efficient CPU.
> - the 'normal' task will be placed on a big CPU. Indeed, placing
> it on a little CPU would raise the OPP of the little cluster.
>
> This means that the 'normal' task is prevented to run the remaining little
> CPU even though:
> - the little CPU can provide the compute capacity
This behaviour is actually due to the little CPUs not being able to provide
the compute capacity for the normal task without raising the OPP of the cluster.
So this behaviour is expected.
I am providing another case instead:
Case B':
- 4 little CPUs (max_capa=512, first OPP as [capa=256])
- 2 big CPUs (max_capa=1024, first OPP as [capa=512])
And tasks:
- 4 tasks with [UCLAMP_MIN=0, UCLAMP_MAX=10, util = 1000]
- 1 task with [UCLAMP_MIN=0, UCLAMP_MAX=1024, util = 50]
The normal task will not fit any of the little CPU as the rq's UCLAMP_MAX
value would raise from 10 to 1024. If I m not mistaken (this time), the
normal task should be placed on a little CPU as:
- it consumes less power
- even though UCLAMP_MAX tasks consume the least power they can, it makes
other tasks consume more
Theoretically, placing the 4 UCLAMP_MAX tasks on one little CPU and using
another CPU for the normal task would:
- consume less energy
- satisfy the UCLAMP_MAX constraint
even though the performance of the workload would be less. This is a bit
hard to conceive for me.
> - the little CPU would consume less energy
>
> In other terms, using UCLAMP_MAX on some tasks makes the system consume
> more energy.
>
> ---
>
> In my opinion, this last case comes from the difficulty of defining UCLAMP_MAX.
> From sched-util-clamp.rst (about UCLAMP_MAX):
> - Uclamp is a hinting mechanism that allows the scheduler to understand the
> performance requirements and restrictions of the tasks
> - The right way to view util clamp is as a mechanism to make request or hint on
> performance constraints.
> - some tasks should be restricted from consuming too
> much resources and should not go above a specific performance point.
> -
> Another example is in Android where tasks are classified as background,
> foreground, top-app, etc. Util clamp can be used to constrain how much
> resources background tasks are consuming by capping the performance point they
> can run at. This constraint helps reserve resources for important tasks, like
> the ones belonging to the currently active app (top-app group). Beside this
> helps in limiting how much power they consume. This can be more obvious in
> heterogeneous systems (e.g. Arm big.LITTLE); the constraint will help bias the
> background tasks to stay on the little cores which will ensure that:
>
> 1. The big cores are free to run top-app tasks immediately. top-app
> tasks are the tasks the user is currently interacting with, hence
> the most important tasks in the system.
> 2. They don't run on a power hungry core and drain battery even if they
> are CPU intensive tasks.
> -
> For example, it can be handy to limit performance when running low on battery
> or when the system wants to limit access to more energy hungry performance
> levels when it's in idle state or screen is off.
>
> """
> This constraint helps reserve resources for important tasks, like
> the ones belonging to the currently active app (top-app group).
> """
> It doesn't seem that UCLAMP_MAX does this. This looks more like bandwidth
> control.
>
> """
> They don't run on a power hungry core and drain battery even if they
> are CPU intensive tasks.
> """
> Avoiding mid/big CPUs could be done with tasksets,
>
> I can understand that one might want to run a task at a higher OPP due to
> timing constraints for instance. However I don't see why someone would want
> to run a task at a lower OPP, regarding only the performance and not the
> energy consumption. It thus means that UCLAMP_MAX is an energy hint rather
> of a performance hint.
>
> UCLAMP_MAX could be set for a task to make it spend less energy, but the
> loss in performance would be unknown.
> A case Hongyan mentioned in his uclamp sum aggregation serie [4] is that
> an infinite task with [UCLAMP_MIN=0, UCLAMP_MAX=1, util = 1000] could fit
> in a little CPU. The energy consumption would indeed be very low, but the
> performance would also be very low.
>
> With Hongyan's sum aggregation serie [5]:
> - case B would not happen as the 'normal' task would not raise the OPP of
> the whole cluster.
Cf. above
> - the 'infinite UCLAMP_MAX tasks' case would not happen as each task would
> account for 1 util
> - case A would still happen, but could be solved in any case.
>
> I know Hongyan's patchset has already been discussed, but I still don't
> understand why max aggregation is preferred over sum aggregation.
> The definition of UCLAMP_MAX values seems clearer and in effect results in
> a simpler implementation and less corner cases. In simple words:
> "When estimating the CPU frequency to set, for this task,
> account for at most X util."
> rather than:
> "When estimating the CPU frequency to set, the task with the highest
> UCLAMP_MAX of the CPU will cap the requested frequency."
>
> Note that actually I think it's a good idea to run feec() regularly
> and to take into account other parameters like nr_running. I just think
> that UCLAMP_MAX's max aggregation creates corner cases that are difficult
> to solve altogether.
>
> Thanks in advance for the time you will take answering,
> Regards,
> Pierre
>
> [1] https://lore.kernel.org/all/20240606070645.3295-1-xuewen.yan@unisoc.com/
> [2] https://www.youtube.com/watch?v=PHEBAyxeM_M
> [3] https://lore.kernel.org/all/CAKfTPtDPCPYvCi1c_Nh+Cn01ZVS7E=tAHQeNX-mArBt3BXdjYw@mail.gmail.com/
> [4] https://lore.kernel.org/all/b81a5b1c-14de-4232-bee9-ee647355dd8c@arm.com/
> [5] https://lore.kernel.org/all/cover.1706792708.git.hongyan.xia2@arm.com/#t
>
On 8/30/24 15:03, Vincent Guittot wrote:
> feec() looks for the CPU with highest spare capacity in a PD assuming that
> it will be the best CPU from a energy efficiency PoV because it will
> require the smallest increase of OPP. Although this is true generally
> speaking, this policy also filters some others CPUs which will be as
> efficients because of using the same OPP.
> In fact, we really care about the cost of the new OPP that will be
> selected to handle the waking task. In many cases, several CPUs will end
> up selecting the same OPP and as a result using the same energy cost. In
> these cases, we can use other metrics to select the best CPU for the same
> energy cost.
>
> Rework feec() to look 1st for the lowest cost in a PD and then the most
> performant CPU between CPUs.
>
> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> ---
> kernel/sched/fair.c | 466 +++++++++++++++++++++++---------------------
> 1 file changed, 244 insertions(+), 222 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index e67d6029b269..2273eecf6086 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> -/*
> - * compute_energy(): Use the Energy Model to estimate the energy that @pd would
> - * consume for a given utilization landscape @eenv. When @dst_cpu < 0, the task
> - * contribution is ignored.
> - */
> -static inline unsigned long
> -compute_energy(struct energy_env *eenv, struct perf_domain *pd,
> - struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu)
> +/*Check if the CPU can handle the waking task */
> +static int check_cpu_with_task(struct task_struct *p, int cpu)
> {
> - unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
> - unsigned long busy_time = eenv->pd_busy_time;
> - unsigned long energy;
> + unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MIN) : 0;
> + unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
> + unsigned long util_min = p_util_min;
> + unsigned long util_max = p_util_max;
> + unsigned long util = cpu_util(cpu, p, cpu, 0);
> + struct rq *rq = cpu_rq(cpu);
>
> - if (dst_cpu >= 0)
> - busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time);
> + /*
> + * Skip CPUs that cannot satisfy the capacity request.
> + * IOW, placing the task there would make the CPU
> + * overutilized. Take uclamp into account to see how
> + * much capacity we can get out of the CPU; this is
> + * aligned with sched_cpu_util().
> + */
> + if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) {
> + unsigned long rq_util_min, rq_util_max;
> + /*
> + * Open code uclamp_rq_util_with() except for
> + * the clamp() part. I.e.: apply max aggregation
> + * only. util_fits_cpu() logic requires to
> + * operate on non clamped util but must use the
> + * max-aggregated uclamp_{min, max}.
> + */
> + rq_util_min = uclamp_rq_get(rq, UCLAMP_MIN);
> + rq_util_max = uclamp_rq_get(rq, UCLAMP_MAX);
> + util_min = max(rq_util_min, p_util_min);
> + util_max = max(rq_util_max, p_util_max);
> + }
> + return util_fits_cpu(util, util_min, util_max, cpu);
> +}
>
> - energy = em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
I think em_cpu_energy() would need to be removed with this patch,
if there are no more references to it.
On Wed, 4 Sept 2024 at 17:07, Pierre Gondois <pierre.gondois@arm.com> wrote:
>
>
>
> On 8/30/24 15:03, Vincent Guittot wrote:
> > feec() looks for the CPU with highest spare capacity in a PD assuming that
> > it will be the best CPU from a energy efficiency PoV because it will
> > require the smallest increase of OPP. Although this is true generally
> > speaking, this policy also filters some others CPUs which will be as
> > efficients because of using the same OPP.
> > In fact, we really care about the cost of the new OPP that will be
> > selected to handle the waking task. In many cases, several CPUs will end
> > up selecting the same OPP and as a result using the same energy cost. In
> > these cases, we can use other metrics to select the best CPU for the same
> > energy cost.
> >
> > Rework feec() to look 1st for the lowest cost in a PD and then the most
> > performant CPU between CPUs.
> >
> > Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> > ---
> > kernel/sched/fair.c | 466 +++++++++++++++++++++++---------------------
> > 1 file changed, 244 insertions(+), 222 deletions(-)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index e67d6029b269..2273eecf6086 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > -/*
> > - * compute_energy(): Use the Energy Model to estimate the energy that @pd would
> > - * consume for a given utilization landscape @eenv. When @dst_cpu < 0, the task
> > - * contribution is ignored.
> > - */
> > -static inline unsigned long
> > -compute_energy(struct energy_env *eenv, struct perf_domain *pd,
> > - struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu)
> > +/*Check if the CPU can handle the waking task */
> > +static int check_cpu_with_task(struct task_struct *p, int cpu)
> > {
> > - unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
> > - unsigned long busy_time = eenv->pd_busy_time;
> > - unsigned long energy;
> > + unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MIN) : 0;
> > + unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
> > + unsigned long util_min = p_util_min;
> > + unsigned long util_max = p_util_max;
> > + unsigned long util = cpu_util(cpu, p, cpu, 0);
> > + struct rq *rq = cpu_rq(cpu);
> >
> > - if (dst_cpu >= 0)
> > - busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time);
> > + /*
> > + * Skip CPUs that cannot satisfy the capacity request.
> > + * IOW, placing the task there would make the CPU
> > + * overutilized. Take uclamp into account to see how
> > + * much capacity we can get out of the CPU; this is
> > + * aligned with sched_cpu_util().
> > + */
> > + if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) {
> > + unsigned long rq_util_min, rq_util_max;
> > + /*
> > + * Open code uclamp_rq_util_with() except for
> > + * the clamp() part. I.e.: apply max aggregation
> > + * only. util_fits_cpu() logic requires to
> > + * operate on non clamped util but must use the
> > + * max-aggregated uclamp_{min, max}.
> > + */
> > + rq_util_min = uclamp_rq_get(rq, UCLAMP_MIN);
> > + rq_util_max = uclamp_rq_get(rq, UCLAMP_MAX);
> > + util_min = max(rq_util_min, p_util_min);
> > + util_max = max(rq_util_max, p_util_max);
> > + }
> > + return util_fits_cpu(util, util_min, util_max, cpu);
> > +}
> >
> > - energy = em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
>
> I think em_cpu_energy() would need to be removed with this patch,
> if there are no more references to it.
Yes, I will add a patch to cleanup unused function
>
On 30/08/2024 14:03, Vincent Guittot wrote:
> feec() looks for the CPU with highest spare capacity in a PD assuming that
> it will be the best CPU from a energy efficiency PoV because it will
> require the smallest increase of OPP. Although this is true generally
> speaking, this policy also filters some others CPUs which will be as
> efficients because of using the same OPP.
> In fact, we really care about the cost of the new OPP that will be
> selected to handle the waking task. In many cases, several CPUs will end
> up selecting the same OPP and as a result using the same energy cost. In
> these cases, we can use other metrics to select the best CPU for the same
> energy cost.
>
> Rework feec() to look 1st for the lowest cost in a PD and then the most
> performant CPU between CPUs.
>
> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> ---
> kernel/sched/fair.c | 466 +++++++++++++++++++++++---------------------
> 1 file changed, 244 insertions(+), 222 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index e67d6029b269..2273eecf6086 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> [...]
>
> - energy = em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
> +/* For a same cost, select the CPU that will povide best performance for the task */
> +static bool select_best_cpu(struct energy_cpu_stat *target,
> + struct energy_cpu_stat *min,
> + int prev, struct sched_domain *sd)
> +{
> + /* Select the one with the least number of running tasks */
> + if (target->nr_running < min->nr_running)
> + return true;
> + if (target->nr_running > min->nr_running)
> + return false;
>
This makes me a bit worried about systems with coarse-grained OPPs. All
my dev boards and one of my old phones have <= 3 OPPs. On my Juno board,
the lowest OPP on the big core spans across 512 utilization, half of the
full capacity. Assuming a scenario where there are 4 tasks, each with
300, 100, 100, 100 utilization, the placement should be 300 on one core
and 3 tasks with 100 on another, but the nr_running check here would
give 2 tasks (300 + 100) on one CPU and 2 tasks (100 + 100) on another
because they are still under the lowest OPP on Juno. The second CPU will
also finish faster and idle more than the first one.
To give an extreme example, assuming the system has only one OPP (such a
system is dumb to begin with, but just to make a point), before this
patch EAS would still work okay in task placement, but after this patch,
EAS would just balance on the number of tasks, regardless of utilization
of tasks on wake-up.
I wonder if there is a way to still take total utilization as a factor.
It used to be 100% of the decision making, but maybe now it is only 60%,
and the other 40% are things like number of tasks and contention.
> - trace_sched_compute_energy_tp(p, dst_cpu, energy, max_util, busy_time);
> + /* Favor previous CPU otherwise */
> + if (target->cpu == prev)
> + return true;
> + if (min->cpu == prev)
> + return false;
>
> - return energy;
> + /*
> + * Choose CPU with lowest contention. One might want to consider load instead of
> + * runnable but we are supposed to not be overutilized so there is enough compute
> + * capacity for everybody.
> + */
> + if ((target->runnable * min->capa * sd->imbalance_pct) >=
> + (min->runnable * target->capa * 100))
> + return false;
> +
> + return true;
> }
> [...]
On Mon, 2 Sept 2024 at 13:03, Hongyan Xia <hongyan.xia2@arm.com> wrote:
>
> On 30/08/2024 14:03, Vincent Guittot wrote:
> > feec() looks for the CPU with highest spare capacity in a PD assuming that
> > it will be the best CPU from a energy efficiency PoV because it will
> > require the smallest increase of OPP. Although this is true generally
> > speaking, this policy also filters some others CPUs which will be as
> > efficients because of using the same OPP.
> > In fact, we really care about the cost of the new OPP that will be
> > selected to handle the waking task. In many cases, several CPUs will end
> > up selecting the same OPP and as a result using the same energy cost. In
> > these cases, we can use other metrics to select the best CPU for the same
> > energy cost.
> >
> > Rework feec() to look 1st for the lowest cost in a PD and then the most
> > performant CPU between CPUs.
> >
> > Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> > ---
> > kernel/sched/fair.c | 466 +++++++++++++++++++++++---------------------
> > 1 file changed, 244 insertions(+), 222 deletions(-)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index e67d6029b269..2273eecf6086 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > [...]
> >
> > - energy = em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
> > +/* For a same cost, select the CPU that will povide best performance for the task */
> > +static bool select_best_cpu(struct energy_cpu_stat *target,
> > + struct energy_cpu_stat *min,
> > + int prev, struct sched_domain *sd)
> > +{
> > + /* Select the one with the least number of running tasks */
> > + if (target->nr_running < min->nr_running)
> > + return true;
> > + if (target->nr_running > min->nr_running)
> > + return false;
> >
> This makes me a bit worried about systems with coarse-grained OPPs. All
> my dev boards and one of my old phones have <= 3 OPPs. On my Juno board,
> the lowest OPP on the big core spans across 512 utilization, half of the
> full capacity. Assuming a scenario where there are 4 tasks, each with
> 300, 100, 100, 100 utilization, the placement should be 300 on one core
> and 3 tasks with 100 on another, but the nr_running check here would
> give 2 tasks (300 + 100) on one CPU and 2 tasks (100 + 100) on another
> because they are still under the lowest OPP on Juno. The second CPU will
> also finish faster and idle more than the first one.
By balancing the number of tasks on each cpu, I try to minimize the
scheduling latency. In your case above, tasks will wait for no more
than a slice before running whereas it might have to wait up to 2
slices if I put all the (100 utilization) tasks on the same CPU.
>
> To give an extreme example, assuming the system has only one OPP (such a
> system is dumb to begin with, but just to make a point), before this
> patch EAS would still work okay in task placement, but after this patch,
Not sure what you mean by would still work okay. Do you have an
example in mind that would not work correctly ?
> EAS would just balance on the number of tasks, regardless of utilization
> of tasks on wake-up.
You have to keep in mind that utilization is already taken into
account to check if the task fits the CPU and by selecting the OPP
(which is a nope in case of one OPP). So we know that there is enough
capacity for the waking task
>
> I wonder if there is a way to still take total utilization as a factor.
utilization is still used to check that the task utilization fits with
current cpu utilization and then to select the OPP. At this step we
know that there is enough capacity for everybody
> It used to be 100% of the decision making, but maybe now it is only 60%,
> and the other 40% are things like number of tasks and contention.
>
> > - trace_sched_compute_energy_tp(p, dst_cpu, energy, max_util, busy_time);
> > + /* Favor previous CPU otherwise */
> > + if (target->cpu == prev)
> > + return true;
> > + if (min->cpu == prev)
> > + return false;
> >
> > - return energy;
> > + /*
> > + * Choose CPU with lowest contention. One might want to consider load instead of
> > + * runnable but we are supposed to not be overutilized so there is enough compute
> > + * capacity for everybody.
> > + */
> > + if ((target->runnable * min->capa * sd->imbalance_pct) >=
> > + (min->runnable * target->capa * 100))
> > + return false;
> > +
> > + return true;
> > }
> > [...]
>
On 06/09/2024 08:08, Vincent Guittot wrote:
> On Mon, 2 Sept 2024 at 13:03, Hongyan Xia <hongyan.xia2@arm.com> wrote:
>>
>> On 30/08/2024 14:03, Vincent Guittot wrote:
>>> feec() looks for the CPU with highest spare capacity in a PD assuming that
>>> it will be the best CPU from a energy efficiency PoV because it will
>>> require the smallest increase of OPP. Although this is true generally
>>> speaking, this policy also filters some others CPUs which will be as
>>> efficients because of using the same OPP.
>>> In fact, we really care about the cost of the new OPP that will be
>>> selected to handle the waking task. In many cases, several CPUs will end
>>> up selecting the same OPP and as a result using the same energy cost. In
>>> these cases, we can use other metrics to select the best CPU for the same
>>> energy cost.
>>>
>>> Rework feec() to look 1st for the lowest cost in a PD and then the most
>>> performant CPU between CPUs.
>>>
>>> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
>>> ---
>>> kernel/sched/fair.c | 466 +++++++++++++++++++++++---------------------
>>> 1 file changed, 244 insertions(+), 222 deletions(-)
>>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index e67d6029b269..2273eecf6086 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> [...]
>>>
>>> - energy = em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
>>> +/* For a same cost, select the CPU that will povide best performance for the task */
>>> +static bool select_best_cpu(struct energy_cpu_stat *target,
>>> + struct energy_cpu_stat *min,
>>> + int prev, struct sched_domain *sd)
>>> +{
>>> + /* Select the one with the least number of running tasks */
>>> + if (target->nr_running < min->nr_running)
>>> + return true;
>>> + if (target->nr_running > min->nr_running)
>>> + return false;
>>>
>> This makes me a bit worried about systems with coarse-grained OPPs. All
>> my dev boards and one of my old phones have <= 3 OPPs. On my Juno board,
>> the lowest OPP on the big core spans across 512 utilization, half of the
>> full capacity. Assuming a scenario where there are 4 tasks, each with
>> 300, 100, 100, 100 utilization, the placement should be 300 on one core
>> and 3 tasks with 100 on another, but the nr_running check here would
>> give 2 tasks (300 + 100) on one CPU and 2 tasks (100 + 100) on another
>> because they are still under the lowest OPP on Juno. The second CPU will
>> also finish faster and idle more than the first one.
>
> By balancing the number of tasks on each cpu, I try to minimize the
> scheduling latency. In your case above, tasks will wait for no more
> than a slice before running whereas it might have to wait up to 2
> slices if I put all the (100 utilization) tasks on the same CPU.
If viewed in another angle, we are now asking the 300 task (which
potentially has a heavier workload to finish) to compete with a 100
task, and now one core finishes faster and the other takes longer time,
making the overall execution time longer.
>>
>> To give an extreme example, assuming the system has only one OPP (such a
>> system is dumb to begin with, but just to make a point), before this
>> patch EAS would still work okay in task placement, but after this patch,
>
> Not sure what you mean by would still work okay. Do you have an
> example in mind that would not work correctly ?
With only one OPP, this patch will balance task placement purely on the
number of tasks without considering utilization, and I don't think
that's entirely acceptable (I actually need to deal with such a device
with only one OPP in real life, although that's the fault of that
device). Before, we are still balancing on total utilization, which
results in the lowest execution time.
>
>> EAS would just balance on the number of tasks, regardless of utilization
>> of tasks on wake-up.
>
> You have to keep in mind that utilization is already taken into
> account to check if the task fits the CPU and by selecting the OPP
> (which is a nope in case of one OPP). So we know that there is enough
> capacity for the waking task
Still, taking my Juno board as an example where the 1st OPP is at
utilization 512. Assuming no 25% margin, four tasks with utilization
200, 200, 50, 50 and two CPUs, I would strongly favor 200 + 50 on one
CPU and same on the other, than 200 + 200 on one, 50 + 50 on the other.
However, with this patch, these two scheduling decisions are the same,
as long as both are under the 512 OPP.
Of course, this becomes less of a problem with fine-grained OPPs. On my
Pixel 6 with 18 OPPs on one CPU, I don't have such concerns.
>>
>> I wonder if there is a way to still take total utilization as a factor.
>
> utilization is still used to check that the task utilization fits with
> current cpu utilization and then to select the OPP. At this step we
> know that there is enough capacity for everybody
>
>> It used to be 100% of the decision making, but maybe now it is only 60%,
>> and the other 40% are things like number of tasks and contention.
>>
>>> - trace_sched_compute_energy_tp(p, dst_cpu, energy, max_util, busy_time);
>>> + /* Favor previous CPU otherwise */
>>> + if (target->cpu == prev)
>>> + return true;
>>> + if (min->cpu == prev)
>>> + return false;
>>>
>>> - return energy;
>>> + /*
>>> + * Choose CPU with lowest contention. One might want to consider load instead of
>>> + * runnable but we are supposed to not be overutilized so there is enough compute
>>> + * capacity for everybody.
>>> + */
>>> + if ((target->runnable * min->capa * sd->imbalance_pct) >=
>>> + (min->runnable * target->capa * 100))
>>> + return false;
>>> +
>>> + return true;
>>> }
>>> [...]
>>
On Fri, 6 Sept 2024 at 17:32, Hongyan Xia <hongyan.xia2@arm.com> wrote:
>
> On 06/09/2024 08:08, Vincent Guittot wrote:
> > On Mon, 2 Sept 2024 at 13:03, Hongyan Xia <hongyan.xia2@arm.com> wrote:
> >>
> >> On 30/08/2024 14:03, Vincent Guittot wrote:
> >>> feec() looks for the CPU with highest spare capacity in a PD assuming that
> >>> it will be the best CPU from a energy efficiency PoV because it will
> >>> require the smallest increase of OPP. Although this is true generally
> >>> speaking, this policy also filters some others CPUs which will be as
> >>> efficients because of using the same OPP.
> >>> In fact, we really care about the cost of the new OPP that will be
> >>> selected to handle the waking task. In many cases, several CPUs will end
> >>> up selecting the same OPP and as a result using the same energy cost. In
> >>> these cases, we can use other metrics to select the best CPU for the same
> >>> energy cost.
> >>>
> >>> Rework feec() to look 1st for the lowest cost in a PD and then the most
> >>> performant CPU between CPUs.
> >>>
> >>> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> >>> ---
> >>> kernel/sched/fair.c | 466 +++++++++++++++++++++++---------------------
> >>> 1 file changed, 244 insertions(+), 222 deletions(-)
> >>>
> >>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> >>> index e67d6029b269..2273eecf6086 100644
> >>> --- a/kernel/sched/fair.c
> >>> +++ b/kernel/sched/fair.c
> >>> [...]
> >>>
> >>> - energy = em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
> >>> +/* For a same cost, select the CPU that will povide best performance for the task */
> >>> +static bool select_best_cpu(struct energy_cpu_stat *target,
> >>> + struct energy_cpu_stat *min,
> >>> + int prev, struct sched_domain *sd)
> >>> +{
> >>> + /* Select the one with the least number of running tasks */
> >>> + if (target->nr_running < min->nr_running)
> >>> + return true;
> >>> + if (target->nr_running > min->nr_running)
> >>> + return false;
> >>>
> >> This makes me a bit worried about systems with coarse-grained OPPs. All
> >> my dev boards and one of my old phones have <= 3 OPPs. On my Juno board,
> >> the lowest OPP on the big core spans across 512 utilization, half of the
> >> full capacity. Assuming a scenario where there are 4 tasks, each with
> >> 300, 100, 100, 100 utilization, the placement should be 300 on one core
> >> and 3 tasks with 100 on another, but the nr_running check here would
> >> give 2 tasks (300 + 100) on one CPU and 2 tasks (100 + 100) on another
> >> because they are still under the lowest OPP on Juno. The second CPU will
> >> also finish faster and idle more than the first one.
> >
> > By balancing the number of tasks on each cpu, I try to minimize the
> > scheduling latency. In your case above, tasks will wait for no more
> > than a slice before running whereas it might have to wait up to 2
> > slices if I put all the (100 utilization) tasks on the same CPU.
>
> If viewed in another angle, we are now asking the 300 task (which
> potentially has a heavier workload to finish) to compete with a 100
> task, and now one core finishes faster and the other takes longer time,
> making the overall execution time longer.
The main problem with utilization is that it also reflects the recent
past and it can screw up task placement as well as I presented at
OSPM. Imagine that a long running "400 task" just went back to sleep
before placing the "300 task" and the 3 "100 tasks" then you can end
up putting 3 tasks on one core as well.
The goal here is to optimize scheduling latency which is a problem
that has never been really taken into account so far with the problem
of several tasks being stacked on the same cpu which increases the
scheduling latency . A next step after this patchset will be to take
into account the sched slice in addition to the number of tasks to
optimize the scheduling latency of some tasks. The fact that a cpu
will run longer should be taken into account in the energy model when
we compute the energy cost which is not the case for now because of
the complexity to now when cpus will be really idle and which state
will be selected
>
> >>
> >> To give an extreme example, assuming the system has only one OPP (such a
> >> system is dumb to begin with, but just to make a point), before this
> >> patch EAS would still work okay in task placement, but after this patch,
> >
> > Not sure what you mean by would still work okay. Do you have an
> > example in mind that would not work correctly ?
>
> With only one OPP, this patch will balance task placement purely on the
> number of tasks without considering utilization, and I don't think
> that's entirely acceptable (I actually need to deal with such a device
> with only one OPP in real life, although that's the fault of that
> device). Before, we are still balancing on total utilization, which
> results in the lowest execution time.
>
> >
> >> EAS would just balance on the number of tasks, regardless of utilization
> >> of tasks on wake-up.
> >
> > You have to keep in mind that utilization is already taken into
> > account to check if the task fits the CPU and by selecting the OPP
> > (which is a nope in case of one OPP). So we know that there is enough
> > capacity for the waking task
>
> Still, taking my Juno board as an example where the 1st OPP is at
> utilization 512. Assuming no 25% margin, four tasks with utilization
> 200, 200, 50, 50 and two CPUs, I would strongly favor 200 + 50 on one
> CPU and same on the other, than 200 + 200 on one, 50 + 50 on the other.
> However, with this patch, these two scheduling decisions are the same,
> as long as both are under the 512 OPP.
The runnable avg test should handle this when there is the same number
of tasks on both CPUs then we select the one with lowest contention so
one 200 task should end up on each CPU
>
> Of course, this becomes less of a problem with fine-grained OPPs. On my
> Pixel 6 with 18 OPPs on one CPU, I don't have such concerns.
>
> >>
> >> I wonder if there is a way to still take total utilization as a factor.
> >
> > utilization is still used to check that the task utilization fits with
> > current cpu utilization and then to select the OPP. At this step we
> > know that there is enough capacity for everybody
> >
> >> It used to be 100% of the decision making, but maybe now it is only 60%,
> >> and the other 40% are things like number of tasks and contention.
> >>
> >>> - trace_sched_compute_energy_tp(p, dst_cpu, energy, max_util, busy_time);
> >>> + /* Favor previous CPU otherwise */
> >>> + if (target->cpu == prev)
> >>> + return true;
> >>> + if (min->cpu == prev)
> >>> + return false;
> >>>
> >>> - return energy;
> >>> + /*
> >>> + * Choose CPU with lowest contention. One might want to consider load instead of
> >>> + * runnable but we are supposed to not be overutilized so there is enough compute
> >>> + * capacity for everybody.
> >>> + */
> >>> + if ((target->runnable * min->capa * sd->imbalance_pct) >=
> >>> + (min->runnable * target->capa * 100))
> >>> + return false;
> >>> +
> >>> + return true;
> >>> }
> >>> [...]
> >>
Hi Vincent,
kernel test robot noticed the following build errors:
[auto build test ERROR on tip/sched/core]
[also build test ERROR on peterz-queue/sched/core linus/master v6.11-rc6 next-20240830]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Vincent-Guittot/sched-fair-Filter-false-overloaded_group-case-for-EAS/20240830-210826
base: tip/sched/core
patch link: https://lore.kernel.org/r/20240830130309.2141697-4-vincent.guittot%40linaro.org
patch subject: [PATCH 3/5] sched/fair: Rework feec() to use cost instead of spare capacity
config: s390-allnoconfig (https://download.01.org/0day-ci/archive/20240902/202409021606.CwIU0HB8-lkp@intel.com/config)
compiler: clang version 20.0.0git (https://github.com/llvm/llvm-project 6f682c26b04f0b349c4c473756cb8625b4f37c6d)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240902/202409021606.CwIU0HB8-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202409021606.CwIU0HB8-lkp@intel.com/
All errors (new ones prefixed by >>):
In file included from kernel/sched/fair.c:23:
In file included from include/linux/energy_model.h:5:
In file included from include/linux/device.h:32:
In file included from include/linux/device/driver.h:21:
In file included from include/linux/module.h:19:
In file included from include/linux/elf.h:6:
In file included from arch/s390/include/asm/elf.h:181:
In file included from arch/s390/include/asm/mmu_context.h:11:
In file included from arch/s390/include/asm/pgalloc.h:18:
In file included from include/linux/mm.h:2228:
include/linux/vmstat.h:514:36: warning: arithmetic between different enumeration types ('enum node_stat_item' and 'enum lru_list') [-Wenum-enum-conversion]
514 | return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_"
| ~~~~~~~~~~~ ^ ~~~
In file included from kernel/sched/fair.c:38:
In file included from include/linux/sched/isolation.h:7:
In file included from include/linux/tick.h:8:
In file included from include/linux/clockchips.h:14:
In file included from include/linux/clocksource.h:22:
In file included from arch/s390/include/asm/io.h:93:
include/asm-generic/io.h:548:31: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
548 | val = __raw_readb(PCI_IOBASE + addr);
| ~~~~~~~~~~ ^
include/asm-generic/io.h:561:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
561 | val = __le16_to_cpu((__le16 __force)__raw_readw(PCI_IOBASE + addr));
| ~~~~~~~~~~ ^
include/uapi/linux/byteorder/big_endian.h:37:59: note: expanded from macro '__le16_to_cpu'
37 | #define __le16_to_cpu(x) __swab16((__force __u16)(__le16)(x))
| ^
include/uapi/linux/swab.h:102:54: note: expanded from macro '__swab16'
102 | #define __swab16(x) (__u16)__builtin_bswap16((__u16)(x))
| ^
In file included from kernel/sched/fair.c:38:
In file included from include/linux/sched/isolation.h:7:
In file included from include/linux/tick.h:8:
In file included from include/linux/clockchips.h:14:
In file included from include/linux/clocksource.h:22:
In file included from arch/s390/include/asm/io.h:93:
include/asm-generic/io.h:574:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
574 | val = __le32_to_cpu((__le32 __force)__raw_readl(PCI_IOBASE + addr));
| ~~~~~~~~~~ ^
include/uapi/linux/byteorder/big_endian.h:35:59: note: expanded from macro '__le32_to_cpu'
35 | #define __le32_to_cpu(x) __swab32((__force __u32)(__le32)(x))
| ^
include/uapi/linux/swab.h:115:54: note: expanded from macro '__swab32'
115 | #define __swab32(x) (__u32)__builtin_bswap32((__u32)(x))
| ^
In file included from kernel/sched/fair.c:38:
In file included from include/linux/sched/isolation.h:7:
In file included from include/linux/tick.h:8:
In file included from include/linux/clockchips.h:14:
In file included from include/linux/clocksource.h:22:
In file included from arch/s390/include/asm/io.h:93:
include/asm-generic/io.h:585:33: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
585 | __raw_writeb(value, PCI_IOBASE + addr);
| ~~~~~~~~~~ ^
include/asm-generic/io.h:595:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
595 | __raw_writew((u16 __force)cpu_to_le16(value), PCI_IOBASE + addr);
| ~~~~~~~~~~ ^
include/asm-generic/io.h:605:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
605 | __raw_writel((u32 __force)cpu_to_le32(value), PCI_IOBASE + addr);
| ~~~~~~~~~~ ^
include/asm-generic/io.h:693:20: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
693 | readsb(PCI_IOBASE + addr, buffer, count);
| ~~~~~~~~~~ ^
include/asm-generic/io.h:701:20: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
701 | readsw(PCI_IOBASE + addr, buffer, count);
| ~~~~~~~~~~ ^
include/asm-generic/io.h:709:20: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
709 | readsl(PCI_IOBASE + addr, buffer, count);
| ~~~~~~~~~~ ^
include/asm-generic/io.h:718:21: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
718 | writesb(PCI_IOBASE + addr, buffer, count);
| ~~~~~~~~~~ ^
include/asm-generic/io.h:727:21: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
727 | writesw(PCI_IOBASE + addr, buffer, count);
| ~~~~~~~~~~ ^
include/asm-generic/io.h:736:21: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
736 | writesl(PCI_IOBASE + addr, buffer, count);
| ~~~~~~~~~~ ^
>> kernel/sched/fair.c:8183:6: error: call to undeclared function 'em_pd_get_efficient_state'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
8183 | i = em_pd_get_efficient_state(em_table->state, pd->nr_perf_states,
| ^
>> kernel/sched/fair.c:8190:6: error: call to undeclared function 'em_pd_get_previous_state'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
8190 | i = em_pd_get_previous_state(em_table->state, pd->nr_perf_states,
| ^
13 warnings and 2 errors generated.
vim +/em_pd_get_efficient_state +8183 kernel/sched/fair.c
8168
8169 /* Find the cost of a performance domain for the estimated utilization */
8170 static inline void find_pd_cost(struct em_perf_domain *pd,
8171 unsigned long max_util,
8172 struct energy_cpu_stat *stat)
8173 {
8174 struct em_perf_table *em_table;
8175 struct em_perf_state *ps;
8176 int i;
8177
8178 /*
8179 * Find the lowest performance state of the Energy Model above the
8180 * requested performance.
8181 */
8182 em_table = rcu_dereference(pd->em_table);
> 8183 i = em_pd_get_efficient_state(em_table->state, pd->nr_perf_states,
8184 max_util, pd->flags);
8185 ps = &em_table->state[i];
8186
8187 /* Save the cost and performance range of the OPP */
8188 stat->max_perf = ps->performance;
8189 stat->cost = ps->cost;
> 8190 i = em_pd_get_previous_state(em_table->state, pd->nr_perf_states,
8191 i, pd->flags);
8192 if (i < 0)
8193 stat->min_perf = 0;
8194 else {
8195 ps = &em_table->state[i];
8196 stat->min_perf = ps->performance;
8197 }
8198 }
8199
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
© 2016 - 2025 Red Hat, Inc.