From nobody Tue Jun 23 08:15:28 2026 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 0A6BDC433F5 for ; Tue, 8 Mar 2022 18:20:24 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1349503AbiCHSVS (ORCPT ); Tue, 8 Mar 2022 13:21:18 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:42332 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S243836AbiCHSVN (ORCPT ); Tue, 8 Mar 2022 13:21:13 -0500 Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by lindbergh.monkeyblade.net (Postfix) with ESMTP id 53B1F43EC3 for ; Tue, 8 Mar 2022 10:20:16 -0800 (PST) Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 1F23C1650; Tue, 8 Mar 2022 10:20:16 -0800 (PST) Received: from localhost.localdomain (unknown [10.57.88.57]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPA id BDFEE3FA45; Tue, 8 Mar 2022 10:20:14 -0800 (PST) From: Vincent Donnefort To: peterz@infradead.org, mingo@redhat.com, vincent.guittot@linaro.org Cc: linux-kernel@vger.kernel.org, dietmar.eggemann@arm.com, morten.rasmussen@arm.com, chris.redpath@arm.com, qperret@google.com, Vincent Donnefort Subject: [PATCH v3 1/7] sched/fair: Provide u64 read for 32-bits arch helper Date: Tue, 8 Mar 2022 18:19:51 +0000 Message-Id: <20220308181957.280354-2-vincent.donnefort@arm.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20220308181957.280354-1-vincent.donnefort@arm.com> References: <20220308181957.280354-1-vincent.donnefort@arm.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" Introducing macro helpers u64_u32_{store,load}() to factorize lockless accesses to u64 variables for 32-bits architectures. Users are for now cfs_rq.min_vruntime and sched_avg.last_update_time. To accommodate the later where the copy lies outside of the structure (cfs_rq.last_udpate_time_copy instead of sched_avg.last_update_time_copy), use the _copy() version of those helpers. Those new helpers encapsulate smp_rmb() and smp_wmb() synchronization and therefore, have a small penalty in set_task_rq_fair() and init_cfs_rq(). Signed-off-by: Vincent Donnefort diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 095b0aa378df..99ea9540ece4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -568,11 +568,8 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) } =20 /* ensure we never gain time by being placed backwards. */ - cfs_rq->min_vruntime =3D max_vruntime(cfs_rq->min_vruntime, vruntime); -#ifndef CONFIG_64BIT - smp_wmb(); - cfs_rq->min_vruntime_copy =3D cfs_rq->min_vruntime; -#endif + u64_u32_store(cfs_rq->min_vruntime, + max_vruntime(cfs_rq->min_vruntime, vruntime)); } =20 static inline bool __entity_less(struct rb_node *a, const struct rb_node *= b) @@ -3246,6 +3243,11 @@ static inline void cfs_rq_util_change(struct cfs_rq = *cfs_rq, int flags) } =20 #ifdef CONFIG_SMP +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) +{ + return u64_u32_load_copy(cfs_rq->avg.last_update_time, + cfs_rq->last_update_time_copy); +} #ifdef CONFIG_FAIR_GROUP_SCHED /* * Because list_add_leaf_cfs_rq always places a child cfs_rq on the list @@ -3356,27 +3358,9 @@ void set_task_rq_fair(struct sched_entity *se, if (!(se->avg.last_update_time && prev)) return; =20 -#ifndef CONFIG_64BIT - { - u64 p_last_update_time_copy; - u64 n_last_update_time_copy; - - do { - p_last_update_time_copy =3D prev->load_last_update_time_copy; - n_last_update_time_copy =3D next->load_last_update_time_copy; - - smp_rmb(); + p_last_update_time =3D cfs_rq_last_update_time(prev); + n_last_update_time =3D cfs_rq_last_update_time(next); =20 - p_last_update_time =3D prev->avg.last_update_time; - n_last_update_time =3D next->avg.last_update_time; - - } while (p_last_update_time !=3D p_last_update_time_copy || - n_last_update_time !=3D n_last_update_time_copy); - } -#else - p_last_update_time =3D prev->avg.last_update_time; - n_last_update_time =3D next->avg.last_update_time; -#endif __update_load_avg_blocked_se(p_last_update_time, se); se->avg.last_update_time =3D n_last_update_time; } @@ -3700,8 +3684,9 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) decayed |=3D __update_load_avg_cfs_rq(now, cfs_rq); =20 #ifndef CONFIG_64BIT - smp_wmb(); - cfs_rq->load_last_update_time_copy =3D sa->last_update_time; + u64_u32_store_copy(sa->last_update_time, + cfs_rq->last_update_time_copy, + sa->last_update_time); #endif =20 return decayed; @@ -3834,27 +3819,6 @@ static inline void update_load_avg(struct cfs_rq *cf= s_rq, struct sched_entity *s } } =20 -#ifndef CONFIG_64BIT -static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) -{ - u64 last_update_time_copy; - u64 last_update_time; - - do { - last_update_time_copy =3D cfs_rq->load_last_update_time_copy; - smp_rmb(); - last_update_time =3D cfs_rq->avg.last_update_time; - } while (last_update_time !=3D last_update_time_copy); - - return last_update_time; -} -#else -static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) -{ - return cfs_rq->avg.last_update_time; -} -#endif - /* * Synchronize entity load avg of dequeued entity without locking * the previous rq. @@ -6904,21 +6868,8 @@ static void migrate_task_rq_fair(struct task_struct = *p, int new_cpu) if (READ_ONCE(p->__state) =3D=3D TASK_WAKING) { struct sched_entity *se =3D &p->se; struct cfs_rq *cfs_rq =3D cfs_rq_of(se); - u64 min_vruntime; =20 -#ifndef CONFIG_64BIT - u64 min_vruntime_copy; - - do { - min_vruntime_copy =3D cfs_rq->min_vruntime_copy; - smp_rmb(); - min_vruntime =3D cfs_rq->min_vruntime; - } while (min_vruntime !=3D min_vruntime_copy); -#else - min_vruntime =3D cfs_rq->min_vruntime; -#endif - - se->vruntime -=3D min_vruntime; + se->vruntime -=3D u64_u32_load(cfs_rq->min_vruntime); } =20 if (p->on_rq =3D=3D TASK_ON_RQ_MIGRATING) { @@ -11362,10 +11313,7 @@ static void set_next_task_fair(struct rq *rq, stru= ct task_struct *p, bool first) void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline =3D RB_ROOT_CACHED; - cfs_rq->min_vruntime =3D (u64)(-(1LL << 20)); -#ifndef CONFIG_64BIT - cfs_rq->min_vruntime_copy =3D cfs_rq->min_vruntime; -#endif + u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20))); #ifdef CONFIG_SMP raw_spin_lock_init(&cfs_rq->removed.lock); #endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index de53be905739..f1a445efdc63 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -528,6 +528,45 @@ struct cfs_bandwidth { }; =20 #endif /* CONFIG_CGROUP_SCHED */ =20 +/* + * u64_u32_load/u64_u32_store + * + * Use a copy of a u64 value to protect against data race. This is only + * applicable for 32-bits architectures. + */ +#ifdef CONFIG_64BIT +# define u64_u32_load_copy(var, copy) var +# define u64_u32_store_copy(var, copy, val) (var =3D val) +#else +# define u64_u32_load_copy(var, copy) \ +({ \ + u64 __val, __val_copy; \ + do { \ + __val_copy =3D copy; \ + /* \ + * paired with u64_u32_store, ordering access \ + * to var and copy. \ + */ \ + smp_rmb(); \ + __val =3D var; \ + } while (__val !=3D __val_copy); \ + __val; \ +}) +# define u64_u32_store_copy(var, copy, val) \ +do { \ + typeof(val) __val =3D (val); \ + var =3D __val; \ + /* \ + * paired with u64_u32_load, ordering access to var and \ + * copy. \ + */ \ + smp_wmb(); \ + copy =3D __val; \ +} while (0) +#endif +# define u64_u32_load(var) u64_u32_load_copy(var, var##_copy) +# define u64_u32_store(var, val) u64_u32_store_copy(var, var##_copy, val) + /* CFS-related fields in a runqueue */ struct cfs_rq { struct load_weight load; @@ -568,7 +607,7 @@ struct cfs_rq { */ struct sched_avg avg; #ifndef CONFIG_64BIT - u64 load_last_update_time_copy; + u64 last_update_time_copy; #endif struct { raw_spinlock_t lock ____cacheline_aligned; --=20 2.25.1 From nobody Tue Jun 23 08:15:28 2026 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id E905FC433F5 for ; Tue, 8 Mar 2022 18:20:26 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1349512AbiCHSVW (ORCPT ); Tue, 8 Mar 2022 13:21:22 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:42360 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1348376AbiCHSVP (ORCPT ); Tue, 8 Mar 2022 13:21:15 -0500 Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by lindbergh.monkeyblade.net (Postfix) with ESMTP id D225855756 for ; Tue, 8 Mar 2022 10:20:17 -0800 (PST) Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id A20A31655; Tue, 8 Mar 2022 10:20:17 -0800 (PST) Received: from localhost.localdomain (unknown [10.57.88.57]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPA id 537CB3FA45; Tue, 8 Mar 2022 10:20:16 -0800 (PST) From: Vincent Donnefort To: peterz@infradead.org, mingo@redhat.com, vincent.guittot@linaro.org Cc: linux-kernel@vger.kernel.org, dietmar.eggemann@arm.com, morten.rasmussen@arm.com, chris.redpath@arm.com, qperret@google.com, Vincent Donnefort Subject: [PATCH v3 2/7] sched/fair: Decay task PELT values during migration Date: Tue, 8 Mar 2022 18:19:52 +0000 Message-Id: <20220308181957.280354-3-vincent.donnefort@arm.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20220308181957.280354-1-vincent.donnefort@arm.com> References: <20220308181957.280354-1-vincent.donnefort@arm.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" Before being migrated to a new CPU, a task sees its PELT values synchronized with rq last_update_time. Once done, that same task will also have its sched_avg last_update_time reset. This means the time between the migration and the last clock update (B) will not be accounted for in util_avg and a discontinuity will appear. This issue is amplified by the PELT clock scaling. If the clock hasn't been updated while the CPU is idle, clock_pelt will not be aligned with clock_task and that time (A) will be also lost. ---------|----- A -----|-----------|------- B -----|> clock_pelt clock_task clock now This is especially problematic for asymmetric CPU capacity systems which need stable util_avg signals for task placement and energy estimation. Ideally, this problem would be solved by updating the runqueue clocks before the migration. But that would require taking the runqueue lock which is quite expensive [1]. Instead estimate the missing time and update the task util_avg with that value: A + B =3D clock_task - clock_pelt + sched_clock_cpu() - clock Neither clock_task, clock_pelt nor clock can be accessed without the runqueue lock. The new cfs_rq last_update_lag is therefore created and encode those three values when the last_update_time value for that very same cfs_rq is updated. last_update_lag =3D clock - clock_task + clock_pelt And we can then write the missing time as follow: A + B =3D sched_clock_cpu() - last_update_lag The B. part of the missing time is however an estimation that doesn't take into account IRQ and Paravirt time. Now we have an estimation for A + B, we can create an estimator for the PELT value at the time of the migration. We need for this purpose to inject last_update_time which is a combination of both clock_pelt and lost_idle_time. The latter is a time value which is completely lost form a PELT point of view and must be ignored. And finally, we can write: rq_clock_pelt_estimator() =3D last_update_time + A + B =3D last_update_time + sched_clock_cpu() - last_update_lag This estimation has a cost, mostly due to sched_clock_cpu(). Limit the usage to the case where the source CPU is idle as we know this is when the clock is having the biggest risk of being outdated. [1] https://lore.kernel.org/all/20190709115759.10451-1-chris.redpath@arm.co= m/ Signed-off-by: Vincent Donnefort diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 99ea9540ece4..1f83616a44d1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3625,6 +3625,22 @@ static inline void add_tg_cfs_propagate(struct cfs_r= q *cfs_rq, long runnable_sum =20 #endif /* CONFIG_FAIR_GROUP_SCHED */ =20 +#ifdef CONFIG_NO_HZ_COMMON +static inline void update_cfs_rq_lag(struct cfs_rq *cfs_rq) +{ + struct rq *rq =3D rq_of(cfs_rq); + + u64_u32_store(cfs_rq->last_update_lag, +#ifdef CONFIG_CFS_BANDWIDTH + /* Timer stopped by throttling */ + unlikely(cfs_rq->throttle_count) ? U64_MAX : +#endif + rq->clock - rq->clock_task + rq->clock_pelt); +} +#else +static void update_cfs_rq_lag(struct cfs_rq *cfs_rq) {} +#endif + /** * update_cfs_rq_load_avg - update the cfs_rq's load/util averages * @now: current time, as per cfs_rq_clock_pelt() @@ -3688,6 +3704,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) cfs_rq->last_update_time_copy, sa->last_update_time); #endif + update_cfs_rq_lag(cfs_rq); =20 return decayed; } @@ -6852,6 +6869,44 @@ select_task_rq_fair(struct task_struct *p, int prev_= cpu, int wake_flags) =20 static void detach_entity_cfs_rq(struct sched_entity *se); =20 +#ifdef CONFIG_NO_HZ_COMMON +static inline void migrate_se_pelt_lag(struct sched_entity *se) +{ + u64 now, last_update_lag; + struct cfs_rq *cfs_rq; + struct rq *rq; + bool is_idle; + + cfs_rq =3D cfs_rq_of(se); + rq =3D rq_of(cfs_rq); + + rcu_read_lock(); + is_idle =3D is_idle_task(rcu_dereference(rq->curr)); + rcu_read_unlock(); + + /* + * The lag estimation comes with a cost we don't want to pay all the + * time. Hence, limiting to the case where the source CPU is idle and + * we know we are at the greatest risk to have an outdated clock. + */ + if (!is_idle) + return; + + last_update_lag =3D u64_u32_load(cfs_rq->last_update_lag); + + /* The clock has been stopped for throttling */ + if (last_update_lag =3D=3D U64_MAX) + return; + + now =3D se->avg.last_update_time - last_update_lag + + sched_clock_cpu(cpu_of(rq)); + + __update_load_avg_blocked_se(now, se); +} +#else +static void migrate_se_pelt_lag(struct sched_entity *se) {} +#endif + /* * Called immediately before a task is migrated to a new CPU; task_cpu(p) = and * cfs_rq_of(p) references at time of call are still valid and identify the @@ -6859,6 +6914,9 @@ static void detach_entity_cfs_rq(struct sched_entity = *se); */ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) { + struct sched_entity *se =3D &p->se; + struct cfs_rq *cfs_rq =3D cfs_rq_of(se); + /* * As blocked tasks retain absolute vruntime the migration needs to * deal with this by subtracting the old and adding the new @@ -6866,8 +6924,6 @@ static void migrate_task_rq_fair(struct task_struct *= p, int new_cpu) * the task on the new runqueue. */ if (READ_ONCE(p->__state) =3D=3D TASK_WAKING) { - struct sched_entity *se =3D &p->se; - struct cfs_rq *cfs_rq =3D cfs_rq_of(se); =20 se->vruntime -=3D u64_u32_load(cfs_rq->min_vruntime); } @@ -6878,25 +6934,28 @@ static void migrate_task_rq_fair(struct task_struct= *p, int new_cpu) * rq->lock and can modify state directly. */ lockdep_assert_rq_held(task_rq(p)); - detach_entity_cfs_rq(&p->se); + detach_entity_cfs_rq(se); =20 } else { + remove_entity_load_avg(se); + /* - * We are supposed to update the task to "current" time, then - * its up to date and ready to go to new CPU/cfs_rq. But we - * have difficulty in getting what current time is, so simply - * throw away the out-of-date time. This will result in the - * wakee task is less decayed, but giving the wakee more load - * sounds not bad. + * Here, the task's PELT values have been updated according to + * the current rq's clock. But if that clock hasn't been + * updated in a while, a substantial idle time will be missed, + * leading to an inflation after wake-up on the new rq. + * + * Estimate the missing time from the rq clock and update + * sched_avg to improve the PELT continuity after migration. */ - remove_entity_load_avg(&p->se); + migrate_se_pelt_lag(se); } =20 /* Tell new CPU we are migrated */ - p->se.avg.last_update_time =3D 0; + se->avg.last_update_time =3D 0; =20 /* We have migrated, no longer consider this task hot */ - p->se.exec_start =3D 0; + se->exec_start =3D 0; =20 update_scan_period(p, new_cpu); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f1a445efdc63..982691ffe9a1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -608,6 +608,12 @@ struct cfs_rq { struct sched_avg avg; #ifndef CONFIG_64BIT u64 last_update_time_copy; +#endif +#ifdef CONFIG_NO_HZ_COMMON + u64 last_update_lag; +#ifndef CONFIG_64BIT + u64 last_update_lag_copy; +#endif #endif struct { raw_spinlock_t lock ____cacheline_aligned; --=20 2.25.1 From nobody Tue Jun 23 08:15:28 2026 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id B0E4DC433F5 for ; Tue, 8 Mar 2022 18:20:37 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1349542AbiCHSVd (ORCPT ); Tue, 8 Mar 2022 13:21:33 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:42392 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1349497AbiCHSVQ (ORCPT ); Tue, 8 Mar 2022 13:21:16 -0500 Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by lindbergh.monkeyblade.net (Postfix) with ESMTP id 48F7B5674A for ; Tue, 8 Mar 2022 10:20:19 -0800 (PST) Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 15466165C; Tue, 8 Mar 2022 10:20:19 -0800 (PST) Received: from localhost.localdomain (unknown [10.57.88.57]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPA id DE4EF3FA45; Tue, 8 Mar 2022 10:20:17 -0800 (PST) From: Vincent Donnefort To: peterz@infradead.org, mingo@redhat.com, vincent.guittot@linaro.org Cc: linux-kernel@vger.kernel.org, dietmar.eggemann@arm.com, morten.rasmussen@arm.com, chris.redpath@arm.com, qperret@google.com Subject: [PATCH v3 3/7] sched, drivers: Remove max param from effective_cpu_util()/sched_cpu_util() Date: Tue, 8 Mar 2022 18:19:53 +0000 Message-Id: <20220308181957.280354-4-vincent.donnefort@arm.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20220308181957.280354-1-vincent.donnefort@arm.com> References: <20220308181957.280354-1-vincent.donnefort@arm.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" From: Dietmar Eggemann effective_cpu_util() already has a `int cpu' parameter which allows to retrieve the CPU capacity scale factor (or maximum CPU capacity) inside this function via an arch_scale_cpu_capacity(cpu). A lot of code calling effective_cpu_util() (or the shim sched_cpu_util()) needs the maximum CPU capacity, i.e. it will call arch_scale_cpu_capacity() already. But not having to pass it into effective_cpu_util() will make the EAS wake-up code easier, especially when the maximum CPU capacity reduced by the thermal pressure is passed through the EAS wake-up functions. Due to the asymmetric CPU capacity support of arm/arm64 architectures, arch_scale_cpu_capacity(int cpu) is a per-CPU variable read access via per_cpu(cpu_scale, cpu) on such a system. On all other architectures it is a a compile-time constant (SCHED_CAPACITY_SCALE). Signed-off-by: Dietmar Eggemann diff --git a/drivers/powercap/dtpm_cpu.c b/drivers/powercap/dtpm_cpu.c index b740866b228d..0d57bcf83ae5 100644 --- a/drivers/powercap/dtpm_cpu.c +++ b/drivers/powercap/dtpm_cpu.c @@ -70,34 +70,19 @@ static u64 set_pd_power_limit(struct dtpm *dtpm, u64 po= wer_limit) =20 static u64 scale_pd_power_uw(struct cpumask *pd_mask, u64 power) { - unsigned long max =3D 0, sum_util =3D 0; + unsigned long max, sum_util =3D 0; int cpu; =20 - for_each_cpu_and(cpu, pd_mask, cpu_online_mask) { - - /* - * The capacity is the same for all CPUs belonging to - * the same perf domain, so a single call to - * arch_scale_cpu_capacity() is enough. However, we - * need the CPU parameter to be initialized by the - * loop, so the call ends up in this block. - * - * We can initialize 'max' with a cpumask_first() call - * before the loop but the bits computation is not - * worth given the arch_scale_cpu_capacity() just - * returns a value where the resulting assembly code - * will be optimized by the compiler. - */ - max =3D arch_scale_cpu_capacity(cpu); - sum_util +=3D sched_cpu_util(cpu, max); - } - /* - * In the improbable case where all the CPUs of the perf - * domain are offline, 'max' will be zero and will lead to an - * illegal operation with a zero division. + * The capacity is the same for all CPUs belonging to + * the same perf domain. */ - return max ? (power * ((sum_util << 10) / max)) >> 10 : 0; + max =3D arch_scale_cpu_capacity(cpumask_first(pd_mask)); + + for_each_cpu_and(cpu, pd_mask, cpu_online_mask) + sum_util +=3D sched_cpu_util(cpu); + + return (power * ((sum_util << 10) / max)) >> 10; } =20 static u64 get_pd_power_uw(struct dtpm *dtpm) diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_co= oling.c index 0bfb8eebd126..3f514ff3d9aa 100644 --- a/drivers/thermal/cpufreq_cooling.c +++ b/drivers/thermal/cpufreq_cooling.c @@ -137,11 +137,9 @@ static u32 cpu_power_to_freq(struct cpufreq_cooling_de= vice *cpufreq_cdev, static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu, int cpu_idx) { - unsigned long max =3D arch_scale_cpu_capacity(cpu); - unsigned long util; + unsigned long util =3D sched_cpu_util(cpu); =20 - util =3D sched_cpu_util(cpu, max); - return (util * 100) / max; + return (util * 100) / arch_scale_cpu_capacity(cpu); } #else /* !CONFIG_SMP */ static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu, diff --git a/include/linux/sched.h b/include/linux/sched.h index 508b91d57470..e231fc71eb84 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2193,7 +2193,7 @@ static inline bool owner_on_cpu(struct task_struct *o= wner) } =20 /* Returns effective CPU energy utilization, as seen by the scheduler */ -unsigned long sched_cpu_util(int cpu, unsigned long max); +unsigned long sched_cpu_util(int cpu); #endif /* CONFIG_SMP */ =20 #ifdef CONFIG_RSEQ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2e4ae00e52d1..a38d27abdf8d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7082,12 +7082,14 @@ struct task_struct *idle_task(int cpu) * required to meet deadlines. */ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, - unsigned long max, enum cpu_util_type type, + enum cpu_util_type type, struct task_struct *p) { - unsigned long dl_util, util, irq; + unsigned long dl_util, util, irq, max; struct rq *rq =3D cpu_rq(cpu); =20 + max =3D arch_scale_cpu_capacity(cpu); + if (!uclamp_is_used() && type =3D=3D FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { return max; @@ -7167,10 +7169,9 @@ unsigned long effective_cpu_util(int cpu, unsigned l= ong util_cfs, return min(max, util); } =20 -unsigned long sched_cpu_util(int cpu, unsigned long max) +unsigned long sched_cpu_util(int cpu) { - return effective_cpu_util(cpu, cpu_util_cfs(cpu), max, - ENERGY_UTIL, NULL); + return effective_cpu_util(cpu, cpu_util_cfs(cpu), ENERGY_UTIL, NULL); } #endif /* CONFIG_SMP */ =20 diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedu= til.c index 26778884d9ab..9b88fc8c6ea8 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -164,11 +164,10 @@ static unsigned int get_next_freq(struct sugov_policy= *sg_policy, static void sugov_get_util(struct sugov_cpu *sg_cpu) { struct rq *rq =3D cpu_rq(sg_cpu->cpu); - unsigned long max =3D arch_scale_cpu_capacity(sg_cpu->cpu); =20 - sg_cpu->max =3D max; + sg_cpu->max =3D arch_scale_cpu_capacity(sg_cpu->cpu); sg_cpu->bw_dl =3D cpu_bw_dl(rq); - sg_cpu->util =3D effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu= ), max, + sg_cpu->util =3D effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu= ), FREQUENCY_UTIL, NULL); } =20 diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1f83616a44d1..bbc44c3bc47c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6575,12 +6575,11 @@ static long compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) { struct cpumask *pd_mask =3D perf_domain_span(pd); - unsigned long cpu_cap =3D arch_scale_cpu_capacity(cpumask_first(pd_mask)); - unsigned long max_util =3D 0, sum_util =3D 0; - unsigned long _cpu_cap =3D cpu_cap; + unsigned long max_util =3D 0, sum_util =3D 0, cpu_cap; int cpu; =20 - _cpu_cap -=3D arch_scale_thermal_pressure(cpumask_first(pd_mask)); + cpu_cap =3D arch_scale_cpu_capacity(cpumask_first(pd_mask)); + cpu_cap -=3D arch_scale_thermal_pressure(cpumask_first(pd_mask)); =20 /* * The capacity state of CPUs of the current rd can be driven by CPUs @@ -6617,10 +6616,10 @@ compute_energy(struct task_struct *p, int dst_cpu, = struct perf_domain *pd) * is already enough to scale the EM reported power * consumption at the (eventually clamped) cpu_capacity. */ - cpu_util =3D effective_cpu_util(cpu, util_running, cpu_cap, - ENERGY_UTIL, NULL); + cpu_util =3D effective_cpu_util(cpu, util_running, ENERGY_UTIL, + NULL); =20 - sum_util +=3D min(cpu_util, _cpu_cap); + sum_util +=3D min(cpu_util, cpu_cap); =20 /* * Performance domain frequency: utilization clamping @@ -6629,12 +6628,12 @@ compute_energy(struct task_struct *p, int dst_cpu, = struct perf_domain *pd) * NOTE: in case RT tasks are running, by default the * FREQUENCY_UTIL's utilization can be max OPP. */ - cpu_util =3D effective_cpu_util(cpu, util_freq, cpu_cap, - FREQUENCY_UTIL, tsk); - max_util =3D max(max_util, min(cpu_util, _cpu_cap)); + cpu_util =3D effective_cpu_util(cpu, util_freq, FREQUENCY_UTIL, + tsk); + max_util =3D max(max_util, min(cpu_util, cpu_cap)); } =20 - return em_cpu_energy(pd->em_pd, max_util, sum_util, _cpu_cap); + return em_cpu_energy(pd->em_pd, max_util, sum_util, cpu_cap); } =20 /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 982691ffe9a1..6c0b91e66b12 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2998,7 +2998,7 @@ enum cpu_util_type { }; =20 unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, - unsigned long max, enum cpu_util_type type, + enum cpu_util_type type, struct task_struct *p); =20 static inline unsigned long cpu_bw_dl(struct rq *rq) --=20 2.25.1 From nobody Tue Jun 23 08:15:28 2026 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 18778C433EF for ; Tue, 8 Mar 2022 18:20:30 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1349521AbiCHSVZ (ORCPT ); Tue, 8 Mar 2022 13:21:25 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:42410 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1349500AbiCHSVR (ORCPT ); Tue, 8 Mar 2022 13:21:17 -0500 Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by lindbergh.monkeyblade.net (Postfix) with ESMTP id B6F3C56770 for ; Tue, 8 Mar 2022 10:20:20 -0800 (PST) Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 7EC8C1682; Tue, 8 Mar 2022 10:20:20 -0800 (PST) Received: from localhost.localdomain (unknown [10.57.88.57]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPA id 5109E3FA45; Tue, 8 Mar 2022 10:20:19 -0800 (PST) From: Vincent Donnefort To: peterz@infradead.org, mingo@redhat.com, vincent.guittot@linaro.org Cc: linux-kernel@vger.kernel.org, dietmar.eggemann@arm.com, morten.rasmussen@arm.com, chris.redpath@arm.com, qperret@google.com Subject: [PATCH v3 4/7] sched/fair: Rename select_idle_mask to select_rq_mask Date: Tue, 8 Mar 2022 18:19:54 +0000 Message-Id: <20220308181957.280354-5-vincent.donnefort@arm.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20220308181957.280354-1-vincent.donnefort@arm.com> References: <20220308181957.280354-1-vincent.donnefort@arm.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" From: Dietmar Eggemann Decouple the name of the per-cpu cpumask select_idle_mask from its usage in select_idle_[cpu/capacity]() of the CFS run-queue selection (select_task_rq_fair()). This is to support the reuse of this cpumask in the Energy Aware Scheduling (EAS) path (find_energy_efficient_cpu()) of the CFS run-queue selection. Signed-off-by: Dietmar Eggemann diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a38d27abdf8d..d0363766b4b0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9293,7 +9293,7 @@ static struct kmem_cache *task_group_cache __read_mos= tly; #endif =20 DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); -DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); +DECLARE_PER_CPU(cpumask_var_t, select_rq_mask); =20 void __init sched_init(void) { @@ -9342,7 +9342,7 @@ void __init sched_init(void) for_each_possible_cpu(i) { per_cpu(load_balance_mask, i) =3D (cpumask_var_t)kzalloc_node( cpumask_size(), GFP_KERNEL, cpu_to_node(i)); - per_cpu(select_idle_mask, i) =3D (cpumask_var_t)kzalloc_node( + per_cpu(select_rq_mask, i) =3D (cpumask_var_t)kzalloc_node( cpumask_size(), GFP_KERNEL, cpu_to_node(i)); } #endif /* CONFIG_CPUMASK_OFFSTACK */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bbc44c3bc47c..0ebfaa2fc1f4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5726,7 +5726,7 @@ static void dequeue_task_fair(struct rq *rq, struct t= ask_struct *p, int flags) =20 /* Working cpumask for: load_balance, load_balance_newidle. */ DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); -DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); +DEFINE_PER_CPU(cpumask_var_t, select_rq_mask); =20 #ifdef CONFIG_NO_HZ_COMMON =20 @@ -6216,7 +6216,7 @@ static inline int select_idle_smt(struct task_struct = *p, struct sched_domain *sd */ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd,= bool has_idle_core, int target) { - struct cpumask *cpus =3D this_cpu_cpumask_var_ptr(select_idle_mask); + struct cpumask *cpus =3D this_cpu_cpumask_var_ptr(select_rq_mask); int i, cpu, idle_cpu =3D -1, nr =3D INT_MAX; struct rq *this_rq =3D this_rq(); int this =3D smp_processor_id(); @@ -6302,7 +6302,7 @@ select_idle_capacity(struct task_struct *p, struct sc= hed_domain *sd, int target) int cpu, best_cpu =3D -1; struct cpumask *cpus; =20 - cpus =3D this_cpu_cpumask_var_ptr(select_idle_mask); + cpus =3D this_cpu_cpumask_var_ptr(select_rq_mask); cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); =20 task_util =3D uclamp_task_util(p); --=20 2.25.1 From nobody Tue Jun 23 08:15:28 2026 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 3823DC433EF for ; Tue, 8 Mar 2022 18:20:34 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1349531AbiCHSV2 (ORCPT ); Tue, 8 Mar 2022 13:21:28 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:42422 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1349504AbiCHSVT (ORCPT ); Tue, 8 Mar 2022 13:21:19 -0500 Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by lindbergh.monkeyblade.net (Postfix) with ESMTP id 268E056772 for ; Tue, 8 Mar 2022 10:20:22 -0800 (PST) Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id E11B01516; Tue, 8 Mar 2022 10:20:21 -0800 (PST) Received: from localhost.localdomain (unknown [10.57.88.57]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPA id B77993FA45; Tue, 8 Mar 2022 10:20:20 -0800 (PST) From: Vincent Donnefort To: peterz@infradead.org, mingo@redhat.com, vincent.guittot@linaro.org Cc: linux-kernel@vger.kernel.org, dietmar.eggemann@arm.com, morten.rasmussen@arm.com, chris.redpath@arm.com, qperret@google.com Subject: [PATCH v3 5/7] sched/fair: Use the same cpumask per-PD throughout find_energy_efficient_cpu() Date: Tue, 8 Mar 2022 18:19:55 +0000 Message-Id: <20220308181957.280354-6-vincent.donnefort@arm.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20220308181957.280354-1-vincent.donnefort@arm.com> References: <20220308181957.280354-1-vincent.donnefort@arm.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" From: Dietmar Eggemann The Perf Domain (PD) cpumask (struct em_perf_domain.cpus) stays invariant after Energy Model creation, i.e. it is not updated after CPU hotplug operations. That's why the PD mask is used in conjunction with the cpu_online_mask (or Sched Domain cpumask). Thereby the cpu_online_mask is fetched multiple times (in compute_energy()) during a run-queue selection for a task. cpu_online_mask may change during this time which can lead to wrong energy calculations. To be able to avoid this, use the select_rq_mask per-cpu cpumask to create a cpumask out of PD cpumask and cpu_online_mask and pass it through the function calls of the EAS run-queue selection path. The PD cpumask for max_spare_cap_cpu/compute_prev_delta selection (find_energy_efficient_cpu()) is now ANDed not only with the SD mask but also with the cpu_online_mask. This is fine since this cpumask has to be in syc with the one used for energy computation (compute_energy()). An exclusive cpuset setup with at least one asymmetric CPU capacity island (hence the additional AND with the SD cpumask) is the obvious exception here. Signed-off-by: Dietmar Eggemann diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0ebfaa2fc1f4..07de5c63c75f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6572,14 +6572,14 @@ static unsigned long cpu_util_next(int cpu, struct = task_struct *p, int dst_cpu) * task. */ static long -compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) +compute_energy(struct task_struct *p, int dst_cpu, struct cpumask *cpus, + struct perf_domain *pd) { - struct cpumask *pd_mask =3D perf_domain_span(pd); unsigned long max_util =3D 0, sum_util =3D 0, cpu_cap; int cpu; =20 - cpu_cap =3D arch_scale_cpu_capacity(cpumask_first(pd_mask)); - cpu_cap -=3D arch_scale_thermal_pressure(cpumask_first(pd_mask)); + cpu_cap =3D arch_scale_cpu_capacity(cpumask_first(cpus)); + cpu_cap -=3D arch_scale_thermal_pressure(cpumask_first(cpus)); =20 /* * The capacity state of CPUs of the current rd can be driven by CPUs @@ -6590,7 +6590,7 @@ compute_energy(struct task_struct *p, int dst_cpu, st= ruct perf_domain *pd) * If an entire pd is outside of the current rd, it will not appear in * its pd list and will not be accounted by compute_energy(). */ - for_each_cpu_and(cpu, pd_mask, cpu_online_mask) { + for_each_cpu(cpu, cpus) { unsigned long util_freq =3D cpu_util_next(cpu, p, dst_cpu); unsigned long cpu_util, util_running =3D util_freq; struct task_struct *tsk =3D NULL; @@ -6677,6 +6677,7 @@ compute_energy(struct task_struct *p, int dst_cpu, st= ruct perf_domain *pd) */ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) { + struct cpumask *cpus =3D this_cpu_cpumask_var_ptr(select_rq_mask); unsigned long prev_delta =3D ULONG_MAX, best_delta =3D ULONG_MAX; struct root_domain *rd =3D cpu_rq(smp_processor_id())->rd; int cpu, best_energy_cpu =3D prev_cpu, target =3D -1; @@ -6711,7 +6712,9 @@ static int find_energy_efficient_cpu(struct task_stru= ct *p, int prev_cpu) unsigned long base_energy_pd; int max_spare_cap_cpu =3D -1; =20 - for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) { + cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask); + + for_each_cpu_and(cpu, cpus, sched_domain_span(sd)) { if (!cpumask_test_cpu(cpu, p->cpus_ptr)) continue; =20 @@ -6748,12 +6751,12 @@ static int find_energy_efficient_cpu(struct task_st= ruct *p, int prev_cpu) continue; =20 /* Compute the 'base' energy of the pd, without @p */ - base_energy_pd =3D compute_energy(p, -1, pd); + base_energy_pd =3D compute_energy(p, -1, cpus, pd); base_energy +=3D base_energy_pd; =20 /* Evaluate the energy impact of using prev_cpu. */ if (compute_prev_delta) { - prev_delta =3D compute_energy(p, prev_cpu, pd); + prev_delta =3D compute_energy(p, prev_cpu, cpus, pd); if (prev_delta < base_energy_pd) goto unlock; prev_delta -=3D base_energy_pd; @@ -6762,7 +6765,8 @@ static int find_energy_efficient_cpu(struct task_stru= ct *p, int prev_cpu) =20 /* Evaluate the energy impact of using max_spare_cap_cpu. */ if (max_spare_cap_cpu >=3D 0) { - cur_delta =3D compute_energy(p, max_spare_cap_cpu, pd); + cur_delta =3D compute_energy(p, max_spare_cap_cpu, cpus, + pd); if (cur_delta < base_energy_pd) goto unlock; cur_delta -=3D base_energy_pd; --=20 2.25.1 From nobody Tue Jun 23 08:15:28 2026 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 9576BC433F5 for ; Tue, 8 Mar 2022 18:20:44 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S244235AbiCHSVh (ORCPT ); Tue, 8 Mar 2022 13:21:37 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:42506 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1349516AbiCHSVW (ORCPT ); Tue, 8 Mar 2022 13:21:22 -0500 Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by lindbergh.monkeyblade.net (Postfix) with ESMTP id B1C1056C02 for ; Tue, 8 Mar 2022 10:20:23 -0800 (PST) Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 7BA7F1650; Tue, 8 Mar 2022 10:20:23 -0800 (PST) Received: from localhost.localdomain (unknown [10.57.88.57]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPA id 2889D3FA45; Tue, 8 Mar 2022 10:20:22 -0800 (PST) From: Vincent Donnefort To: peterz@infradead.org, mingo@redhat.com, vincent.guittot@linaro.org Cc: linux-kernel@vger.kernel.org, dietmar.eggemann@arm.com, morten.rasmussen@arm.com, chris.redpath@arm.com, qperret@google.com, Vincent Donnefort Subject: [PATCH v3 6/7] sched/fair: Remove task_util from effective utilization in feec() Date: Tue, 8 Mar 2022 18:19:56 +0000 Message-Id: <20220308181957.280354-7-vincent.donnefort@arm.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20220308181957.280354-1-vincent.donnefort@arm.com> References: <20220308181957.280354-1-vincent.donnefort@arm.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" The energy estimation in find_energy_efficient_cpu() (feec()) relies on the computation of the effective utilization for each CPU of a perf domain (PD). This effective utilization is then used as an estimation of the busy time for this pd. The function effective_cpu_util() which gives this value, scales the utilization relative to IRQ pressure on the CPU to take into account that the IRQ time is hidden from the task clock. The IRQ scaling is as follow: effective_cpu_util =3D irq + (cpu_cap - irq)/cpu_cap * util Where util is the sum of CFS/RT/DL utilization, cpu_cap the capacity of the CPU and irq the IRQ avg time. If now we take as an example a task placement which doesn't raise the OPP on the candidate CPU, we can write the energy delta as: delta =3D OPPcost/cpu_cap * (effective_cpu_util(cpu_util + task_util) - effective_cpu_util(cpu_util)) =3D OPPcost/cpu_cap * (cpu_cap - irq)/cpu_cap * task_util We end-up with an energy delta depending on the IRQ avg time, which is a problem: first the time spent on IRQs by a CPU has no effect on the additional energy that would be consumed by a task. Second, we don't want to favour a CPU with a higher IRQ avg time value. Nonetheless, we need to take the IRQ avg time into account. If a task placement raises the PD's frequency, it will increase the energy cost for the entire time where the CPU is busy. A solution is to only use effective_cpu_util() with the CPU contribution part. The task contribution is added separately and scaled according to prev_cpu's IRQ time. No change for the FREQUENCY_UTIL component of the energy estimation. We still want to get the actual frequency that would be selected after the task placement. Signed-off-by: Vincent Donnefort diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 07de5c63c75f..b48ba181c8ec 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6565,61 +6565,97 @@ static unsigned long cpu_util_next(int cpu, struct = task_struct *p, int dst_cpu) } =20 /* - * compute_energy(): Estimates the energy that @pd would consume if @p was - * migrated to @dst_cpu. compute_energy() predicts what will be the utiliz= ation - * landscape of @pd's CPUs after the task migration, and uses the Energy M= odel - * to compute what would be the energy if we decided to actually migrate t= hat - * task. + * energy_env - Utilization landscape for energy estimation. + * @task_busy_time: Utilization contribution by the task for which we test= the + * placement. Given by eenv_task_busy_time(). + * @pd_busy_time: Utilization of the whole perf domain without the task + * contribution. Given by eenv_pd_busy_time(). + * @cpu_cap: Maximum CPU capacity for the perf domain. + * @pd_cap: Entire perf domain capacity. (pd->nr_cpus * cpu_cap). + */ +struct energy_env { + unsigned long task_busy_time; + unsigned long pd_busy_time; + unsigned long cpu_cap; + unsigned long pd_cap; +}; + +/* + * Compute the task busy time for compute_energy(). This time cannot be + * injected directly into effective_cpu_util() because of the IRQ scaling. + * The latter only makes sense with the most recent CPUs where the task has + * run. + */ +static inline void eenv_task_busy_time(struct energy_env *eenv, + struct task_struct *p, int prev_cpu) +{ + unsigned long max_cap =3D arch_scale_cpu_capacity(prev_cpu); + unsigned long irq =3D cpu_util_irq(cpu_rq(prev_cpu)); + + if (unlikely(irq >=3D max_cap)) { + eenv->task_busy_time =3D max_cap; + return; + } + + eenv->task_busy_time =3D + scale_irq_capacity(task_util_est(p), irq, max_cap); +} + +/* + * Compute the perf_domain (PD) busy time for compute_energy(). Based on t= he + * utilization for each @pd_cpus, it however doesn't take into account + * clamping since the ratio (utilization / cpu_capacity) is already enough= to + * scale the EM reported power consumption at the (eventually clamped) + * cpu_capacity. + * + * The contribution of the task @p for which we want to estimate the + * energy cost is removed (by cpu_util_next()) and must be calculated + * separately (see eenv_task_busy_time). This ensures: + * + * - A stable PD utilization, no matter which CPU of that PD we want to = place + * the task on. + * + * - A fair comparison between CPUs as the task contribution (task_util(= )) + * will always be the same no matter which CPU utilization we rely on + * (util_avg or util_est). + * + * Set @eenv busy time for the PD that spans @pd_cpus. This busy time can't + * exceed @eenv->pd_cap. */ -static long -compute_energy(struct task_struct *p, int dst_cpu, struct cpumask *cpus, - struct perf_domain *pd) +static inline void eenv_pd_busy_time(struct energy_env *eenv, + struct cpumask *pd_cpus, + struct task_struct *p) { - unsigned long max_util =3D 0, sum_util =3D 0, cpu_cap; + unsigned long busy_time =3D 0; int cpu; =20 - cpu_cap =3D arch_scale_cpu_capacity(cpumask_first(cpus)); - cpu_cap -=3D arch_scale_thermal_pressure(cpumask_first(cpus)); + for_each_cpu(cpu, pd_cpus) { + unsigned long util =3D cpu_util_next(cpu, p, -1); =20 - /* - * The capacity state of CPUs of the current rd can be driven by CPUs - * of another rd if they belong to the same pd. So, account for the - * utilization of these CPUs too by masking pd with cpu_online_mask - * instead of the rd span. - * - * If an entire pd is outside of the current rd, it will not appear in - * its pd list and will not be accounted by compute_energy(). - */ - for_each_cpu(cpu, cpus) { - unsigned long util_freq =3D cpu_util_next(cpu, p, dst_cpu); - unsigned long cpu_util, util_running =3D util_freq; - struct task_struct *tsk =3D NULL; + busy_time +=3D effective_cpu_util(cpu, util, ENERGY_UTIL, NULL); + } =20 - /* - * When @p is placed on @cpu: - * - * util_running =3D max(cpu_util, cpu_util_est) + - * max(task_util, _task_util_est) - * - * while cpu_util_next is: max(cpu_util + task_util, - * cpu_util_est + _task_util_est) - */ - if (cpu =3D=3D dst_cpu) { - tsk =3D p; - util_running =3D - cpu_util_next(cpu, p, -1) + task_util_est(p); - } + eenv->pd_busy_time =3D min(eenv->pd_cap, busy_time); +} =20 - /* - * Busy time computation: utilization clamping is not - * required since the ratio (sum_util / cpu_capacity) - * is already enough to scale the EM reported power - * consumption at the (eventually clamped) cpu_capacity. - */ - cpu_util =3D effective_cpu_util(cpu, util_running, ENERGY_UTIL, - NULL); +/* + * Compute the maximum utilization for compute_energy() when the task @p + * is placed on the cpu @dst_cpu. + * + * Returns the maximum utilization among @eenv->cpus. This utilization can= 't + * exceed @eenv->cpu_cap. + */ +static inline unsigned long +eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus, + struct task_struct *p, int dst_cpu) +{ + unsigned long max_util =3D 0; + int cpu; =20 - sum_util +=3D min(cpu_util, cpu_cap); + for_each_cpu(cpu, pd_cpus) { + struct task_struct *tsk =3D (cpu =3D=3D dst_cpu) ? p : NULL; + unsigned long util =3D cpu_util_next(cpu, p, dst_cpu); + unsigned long cpu_util; =20 /* * Performance domain frequency: utilization clamping @@ -6628,12 +6664,30 @@ compute_energy(struct task_struct *p, int dst_cpu, = struct cpumask *cpus, * NOTE: in case RT tasks are running, by default the * FREQUENCY_UTIL's utilization can be max OPP. */ - cpu_util =3D effective_cpu_util(cpu, util_freq, FREQUENCY_UTIL, - tsk); - max_util =3D max(max_util, min(cpu_util, cpu_cap)); + cpu_util =3D effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk); + max_util =3D max(max_util, cpu_util); } =20 - return em_cpu_energy(pd->em_pd, max_util, sum_util, cpu_cap); + return min(max_util, eenv->cpu_cap); +} + +/* + * compute_energy(): Use the Energy Model to estimate the energy that @pd = would + * consume for a given utilization landscape @eenv. If @dst_cpu < 0 the ta= sk + * contribution is removed from the energy estimation. + */ +static inline unsigned long +compute_energy(struct energy_env *eenv, struct perf_domain *pd, + struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu) +{ + unsigned long max_util =3D eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu); + unsigned long busy_time =3D eenv->pd_busy_time; + + if (dst_cpu >=3D 0) + busy_time =3D min(eenv->pd_cap, + eenv->pd_busy_time + eenv->task_busy_time); + + return em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap); } =20 /* @@ -6681,9 +6735,11 @@ static int find_energy_efficient_cpu(struct task_str= uct *p, int prev_cpu) unsigned long prev_delta =3D ULONG_MAX, best_delta =3D ULONG_MAX; struct root_domain *rd =3D cpu_rq(smp_processor_id())->rd; int cpu, best_energy_cpu =3D prev_cpu, target =3D -1; - unsigned long cpu_cap, util, base_energy =3D 0; + unsigned long cpu_cap, cpu_thermal_cap, util; + unsigned long base_energy =3D 0; struct sched_domain *sd; struct perf_domain *pd; + struct energy_env eenv; =20 rcu_read_lock(); pd =3D rcu_dereference(rd->pd); @@ -6706,6 +6762,8 @@ static int find_energy_efficient_cpu(struct task_stru= ct *p, int prev_cpu) if (!task_util_est(p)) goto unlock; =20 + eenv_task_busy_time(&eenv, p, prev_cpu); + for (; pd; pd =3D pd->next) { unsigned long cur_delta, spare_cap, max_spare_cap =3D 0; bool compute_prev_delta =3D false; @@ -6714,7 +6772,20 @@ static int find_energy_efficient_cpu(struct task_str= uct *p, int prev_cpu) =20 cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask); =20 - for_each_cpu_and(cpu, cpus, sched_domain_span(sd)) { + /* Account thermal pressure for the energy estimation */ + cpu =3D cpumask_first(cpus); + cpu_thermal_cap =3D arch_scale_cpu_capacity(cpu); + cpu_thermal_cap -=3D arch_scale_thermal_pressure(cpu); + + eenv.cpu_cap =3D cpu_thermal_cap; + eenv.pd_cap =3D 0; + + for_each_cpu(cpu, cpus) { + eenv.pd_cap +=3D cpu_thermal_cap; + + if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) + continue; + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) continue; =20 @@ -6751,12 +6822,14 @@ static int find_energy_efficient_cpu(struct task_st= ruct *p, int prev_cpu) continue; =20 /* Compute the 'base' energy of the pd, without @p */ - base_energy_pd =3D compute_energy(p, -1, cpus, pd); + eenv_pd_busy_time(&eenv, cpus, p); + base_energy_pd =3D compute_energy(&eenv, pd, cpus, p, -1); base_energy +=3D base_energy_pd; =20 /* Evaluate the energy impact of using prev_cpu. */ if (compute_prev_delta) { - prev_delta =3D compute_energy(p, prev_cpu, cpus, pd); + prev_delta =3D compute_energy(&eenv, pd, cpus, p, + prev_cpu); if (prev_delta < base_energy_pd) goto unlock; prev_delta -=3D base_energy_pd; @@ -6765,8 +6838,8 @@ static int find_energy_efficient_cpu(struct task_stru= ct *p, int prev_cpu) =20 /* Evaluate the energy impact of using max_spare_cap_cpu. */ if (max_spare_cap_cpu >=3D 0) { - cur_delta =3D compute_energy(p, max_spare_cap_cpu, cpus, - pd); + cur_delta =3D compute_energy(&eenv, pd, cpus, p, + max_spare_cap_cpu); if (cur_delta < base_energy_pd) goto unlock; cur_delta -=3D base_energy_pd; --=20 2.25.1 From nobody Tue Jun 23 08:15:28 2026 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id EAE74C433EF for ; Tue, 8 Mar 2022 18:20:49 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1349529AbiCHSVp (ORCPT ); Tue, 8 Mar 2022 13:21:45 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:42528 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1349510AbiCHSVW (ORCPT ); Tue, 8 Mar 2022 13:21:22 -0500 Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by lindbergh.monkeyblade.net (Postfix) with ESMTP id 7412056C1C for ; Tue, 8 Mar 2022 10:20:25 -0800 (PST) Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 076941516; Tue, 8 Mar 2022 10:20:25 -0800 (PST) Received: from localhost.localdomain (unknown [10.57.88.57]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPA id B09653FA45; Tue, 8 Mar 2022 10:20:23 -0800 (PST) From: Vincent Donnefort To: peterz@infradead.org, mingo@redhat.com, vincent.guittot@linaro.org Cc: linux-kernel@vger.kernel.org, dietmar.eggemann@arm.com, morten.rasmussen@arm.com, chris.redpath@arm.com, qperret@google.com, Vincent Donnefort Subject: [PATCH v3 7/7] sched/fair: Remove the energy margin in feec() Date: Tue, 8 Mar 2022 18:19:57 +0000 Message-Id: <20220308181957.280354-8-vincent.donnefort@arm.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20220308181957.280354-1-vincent.donnefort@arm.com> References: <20220308181957.280354-1-vincent.donnefort@arm.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" find_energy_efficient_cpu() integrates a margin to protect tasks from bouncing back and forth from a CPU to another. This margin is set as being 6% of the total current energy estimated on the system. This however does not work for two reasons: 1. The energy estimation is not a good absolute value: The function, compute_energy() used in feec() is a good estimation for task placement as it allows to compare the energy with and without a task. The computed delta will give a good overview of the cost for a certain task placement. It, however, doesn't work as an absolute estimation for the total energy of the system. First it adds the contribution to idle CPUs into the energy, second it mixes util_avg with util_est values. util_avg represents integrates the near history for a CPU usage, it doesn't tell at all what the current utilization is. A system that has been quite busy in the near past will hold a very high energy and then a high margin preventing any task migration to a lower capacity CPU, wasting energy. It even creates a negative feedback loop: by holding the tasks on a less efficient CPU, the margin contributes in keeping the energy high. 2. The margin handicaps small tasks: On a system where the workload is composed mostly of small tasks (which is often the case on Android), the overall energy will be high enough to create a margin none of those tasks can cross. e.g. On a Pixel4, a small utilization of 5% on all the CPUs creates a global estimated energy of 140 joules, as per the Energy Model declaration of that same device. This means, after applying the 6% margin that any migration must save more than 8 joules to happen. No task with a utilization lower than 40 would then be able to migrate away from the biggest CPU of the system. The 6% of the overall system energy was brought by the following patch: (eb92692b2544 sched/fair: Speed-up energy-aware wake-ups) It was previously 6% of the prev_cpu energy. Also, the following one made this margin value conditional on the clusters where the task fits: (8d4c97c105ca sched/fair: Only compute base_energy_pd if necessary) We could simply revert that margin change to what it was, but the original version didn't have strong grounds neither and as demonstrated in (1.) the estimated energy isn't a good absolute value. Instead, removing it completely. It is indeed, made possible by recent changes that improved energy estimation comparison fairness (sched/fair: Remove task_util from effective utilization in feec()) (PM: EM: Increase energy calculation precision) and task utilization stabilization (sched/fair: Decay task util_avg during migration) Without a margin, we could have feared bouncing between CPUs. But running LISA's eas_behaviour test coverage on three different platforms (Hikey960, RB-5 and DB-845) showed no issue and even fixed previously known failures. Removing the energy margin enables more energy-optimized placements for a more energy efficient system. Signed-off-by: Vincent Donnefort diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b48ba181c8ec..05518a6150e5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6736,7 +6736,6 @@ static int find_energy_efficient_cpu(struct task_stru= ct *p, int prev_cpu) struct root_domain *rd =3D cpu_rq(smp_processor_id())->rd; int cpu, best_energy_cpu =3D prev_cpu, target =3D -1; unsigned long cpu_cap, cpu_thermal_cap, util; - unsigned long base_energy =3D 0; struct sched_domain *sd; struct perf_domain *pd; struct energy_env eenv; @@ -6767,8 +6766,8 @@ static int find_energy_efficient_cpu(struct task_stru= ct *p, int prev_cpu) for (; pd; pd =3D pd->next) { unsigned long cur_delta, spare_cap, max_spare_cap =3D 0; bool compute_prev_delta =3D false; - unsigned long base_energy_pd; int max_spare_cap_cpu =3D -1; + unsigned long base_energy; =20 cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask); =20 @@ -6823,16 +6822,15 @@ static int find_energy_efficient_cpu(struct task_st= ruct *p, int prev_cpu) =20 /* Compute the 'base' energy of the pd, without @p */ eenv_pd_busy_time(&eenv, cpus, p); - base_energy_pd =3D compute_energy(&eenv, pd, cpus, p, -1); - base_energy +=3D base_energy_pd; + base_energy =3D compute_energy(&eenv, pd, cpus, p, -1); =20 /* Evaluate the energy impact of using prev_cpu. */ if (compute_prev_delta) { prev_delta =3D compute_energy(&eenv, pd, cpus, p, prev_cpu); - if (prev_delta < base_energy_pd) + if (prev_delta < base_energy) goto unlock; - prev_delta -=3D base_energy_pd; + prev_delta -=3D base_energy; best_delta =3D min(best_delta, prev_delta); } =20 @@ -6840,9 +6838,9 @@ static int find_energy_efficient_cpu(struct task_stru= ct *p, int prev_cpu) if (max_spare_cap_cpu >=3D 0) { cur_delta =3D compute_energy(&eenv, pd, cpus, p, max_spare_cap_cpu); - if (cur_delta < base_energy_pd) + if (cur_delta < base_energy) goto unlock; - cur_delta -=3D base_energy_pd; + cur_delta -=3D base_energy; if (cur_delta < best_delta) { best_delta =3D cur_delta; best_energy_cpu =3D max_spare_cap_cpu; @@ -6851,12 +6849,7 @@ static int find_energy_efficient_cpu(struct task_str= uct *p, int prev_cpu) } rcu_read_unlock(); =20 - /* - * Pick the best CPU if prev_cpu cannot be used, or if it saves at - * least 6% of the energy used by prev_cpu. - */ - if ((prev_delta =3D=3D ULONG_MAX) || - (prev_delta - best_delta) > ((prev_delta + base_energy) >> 4)) + if (best_delta < prev_delta) target =3D best_energy_cpu; =20 return target; --=20 2.25.1