From nobody Wed Dec 17 15:51:19 2025 Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by smtp.subspace.kernel.org (Postfix) with ESMTP id CE9C2136E1A for ; Mon, 24 Jun 2024 10:24:05 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=217.140.110.172 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1719224647; cv=none; b=hGLFf8XF4TZs0CHNNoNs990uUfzC9l61StSDblBSadGgtDE9/bJ2gwVV4EOsT0jTqx/TYuPwh9Yvhr/qgpYobyPX9ClG0e4HmT5T6VqP3Kkt3YgIc7a+moX5qw9BXH9/qIT0tzXxRh5eko2NgLkK88OKJRQ1lTc4Cs6PLge4MOk= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1719224647; c=relaxed/simple; bh=zr1bNmbnSH/iHggyquhCm1tAbo6P9Nb6vXtmFOhx/8w=; h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: MIME-Version; b=Y8XLUMU0flbyNhF058M80rSu3fYpCWu9EVZpRYHoJHYYgz7G8q8fzOGvIP3xr5Uw8oDW5YqwdoYtCTHENmVmjWXbTHcRt9nQwkjIOIJ38EBzZy5G6pwB5tBlo6/kTHzvEOzKyyZziIRuDbv8GLg5Zj6aXFA7ZqoTWlKoKrGHKzM= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com; spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=arm.com Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 07A87FEC; Mon, 24 Jun 2024 03:24:30 -0700 (PDT) Received: from e130256.cambridge.arm.com (usa-sjc-imap-foss1.foss.arm.com [10.121.207.14]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id 0A2FA3F766; Mon, 24 Jun 2024 03:24:02 -0700 (PDT) From: Hongyan Xia To: Ingo Molnar , Peter Zijlstra , Vincent Guittot , Dietmar Eggemann , Juri Lelli , Steven Rostedt , Ben Segall , Mel Gorman , Daniel Bristot de Oliveira , Valentin Schneider Cc: Qais Yousef , Morten Rasmussen , Lukasz Luba , Christian Loehle , Pierre Gondois , Youssef Esmat , linux-kernel@vger.kernel.org, Hongyan Xia Subject: [PATCH 1/7] Revert "sched/uclamp: Set max_spare_cap_cpu even if max_spare_cap is 0" Date: Mon, 24 Jun 2024 11:23:50 +0100 Message-Id: <816d80b45081ead13927c86be998c21db03b5ddf.1719223916.git.hongyan.xia2@arm.com> X-Mailer: git-send-email 2.34.1 In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: Hongyan Xia That commit creates further problems because 0 spare capacity can be either a real indication that the CPU is maxed out, or the CPU is UCLAMP_MAX throttled, but we end up giving all of them a chance which can results in bogus energy calculations. It also tends to schedule tasks on the same CPU and requires load balancing patches. Sum aggregation solves these problems and this patch is not needed. This reverts commit 6b00a40147653c8ea748e8f4396510f252763364. Signed-off-by: Hongyan Xia --- kernel/sched/fair.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 41b58387023d..2f8ed6561a9f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8028,10 +8028,11 @@ static int find_energy_efficient_cpu(struct task_st= ruct *p, int prev_cpu) for (; pd; pd =3D pd->next) { unsigned long util_min =3D p_util_min, util_max =3D p_util_max; unsigned long cpu_cap, cpu_actual_cap, util; - long prev_spare_cap =3D -1, max_spare_cap =3D -1; + unsigned long cur_delta, max_spare_cap =3D 0; unsigned long rq_util_min, rq_util_max; - unsigned long cur_delta, base_energy; + unsigned long prev_spare_cap =3D 0; int max_spare_cap_cpu =3D -1; + unsigned long base_energy; int fits, max_fits =3D -1; =20 cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask); @@ -8093,7 +8094,7 @@ static int find_energy_efficient_cpu(struct task_stru= ct *p, int prev_cpu) prev_spare_cap =3D cpu_cap; prev_fits =3D fits; } else if ((fits > max_fits) || - ((fits =3D=3D max_fits) && ((long)cpu_cap > max_spare_cap))) { + ((fits =3D=3D max_fits) && (cpu_cap > max_spare_cap))) { /* * Find the CPU with the maximum spare capacity * among the remaining CPUs in the performance @@ -8105,7 +8106,7 @@ static int find_energy_efficient_cpu(struct task_stru= ct *p, int prev_cpu) } } =20 - if (max_spare_cap_cpu < 0 && prev_spare_cap < 0) + if (max_spare_cap_cpu < 0 && prev_spare_cap =3D=3D 0) continue; =20 eenv_pd_busy_time(&eenv, cpus, p); @@ -8113,7 +8114,7 @@ static int find_energy_efficient_cpu(struct task_stru= ct *p, int prev_cpu) base_energy =3D compute_energy(&eenv, pd, cpus, p, -1); =20 /* Evaluate the energy impact of using prev_cpu. */ - if (prev_spare_cap > -1) { + if (prev_spare_cap > 0) { prev_delta =3D compute_energy(&eenv, pd, cpus, p, prev_cpu); /* CPU utilization has changed */ --=20 2.34.1 From nobody Wed Dec 17 15:51:19 2025 Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by smtp.subspace.kernel.org (Postfix) with ESMTP id 5601D137904 for ; Mon, 24 Jun 2024 10:24:09 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=217.140.110.172 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1719224651; cv=none; b=VIdcgi1ShecidLZ1+afP8s+xxlgBw7W/P5CAkigsveaIe/9+OwbjYHyKltITmJpu8YKAT7yjyCVN5m2jFrqA1Y550Vq7+brLzxdtFFOKID5JBfhoza0rDd8hqrFUsfFuFNxfXatxUZkP4tcEC9qaU94+JfW5IydeOjzFxCnaFjU= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1719224651; c=relaxed/simple; bh=X2aDfNQVEg4IgBUI1QgLufrrmUjBVes4+C6giy6UdMA=; h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: MIME-Version; b=aXgJYRpNo8vmwX+RcFtXVEHIpkwupj8yjTWBdo1cFlTDtIprTuSRpDwQ9QYP6WYfiTS9E+zFKp7OSbYHLUoghN62yQlraVDg5Fnk/iB0JPzN7GUykf4I2Yl9DEdfkQAdor9pr6zGu6A/dCbTnl2PVqaDrBmjGpheGzqz736wJvM= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com; spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=arm.com Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 66BE7113E; Mon, 24 Jun 2024 03:24:33 -0700 (PDT) Received: from e130256.cambridge.arm.com (usa-sjc-imap-foss1.foss.arm.com [10.121.207.14]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id 839C23F766; Mon, 24 Jun 2024 03:24:06 -0700 (PDT) From: Hongyan Xia To: Ingo Molnar , Peter Zijlstra , Vincent Guittot , Dietmar Eggemann , Juri Lelli , Steven Rostedt , Ben Segall , Mel Gorman , Daniel Bristot de Oliveira , Valentin Schneider Cc: Qais Yousef , Morten Rasmussen , Lukasz Luba , Christian Loehle , Pierre Gondois , Youssef Esmat , linux-kernel@vger.kernel.org Subject: [PATCH 2/7] sched/uclamp: Track a new util_avg_bias signal Date: Mon, 24 Jun 2024 11:23:51 +0100 Message-Id: <85519f7a06e7a59dca644baca10b78c85749c1b5.1719223916.git.hongyan.xia2@arm.com> X-Mailer: git-send-email 2.34.1 In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Add a util_avg_bias signal in sched_avg, which is obtained by: util_avg_bias =3D clamp(util_avg, uclamp_min, uclamp_max) - util_avg The task utilization after considering uclamp is; util_avg_uclamp =3D util_avg + util_avg_bias We then sum up all biases on the same rq and use the total bias to bias the rq utilization. This is the core idea of uclamp sum aggregation. The rq utilization will be rq_util_avg_uclamp =3D rq_util_avg + total_util_avg_bias Signed-off-by: Hongyan Xia --- include/linux/sched.h | 3 ++- kernel/sched/debug.c | 2 +- kernel/sched/fair.c | 31 +++++++++++++++++++++++++++++++ kernel/sched/pelt.c | 37 +++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 24 ++++++++++++++++++++++++ 5 files changed, 95 insertions(+), 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 90691d99027e..63bcb81b20bb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -476,7 +476,8 @@ struct sched_avg { u32 period_contrib; unsigned long load_avg; unsigned long runnable_avg; - unsigned long util_avg; + unsigned int util_avg; + int util_avg_bias; unsigned int util_est; } ____cacheline_aligned; =20 diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index c1eb9a1afd13..d416be6e3a83 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -683,7 +683,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct c= fs_rq *cfs_rq) cfs_rq->avg.load_avg); SEQ_printf(m, " .%-30s: %lu\n", "runnable_avg", cfs_rq->avg.runnable_avg); - SEQ_printf(m, " .%-30s: %lu\n", "util_avg", + SEQ_printf(m, " .%-30s: %u\n", "util_avg", cfs_rq->avg.util_avg); SEQ_printf(m, " .%-30s: %u\n", "util_est", cfs_rq->avg.util_est); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2f8ed6561a9f..23360c666829 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1089,6 +1089,7 @@ void post_init_entity_util_avg(struct task_struct *p) } =20 sa->runnable_avg =3D sa->util_avg; + sa->util_avg_bias =3D 0; } =20 #else /* !CONFIG_SMP */ @@ -4844,6 +4845,32 @@ static inline unsigned long task_util_est(struct tas= k_struct *p) return max(task_util(p), _task_util_est(p)); } =20 +#ifdef CONFIG_UCLAMP_TASK +static inline long task_util_bias(struct task_struct *p) +{ + return READ_ONCE(p->se.avg.util_avg_bias); +} + +static inline unsigned long task_util_uclamp(struct task_struct *p) +{ + long ret =3D task_util(p); + + ret +=3D task_util_bias(p); + + return max(ret, 0L); +} +#else +static inline long task_util_bias(struct task_struct *p) +{ + return 0; +} + +static inline unsigned long task_util_uclamp(struct task_struct *p) +{ + return task_util(p); +} +#endif + static inline void util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) { @@ -6807,6 +6834,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *= p, int flags) =20 /* At this point se is NULL and we are at root level*/ add_nr_running(rq, 1); + util_bias_enqueue(rq, p); + /* XXX: We should skip the update above and only do it once here. */ + cpufreq_update_util(rq, 0); =20 /* * Since new tasks are assigned an initial util_avg equal to @@ -6898,6 +6928,7 @@ static void dequeue_task_fair(struct rq *rq, struct t= ask_struct *p, int flags) =20 /* At this point se is NULL and we are at root level*/ sub_nr_running(rq, 1); + util_bias_dequeue(rq, p); =20 /* balance early to pull high priority tasks */ if (unlikely(!was_sched_idle && sched_idle_rq(rq))) diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index fa52906a4478..11aa845d212c 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -266,6 +266,39 @@ ___update_load_avg(struct sched_avg *sa, unsigned long= load) WRITE_ONCE(sa->util_avg, sa->util_sum / divider); } =20 +#ifdef CONFIG_UCLAMP_TASK +/* avg must belong to the queue this se is on. */ +static void util_bias_update(struct task_struct *p) +{ + unsigned int util, uclamp_min, uclamp_max; + struct rq *rq; + int old, new; + + util =3D READ_ONCE(p->se.avg.util_avg); + uclamp_min =3D uclamp_eff_value(p, UCLAMP_MIN); + uclamp_max =3D uclamp_eff_value(p, UCLAMP_MAX); + /* + * uclamp_max at the max value means there is no uclamp_max, and should + * not have any clamping effect at all here. + */ + if (uclamp_max =3D=3D SCHED_CAPACITY_SCALE) + uclamp_max =3D UINT_MAX; + old =3D READ_ONCE(p->se.avg.util_avg_bias); + new =3D (int)clamp(util, uclamp_min, uclamp_max) - (int)util; + + WRITE_ONCE(p->se.avg.util_avg_bias, new); + if (!p->se.on_rq) + return; + rq =3D task_rq(p); + WRITE_ONCE(rq->cfs.avg.util_avg_bias, + READ_ONCE(rq->cfs.avg.util_avg_bias) + new - old); +} +#else /* !CONFIG_UCLAMP_TASK */ +static void util_bias_update(struct task_struct *p) +{ +} +#endif + /* * sched_entity: * @@ -296,6 +329,8 @@ int __update_load_avg_blocked_se(u64 now, struct sched_= entity *se) { if (___update_load_sum(now, &se->avg, 0, 0, 0)) { ___update_load_avg(&se->avg, se_weight(se)); + if (entity_is_task(se)) + util_bias_update(task_of(se)); trace_pelt_se_tp(se); return 1; } @@ -310,6 +345,8 @@ int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq= , struct sched_entity *se =20 ___update_load_avg(&se->avg, se_weight(se)); cfs_se_util_change(&se->avg); + if (entity_is_task(se)) + util_bias_update(task_of(se)); trace_pelt_se_tp(se); return 1; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 62fd8bc6fd08..5f93a6b1b563 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3178,6 +3178,22 @@ uclamp_se_set(struct uclamp_se *uc_se, unsigned int = value, bool user_defined) uc_se->user_defined =3D user_defined; } =20 +static inline void util_bias_enqueue(struct rq *rq, struct task_struct *p) +{ + int rq_val =3D READ_ONCE(rq->cfs.avg.util_avg_bias); + int p_val =3D READ_ONCE(p->se.avg.util_avg_bias); + + WRITE_ONCE(rq->cfs.avg.util_avg_bias, rq_val + p_val); +} + +static inline void util_bias_dequeue(struct rq *rq, struct task_struct *p) +{ + int rq_val =3D READ_ONCE(rq->cfs.avg.util_avg_bias); + int p_val =3D READ_ONCE(p->se.avg.util_avg_bias); + + WRITE_ONCE(rq->cfs.avg.util_avg_bias, rq_val - p_val); +} + #else /* !CONFIG_UCLAMP_TASK: */ =20 static inline unsigned long @@ -3215,6 +3231,14 @@ static inline bool uclamp_rq_is_idle(struct rq *rq) return false; } =20 +static inline void util_bias_enqueue(struct rq *rq, struct task_struct *p) +{ +} + +static inline void util_bias_dequeue(struct rq *rq, struct task_struct *p) +{ +} + #endif /* !CONFIG_UCLAMP_TASK */ =20 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ --=20 2.34.1 From nobody Wed Dec 17 15:51:19 2025 Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by smtp.subspace.kernel.org (Postfix) with ESMTP id 3B42C1369A7 for ; Mon, 24 Jun 2024 10:24:12 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=217.140.110.172 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1719224654; cv=none; b=L20AlMrwzhEegsFgLBcxtlvxUHmZjZzSn9MHdniHcJy69GhsEU5lSyMYP7dCFlGZmEjyOlPRxj6lf/jR7ZW0+SGJcm5ghBca+3Pw7r4MjyFuDqQMXubENgxrUsCyxnbCTXJ9tqYW1UX4QNlQTxXe7pe4AczNsY9NKVRtn6OUjJ4= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1719224654; c=relaxed/simple; bh=ONpgqyi1Co8BYmXFUGQYubc5DOmD5HMOV0F0PQ9jeX0=; h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: MIME-Version; b=bW8LiWS9/ipCxfsm5kcem/8xNIUrL3JqKlqMZlJoHL4KoklicM8+39Xzs6m7FbnxpVLz4eDvZLp7UHFpv4NpepXTnV/7ZR3iVm14sEJ/FEqglQTRH0hr/i0BsaDBGxC0t1dRMV+qTzBjCKzFSeH9xkBpVykLFSYKpO2xywMbEm4= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com; spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=arm.com Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 56DEADA7; Mon, 24 Jun 2024 03:24:36 -0700 (PDT) Received: from e130256.cambridge.arm.com (usa-sjc-imap-foss1.foss.arm.com [10.121.207.14]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id 7732D3F766; Mon, 24 Jun 2024 03:24:09 -0700 (PDT) From: Hongyan Xia To: Ingo Molnar , Peter Zijlstra , Vincent Guittot , Dietmar Eggemann , Juri Lelli , Steven Rostedt , Ben Segall , Mel Gorman , Daniel Bristot de Oliveira , Valentin Schneider Cc: Qais Yousef , Morten Rasmussen , Lukasz Luba , Christian Loehle , Pierre Gondois , Youssef Esmat , linux-kernel@vger.kernel.org Subject: [PATCH 3/7] sched/uclamp: Add util_est_uclamp Date: Mon, 24 Jun 2024 11:23:52 +0100 Message-Id: <6db7769986b58c278bef743785f1ff7a348b0d88.1719223916.git.hongyan.xia2@arm.com> X-Mailer: git-send-email 2.34.1 In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" The new util_est_uclamp is essentially clamp(util_est, min, max) and follows how util_est operates. Signed-off-by: Hongyan Xia --- include/linux/sched.h | 1 + kernel/sched/fair.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index 63bcb81b20bb..0160567314ae 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -479,6 +479,7 @@ struct sched_avg { unsigned int util_avg; int util_avg_bias; unsigned int util_est; + unsigned int util_est_uclamp; } ____cacheline_aligned; =20 /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 23360c666829..0fa48466e02e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4859,6 +4859,16 @@ static inline unsigned long task_util_uclamp(struct = task_struct *p) =20 return max(ret, 0L); } + +static inline unsigned long _task_util_est_uclamp(struct task_struct *p) +{ + return READ_ONCE(p->se.avg.util_est_uclamp); +} + +static inline unsigned long task_util_est_uclamp(struct task_struct *p) +{ + return max(task_util_uclamp(p), _task_util_est_uclamp(p)); +} #else static inline long task_util_bias(struct task_struct *p) { @@ -4869,6 +4879,16 @@ static inline unsigned long task_util_uclamp(struct = task_struct *p) { return task_util(p); } + +static inline unsigned long _task_util_est_uclamp(struct task_struct *p) +{ + return _task_util_est(p); +} + +static inline unsigned long task_util_est_uclamp(struct task_struct *p) +{ + return task_util_est(p); +} #endif =20 static inline void util_est_enqueue(struct cfs_rq *cfs_rq, @@ -4883,6 +4903,9 @@ static inline void util_est_enqueue(struct cfs_rq *cf= s_rq, enqueued =3D cfs_rq->avg.util_est; enqueued +=3D _task_util_est(p); WRITE_ONCE(cfs_rq->avg.util_est, enqueued); + enqueued =3D cfs_rq->avg.util_est_uclamp; + enqueued +=3D _task_util_est_uclamp(p); + WRITE_ONCE(cfs_rq->avg.util_est_uclamp, enqueued); =20 trace_sched_util_est_cfs_tp(cfs_rq); } @@ -4899,6 +4922,9 @@ static inline void util_est_dequeue(struct cfs_rq *cf= s_rq, enqueued =3D cfs_rq->avg.util_est; enqueued -=3D min_t(unsigned int, enqueued, _task_util_est(p)); WRITE_ONCE(cfs_rq->avg.util_est, enqueued); + enqueued =3D cfs_rq->avg.util_est_uclamp; + enqueued -=3D min_t(unsigned int, enqueued, _task_util_est_uclamp(p)); + WRITE_ONCE(cfs_rq->avg.util_est_uclamp, enqueued); =20 trace_sched_util_est_cfs_tp(cfs_rq); } @@ -4986,6 +5012,10 @@ static inline void util_est_update(struct cfs_rq *cf= s_rq, ewma -=3D last_ewma_diff; ewma >>=3D UTIL_EST_WEIGHT_SHIFT; done: + WRITE_ONCE(p->se.avg.util_est_uclamp, + clamp(ewma, + (unsigned int)uclamp_eff_value(p, UCLAMP_MIN), + (unsigned int)uclamp_eff_value(p, UCLAMP_MAX))); ewma |=3D UTIL_AVG_UNCHANGED; WRITE_ONCE(p->se.avg.util_est, ewma); =20 --=20 2.34.1 From nobody Wed Dec 17 15:51:19 2025 Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by smtp.subspace.kernel.org (Postfix) with ESMTP id 6B3B61369A7; Mon, 24 Jun 2024 10:24:16 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=217.140.110.172 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1719224658; cv=none; b=Un/WuPyJSJmcPleHNHXUgsA9MJdlQbd/93Lguv8zDI90/DVdBOhLKKtQmZpb4qAp42lBjYP4T5cQh/XS3tFx3JEmvGiYipdh1AwYNUwPcwJYi6RklMrcy9NOvmscetD9PcxJGhmdNco8leJvLrf2kGnmNWRe+5lno2Io2zhKfRA= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1719224658; c=relaxed/simple; bh=pf3cQlwDx91QO9IKX3PfiD+j0eC+XqjlE+xssVCqY7c=; h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: MIME-Version; b=ZoNhgAm7DGMdYHOCT/L32+sVlq3KXhIPmw/9JIi9qgDOyK0mAAS8LId1+Kzgd+zVmo2pEW6SRP5OhrdimG2PgQL9PzGpsXTXpPVoqI0Z6UlA5rdmCoWpvRqC3xzLBsz6HiXqlobboPeHgNbaKG395zyMitIA6XHvZu+KsWGO3nE= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com; spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=arm.com Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 83772DA7; Mon, 24 Jun 2024 03:24:40 -0700 (PDT) Received: from e130256.cambridge.arm.com (usa-sjc-imap-foss1.foss.arm.com [10.121.207.14]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id 0C60E3F766; Mon, 24 Jun 2024 03:24:12 -0700 (PDT) From: Hongyan Xia To: Ingo Molnar , Peter Zijlstra , Vincent Guittot , Dietmar Eggemann , "Rafael J. Wysocki" , Viresh Kumar , Juri Lelli , Steven Rostedt , Ben Segall , Mel Gorman , Daniel Bristot de Oliveira , Valentin Schneider Cc: Qais Yousef , Morten Rasmussen , Lukasz Luba , Christian Loehle , Pierre Gondois , Youssef Esmat , linux-kernel@vger.kernel.org, linux-pm@vger.kernel.org Subject: [PATCH 4/7] sched/fair: Use util biases for utilization and frequency Date: Mon, 24 Jun 2024 11:23:53 +0100 Message-Id: <4eb24fc71fb858ebe15e8e170f366b8c1eab1781.1719223916.git.hongyan.xia2@arm.com> X-Mailer: git-send-email 2.34.1 In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Use the new util_avg_bias for task and runqueue utilization. We also maintain separate util_est and util_est_uclamp signals. Now that we have the uclamp sum aggregated CFS util value, we do not need to consult uclamp buckets to know how the frequency should be clamped. We simply look at the aggregated top level rq->cfs.avg.util_avg + rq->cfs.avg.util_avg_bias and rq->cfs.avg.util_est_uclamp to know what frequency to choose and how to place tasks. Signed-off-by: Hongyan Xia --- kernel/sched/cpufreq_schedutil.c | 12 +- kernel/sched/fair.c | 303 +++++++++++-------------------- kernel/sched/sched.h | 20 +- kernel/sched/syscalls.c | 14 +- 4 files changed, 117 insertions(+), 232 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedu= til.c index eece6244f9d2..65fdcf4d73d1 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -197,7 +197,7 @@ unsigned long sugov_effective_cpu_perf(int cpu, unsigne= d long actual, =20 static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost) { - unsigned long min, max, util =3D cpu_util_cfs_boost(sg_cpu->cpu); + unsigned long min, max, util =3D cpu_util_cfs_boost_uclamp(sg_cpu->cpu); =20 util =3D effective_cpu_util(sg_cpu->cpu, util, &min, &max); util =3D max(util, boost); @@ -385,11 +385,8 @@ static void sugov_update_single_freq(struct update_uti= l_data *hook, u64 time, /* * Do not reduce the frequency if the CPU has not been idle * recently, as the reduction is likely to be premature then. - * - * Except when the rq is capped by uclamp_max. */ - if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) && - sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq && + if (sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq && !sg_policy->need_freq_update) { next_f =3D sg_policy->next_freq; =20 @@ -439,11 +436,8 @@ static void sugov_update_single_perf(struct update_uti= l_data *hook, u64 time, /* * Do not reduce the target performance level if the CPU has not been * idle recently, as the reduction is likely to be premature then. - * - * Except when the rq is capped by uclamp_max. */ - if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) && - sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util) + if (sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util) sg_cpu->util =3D prev_util; =20 cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0fa48466e02e..a25de0044af8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4869,6 +4869,15 @@ static inline unsigned long task_util_est_uclamp(str= uct task_struct *p) { return max(task_util_uclamp(p), _task_util_est_uclamp(p)); } + +static inline unsigned long root_cfs_util_uclamp(struct rq *rq) +{ + long ret =3D READ_ONCE(rq->cfs.avg.util_avg); + + ret +=3D READ_ONCE(rq->cfs.avg.util_avg_bias); + + return max(ret, 0L); +} #else static inline long task_util_bias(struct task_struct *p) { @@ -4889,6 +4898,11 @@ static inline unsigned long task_util_est_uclamp(str= uct task_struct *p) { return task_util_est(p); } + +static inline unsigned long root_cfs_util_uclamp(struct rq *rq) +{ + return READ_ONCE(rq->cfs.avg.util_avg); +} #endif =20 static inline void util_est_enqueue(struct cfs_rq *cfs_rq, @@ -5032,134 +5046,29 @@ static inline unsigned long get_actual_cpu_capacit= y(int cpu) } =20 static inline int util_fits_cpu(unsigned long util, - unsigned long uclamp_min, - unsigned long uclamp_max, + unsigned long util_uclamp, int cpu) { unsigned long capacity =3D capacity_of(cpu); - unsigned long capacity_orig; - bool fits, uclamp_max_fits; - - /* - * Check if the real util fits without any uclamp boost/cap applied. - */ - fits =3D fits_capacity(util, capacity); - - if (!uclamp_is_used()) - return fits; =20 - /* - * We must use arch_scale_cpu_capacity() for comparing against uclamp_min= and - * uclamp_max. We only care about capacity pressure (by using - * capacity_of()) for comparing against the real util. - * - * If a task is boosted to 1024 for example, we don't want a tiny - * pressure to skew the check whether it fits a CPU or not. - * - * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), = it - * should fit a little cpu even if there's some pressure. - * - * Only exception is for HW or cpufreq pressure since it has a direct imp= act - * on available OPP of the system. - * - * We honour it for uclamp_min only as a drop in performance level - * could result in not getting the requested minimum performance level. - * - * For uclamp_max, we can tolerate a drop in performance level as the - * goal is to cap the task. So it's okay if it's getting less. - */ - capacity_orig =3D arch_scale_cpu_capacity(cpu); - - /* - * We want to force a task to fit a cpu as implied by uclamp_max. - * But we do have some corner cases to cater for.. - * - * - * C=3Dz - * | ___ - * | C=3Dy | | - * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max - * | C=3Dx | | | | - * | ___ | | | | - * | | | | | | | (util somewhere in this region) - * | | | | | | | - * | | | | | | | - * +---------------------------------------- - * CPU0 CPU1 CPU2 - * - * In the above example if a task is capped to a specific performance - * point, y, then when: - * - * * util =3D 80% of x then it does not fit on CPU0 and should migrate - * to CPU1 - * * util =3D 80% of y then it is forced to fit on CPU1 to honour - * uclamp_max request. - * - * which is what we're enforcing here. A task always fits if - * uclamp_max <=3D capacity_orig. But when uclamp_max > capacity_orig, - * the normal upmigration rules should withhold still. - * - * Only exception is when we are on max capacity, then we need to be - * careful not to block overutilized state. This is so because: - * - * 1. There's no concept of capping at max_capacity! We can't go - * beyond this performance level anyway. - * 2. The system is being saturated when we're operating near - * max capacity, it doesn't make sense to block overutilized. - */ - uclamp_max_fits =3D (capacity_orig =3D=3D SCHED_CAPACITY_SCALE) && (uclam= p_max =3D=3D SCHED_CAPACITY_SCALE); - uclamp_max_fits =3D !uclamp_max_fits && (uclamp_max <=3D capacity_orig); - fits =3D fits || uclamp_max_fits; + if (fits_capacity(util_uclamp, capacity)) + return 1; =20 - /* - * - * C=3Dz - * | ___ (region a, capped, util >=3D= uclamp_max) - * | C=3Dy | | - * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max - * | C=3Dx | | | | - * | ___ | | | | (region b, uclamp_min <=3D u= til <=3D uclamp_max) - * |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min - * | | | | | | | - * | | | | | | | (region c, boosted, util < u= clamp_min) - * +---------------------------------------- - * CPU0 CPU1 CPU2 - * - * a) If util > uclamp_max, then we're capped, we don't care about - * actual fitness value here. We only care if uclamp_max fits - * capacity without taking margin/pressure into account. - * See comment above. - * - * b) If uclamp_min <=3D util <=3D uclamp_max, then the normal - * fits_capacity() rules apply. Except we need to ensure that we - * enforce we remain within uclamp_max, see comment above. - * - * c) If util < uclamp_min, then we are boosted. Same as (b) but we - * need to take into account the boosted value fits the CPU without - * taking margin/pressure into account. - * - * Cases (a) and (b) are handled in the 'fits' variable already. We - * just need to consider an extra check for case (c) after ensuring we - * handle the case uclamp_min > uclamp_max. - */ - uclamp_min =3D min(uclamp_min, uclamp_max); - if (fits && (util < uclamp_min) && - (uclamp_min > get_actual_cpu_capacity(cpu))) + if (fits_capacity(util, capacity)) return -1; =20 - return fits; + return 0; } =20 static inline int task_fits_cpu(struct task_struct *p, int cpu) { - unsigned long uclamp_min =3D uclamp_eff_value(p, UCLAMP_MIN); - unsigned long uclamp_max =3D uclamp_eff_value(p, UCLAMP_MAX); unsigned long util =3D task_util_est(p); + unsigned long util_uclamp =3D task_util_est_uclamp(p); /* * Return true only if the cpu fully fits the task requirements, which * include the utilization but also the performance hints. */ - return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0); + return (util_fits_cpu(util, util_uclamp, cpu) > 0); } =20 static inline void update_misfit_status(struct task_struct *p, struct rq *= rq) @@ -6737,18 +6646,19 @@ static inline void hrtick_update(struct rq *rq) #endif =20 #ifdef CONFIG_SMP +static unsigned long cpu_util_cfs_uclamp(int cpu); + static inline bool cpu_overutilized(int cpu) { - unsigned long rq_util_min, rq_util_max; + unsigned long util, util_uclamp; =20 if (!sched_energy_enabled()) return false; =20 - rq_util_min =3D uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); - rq_util_max =3D uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); + util =3D cpu_util_cfs(cpu); + util_uclamp =3D uclamp_is_used() ? cpu_util_cfs_uclamp(cpu) : util; =20 - /* Return true only if the utilization doesn't fit CPU's capacity */ - return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu); + return !util_fits_cpu(util, util_uclamp, cpu); } =20 /* @@ -7534,9 +7444,13 @@ static int select_idle_cpu(struct task_struct *p, st= ruct sched_domain *sd, bool * maximize capacity. */ static int -select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int t= arget) +select_idle_capacity(struct task_struct *p, + unsigned long task_util, + unsigned long task_util_uclamp, + struct sched_domain *sd, + int target) { - unsigned long task_util, util_min, util_max, best_cap =3D 0; + unsigned long best_cap =3D 0; int fits, best_fits =3D 0; int cpu, best_cpu =3D -1; struct cpumask *cpus; @@ -7544,17 +7458,13 @@ select_idle_capacity(struct task_struct *p, struct = sched_domain *sd, int target) cpus =3D this_cpu_cpumask_var_ptr(select_rq_mask); cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); =20 - task_util =3D task_util_est(p); - util_min =3D uclamp_eff_value(p, UCLAMP_MIN); - util_max =3D uclamp_eff_value(p, UCLAMP_MAX); - for_each_cpu_wrap(cpu, cpus, target) { unsigned long cpu_cap =3D capacity_of(cpu); =20 if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) continue; =20 - fits =3D util_fits_cpu(task_util, util_min, util_max, cpu); + fits =3D util_fits_cpu(task_util, task_util_uclamp, cpu); =20 /* This CPU fits with all requirements */ if (fits > 0) @@ -7582,8 +7492,7 @@ select_idle_capacity(struct task_struct *p, struct sc= hed_domain *sd, int target) } =20 static inline bool asym_fits_cpu(unsigned long util, - unsigned long util_min, - unsigned long util_max, + unsigned long util_uclamp, int cpu) { if (sched_asym_cpucap_active()) @@ -7591,7 +7500,7 @@ static inline bool asym_fits_cpu(unsigned long util, * Return true only if the cpu fully fits the task requirements * which include the utilization and the performance hints. */ - return (util_fits_cpu(util, util_min, util_max, cpu) > 0); + return (util_fits_cpu(util, util_uclamp, cpu) > 0); =20 return true; } @@ -7603,18 +7512,18 @@ static int select_idle_sibling(struct task_struct *= p, int prev, int target) { bool has_idle_core =3D false; struct sched_domain *sd; - unsigned long task_util, util_min, util_max; + unsigned long task_util, task_util_uclamp; + bool asym =3D sched_asym_cpucap_active(); int i, recent_used_cpu, prev_aff =3D -1; =20 /* * On asymmetric system, update task utilization because we will check * that the task fits with CPU's capacity. */ - if (sched_asym_cpucap_active()) { + if (asym) { sync_entity_load_avg(&p->se); task_util =3D task_util_est(p); - util_min =3D uclamp_eff_value(p, UCLAMP_MIN); - util_max =3D uclamp_eff_value(p, UCLAMP_MAX); + task_util_uclamp =3D task_util_est_uclamp(p); } =20 /* @@ -7623,7 +7532,7 @@ static int select_idle_sibling(struct task_struct *p,= int prev, int target) lockdep_assert_irqs_disabled(); =20 if ((available_idle_cpu(target) || sched_idle_cpu(target)) && - asym_fits_cpu(task_util, util_min, util_max, target)) + asym_fits_cpu(task_util, task_util_uclamp, target)) return target; =20 /* @@ -7631,7 +7540,7 @@ static int select_idle_sibling(struct task_struct *p,= int prev, int target) */ if (prev !=3D target && cpus_share_cache(prev, target) && (available_idle_cpu(prev) || sched_idle_cpu(prev)) && - asym_fits_cpu(task_util, util_min, util_max, prev)) { + asym_fits_cpu(task_util, task_util_uclamp, prev)) { =20 if (!static_branch_unlikely(&sched_cluster_active) || cpus_share_resources(prev, target)) @@ -7652,7 +7561,7 @@ static int select_idle_sibling(struct task_struct *p,= int prev, int target) in_task() && prev =3D=3D smp_processor_id() && this_rq()->nr_running <=3D 1 && - asym_fits_cpu(task_util, util_min, util_max, prev)) { + asym_fits_cpu(task_util, task_util_uclamp, prev)) { return prev; } =20 @@ -7664,7 +7573,7 @@ static int select_idle_sibling(struct task_struct *p,= int prev, int target) cpus_share_cache(recent_used_cpu, target) && (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cp= u)) && cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) && - asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { + asym_fits_cpu(task_util, task_util_uclamp, recent_used_cpu)) { =20 if (!static_branch_unlikely(&sched_cluster_active) || cpus_share_resources(recent_used_cpu, target)) @@ -7678,7 +7587,7 @@ static int select_idle_sibling(struct task_struct *p,= int prev, int target) * For asymmetric CPU capacity systems, our domain of interest is * sd_asym_cpucapacity rather than sd_llc. */ - if (sched_asym_cpucap_active()) { + if (asym) { sd =3D rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); /* * On an asymmetric CPU capacity system where an exclusive @@ -7689,7 +7598,8 @@ static int select_idle_sibling(struct task_struct *p,= int prev, int target) * capacity path. */ if (sd) { - i =3D select_idle_capacity(p, sd, target); + i =3D select_idle_capacity(p, task_util, task_util_uclamp, + sd, target); return ((unsigned)i < nr_cpumask_bits) ? i : target; } } @@ -7832,16 +7742,67 @@ cpu_util(int cpu, struct task_struct *p, int dst_cp= u, int boost) return min(util, arch_scale_cpu_capacity(cpu)); } =20 +/* This is basically a copy-paste from cpu_util(), but instead using uclam= p values. */ +static unsigned long +cpu_util_uclamp(int cpu, struct task_struct *p, int dst_cpu, int boost) +{ + struct rq *rq =3D cpu_rq(cpu); + struct cfs_rq *cfs_rq =3D &rq->cfs; + unsigned long util =3D root_cfs_util_uclamp(rq); + + if (boost) { + unsigned long runnable =3D READ_ONCE(cfs_rq->avg.runnable_avg); + unsigned long util_raw =3D READ_ONCE(cfs_rq->avg.util_avg); + + util =3D max(util, util_raw ? util * runnable / util_raw : 0); + } + + if (p) { + if (task_cpu(p) =3D=3D cpu && !p->se.on_rq) { + util +=3D task_util_bias(p); + if ((long)util < 0) + util =3D 0; + } + if (task_cpu(p) =3D=3D cpu && dst_cpu !=3D cpu) + lsub_positive(&util, task_util_uclamp(p)); + else if (task_cpu(p) !=3D cpu && dst_cpu =3D=3D cpu) + util +=3D task_util_uclamp(p); + } + + if (sched_feat(UTIL_EST)) { + unsigned long util_est =3D READ_ONCE(cfs_rq->avg.util_est_uclamp); + + if (dst_cpu =3D=3D cpu) + util_est +=3D _task_util_est_uclamp(p); + else if (p && unlikely(task_on_rq_queued(p) || current =3D=3D p)) + lsub_positive(&util_est, _task_util_est_uclamp(p)); + + util =3D max(util, util_est); + } + + return min(util, arch_scale_cpu_capacity(cpu)); +} + unsigned long cpu_util_cfs(int cpu) { return cpu_util(cpu, NULL, -1, 0); } =20 -unsigned long cpu_util_cfs_boost(int cpu) +static unsigned long cpu_util_cfs_uclamp(int cpu) +{ + return cpu_util_uclamp(cpu, NULL, -1, 0); +} + +static unsigned long cpu_util_cfs_boost(int cpu) { return cpu_util(cpu, NULL, -1, 1); } =20 +unsigned long cpu_util_cfs_boost_uclamp(int cpu) +{ + return cpu_util_uclamp(cpu, NULL, -1, 1); +} + /* * cpu_util_without: compute cpu utilization without any contributions fro= m *p * @cpu: the CPU which utilization is requested @@ -7952,33 +7913,15 @@ eenv_pd_max_util(struct energy_env *eenv, struct cp= umask *pd_cpus, int cpu; =20 for_each_cpu(cpu, pd_cpus) { - struct task_struct *tsk =3D (cpu =3D=3D dst_cpu) ? p : NULL; - unsigned long util =3D cpu_util(cpu, p, dst_cpu, 1); + unsigned long util =3D cpu_util_uclamp(cpu, p, dst_cpu, 1); unsigned long eff_util, min, max; =20 /* - * Performance domain frequency: utilization clamping - * must be considered since it affects the selection - * of the performance domain frequency. - * NOTE: in case RT tasks are running, by default the min - * utilization can be max OPP. + * NOTE: in case RT tasks are running, by default the + * FREQUENCY_UTIL's utilization can be max OPP. */ eff_util =3D effective_cpu_util(cpu, util, &min, &max); =20 - /* Task's uclamp can modify min and max value */ - if (tsk && uclamp_is_used()) { - min =3D max(min, uclamp_eff_value(p, UCLAMP_MIN)); - - /* - * If there is no active max uclamp constraint, - * directly use task's one, otherwise keep max. - */ - if (uclamp_rq_is_idle(cpu_rq(cpu))) - max =3D uclamp_eff_value(p, UCLAMP_MAX); - else - max =3D max(max, uclamp_eff_value(p, UCLAMP_MAX)); - } - eff_util =3D sugov_effective_cpu_perf(cpu, eff_util, min, max); max_util =3D max(max_util, eff_util); } @@ -8052,8 +7995,6 @@ static int find_energy_efficient_cpu(struct task_stru= ct *p, int prev_cpu) { struct cpumask *cpus =3D this_cpu_cpumask_var_ptr(select_rq_mask); unsigned long prev_delta =3D ULONG_MAX, best_delta =3D ULONG_MAX; - unsigned long p_util_min =3D uclamp_is_used() ? uclamp_eff_value(p, UCLAM= P_MIN) : 0; - unsigned long p_util_max =3D uclamp_is_used() ? uclamp_eff_value(p, UCLAM= P_MAX) : 1024; struct root_domain *rd =3D this_rq()->rd; int cpu, best_energy_cpu, target =3D -1; int prev_fits =3D -1, best_fits =3D -1; @@ -8081,16 +8022,14 @@ static int find_energy_efficient_cpu(struct task_st= ruct *p, int prev_cpu) target =3D prev_cpu; =20 sync_entity_load_avg(&p->se); - if (!task_util_est(p) && p_util_min =3D=3D 0) + if (!task_util_est_uclamp(p)) goto unlock; =20 eenv_task_busy_time(&eenv, p, prev_cpu); =20 for (; pd; pd =3D pd->next) { - unsigned long util_min =3D p_util_min, util_max =3D p_util_max; - unsigned long cpu_cap, cpu_actual_cap, util; + unsigned long cpu_cap, cpu_actual_cap, util, util_uclamp; unsigned long cur_delta, max_spare_cap =3D 0; - unsigned long rq_util_min, rq_util_max; unsigned long prev_spare_cap =3D 0; int max_spare_cap_cpu =3D -1; unsigned long base_energy; @@ -8109,8 +8048,6 @@ static int find_energy_efficient_cpu(struct task_stru= ct *p, int prev_cpu) eenv.pd_cap =3D 0; =20 for_each_cpu(cpu, cpus) { - struct rq *rq =3D cpu_rq(cpu); - eenv.pd_cap +=3D cpu_actual_cap; =20 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) @@ -8120,36 +8057,18 @@ static int find_energy_efficient_cpu(struct task_st= ruct *p, int prev_cpu) continue; =20 util =3D cpu_util(cpu, p, cpu, 0); + util_uclamp =3D uclamp_is_used() ? + cpu_util_uclamp(cpu, p, cpu, 0) : util; cpu_cap =3D capacity_of(cpu); =20 - /* - * Skip CPUs that cannot satisfy the capacity request. - * IOW, placing the task there would make the CPU - * overutilized. Take uclamp into account to see how - * much capacity we can get out of the CPU; this is - * aligned with sched_cpu_util(). - */ - if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) { - /* - * Open code uclamp_rq_util_with() except for - * the clamp() part. I.e.: apply max aggregation - * only. util_fits_cpu() logic requires to - * operate on non clamped util but must use the - * max-aggregated uclamp_{min, max}. - */ - rq_util_min =3D uclamp_rq_get(rq, UCLAMP_MIN); - rq_util_max =3D uclamp_rq_get(rq, UCLAMP_MAX); - - util_min =3D max(rq_util_min, p_util_min); - util_max =3D max(rq_util_max, p_util_max); - } - - fits =3D util_fits_cpu(util, util_min, util_max, cpu); + fits =3D util_fits_cpu(util, util_uclamp, cpu); + if (fits =3D=3D 1) + lsub_positive(&cpu_cap, util_uclamp); + else if (fits =3D=3D -1) + lsub_positive(&cpu_cap, util); if (!fits) continue; =20 - lsub_positive(&cpu_cap, util); - if (cpu =3D=3D prev_cpu) { /* Always use prev_cpu as a candidate. */ prev_spare_cap =3D cpu_cap; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5f93a6b1b563..c02ab8a54d66 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3089,9 +3089,8 @@ static inline unsigned long cpu_util_dl(struct rq *rq) return READ_ONCE(rq->avg_dl.util_avg); } =20 - extern unsigned long cpu_util_cfs(int cpu); -extern unsigned long cpu_util_cfs_boost(int cpu); +extern unsigned long cpu_util_cfs_boost_uclamp(int cpu); =20 static inline unsigned long cpu_util_rt(struct rq *rq) { @@ -3121,21 +3120,6 @@ static inline bool uclamp_rq_is_idle(struct rq *rq) return rq->uclamp_flags & UCLAMP_FLAG_IDLE; } =20 -/* Is the rq being capped/throttled by uclamp_max? */ -static inline bool uclamp_rq_is_capped(struct rq *rq) -{ - unsigned long rq_util; - unsigned long max_util; - - if (!static_branch_likely(&sched_uclamp_used)) - return false; - - rq_util =3D cpu_util_cfs(cpu_of(rq)) + cpu_util_rt(rq); - max_util =3D READ_ONCE(rq->uclamp[UCLAMP_MAX].value); - - return max_util !=3D SCHED_CAPACITY_SCALE && rq_util >=3D max_util; -} - /* * When uclamp is compiled in, the aggregation at rq level is 'turned off' * by default in the fast path and only gets turned on once userspace perf= orms @@ -3205,8 +3189,6 @@ uclamp_eff_value(struct task_struct *p, enum uclamp_i= d clamp_id) return SCHED_CAPACITY_SCALE; } =20 -static inline bool uclamp_rq_is_capped(struct rq *rq) { return false; } - static inline bool uclamp_is_used(void) { return false; diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index ae1b42775ef9..d6696d06829d 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -303,13 +303,7 @@ unsigned long effective_cpu_util(int cpu, unsigned lon= g util_cfs, } =20 if (min) { - /* - * The minimum utilization returns the highest level between: - * - the computed DL bandwidth needed with the IRQ pressure which - * steals time to the deadline task. - * - The minimum performance requirement for CFS and/or RT. - */ - *min =3D max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN)); + *min =3D irq + cpu_bw_dl(rq); =20 /* * When an RT task is runnable and uclamp is not used, we must @@ -328,12 +322,8 @@ unsigned long effective_cpu_util(int cpu, unsigned lon= g util_cfs, util =3D util_cfs + cpu_util_rt(rq); util +=3D cpu_util_dl(rq); =20 - /* - * The maximum hint is a soft bandwidth requirement, which can be lower - * than the actual utilization because of uclamp_max requirements. - */ if (max) - *max =3D min(scale, uclamp_rq_get(rq, UCLAMP_MAX)); + *max =3D scale; =20 if (util >=3D scale) return scale; --=20 2.34.1 From nobody Wed Dec 17 15:51:19 2025 Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by smtp.subspace.kernel.org (Postfix) with ESMTP id 55B1C139CE3 for ; Mon, 24 Jun 2024 10:24:20 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=217.140.110.172 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1719224662; cv=none; b=XLaBswGreTmCmX2NhaSlx0+izVq+2BUeBWk6slhjBrh8g6hp6TVRlVQzbamQSZkSKfqh0x5Rdu9VrjvM+vOPNBm2yR+LB6QSJfIEAz5sxXc40juw1AN/J3QaqpHOsFXStztEaZOLjqKdWipMG6xrJPg/gvJ4SrF61lh9LctSyl8= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1719224662; c=relaxed/simple; bh=u+gzmeDIbNB6D/4H6z4GI6G3EGk4rQYkCG4snZT5814=; h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: MIME-Version; b=aaONQAn7DKzxQZp5nev9gwOyY1XVA29TeLpLI57ZrtYsMlkQI7MtlVqiihJgHeIKsE7NXXzYbbUgo1yox705cTLsjBwPLDy6+IZhWGCG1eN0gjX5PnF6cxRFQikihgCd3lKyrvl4P0WlZcn6eW+eduOwKwqzkq+b713NoOBq5gY= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com; spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=arm.com Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 741EADA7; Mon, 24 Jun 2024 03:24:44 -0700 (PDT) Received: from e130256.cambridge.arm.com (usa-sjc-imap-foss1.foss.arm.com [10.121.207.14]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id 7D4C43F766; Mon, 24 Jun 2024 03:24:17 -0700 (PDT) From: Hongyan Xia To: Ingo Molnar , Peter Zijlstra , Vincent Guittot , Dietmar Eggemann , Juri Lelli , Steven Rostedt , Ben Segall , Mel Gorman , Daniel Bristot de Oliveira , Valentin Schneider Cc: Qais Yousef , Morten Rasmussen , Lukasz Luba , Christian Loehle , Pierre Gondois , Youssef Esmat , linux-kernel@vger.kernel.org Subject: [PATCH 5/7] sched/uclamp: Remove all uclamp bucket logic Date: Mon, 24 Jun 2024 11:23:54 +0100 Message-Id: <88087fa7ce9ee193425cee6a75bd06f3420a1467.1719223916.git.hongyan.xia2@arm.com> X-Mailer: git-send-email 2.34.1 In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Also rewrite uclamp_update_active() so that the effective uclamp values are updated every time we change task group properties, change system defaults or a request is issued from userspace. This also signnificantly reduces uclamp overhead because we no longer need to compute effective uclamp values and manipulate buckets every time a task is enqueued or dequeued (in uclamp_rq_{inc/dec}()). TODO: Rewrite documentation to match the new logic. Signed-off-by: Hongyan Xia --- include/linux/sched.h | 4 - init/Kconfig | 32 ----- kernel/sched/core.c | 287 +++------------------------------------- kernel/sched/fair.c | 4 - kernel/sched/rt.c | 4 - kernel/sched/sched.h | 95 +------------ kernel/sched/syscalls.c | 2 + 7 files changed, 20 insertions(+), 408 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 0160567314ae..6ba0454d1174 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -685,9 +685,6 @@ struct sched_dl_entity { }; =20 #ifdef CONFIG_UCLAMP_TASK -/* Number of utilization clamp buckets (shorter alias) */ -#define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT - /* * Utilization clamp for a scheduling entity * @value: clamp value "assigned" to a se @@ -713,7 +710,6 @@ struct sched_dl_entity { */ struct uclamp_se { unsigned int value : bits_per(SCHED_CAPACITY_SCALE); - unsigned int bucket_id : bits_per(UCLAMP_BUCKETS); unsigned int active : 1; unsigned int user_defined : 1; }; diff --git a/init/Kconfig b/init/Kconfig index 72404c1f2157..b2dc6a3baa1f 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -817,38 +817,6 @@ config UCLAMP_TASK enforce or grant any specific bandwidth for tasks. =20 If in doubt, say N. - -config UCLAMP_BUCKETS_COUNT - int "Number of supported utilization clamp buckets" - range 5 20 - default 5 - depends on UCLAMP_TASK - help - Defines the number of clamp buckets to use. The range of each bucket - will be SCHED_CAPACITY_SCALE/UCLAMP_BUCKETS_COUNT. The higher the - number of clamp buckets the finer their granularity and the higher - the precision of clamping aggregation and tracking at run-time. - - For example, with the minimum configuration value we will have 5 - clamp buckets tracking 20% utilization each. A 25% boosted tasks will - be refcounted in the [20..39]% bucket and will set the bucket clamp - effective value to 25%. - If a second 30% boosted task should be co-scheduled on the same CPU, - that task will be refcounted in the same bucket of the first task and - it will boost the bucket clamp effective value to 30%. - The clamp effective value of a bucket is reset to its nominal value - (20% in the example above) when there are no more tasks refcounted in - that bucket. - - An additional boost/capping margin can be added to some tasks. In the - example above the 25% task will be boosted to 30% until it exits the - CPU. If that should be considered not acceptable on certain systems, - it's always possible to reduce the margin by increasing the number of - clamp buckets to trade off used memory for run-time tracking - precision. - - If in doubt, use the default value. - endmenu =20 # diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0935f9d4bb7b..767894fc1562 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1410,54 +1410,6 @@ static struct uclamp_se uclamp_default[UCLAMP_CNT]; */ DEFINE_STATIC_KEY_FALSE(sched_uclamp_used); =20 -static inline unsigned int -uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id, - unsigned int clamp_value) -{ - /* - * Avoid blocked utilization pushing up the frequency when we go - * idle (which drops the max-clamp) by retaining the last known - * max-clamp. - */ - if (clamp_id =3D=3D UCLAMP_MAX) { - rq->uclamp_flags |=3D UCLAMP_FLAG_IDLE; - return clamp_value; - } - - return uclamp_none(UCLAMP_MIN); -} - -static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_i= d, - unsigned int clamp_value) -{ - /* Reset max-clamp retention only on idle exit */ - if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE)) - return; - - uclamp_rq_set(rq, clamp_id, clamp_value); -} - -static inline -unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id, - unsigned int clamp_value) -{ - struct uclamp_bucket *bucket =3D rq->uclamp[clamp_id].bucket; - int bucket_id =3D UCLAMP_BUCKETS - 1; - - /* - * Since both min and max clamps are max aggregated, find the - * top most bucket with tasks in. - */ - for ( ; bucket_id >=3D 0; bucket_id--) { - if (!bucket[bucket_id].tasks) - continue; - return bucket[bucket_id].value; - } - - /* No tasks -- default clamp values */ - return uclamp_idle_value(rq, clamp_id, clamp_value); -} - static void __uclamp_update_util_min_rt_default(struct task_struct *p) { unsigned int default_util_min; @@ -1513,8 +1465,7 @@ uclamp_tg_restrict(struct task_struct *p, enum uclamp= _id clamp_id) } =20 /* - * The effective clamp bucket index of a task depends on, by increasing - * priority: + * The effective uclamp value of a task depends on, by increasing priority: * - the task specific clamp value, when explicitly requested from userspa= ce * - the task group effective clamp value, for tasks not either in the root * group or in an autogroup @@ -1535,196 +1486,23 @@ uclamp_eff_get(struct task_struct *p, enum uclamp_= id clamp_id) =20 unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp= _id) { - struct uclamp_se uc_eff; - - /* Task currently refcounted: use back-annotated (effective) value */ - if (p->uclamp[clamp_id].active) - return (unsigned long)p->uclamp[clamp_id].value; - - uc_eff =3D uclamp_eff_get(p, clamp_id); - - return (unsigned long)uc_eff.value; -} - -/* - * When a task is enqueued on a rq, the clamp bucket currently defined by = the - * task's uclamp::bucket_id is refcounted on that rq. This also immediately - * updates the rq's clamp value if required. - * - * Tasks can have a task-specific value requested from user-space, track - * within each bucket the maximum value for tasks refcounted in it. - * This "local max aggregation" allows to track the exact "requested" value - * for each bucket when all its RUNNABLE tasks require the same clamp. - */ -static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, - enum uclamp_id clamp_id) -{ - struct uclamp_rq *uc_rq =3D &rq->uclamp[clamp_id]; - struct uclamp_se *uc_se =3D &p->uclamp[clamp_id]; - struct uclamp_bucket *bucket; - - lockdep_assert_rq_held(rq); - - /* Update task effective clamp */ - p->uclamp[clamp_id] =3D uclamp_eff_get(p, clamp_id); - - bucket =3D &uc_rq->bucket[uc_se->bucket_id]; - bucket->tasks++; - uc_se->active =3D true; - - uclamp_idle_reset(rq, clamp_id, uc_se->value); - - /* - * Local max aggregation: rq buckets always track the max - * "requested" clamp value of its RUNNABLE tasks. - */ - if (bucket->tasks =3D=3D 1 || uc_se->value > bucket->value) - bucket->value =3D uc_se->value; - - if (uc_se->value > uclamp_rq_get(rq, clamp_id)) - uclamp_rq_set(rq, clamp_id, uc_se->value); -} - -/* - * When a task is dequeued from a rq, the clamp bucket refcounted by the t= ask - * is released. If this is the last task reference counting the rq's max - * active clamp value, then the rq's clamp value is updated. - * - * Both refcounted tasks and rq's cached clamp values are expected to be - * always valid. If it's detected they are not, as defensive programming, - * enforce the expected state and warn. - */ -static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, - enum uclamp_id clamp_id) -{ - struct uclamp_rq *uc_rq =3D &rq->uclamp[clamp_id]; - struct uclamp_se *uc_se =3D &p->uclamp[clamp_id]; - struct uclamp_bucket *bucket; - unsigned int bkt_clamp; - unsigned int rq_clamp; - - lockdep_assert_rq_held(rq); - - /* - * If sched_uclamp_used was enabled after task @p was enqueued, - * we could end up with unbalanced call to uclamp_rq_dec_id(). - * - * In this case the uc_se->active flag should be false since no uclamp - * accounting was performed at enqueue time and we can just return - * here. - * - * Need to be careful of the following enqueue/dequeue ordering - * problem too - * - * enqueue(taskA) - * // sched_uclamp_used gets enabled - * enqueue(taskB) - * dequeue(taskA) - * // Must not decrement bucket->tasks here - * dequeue(taskB) - * - * where we could end up with stale data in uc_se and - * bucket[uc_se->bucket_id]. - * - * The following check here eliminates the possibility of such race. - */ - if (unlikely(!uc_se->active)) - return; - - bucket =3D &uc_rq->bucket[uc_se->bucket_id]; - - SCHED_WARN_ON(!bucket->tasks); - if (likely(bucket->tasks)) - bucket->tasks--; - - uc_se->active =3D false; - - /* - * Keep "local max aggregation" simple and accept to (possibly) - * overboost some RUNNABLE tasks in the same bucket. - * The rq clamp bucket value is reset to its base value whenever - * there are no more RUNNABLE tasks refcounting it. - */ - if (likely(bucket->tasks)) - return; - - rq_clamp =3D uclamp_rq_get(rq, clamp_id); - /* - * Defensive programming: this should never happen. If it happens, - * e.g. due to future modification, warn and fix up the expected value. - */ - SCHED_WARN_ON(bucket->value > rq_clamp); - if (bucket->value >=3D rq_clamp) { - bkt_clamp =3D uclamp_rq_max_value(rq, clamp_id, uc_se->value); - uclamp_rq_set(rq, clamp_id, bkt_clamp); - } -} - -static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) -{ - enum uclamp_id clamp_id; - - /* - * Avoid any overhead until uclamp is actually used by the userspace. - * - * The condition is constructed such that a NOP is generated when - * sched_uclamp_used is disabled. - */ - if (!static_branch_unlikely(&sched_uclamp_used)) - return; + if (!uclamp_is_used() || !p->uclamp[clamp_id].active) + return uclamp_none(clamp_id); =20 - if (unlikely(!p->sched_class->uclamp_enabled)) - return; - - for_each_clamp_id(clamp_id) - uclamp_rq_inc_id(rq, p, clamp_id); - - /* Reset clamp idle holding when there is one RUNNABLE task */ - if (rq->uclamp_flags & UCLAMP_FLAG_IDLE) - rq->uclamp_flags &=3D ~UCLAMP_FLAG_IDLE; + return p->uclamp[clamp_id].value; } =20 -static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) +void uclamp_update_active_nolock(struct task_struct *p) { enum uclamp_id clamp_id; =20 - /* - * Avoid any overhead until uclamp is actually used by the userspace. - * - * The condition is constructed such that a NOP is generated when - * sched_uclamp_used is disabled. - */ - if (!static_branch_unlikely(&sched_uclamp_used)) - return; - - if (unlikely(!p->sched_class->uclamp_enabled)) - return; - for_each_clamp_id(clamp_id) - uclamp_rq_dec_id(rq, p, clamp_id); -} - -static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p, - enum uclamp_id clamp_id) -{ - if (!p->uclamp[clamp_id].active) - return; - - uclamp_rq_dec_id(rq, p, clamp_id); - uclamp_rq_inc_id(rq, p, clamp_id); - - /* - * Make sure to clear the idle flag if we've transiently reached 0 - * active tasks on rq. - */ - if (clamp_id =3D=3D UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE)) - rq->uclamp_flags &=3D ~UCLAMP_FLAG_IDLE; + p->uclamp[clamp_id] =3D uclamp_eff_get(p, clamp_id); } =20 static inline void uclamp_update_active(struct task_struct *p) { - enum uclamp_id clamp_id; struct rq_flags rf; struct rq *rq; =20 @@ -1738,14 +1516,7 @@ uclamp_update_active(struct task_struct *p) */ rq =3D task_rq_lock(p, &rf); =20 - /* - * Setting the clamp bucket is serialized by task_rq_lock(). - * If the task is not yet RUNNABLE and its task_struct is not - * affecting a valid clamp bucket, the next time it's enqueued, - * it will already see the updated clamp bucket value. - */ - for_each_clamp_id(clamp_id) - uclamp_rq_reinc_id(rq, p, clamp_id); + uclamp_update_active_nolock(p); =20 task_rq_unlock(rq, p, &rf); } @@ -1877,20 +1648,14 @@ static void uclamp_fork(struct task_struct *p) { enum uclamp_id clamp_id; =20 - /* - * We don't need to hold task_rq_lock() when updating p->uclamp_* here - * as the task is still at its early fork stages. - */ - for_each_clamp_id(clamp_id) - p->uclamp[clamp_id].active =3D false; - - if (likely(!p->sched_reset_on_fork)) - return; - - for_each_clamp_id(clamp_id) { - uclamp_se_set(&p->uclamp_req[clamp_id], - uclamp_none(clamp_id), false); + if (unlikely(p->sched_reset_on_fork)) { + for_each_clamp_id(clamp_id) { + uclamp_se_set(&p->uclamp_req[clamp_id], + uclamp_none(clamp_id), false); + } } + + uclamp_update_active(p); } =20 static void uclamp_post_fork(struct task_struct *p) @@ -1898,28 +1663,10 @@ static void uclamp_post_fork(struct task_struct *p) uclamp_update_util_min_rt_default(p); } =20 -static void __init init_uclamp_rq(struct rq *rq) -{ - enum uclamp_id clamp_id; - struct uclamp_rq *uc_rq =3D rq->uclamp; - - for_each_clamp_id(clamp_id) { - uc_rq[clamp_id] =3D (struct uclamp_rq) { - .value =3D uclamp_none(clamp_id) - }; - } - - rq->uclamp_flags =3D UCLAMP_FLAG_IDLE; -} - static void __init init_uclamp(void) { struct uclamp_se uc_max =3D {}; enum uclamp_id clamp_id; - int cpu; - - for_each_possible_cpu(cpu) - init_uclamp_rq(cpu_rq(cpu)); =20 for_each_clamp_id(clamp_id) { uclamp_se_set(&init_task.uclamp_req[clamp_id], @@ -1938,8 +1685,6 @@ static void __init init_uclamp(void) } =20 #else /* !CONFIG_UCLAMP_TASK */ -static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { } -static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { } static inline void uclamp_fork(struct task_struct *p) { } static inline void uclamp_post_fork(struct task_struct *p) { } static inline void init_uclamp(void) { } @@ -1979,7 +1724,6 @@ void enqueue_task(struct rq *rq, struct task_struct *= p, int flags) psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED)); } =20 - uclamp_rq_inc(rq, p); p->sched_class->enqueue_task(rq, p, flags); =20 if (sched_core_enabled(rq)) @@ -1999,7 +1743,6 @@ void dequeue_task(struct rq *rq, struct task_struct *= p, int flags) psi_dequeue(p, flags & DEQUEUE_SLEEP); } =20 - uclamp_rq_dec(rq, p); p->sched_class->dequeue_task(rq, p, flags); } =20 @@ -8749,6 +8492,7 @@ void sched_move_task(struct task_struct *tsk) put_prev_task(rq, tsk); =20 sched_change_group(tsk, group); + uclamp_update_active_nolock(tsk); =20 if (queued) enqueue_task(rq, tsk, queue_flags); @@ -8881,7 +8625,6 @@ static void cpu_util_update_eff(struct cgroup_subsys_= state *css) if (eff[clamp_id] =3D=3D uc_se[clamp_id].value) continue; uc_se[clamp_id].value =3D eff[clamp_id]; - uc_se[clamp_id].bucket_id =3D uclamp_bucket_id(eff[clamp_id]); clamps |=3D (0x1 << clamp_id); } if (!clamps) { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a25de0044af8..3bb077df52ae 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -13217,10 +13217,6 @@ DEFINE_SCHED_CLASS(fair) =3D { #ifdef CONFIG_SCHED_CORE .task_is_throttled =3D task_is_throttled_fair, #endif - -#ifdef CONFIG_UCLAMP_TASK - .uclamp_enabled =3D 1, -#endif }; =20 #ifdef CONFIG_SCHED_DEBUG diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 63e49c8ffc4d..df8bfab0232a 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2680,10 +2680,6 @@ DEFINE_SCHED_CLASS(rt) =3D { #ifdef CONFIG_SCHED_CORE .task_is_throttled =3D task_is_throttled_rt, #endif - -#ifdef CONFIG_UCLAMP_TASK - .uclamp_enabled =3D 1, -#endif }; =20 #ifdef CONFIG_RT_GROUP_SCHED diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c02ab8a54d66..e01e42d2703d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -953,46 +953,6 @@ extern void rto_push_irq_work_func(struct irq_work *wo= rk); #endif /* CONFIG_SMP */ =20 #ifdef CONFIG_UCLAMP_TASK -/* - * struct uclamp_bucket - Utilization clamp bucket - * @value: utilization clamp value for tasks on this clamp bucket - * @tasks: number of RUNNABLE tasks on this clamp bucket - * - * Keep track of how many tasks are RUNNABLE for a given utilization - * clamp value. - */ -struct uclamp_bucket { - unsigned long value : bits_per(SCHED_CAPACITY_SCALE); - unsigned long tasks : BITS_PER_LONG - bits_per(SCHED_CAPACITY_SCALE); -}; - -/* - * struct uclamp_rq - rq's utilization clamp - * @value: currently active clamp values for a rq - * @bucket: utilization clamp buckets affecting a rq - * - * Keep track of RUNNABLE tasks on a rq to aggregate their clamp values. - * A clamp value is affecting a rq when there is at least one task RUNNABLE - * (or actually running) with that value. - * - * There are up to UCLAMP_CNT possible different clamp values, currently t= here - * are only two: minimum utilization and maximum utilization. - * - * All utilization clamping values are MAX aggregated, since: - * - for util_min: we want to run the CPU at least at the max of the minim= um - * utilization required by its currently RUNNABLE tasks. - * - for util_max: we want to allow the CPU to run up to the max of the - * maximum utilization allowed by its currently RUNNABLE tasks. - * - * Since on each system we expect only a limited number of different - * utilization clamp values (UCLAMP_BUCKETS), use a simple array to track - * the metrics required to compute all the per-rq utilization clamp values. - */ -struct uclamp_rq { - unsigned int value; - struct uclamp_bucket bucket[UCLAMP_BUCKETS]; -}; - DECLARE_STATIC_KEY_FALSE(sched_uclamp_used); #endif /* CONFIG_UCLAMP_TASK */ =20 @@ -1034,10 +994,6 @@ struct rq { u64 nr_switches; =20 #ifdef CONFIG_UCLAMP_TASK - /* Utilization clamp values based on CPU's RUNNABLE tasks */ - struct uclamp_rq uclamp[UCLAMP_CNT] ____cacheline_aligned; - unsigned int uclamp_flags; -#define UCLAMP_FLAG_IDLE 0x01 #endif =20 struct cfs_rq cfs; @@ -2278,11 +2234,6 @@ struct affinity_context { extern s64 update_curr_common(struct rq *rq); =20 struct sched_class { - -#ifdef CONFIG_UCLAMP_TASK - int uclamp_enabled; -#endif - void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); void (*yield_task) (struct rq *rq); @@ -3102,23 +3053,7 @@ static inline unsigned long cpu_util_rt(struct rq *r= q) #ifdef CONFIG_UCLAMP_TASK =20 unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp= _id); - -static inline unsigned long uclamp_rq_get(struct rq *rq, - enum uclamp_id clamp_id) -{ - return READ_ONCE(rq->uclamp[clamp_id].value); -} - -static inline void uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id, - unsigned int value) -{ - WRITE_ONCE(rq->uclamp[clamp_id].value, value); -} - -static inline bool uclamp_rq_is_idle(struct rq *rq) -{ - return rq->uclamp_flags & UCLAMP_FLAG_IDLE; -} +void uclamp_update_active_nolock(struct task_struct *p); =20 /* * When uclamp is compiled in, the aggregation at rq level is 'turned off' @@ -3146,19 +3081,10 @@ static inline unsigned int uclamp_none(enum uclamp_= id clamp_id) return SCHED_CAPACITY_SCALE; } =20 -/* Integer rounded range for each bucket */ -#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP= _BUCKETS) - -static inline unsigned int uclamp_bucket_id(unsigned int clamp_value) -{ - return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCK= ETS - 1); -} - static inline void uclamp_se_set(struct uclamp_se *uc_se, unsigned int value, bool user_defin= ed) { uc_se->value =3D value; - uc_se->bucket_id =3D uclamp_bucket_id(value); uc_se->user_defined =3D user_defined; } =20 @@ -3189,26 +3115,11 @@ uclamp_eff_value(struct task_struct *p, enum uclamp= _id clamp_id) return SCHED_CAPACITY_SCALE; } =20 -static inline bool uclamp_is_used(void) -{ - return false; -} - -static inline unsigned long -uclamp_rq_get(struct rq *rq, enum uclamp_id clamp_id) -{ - if (clamp_id =3D=3D UCLAMP_MIN) - return 0; - - return SCHED_CAPACITY_SCALE; -} - -static inline void -uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id, unsigned int value) +static inline void uclamp_update_active_nolock(struct task_struct *p) { } =20 -static inline bool uclamp_rq_is_idle(struct rq *rq) +static inline bool uclamp_is_used(void) { return false; } diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index d6696d06829d..b3fadd386d71 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -520,6 +520,8 @@ static void __setscheduler_uclamp(struct task_struct *p, uclamp_se_set(&p->uclamp_req[UCLAMP_MAX], attr->sched_util_max, true); } + + uclamp_update_active_nolock(p); } =20 #else /* !CONFIG_UCLAMP_TASK: */ --=20 2.34.1 From nobody Wed Dec 17 15:51:19 2025 Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by smtp.subspace.kernel.org (Postfix) with ESMTP id 1234E13A240 for ; Mon, 24 Jun 2024 10:24:22 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=217.140.110.172 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1719224664; cv=none; b=hXTKNQXtbceIrlfb3EYXgDyf2q49z2cQL2MlozgpUhDdIGTKqmOFoKTjzBgykDMxkG3ABEOuM5bHG+enpwagiTqHgV73gTF5bgZIM0MSjU41DGua7YUKlxlmWpZH2AgQUYFoyThBljAvOO13Muf5xdjvwjn01i7sER9AJoGSW1s= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1719224664; c=relaxed/simple; bh=GVrexOSXchRwkLEcqMASpbL2gufR/zoy5FB93APaTvA=; h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: MIME-Version; b=H6sxTy5We6WGklYW3l+UCJez47jIOwkD7kQqXmZBHGXumF11ExGa4oGu8B6TindW76swITXor70CeaKX7xqaE4Aa2U7kg5m4kAd7P+QED35VH5hMt6/c4gABGLg6jAAg2vZ6cSKndX9emPHt/5di1Ql7rDgC3haVnFyiMGsoBBM= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com; spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=arm.com Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 625D5FEC; Mon, 24 Jun 2024 03:24:47 -0700 (PDT) Received: from e130256.cambridge.arm.com (usa-sjc-imap-foss1.foss.arm.com [10.121.207.14]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id 83B663F766; Mon, 24 Jun 2024 03:24:20 -0700 (PDT) From: Hongyan Xia To: Ingo Molnar , Peter Zijlstra , Vincent Guittot , Dietmar Eggemann , Juri Lelli , Steven Rostedt , Ben Segall , Mel Gorman , Daniel Bristot de Oliveira , Valentin Schneider Cc: Qais Yousef , Morten Rasmussen , Lukasz Luba , Christian Loehle , Pierre Gondois , Youssef Esmat , linux-kernel@vger.kernel.org Subject: [PATCH 6/7] sched/uclamp: Simplify uclamp_eff_value() Date: Mon, 24 Jun 2024 11:23:55 +0100 Message-Id: X-Mailer: git-send-email 2.34.1 In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" The commit sched: Remove all uclamp bucket logic removes uclamp_rq_{inc/dec}() functions, so now p->uclamp contains the correct values all the time after a uclamp_update_active() call, and there's no need to toggle the boolean `active` after an update. As a result, this function is fairly simple now and can live as a static inline function. Signed-off-by: Hongyan Xia --- kernel/sched/core.c | 13 ++++--------- kernel/sched/sched.h | 10 +++++++++- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 767894fc1562..6dc8aa730eb6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1484,20 +1484,14 @@ uclamp_eff_get(struct task_struct *p, enum uclamp_i= d clamp_id) return uc_req; } =20 -unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp= _id) -{ - if (!uclamp_is_used() || !p->uclamp[clamp_id].active) - return uclamp_none(clamp_id); - - return p->uclamp[clamp_id].value; -} - void uclamp_update_active_nolock(struct task_struct *p) { enum uclamp_id clamp_id; =20 - for_each_clamp_id(clamp_id) + for_each_clamp_id(clamp_id) { p->uclamp[clamp_id] =3D uclamp_eff_get(p, clamp_id); + p->uclamp[clamp_id].active =3D 1; + } } =20 static inline void @@ -1652,6 +1646,7 @@ static void uclamp_fork(struct task_struct *p) for_each_clamp_id(clamp_id) { uclamp_se_set(&p->uclamp_req[clamp_id], uclamp_none(clamp_id), false); + p->uclamp[clamp_id].active =3D 0; } } =20 diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e01e42d2703d..59e5ea421a4c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3052,7 +3052,6 @@ static inline unsigned long cpu_util_rt(struct rq *rq) =20 #ifdef CONFIG_UCLAMP_TASK =20 -unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp= _id); void uclamp_update_active_nolock(struct task_struct *p); =20 /* @@ -3081,6 +3080,15 @@ static inline unsigned int uclamp_none(enum uclamp_i= d clamp_id) return SCHED_CAPACITY_SCALE; } =20 +static inline unsigned long +uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id) +{ + if (uclamp_is_used() && p->uclamp[clamp_id].active) + return p->uclamp[clamp_id].value; + + return uclamp_none(clamp_id); +} + static inline void uclamp_se_set(struct uclamp_se *uc_se, unsigned int value, bool user_defin= ed) { --=20 2.34.1 From nobody Wed Dec 17 15:51:19 2025 Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by smtp.subspace.kernel.org (Postfix) with ESMTP id 1ABA013AA2C for ; Mon, 24 Jun 2024 10:24:25 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=217.140.110.172 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1719224667; cv=none; b=KJfbEt6RekzdfA7gYcgjEFF9inSNrepPWR0btuQScpe7897Wb9VJS47dClnqhzDOHMMN+I/aSD5wJiDTanSufBaaNvP2hOTLKQhPF3RptxFjshnirwJ9OGiWJrAg/hhwSlvOmSb3W7jnEs2mFZsflEErw/wOnRWvozV05xqUiTI= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1719224667; c=relaxed/simple; bh=cQws8ortMF+x3AGFx4Uqqj8FnUiGqPelfHlVhi+4xhI=; h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: MIME-Version; b=BzCcis+qZwpV0szsvG9DoxT8QnZvOG+vfhvgK8ZseLaVrNTjKr5fHKhNAb9sNTWBSRGr/tJIxRYzuveRyQLXmDOhVcBvJMabSAK6W3RUjY2Etq1tenFVF4YDG9mggIkfn19B4lECNpCqlCSmSgAPuRNBJtMWo8BpFS63YWzQjiI= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com; spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=arm.com Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 5188FDA7; Mon, 24 Jun 2024 03:24:50 -0700 (PDT) Received: from e130256.cambridge.arm.com (usa-sjc-imap-foss1.foss.arm.com [10.121.207.14]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id 7122E3F766; Mon, 24 Jun 2024 03:24:23 -0700 (PDT) From: Hongyan Xia To: Ingo Molnar , Peter Zijlstra , Vincent Guittot , Dietmar Eggemann , Juri Lelli , Steven Rostedt , Ben Segall , Mel Gorman , Daniel Bristot de Oliveira , Valentin Schneider Cc: Qais Yousef , Morten Rasmussen , Lukasz Luba , Christian Loehle , Pierre Gondois , Youssef Esmat , linux-kernel@vger.kernel.org Subject: [PATCH 7/7] Propagate negative bias Date: Mon, 24 Jun 2024 11:23:56 +0100 Message-Id: <60985d07acd8a2daf4f3adf31ce4bf3be2982306.1719223916.git.hongyan.xia2@arm.com> X-Mailer: git-send-email 2.34.1 In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Negative bias is interesting, because dequeuing such a task will actually increase utilization. Solve by applying PELT decay to negative biases as well. This in fact can be implemented easily with some math tricks. Signed-off-by: Hongyan Xia --- kernel/sched/fair.c | 40 ++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 4 ++++ 2 files changed, 44 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3bb077df52ae..d09af6abf464 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4878,6 +4878,45 @@ static inline unsigned long root_cfs_util_uclamp(str= uct rq *rq) =20 return max(ret, 0L); } + +/* + * Negative biases are tricky. If we remove them right away then dequeuing= a + * uclamp_max task has the interesting effect that dequeuing results in a = higher + * rq utilization. Solve this by applying PELT decay to the bias itself. + * + * Keeping track of a PELT-decayed negative bias is extra overhead. Howeve= r, we + * observe this interesting math property, where y is the decay factor and= p is + * the number of periods elapsed: + * + * util_new =3D util_old * y^p - neg_bias * y^p + * =3D (util_old - neg_bias) * y^p + * + * Therefore, we simply subtract the negative bias from util_avg the momen= t we + * dequeue, then the PELT signal itself is the total of util_avg and the d= ecayed + * negative bias, and we no longer need to track the decayed bias separate= ly. + */ +static void propagate_negative_bias(struct task_struct *p) +{ + if (task_util_bias(p) < 0 && !task_on_rq_migrating(p)) { + unsigned long neg_bias =3D -task_util_bias(p); + struct sched_entity *se =3D &p->se; + struct cfs_rq *cfs_rq; + + p->se.avg.util_avg_bias =3D 0; + + for_each_sched_entity(se) { + u32 divider, neg_sum; + + cfs_rq =3D cfs_rq_of(se); + divider =3D get_pelt_divider(&cfs_rq->avg); + neg_sum =3D neg_bias * divider; + sub_positive(&se->avg.util_avg, neg_bias); + sub_positive(&se->avg.util_sum, neg_sum); + sub_positive(&cfs_rq->avg.util_avg, neg_bias); + sub_positive(&cfs_rq->avg.util_sum, neg_sum); + } + } +} #else static inline long task_util_bias(struct task_struct *p) { @@ -6869,6 +6908,7 @@ static void dequeue_task_fair(struct rq *rq, struct t= ask_struct *p, int flags) /* At this point se is NULL and we are at root level*/ sub_nr_running(rq, 1); util_bias_dequeue(rq, p); + propagate_negative_bias(p); =20 /* balance early to pull high priority tasks */ if (unlikely(!was_sched_idle && sched_idle_rq(rq))) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 59e5ea421a4c..9d14ef9c717e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3140,6 +3140,10 @@ static inline void util_bias_dequeue(struct rq *rq, = struct task_struct *p) { } =20 +static inline void propagate_negative_bias(struct task_struct *p) +{ +} + #endif /* !CONFIG_UCLAMP_TASK */ =20 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ --=20 2.34.1