From nobody Sat Feb  7 20:51:51 2026
Return-Path: <linux-kernel-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 0957AE8FDDE
	for <linux-kernel@archiver.kernel.org>; Wed,  4 Oct 2023 09:05:30 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S241883AbjJDJFa (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
        Wed, 4 Oct 2023 05:05:30 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56404 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S241846AbjJDJF2 (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Wed, 4 Oct 2023 05:05:28 -0400
Received: from foss.arm.com (foss.arm.com [217.140.110.172])
        by lindbergh.monkeyblade.net (Postfix) with ESMTP id 8342898
        for <linux-kernel@vger.kernel.org>;
 Wed,  4 Oct 2023 02:05:24 -0700 (PDT)
Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14])
        by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id DFB4D1480;
        Wed,  4 Oct 2023 02:06:02 -0700 (PDT)
Received: from e130256.cambridge.arm.com (usa-sjc-imap-foss1.foss.arm.com
 [10.121.207.14])
        by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id
 DB7803F59C;
        Wed,  4 Oct 2023 02:05:22 -0700 (PDT)
From: Hongyan Xia <Hongyan.Xia2@arm.com>
To: Ingo Molnar <mingo@redhat.com>,
        Peter Zijlstra <peterz@infradead.org>,
        Vincent Guittot <vincent.guittot@linaro.org>,
        Dietmar Eggemann <dietmar.eggemann@arm.com>,
        Juri Lelli <juri.lelli@redhat.com>
Cc: Qais Yousef <qyousef@layalina.io>,
        Morten Rasmussen <morten.rasmussen@arm.com>,
        Lukasz Luba <lukasz.luba@arm.com>,
        Christian Loehle <christian.loehle@arm.com>,
        linux-kernel@vger.kernel.org, Hongyan Xia <hongyan.xia2@arm.com>
Subject: [RFC PATCH 1/6] sched/uclamp: Track uclamped util_avg in sched_avg
Date: Wed,  4 Oct 2023 10:04:49 +0100
Message-Id: 
 <5564fc23d5e6425d069c36b4cef48edbe77fe64d.1696345700.git.Hongyan.Xia2@arm.com>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <cover.1696345700.git.Hongyan.Xia2@arm.com>
References: <cover.1696345700.git.Hongyan.Xia2@arm.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

From: Hongyan Xia <hongyan.xia2@arm.com>

Track a uclamped version of util_avg in sched_avg, which clamps util_avg
within [uclamp[UCLAMP_MIN], uclamp[UCLAMP_MAX]] every time util_avg is
updated. At the CFS rq level, cfs_rq->avg.util_avg_uclamp must always be
the sum of all util_avg_uclamp of entities on this cfs_rq. So, each
time the util_avg_uclamp of an entity gets updated, we also track the
delta and update the cfs_rq.

We can't put the update of cfs_rq util_avg_uclamp separately in
propagate_entity_load_avg(), because util_avg_uclamp of se and cfs_rq
are not tracked separately, unlike util_avg. As a result,
util_avg_uclamp of the se and the cfs_rq the se is on must be updated
at the same time.

Signed-off-by: Hongyan Xia <hongyan.xia2@arm.com>
---
 include/linux/sched.h |  9 ++++++++-
 kernel/sched/fair.c   | 39 +++++++++++++++++++++++++++++++++++++++
 kernel/sched/pelt.c   | 43 +++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h  | 37 +++++++++++++++++++++++++++++++++++++
 4 files changed, 127 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 177b3f3676ef..825d7b86b006 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -499,7 +499,14 @@ struct sched_avg {
 	u32				period_contrib;
 	unsigned long			load_avg;
 	unsigned long			runnable_avg;
-	unsigned long			util_avg;
+	unsigned int			util_avg;
+#ifdef CONFIG_UCLAMP_TASK
+	/*
+	 * XXX: util_avg shrunk to accommodate util_avg_uclamp.
+	 * What are the consequences?
+	 */
+	unsigned int			util_avg_uclamp;
+#endif
 	struct util_est			util_est;
 } ____cacheline_aligned;
=20
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0b7445cd5af9..33e5a6e751c0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1077,6 +1077,9 @@ void post_init_entity_util_avg(struct task_struct *p)
 	}
=20
 	sa->runnable_avg =3D sa->util_avg;
+#ifdef CONFIG_UCLAMP_TASK
+	sa->util_avg_uclamp =3D sa->util_avg;
+#endif
 }
=20
 #else /* !CONFIG_SMP */
@@ -5068,6 +5071,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_en=
tity *se, int flags)
 	update_stats_enqueue_fair(cfs_rq, se, flags);
 	if (!curr)
 		__enqueue_entity(cfs_rq, se);
+	enqueue_util_avg_uclamp(cfs_rq, se);
 	se->on_rq =3D 1;
=20
 	if (cfs_rq->nr_running =3D=3D 1) {
@@ -5138,6 +5142,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_en=
tity *se, int flags)
 	update_entity_lag(cfs_rq, se);
 	if (se !=3D cfs_rq->curr)
 		__dequeue_entity(cfs_rq, se);
+	dequeue_util_avg_uclamp(cfs_rq, se);
 	se->on_rq =3D 0;
 	account_entity_dequeue(cfs_rq, se);
=20
@@ -6445,6 +6450,21 @@ static int sched_idle_cpu(int cpu)
 }
 #endif
=20
+void ___update_util_avg_uclamp(struct sched_avg *avg, struct sched_entity =
*se);
+
+static void update_se_chain(struct task_struct *p)
+{
+#ifdef CONFIG_UCLAMP_TASK
+	struct sched_entity *se =3D &p->se;
+	struct rq *rq =3D task_rq(p);
+
+	for_each_sched_entity(se) {
+		struct cfs_rq *cfs_rq =3D cfs_rq_of(se);
+
+		___update_util_avg_uclamp(&cfs_rq->avg, se);
+	}
+#endif
+}
 /*
  * The enqueue_task method is called before nr_running is
  * increased. Here we update the fair scheduling stats and
@@ -6511,6 +6531,16 @@ enqueue_task_fair(struct rq *rq, struct task_struct =
*p, int flags)
 			goto enqueue_throttle;
 	}
=20
+	/*
+	 * Re-evaluate the se hierarchy now that on_rq is true. This is
+	 * important to enforce uclamp the moment a task with uclamp is
+	 * enqueued, rather than waiting a timer tick for uclamp to kick in.
+	 *
+	 * XXX: This duplicates some of the work already done in the above for
+	 * loops.
+	 */
+	update_se_chain(p);
+
 	/* At this point se is NULL and we are at root level*/
 	add_nr_running(rq, 1);
=20
@@ -6612,6 +6642,15 @@ static void dequeue_task_fair(struct rq *rq, struct =
task_struct *p, int flags)
 dequeue_throttle:
 	util_est_update(&rq->cfs, p, task_sleep);
 	hrtick_update(rq);
+
+#ifdef CONFIG_UCLAMP_TASK
+	if (rq->cfs.h_nr_running =3D=3D 0) {
+		WARN_ONCE(rq->cfs.avg.util_avg_uclamp,
+			"0 tasks on CFS of CPU %d, but util_avg_uclamp is %u\n",
+			rq->cpu, rq->cfs.avg.util_avg_uclamp);
+		WRITE_ONCE(rq->cfs.avg.util_avg_uclamp, 0);
+	}
+#endif
 }
=20
 #ifdef CONFIG_SMP
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 0f310768260c..c656e4dcb1d1 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -266,6 +266,48 @@ ___update_load_avg(struct sched_avg *sa, unsigned long=
 load)
 	WRITE_ONCE(sa->util_avg, sa->util_sum / divider);
 }
=20
+#ifdef CONFIG_UCLAMP_TASK
+/* avg must belong to the queue this se is on. */
+void ___update_util_avg_uclamp(struct sched_avg *avg, struct sched_entity =
*se)
+{
+	unsigned int util;
+	int delta;
+
+	if (entity_is_task(se)) {
+		unsigned int uclamp_min, uclamp_max;
+
+		if (!se->on_rq)
+			return;
+
+		util =3D READ_ONCE(se->avg.util_avg);
+		uclamp_min =3D uclamp_eff_value(task_of(se), UCLAMP_MIN);
+		uclamp_max =3D uclamp_eff_value(task_of(se), UCLAMP_MAX);
+		util =3D clamp(util, uclamp_min, uclamp_max);
+	} else {
+		util =3D READ_ONCE(group_cfs_rq(se)->avg.util_avg_uclamp);
+
+		if (!se->on_rq) {
+			WRITE_ONCE(se->avg.util_avg_uclamp, util);
+			return;
+		}
+	}
+
+	delta =3D util - READ_ONCE(se->avg.util_avg_uclamp);
+	if (delta =3D=3D 0)
+		return;
+
+	WRITE_ONCE(se->avg.util_avg_uclamp, util);
+	util =3D READ_ONCE(avg->util_avg_uclamp);
+	util +=3D delta;
+	WRITE_ONCE(avg->util_avg_uclamp, util);
+}
+#else /* !CONFIG_UCLAMP_TASK */
+static void
+___update_util_avg_uclamp(struct sched_avg *avg, struct sched_entity *se)
+{
+}
+#endif
+
 /*
  * sched_entity:
  *
@@ -309,6 +351,7 @@ int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq=
, struct sched_entity *se
 				cfs_rq->curr =3D=3D se)) {
=20
 		___update_load_avg(&se->avg, se_weight(se));
+		___update_util_avg_uclamp(&cfs_rq->avg, se);
 		cfs_se_util_change(&se->avg);
 		trace_pelt_se_tp(se);
 		return 1;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3a01b7a2bf66..2eefcdb0c3b0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3133,6 +3133,33 @@ static inline bool uclamp_is_used(void)
 {
 	return static_branch_likely(&sched_uclamp_used);
 }
+
+static inline void enqueue_util_avg_uclamp(struct cfs_rq *cfs_rq,
+					   struct sched_entity *se)
+{
+	unsigned int cfs_val =3D READ_ONCE(cfs_rq->avg.util_avg_uclamp);
+	unsigned int se_val =3D READ_ONCE(se->avg.util_avg_uclamp);
+
+	WRITE_ONCE(cfs_rq->avg.util_avg_uclamp, cfs_val + se_val);
+}
+
+static inline void dequeue_util_avg_uclamp(struct cfs_rq *cfs_rq,
+					   struct sched_entity *se)
+{
+	unsigned int cfs_val =3D READ_ONCE(cfs_rq->avg.util_avg_uclamp);
+	unsigned int se_val =3D READ_ONCE(se->avg.util_avg_uclamp), new_val;
+
+	if (cfs_val > se_val)
+		new_val =3D cfs_val - se_val;
+	else {
+		WARN_ONCE(cfs_val < se_val,
+			"CPU %d. cfs_rq %p, cfs_val %u is even less than se_val %u before subtr=
action\n",
+			rq_of(cfs_rq)->cpu, cfs_rq, cfs_val, se_val);
+		new_val =3D 0;
+	}
+
+	WRITE_ONCE(cfs_rq->avg.util_avg_uclamp, new_val);
+}
 #else /* CONFIG_UCLAMP_TASK */
 static inline unsigned long uclamp_eff_value(struct task_struct *p,
 					     enum uclamp_id clamp_id)
@@ -3175,6 +3202,16 @@ static inline bool uclamp_rq_is_idle(struct rq *rq)
 {
 	return false;
 }
+
+static inline void enqueue_util_avg_uclamp(struct cfs_rq *cfs_rq,
+					   struct sched_entity *se)
+{
+}
+
+static inline void dequeue_util_avg_uclamp(struct cfs_rq *cfs_rq,
+					   struct sched_entity *se)
+{
+}
 #endif /* CONFIG_UCLAMP_TASK */
=20
 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
--=20
2.34.1
From nobody Sat Feb  7 20:51:51 2026
Return-Path: <linux-kernel-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 8AA4BE8FDDB
	for <linux-kernel@archiver.kernel.org>; Wed,  4 Oct 2023 09:05:40 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S241913AbjJDJFl (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
        Wed, 4 Oct 2023 05:05:41 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56426 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S241884AbjJDJFb (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Wed, 4 Oct 2023 05:05:31 -0400
Received: from foss.arm.com (foss.arm.com [217.140.110.172])
        by lindbergh.monkeyblade.net (Postfix) with ESMTP id B6CEBA6
        for <linux-kernel@vger.kernel.org>;
 Wed,  4 Oct 2023 02:05:27 -0700 (PDT)
Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14])
        by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 1ADBE150C;
        Wed,  4 Oct 2023 02:06:06 -0700 (PDT)
Received: from e130256.cambridge.arm.com (usa-sjc-imap-foss1.foss.arm.com
 [10.121.207.14])
        by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id
 14E823F59C;
        Wed,  4 Oct 2023 02:05:25 -0700 (PDT)
From: Hongyan Xia <Hongyan.Xia2@arm.com>
To: Ingo Molnar <mingo@redhat.com>,
        Peter Zijlstra <peterz@infradead.org>,
        Vincent Guittot <vincent.guittot@linaro.org>,
        Dietmar Eggemann <dietmar.eggemann@arm.com>,
        Juri Lelli <juri.lelli@redhat.com>
Cc: Qais Yousef <qyousef@layalina.io>,
        Morten Rasmussen <morten.rasmussen@arm.com>,
        Lukasz Luba <lukasz.luba@arm.com>,
        Christian Loehle <christian.loehle@arm.com>,
        linux-kernel@vger.kernel.org, Hongyan Xia <hongyan.xia2@arm.com>
Subject: [RFC PATCH 2/6] sched/uclamp: Simulate PELT decay in util_avg_uclamp
Date: Wed,  4 Oct 2023 10:04:50 +0100
Message-Id: 
 <d73fc3e9a02f047902fdd5e4c07402452d6e0590.1696345700.git.Hongyan.Xia2@arm.com>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <cover.1696345700.git.Hongyan.Xia2@arm.com>
References: <cover.1696345700.git.Hongyan.Xia2@arm.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

From: Hongyan Xia <hongyan.xia2@arm.com>

Because util_avg_uclamp is not directly managed by PELT, it lacks the
nice property of slowly decaying to a lower value, resulting in
performance degredation due to premature frequency drops.

Add functions to decay root cfs utilization and tasks that are not on
the rq. This way, we get the benefits of PELT while still maintaining
uclamp. The rules are simple:

1. When task is se->on_rq, enforce its util_avg_uclamp within uclamp
   range.
2. When task is !se->on_rq, PELT decay its util_avg_uclamp.
3. When the root CFS util drops, PELT decay to the target frequency
   instead of immediately dropping to a lower target frequency.

TODO: Can we somehow integrate this uclamp sum aggregation directly into
util_avg, so that we don't need to introduce a new util_avg_uclamp
signal and don't need to simulate PELT decay?

Signed-off-by: Hongyan Xia <hongyan.xia2@arm.com>
---
 kernel/sched/fair.c  |  20 +++++++++
 kernel/sched/pelt.c  | 103 ++++++++++++++++++++++++++++++++++++++++---
 kernel/sched/sched.h |   2 +
 3 files changed, 119 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 33e5a6e751c0..420af57d01ee 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4311,17 +4311,22 @@ static inline int
 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 {
 	unsigned long removed_load =3D 0, removed_util =3D 0, removed_runnable =
=3D 0;
+	unsigned int removed_root_util =3D 0;
 	struct sched_avg *sa =3D &cfs_rq->avg;
 	int decayed =3D 0;
=20
 	if (cfs_rq->removed.nr) {
 		unsigned long r;
+		struct rq *rq =3D rq_of(cfs_rq);
 		u32 divider =3D get_pelt_divider(&cfs_rq->avg);
=20
 		raw_spin_lock(&cfs_rq->removed.lock);
 		swap(cfs_rq->removed.util_avg, removed_util);
 		swap(cfs_rq->removed.load_avg, removed_load);
 		swap(cfs_rq->removed.runnable_avg, removed_runnable);
+#ifdef CONFIG_UCLAMP_TASK
+		swap(rq->root_cfs_util_uclamp_removed, removed_root_util);
+#endif
 		cfs_rq->removed.nr =3D 0;
 		raw_spin_unlock(&cfs_rq->removed.lock);
=20
@@ -4346,6 +4351,12 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_r=
q)
 		 *    util_avg * minimum possible divider
 		 */
 		sa->util_sum =3D max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDE=
R);
+#ifdef CONFIG_UCLAMP_TASK
+		r =3D removed_root_util;
+		sub_positive(&rq->root_cfs_util_uclamp, r);
+		rq->root_cfs_util_uclamp =3D
+			max(rq->root_cfs_util_uclamp, rq->cfs.avg.util_avg_uclamp);
+#endif
=20
 		r =3D removed_runnable;
 		sub_positive(&sa->runnable_avg, r);
@@ -4527,6 +4538,7 @@ static void sync_entity_load_avg(struct sched_entity =
*se)
 static void remove_entity_load_avg(struct sched_entity *se)
 {
 	struct cfs_rq *cfs_rq =3D cfs_rq_of(se);
+	struct rq *rq =3D rq_of(cfs_rq);
 	unsigned long flags;
=20
 	/*
@@ -4542,6 +4554,9 @@ static void remove_entity_load_avg(struct sched_entit=
y *se)
 	cfs_rq->removed.util_avg	+=3D se->avg.util_avg;
 	cfs_rq->removed.load_avg	+=3D se->avg.load_avg;
 	cfs_rq->removed.runnable_avg	+=3D se->avg.runnable_avg;
+#ifdef CONFIG_UCLAMP_TASK
+	rq->root_cfs_util_uclamp_removed +=3D se->avg.util_avg_uclamp;
+#endif
 	raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
 }
=20
@@ -6462,6 +6477,11 @@ static void update_se_chain(struct task_struct *p)
 		struct cfs_rq *cfs_rq =3D cfs_rq_of(se);
=20
 		___update_util_avg_uclamp(&cfs_rq->avg, se);
+		if (&rq->cfs =3D=3D cfs_rq) {
+			rq->root_cfs_util_uclamp =3D max(rq->root_cfs_util_uclamp,
+						       cfs_rq->avg.util_avg_uclamp);
+			cfs_rq_util_change(cfs_rq, 0);
+		}
 	}
 #endif
 }
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index c656e4dcb1d1..83d5ac7e7ddb 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -267,6 +267,57 @@ ___update_load_avg(struct sched_avg *sa, unsigned long=
 load)
 }
=20
 #ifdef CONFIG_UCLAMP_TASK
+static void ___decay_util_avg_uclamp_towards(u64 now,
+					     u64 last_update_time,
+					     u32 period_contrib,
+					     unsigned int *old,
+					     unsigned int new_val)
+{
+	unsigned int old_val =3D READ_ONCE(*old);
+	u64 delta, periods;
+
+	if (old_val <=3D new_val) {
+		WRITE_ONCE(*old, new_val);
+		return;
+	}
+
+	if (!last_update_time)
+		return;
+	delta =3D now - last_update_time;
+	if ((s64)delta < 0)
+		return;
+	delta >>=3D 10;
+	if (!delta)
+		return;
+
+	delta +=3D period_contrib;
+	periods =3D delta / 1024;
+	if (periods) {
+		u64 diff =3D old_val - new_val;
+
+		/*
+		 * Let's assume 3 tasks, A, B and C. A is still on rq but B and
+		 * C have just been dequeued. The cfs.avg.util_avg_uclamp has
+		 * become A but root_cfs_util_uclamp just starts to decay and is
+		 * now still A + B + C.
+		 *
+		 * After p periods with y being the decay factor, the new
+		 * root_cfs_util_uclamp should become
+		 *
+		 * A + B * y^p + C * y^p =3D=3D A + (A + B + C - A) * y^p
+		 *     =3D=3D cfs.avg.util_avg_uclamp +
+		 *        (root_cfs_util_uclamp_at_the_start - cfs.avg.util_avg_uclamp) =
* y^p
+		 *     =3D=3D cfs.avg.util_avg_uclamp + diff * y^p
+		 *
+		 * So, instead of summing up each individual decayed values, we
+		 * could just decay the diff and not bother with the summation
+		 * at all. This is why we decay the diff here.
+		 */
+		diff =3D decay_load(diff, periods);
+		WRITE_ONCE(*old, new_val + diff);
+	}
+}
+
 /* avg must belong to the queue this se is on. */
 void ___update_util_avg_uclamp(struct sched_avg *avg, struct sched_entity =
*se)
 {
@@ -336,17 +387,33 @@ ___update_util_avg_uclamp(struct sched_avg *avg, stru=
ct sched_entity *se)
=20
 int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
 {
+	u64 last_update_time =3D se->avg.last_update_time;
+	u32 period_contrib =3D se->avg.period_contrib;
+	int ret =3D 0;
+
 	if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
 		___update_load_avg(&se->avg, se_weight(se));
 		trace_pelt_se_tp(se);
-		return 1;
+		ret =3D 1;
 	}
=20
-	return 0;
+#ifdef CONFIG_UCLAMP_TASK
+	if (entity_is_task(se))
+		___decay_util_avg_uclamp_towards(now,
+						 last_update_time,
+						 period_contrib,
+						 &se->avg.util_avg_uclamp,
+						 0);
+#endif
+	return ret;
 }
=20
 int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_enti=
ty *se)
 {
+	u64 last_update_time =3D se->avg.last_update_time;
+	u32 period_contrib =3D se->avg.period_contrib;
+	int ret =3D 0;
+
 	if (___update_load_sum(now, &se->avg, !!se->on_rq, se_runnable(se),
 				cfs_rq->curr =3D=3D se)) {
=20
@@ -354,14 +421,26 @@ int __update_load_avg_se(u64 now, struct cfs_rq *cfs_=
rq, struct sched_entity *se
 		___update_util_avg_uclamp(&cfs_rq->avg, se);
 		cfs_se_util_change(&se->avg);
 		trace_pelt_se_tp(se);
-		return 1;
+		ret =3D 1;
 	}
=20
-	return 0;
+#ifdef CONFIG_UCLAMP_TASK
+	if (!se->on_rq && entity_is_task(se))
+		___decay_util_avg_uclamp_towards(now,
+						 last_update_time,
+						 period_contrib,
+						 &se->avg.util_avg_uclamp,
+						 0);
+#endif
+	return ret;
 }
=20
 int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
 {
+	u64 last_update_time =3D cfs_rq->avg.last_update_time;
+	u32 period_contrib =3D cfs_rq->avg.period_contrib;
+	int ret =3D 0;
+
 	if (___update_load_sum(now, &cfs_rq->avg,
 				scale_load_down(cfs_rq->load.weight),
 				cfs_rq->h_nr_running,
@@ -369,10 +448,22 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *=
cfs_rq)
=20
 		___update_load_avg(&cfs_rq->avg, 1);
 		trace_pelt_cfs_tp(cfs_rq);
-		return 1;
+		ret =3D 1;
 	}
=20
-	return 0;
+#ifdef CONFIG_UCLAMP_TASK
+	if (&rq_of(cfs_rq)->cfs =3D=3D cfs_rq) {
+		unsigned int target =3D READ_ONCE(cfs_rq->avg.util_avg_uclamp);
+
+		___decay_util_avg_uclamp_towards(now,
+				last_update_time,
+				period_contrib,
+				&rq_of(cfs_rq)->root_cfs_util_uclamp,
+				target);
+	}
+#endif
+
+	return ret;
 }
=20
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2eefcdb0c3b0..98fa5e79f4e9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -992,6 +992,8 @@ struct rq {
 	/* Utilization clamp values based on CPU's RUNNABLE tasks */
 	struct uclamp_rq	uclamp[UCLAMP_CNT] ____cacheline_aligned;
 	unsigned int		uclamp_flags;
+	unsigned int		root_cfs_util_uclamp;
+	unsigned int		root_cfs_util_uclamp_removed;
 #define UCLAMP_FLAG_IDLE 0x01
 #endif
=20
--=20
2.34.1
From nobody Sat Feb  7 20:51:51 2026
Return-Path: <linux-kernel-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 56666E8FDDD
	for <linux-kernel@archiver.kernel.org>; Wed,  4 Oct 2023 09:05:42 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S241904AbjJDJFm (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
        Wed, 4 Oct 2023 05:05:42 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56920 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S241916AbjJDJFj (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Wed, 4 Oct 2023 05:05:39 -0400
Received: from foss.arm.com (foss.arm.com [217.140.110.172])
        by lindbergh.monkeyblade.net (Postfix) with ESMTP id 85E24D8;
        Wed,  4 Oct 2023 02:05:33 -0700 (PDT)
Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14])
        by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id D96EF1515;
        Wed,  4 Oct 2023 02:06:11 -0700 (PDT)
Received: from e130256.cambridge.arm.com (usa-sjc-imap-foss1.foss.arm.com
 [10.121.207.14])
        by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id
 6ADB83F59C;
        Wed,  4 Oct 2023 02:05:31 -0700 (PDT)
From: Hongyan Xia <Hongyan.Xia2@arm.com>
To: Ingo Molnar <mingo@redhat.com>,
        Peter Zijlstra <peterz@infradead.org>,
        Vincent Guittot <vincent.guittot@linaro.org>,
        Dietmar Eggemann <dietmar.eggemann@arm.com>,
        Juri Lelli <juri.lelli@redhat.com>,
        "Rafael J. Wysocki" <rafael@kernel.org>,
        Viresh Kumar <viresh.kumar@linaro.org>
Cc: Qais Yousef <qyousef@layalina.io>,
        Morten Rasmussen <morten.rasmussen@arm.com>,
        Lukasz Luba <lukasz.luba@arm.com>,
        Christian Loehle <christian.loehle@arm.com>,
        linux-kernel@vger.kernel.org, Hongyan Xia <hongyan.xia2@arm.com>,
        linux-pm@vger.kernel.org
Subject: [RFC PATCH 3/6] sched/fair: Use CFS util_avg_uclamp for utilization
 and frequency
Date: Wed,  4 Oct 2023 10:04:51 +0100
Message-Id: 
 <b2fc40b143f90ce652a02950503cbe744bc1d112.1696345700.git.Hongyan.Xia2@arm.com>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <cover.1696345700.git.Hongyan.Xia2@arm.com>
References: <cover.1696345700.git.Hongyan.Xia2@arm.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

From: Hongyan Xia <hongyan.xia2@arm.com>

Switch to the new util_avg_uclamp for task and runqueue utilization.
Since util_est() calls task_util(), this means util_est is now also a
clamped value.

Now that we have the sum aggregated CFS util value, we do not need to
consult uclamp buckets to know how the frequency should be clamped. We
simply look at the aggregated top level root_cfs_util_uclamp to know
what frequency to choose. Because we simulate PELT decay in
root_cfs_util_uclamp anyway, there's no need in cpufreq_schedutil.c to
avoid premature frequency drops.

Consequently, there is no need for uclamp_rq_util_with(). This function
takes the un-clamped util value and sends it through various clamping
filters to get the final value. However, util_avg_uclamp is propagated
with clamping in mind already, so it does not need to be clamped again.

TODO: There are two major caveats in this patch.
1. At the moment sum aggregation does not consider RT tasks. The avg_rt
   signal considers all RT tasks on this rq as a single entity, which
   means the utilization of individual RT tasks is not tracked
   separately. If we want to use sum aggregation, we might have to track
   utilization of RT tasks individually.
2. Busy time accounting in compute_energy() now takes the uclamp'ed
   value. Ideally, it should reflect reality and use the un-clamp'ed
   values. However, that would require maintaining both the normal and
   uclamp'ed values for util_est. This needs to be revisited if it
   causes real problems in practice.

Signed-off-by: Hongyan Xia <hongyan.xia2@arm.com>
---
 kernel/sched/core.c              |  10 +--
 kernel/sched/cpufreq_schedutil.c |  19 +++---
 kernel/sched/fair.c              |  38 +++++------
 kernel/sched/sched.h             | 106 +++++++++----------------------
 4 files changed, 59 insertions(+), 114 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index efe3848978a0..32511ee63f01 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7402,10 +7402,12 @@ int sched_core_idle_cpu(int cpu)
  * The DL bandwidth number otoh is not a measured metric but a value compu=
ted
  * based on the task model parameters and gives the minimal utilization
  * required to meet deadlines.
+ *
+ * The util_cfs parameter has already taken uclamp into account (unless uc=
lamp
+ * support is not compiled in).
  */
 unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
-				 enum cpu_util_type type,
-				 struct task_struct *p)
+				 enum cpu_util_type type)
 {
 	unsigned long dl_util, util, irq, max;
 	struct rq *rq =3D cpu_rq(cpu);
@@ -7439,8 +7441,6 @@ unsigned long effective_cpu_util(int cpu, unsigned lo=
ng util_cfs,
 	 * frequency will be gracefully reduced with the utilization decay.
 	 */
 	util =3D util_cfs + cpu_util_rt(rq);
-	if (type =3D=3D FREQUENCY_UTIL)
-		util =3D uclamp_rq_util_with(rq, util, p);
=20
 	dl_util =3D cpu_util_dl(rq);
=20
@@ -7493,7 +7493,7 @@ unsigned long effective_cpu_util(int cpu, unsigned lo=
ng util_cfs,
=20
 unsigned long sched_cpu_util(int cpu)
 {
-	return effective_cpu_util(cpu, cpu_util_cfs(cpu), ENERGY_UTIL, NULL);
+	return effective_cpu_util(cpu, cpu_util_cfs(cpu), ENERGY_UTIL);
 }
 #endif /* CONFIG_SMP */
=20
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedu=
til.c
index 4492608b7d7f..6e63952b8063 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -159,8 +159,7 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
 	struct rq *rq =3D cpu_rq(sg_cpu->cpu);
=20
 	sg_cpu->bw_dl =3D cpu_bw_dl(rq);
-	sg_cpu->util =3D effective_cpu_util(sg_cpu->cpu, util,
-					  FREQUENCY_UTIL, NULL);
+	sg_cpu->util =3D effective_cpu_util(sg_cpu->cpu, util, FREQUENCY_UTIL);
 }
=20
 /**
@@ -282,7 +281,11 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cp=
u, u64 time,
 	 * into the same scale so we can compare.
 	 */
 	boost =3D (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT;
-	boost =3D uclamp_rq_util_with(cpu_rq(sg_cpu->cpu), boost, NULL);
+	/*
+	 * TODO: Investigate what should be done here. In sum aggregation there
+	 * is no such thing as uclamp_max on a rq, so how do we cap the boost
+	 * value, or do we want to cap the boost frequency here at all?
+	 */
 	if (sg_cpu->util < boost)
 		sg_cpu->util =3D boost;
 }
@@ -346,11 +349,8 @@ static void sugov_update_single_freq(struct update_uti=
l_data *hook, u64 time,
 	/*
 	 * Do not reduce the frequency if the CPU has not been idle
 	 * recently, as the reduction is likely to be premature then.
-	 *
-	 * Except when the rq is capped by uclamp_max.
 	 */
-	if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) &&
-	    sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq) {
+	if (sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq) {
 		next_f =3D sg_policy->next_freq;
=20
 		/* Restore cached freq as next_freq has changed */
@@ -399,11 +399,8 @@ static void sugov_update_single_perf(struct update_uti=
l_data *hook, u64 time,
 	/*
 	 * Do not reduce the target performance level if the CPU has not been
 	 * idle recently, as the reduction is likely to be premature then.
-	 *
-	 * Except when the rq is capped by uclamp_max.
 	 */
-	if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) &&
-	    sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util)
+	if (sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util)
 		sg_cpu->util =3D prev_util;
=20
 	cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl),
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 420af57d01ee..31004aae5f09 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4572,10 +4572,17 @@ static inline unsigned long cfs_rq_load_avg(struct =
cfs_rq *cfs_rq)
=20
 static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
=20
+#ifdef CONFIG_UCLAMP_TASK
+static inline unsigned long task_util(struct task_struct *p)
+{
+	return READ_ONCE(p->se.avg.util_avg_uclamp);
+}
+#else
 static inline unsigned long task_util(struct task_struct *p)
 {
 	return READ_ONCE(p->se.avg.util_avg);
 }
+#endif
=20
 static inline unsigned long _task_util_est(struct task_struct *p)
 {
@@ -4589,22 +4596,6 @@ static inline unsigned long task_util_est(struct tas=
k_struct *p)
 	return max(task_util(p), _task_util_est(p));
 }
=20
-#ifdef CONFIG_UCLAMP_TASK
-static inline unsigned long uclamp_task_util(struct task_struct *p,
-					     unsigned long uclamp_min,
-					     unsigned long uclamp_max)
-{
-	return clamp(task_util_est(p), uclamp_min, uclamp_max);
-}
-#else
-static inline unsigned long uclamp_task_util(struct task_struct *p,
-					     unsigned long uclamp_min,
-					     unsigned long uclamp_max)
-{
-	return task_util_est(p);
-}
-#endif
-
 static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
 				    struct task_struct *p)
 {
@@ -7468,11 +7459,13 @@ static int select_idle_sibling(struct task_struct *=
p, int prev, int target)
 static unsigned long
 cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
 {
-	struct cfs_rq *cfs_rq =3D &cpu_rq(cpu)->cfs;
-	unsigned long util =3D READ_ONCE(cfs_rq->avg.util_avg);
+	struct rq *rq =3D cpu_rq(cpu);
+	struct cfs_rq *cfs_rq =3D &rq->cfs;
+	unsigned long util =3D root_cfs_util(rq);
+	bool capped =3D uclamp_rq_is_capped(rq);
 	unsigned long runnable;
=20
-	if (boost) {
+	if (boost && !capped) {
 		runnable =3D READ_ONCE(cfs_rq->avg.runnable_avg);
 		util =3D max(util, runnable);
 	}
@@ -7629,7 +7622,7 @@ static inline void eenv_pd_busy_time(struct energy_en=
v *eenv,
 	for_each_cpu(cpu, pd_cpus) {
 		unsigned long util =3D cpu_util(cpu, p, -1, 0);
=20
-		busy_time +=3D effective_cpu_util(cpu, util, ENERGY_UTIL, NULL);
+		busy_time +=3D effective_cpu_util(cpu, util, ENERGY_UTIL);
 	}
=20
 	eenv->pd_busy_time =3D min(eenv->pd_cap, busy_time);
@@ -7650,7 +7643,6 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpum=
ask *pd_cpus,
 	int cpu;
=20
 	for_each_cpu(cpu, pd_cpus) {
-		struct task_struct *tsk =3D (cpu =3D=3D dst_cpu) ? p : NULL;
 		unsigned long util =3D cpu_util(cpu, p, dst_cpu, 1);
 		unsigned long eff_util;
=20
@@ -7661,7 +7653,7 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpum=
ask *pd_cpus,
 		 * NOTE: in case RT tasks are running, by default the
 		 * FREQUENCY_UTIL's utilization can be max OPP.
 		 */
-		eff_util =3D effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk);
+		eff_util =3D effective_cpu_util(cpu, util, FREQUENCY_UTIL);
 		max_util =3D max(max_util, eff_util);
 	}
=20
@@ -7758,7 +7750,7 @@ static int find_energy_efficient_cpu(struct task_stru=
ct *p, int prev_cpu)
 	target =3D prev_cpu;
=20
 	sync_entity_load_avg(&p->se);
-	if (!uclamp_task_util(p, p_util_min, p_util_max))
+	if (!task_util_est(p))
 		goto unlock;
=20
 	eenv_task_busy_time(&eenv, p, prev_cpu);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 98fa5e79f4e9..e73aedd9a76b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2997,8 +2997,7 @@ enum cpu_util_type {
 };
=20
 unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
-				 enum cpu_util_type type,
-				 struct task_struct *p);
+				 enum cpu_util_type type);
=20
 /*
  * Verify the fitness of task @p to run on @cpu taking into account the
@@ -3055,85 +3054,44 @@ static inline bool uclamp_rq_is_idle(struct rq *rq)
 	return rq->uclamp_flags & UCLAMP_FLAG_IDLE;
 }
=20
-/**
- * uclamp_rq_util_with - clamp @util with @rq and @p effective uclamp valu=
es.
- * @rq:		The rq to clamp against. Must not be NULL.
- * @util:	The util value to clamp.
- * @p:		The task to clamp against. Can be NULL if you want to clamp
- *		against @rq only.
- *
- * Clamps the passed @util to the max(@rq, @p) effective uclamp values.
- *
- * If sched_uclamp_used static key is disabled, then just return the util
- * without any clamping since uclamp aggregation at the rq level in the fa=
st
- * path is disabled, rendering this operation a NOP.
+/*
+ * When uclamp is compiled in, the aggregation at rq level is 'turned off'
+ * by default in the fast path and only gets turned on once userspace perf=
orms
+ * an operation that requires it.
  *
- * Use uclamp_eff_value() if you don't care about uclamp values at rq leve=
l. It
- * will return the correct effective uclamp value of the task even if the
- * static key is disabled.
+ * Returns true if userspace opted-in to use uclamp and aggregation at rq =
level
+ * hence is active.
  */
-static __always_inline
-unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
-				  struct task_struct *p)
+static inline bool uclamp_is_used(void)
 {
-	unsigned long min_util =3D 0;
-	unsigned long max_util =3D 0;
-
-	if (!static_branch_likely(&sched_uclamp_used))
-		return util;
-
-	if (p) {
-		min_util =3D uclamp_eff_value(p, UCLAMP_MIN);
-		max_util =3D uclamp_eff_value(p, UCLAMP_MAX);
-
-		/*
-		 * Ignore last runnable task's max clamp, as this task will
-		 * reset it. Similarly, no need to read the rq's min clamp.
-		 */
-		if (uclamp_rq_is_idle(rq))
-			goto out;
-	}
-
-	min_util =3D max_t(unsigned long, min_util, uclamp_rq_get(rq, UCLAMP_MIN)=
);
-	max_util =3D max_t(unsigned long, max_util, uclamp_rq_get(rq, UCLAMP_MAX)=
);
-out:
-	/*
-	 * Since CPU's {min,max}_util clamps are MAX aggregated considering
-	 * RUNNABLE tasks with _different_ clamps, we can end up with an
-	 * inversion. Fix it now when the clamps are applied.
-	 */
-	if (unlikely(min_util >=3D max_util))
-		return min_util;
+	return static_branch_likely(&sched_uclamp_used);
+}
=20
-	return clamp(util, min_util, max_util);
+static inline unsigned long root_cfs_util(struct rq *rq)
+{
+	return READ_ONCE(rq->root_cfs_util_uclamp);
 }
=20
 /* Is the rq being capped/throttled by uclamp_max? */
 static inline bool uclamp_rq_is_capped(struct rq *rq)
 {
-	unsigned long rq_util;
-	unsigned long max_util;
+	unsigned long uclamp_util, real_util;
=20
-	if (!static_branch_likely(&sched_uclamp_used))
+	if (!uclamp_is_used())
 		return false;
=20
-	rq_util =3D cpu_util_cfs(cpu_of(rq)) + cpu_util_rt(rq);
-	max_util =3D READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
-
-	return max_util !=3D SCHED_CAPACITY_SCALE && rq_util >=3D max_util;
-}
+	/*
+	 * At the moment there's no such thing as uclamp_max for RT tasks, so
+	 * we only see if CFS is capped.
+	 *
+	 * TODO: Implement uclamp sum aggregation for RT.
+	 */
+	uclamp_util =3D root_cfs_util(rq);
+	real_util =3D READ_ONCE(rq->cfs.avg.util_avg);
=20
-/*
- * When uclamp is compiled in, the aggregation at rq level is 'turned off'
- * by default in the fast path and only gets turned on once userspace perf=
orms
- * an operation that requires it.
- *
- * Returns true if userspace opted-in to use uclamp and aggregation at rq =
level
- * hence is active.
- */
-static inline bool uclamp_is_used(void)
-{
-	return static_branch_likely(&sched_uclamp_used);
+	/* XXX: The 80 margin here isn't backed by science. */
+	return uclamp_util < SCHED_CAPACITY_SCALE &&
+		real_util > uclamp_util + 80;
 }
=20
 static inline void enqueue_util_avg_uclamp(struct cfs_rq *cfs_rq,
@@ -3172,13 +3130,6 @@ static inline unsigned long uclamp_eff_value(struct =
task_struct *p,
 	return SCHED_CAPACITY_SCALE;
 }
=20
-static inline
-unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
-				  struct task_struct *p)
-{
-	return util;
-}
-
 static inline bool uclamp_rq_is_capped(struct rq *rq) { return false; }
=20
 static inline bool uclamp_is_used(void)
@@ -3205,6 +3156,11 @@ static inline bool uclamp_rq_is_idle(struct rq *rq)
 	return false;
 }
=20
+static inline unsigned long root_cfs_util(struct rq *rq)
+{
+	return READ_ONCE(rq->cfs.avg.util_avg);
+}
+
 static inline void enqueue_util_avg_uclamp(struct cfs_rq *cfs_rq,
 					   struct sched_entity *se)
 {
--=20
2.34.1
From nobody Sat Feb  7 20:51:51 2026
Return-Path: <linux-kernel-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 513EEE8FDDD
	for <linux-kernel@archiver.kernel.org>; Wed,  4 Oct 2023 09:05:55 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S241921AbjJDJF4 (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
        Wed, 4 Oct 2023 05:05:56 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:57060 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S241909AbjJDJFp (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Wed, 4 Oct 2023 05:05:45 -0400
Received: from foss.arm.com (foss.arm.com [217.140.110.172])
        by lindbergh.monkeyblade.net (Postfix) with ESMTP id 140AABB
        for <linux-kernel@vger.kernel.org>;
 Wed,  4 Oct 2023 02:05:37 -0700 (PDT)
Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14])
        by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 5715D1516;
        Wed,  4 Oct 2023 02:06:15 -0700 (PDT)
Received: from e130256.cambridge.arm.com (usa-sjc-imap-foss1.foss.arm.com
 [10.121.207.14])
        by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id
 37CDE3F59C;
        Wed,  4 Oct 2023 02:05:35 -0700 (PDT)
From: Hongyan Xia <Hongyan.Xia2@arm.com>
To: Ingo Molnar <mingo@redhat.com>,
        Peter Zijlstra <peterz@infradead.org>,
        Vincent Guittot <vincent.guittot@linaro.org>,
        Dietmar Eggemann <dietmar.eggemann@arm.com>,
        Juri Lelli <juri.lelli@redhat.com>
Cc: Qais Yousef <qyousef@layalina.io>,
        Morten Rasmussen <morten.rasmussen@arm.com>,
        Lukasz Luba <lukasz.luba@arm.com>,
        Christian Loehle <christian.loehle@arm.com>,
        linux-kernel@vger.kernel.org, Hongyan Xia <hongyan.xia2@arm.com>
Subject: [RFC PATCH 4/6] sched/fair: Rewrite util_fits_cpu()
Date: Wed,  4 Oct 2023 10:04:52 +0100
Message-Id: 
 <d8371d0764b595ab496b4fb744fdcba0a82bf41d.1696345700.git.Hongyan.Xia2@arm.com>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <cover.1696345700.git.Hongyan.Xia2@arm.com>
References: <cover.1696345700.git.Hongyan.Xia2@arm.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

From: Hongyan Xia <hongyan.xia2@arm.com>

Currently, there's no way to distinguish the difference between 1) a CPU
that is actually maxed out at its highest frequency, or 2) one that is
throttled because of UCLAMP_MAX, since both present util_avg values of
1024. This is problematic because when we try to pick a CPU for a task
to run, we would like to give 2) a chance, or at least prefer 2) to 1).

Current upstream gives neither a chance because the spare capacity is 0
for either case. There are patches to fix this problem by considering 0
capacities [1], but this might still be inefficient because this ends
up treating 1) and 2) equally, and will always pick the same one because
we don't change how we iterate through all CPUs. If we end up putting
many tasks on 1), then this creates a seriously unbalanced load for the
two CPUs.

Fix by using util_avg_uclamp for util_fits_cpu(). This way, case 1) will
still keep its utilization at 1024 whereas 2) shows spare capacities if
the sum of util_avg_uclamp values is still under the CPU capacity.
Note that this is roughly what the sum aggregation does in the Android
kernel [2] (although we clamp UCLAMP_MIN as well in this patch, which
may need some discussions), which shows superior energy savings because
there's more chance that a task can get scheduled on 2) instead of
finding a big CPU to run on.

Under sum aggregation, checking whether a task fits a CPU becomes much
simpler. We simply do fits_capacity() and there does not need to be code
checking all corner cases for uclamp. This means util_fits_cpu() returns
to true and false instead of tri-state, simplifying a significant amount
of code.

[1]: https://lore.kernel.org/all/20230205224318.2035646-2-qyousef@layalina.=
io/
[2]: https://android.googlesource.com/kernel/gs/+/refs/heads/android-gs-rav=
iole-5.10-android12-d1/drivers/soc/google/vh/kernel/sched/fair.c#510

Signed-off-by: Hongyan Xia <hongyan.xia2@arm.com>
---
 kernel/sched/fair.c | 253 ++++----------------------------------------
 1 file changed, 23 insertions(+), 230 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 31004aae5f09..75a8f7d50e9c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4729,135 +4729,19 @@ static inline void util_est_update(struct cfs_rq *=
cfs_rq,
 	trace_sched_util_est_se_tp(&p->se);
 }
=20
-static inline int util_fits_cpu(unsigned long util,
-				unsigned long uclamp_min,
-				unsigned long uclamp_max,
-				int cpu)
+/* util must be the uclamp'ed value (i.e. from util_avg_uclamp). */
+static inline int util_fits_cpu(unsigned long util, int cpu)
 {
-	unsigned long capacity_orig, capacity_orig_thermal;
 	unsigned long capacity =3D capacity_of(cpu);
-	bool fits, uclamp_max_fits;
=20
-	/*
-	 * Check if the real util fits without any uclamp boost/cap applied.
-	 */
-	fits =3D fits_capacity(util, capacity);
-
-	if (!uclamp_is_used())
-		return fits;
-
-	/*
-	 * We must use capacity_orig_of() for comparing against uclamp_min and
-	 * uclamp_max. We only care about capacity pressure (by using
-	 * capacity_of()) for comparing against the real util.
-	 *
-	 * If a task is boosted to 1024 for example, we don't want a tiny
-	 * pressure to skew the check whether it fits a CPU or not.
-	 *
-	 * Similarly if a task is capped to capacity_orig_of(little_cpu), it
-	 * should fit a little cpu even if there's some pressure.
-	 *
-	 * Only exception is for thermal pressure since it has a direct impact
-	 * on available OPP of the system.
-	 *
-	 * We honour it for uclamp_min only as a drop in performance level
-	 * could result in not getting the requested minimum performance level.
-	 *
-	 * For uclamp_max, we can tolerate a drop in performance level as the
-	 * goal is to cap the task. So it's okay if it's getting less.
-	 */
-	capacity_orig =3D capacity_orig_of(cpu);
-	capacity_orig_thermal =3D capacity_orig - arch_scale_thermal_pressure(cpu=
);
-
-	/*
-	 * We want to force a task to fit a cpu as implied by uclamp_max.
-	 * But we do have some corner cases to cater for..
-	 *
-	 *
-	 *                                 C=3Dz
-	 *   |                             ___
-	 *   |                  C=3Dy       |   |
-	 *   |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _  uclamp_max
-	 *   |      C=3Dx        |   |      |   |
-	 *   |      ___        |   |      |   |
-	 *   |     |   |       |   |      |   |    (util somewhere in this region)
-	 *   |     |   |       |   |      |   |
-	 *   |     |   |       |   |      |   |
-	 *   +----------------------------------------
-	 *         cpu0        cpu1       cpu2
-	 *
-	 *   In the above example if a task is capped to a specific performance
-	 *   point, y, then when:
-	 *
-	 *   * util =3D 80% of x then it does not fit on cpu0 and should migrate
-	 *     to cpu1
-	 *   * util =3D 80% of y then it is forced to fit on cpu1 to honour
-	 *     uclamp_max request.
-	 *
-	 *   which is what we're enforcing here. A task always fits if
-	 *   uclamp_max <=3D capacity_orig. But when uclamp_max > capacity_orig,
-	 *   the normal upmigration rules should withhold still.
-	 *
-	 *   Only exception is when we are on max capacity, then we need to be
-	 *   careful not to block overutilized state. This is so because:
-	 *
-	 *     1. There's no concept of capping at max_capacity! We can't go
-	 *        beyond this performance level anyway.
-	 *     2. The system is being saturated when we're operating near
-	 *        max capacity, it doesn't make sense to block overutilized.
-	 */
-	uclamp_max_fits =3D (capacity_orig =3D=3D SCHED_CAPACITY_SCALE) && (uclam=
p_max =3D=3D SCHED_CAPACITY_SCALE);
-	uclamp_max_fits =3D !uclamp_max_fits && (uclamp_max <=3D capacity_orig);
-	fits =3D fits || uclamp_max_fits;
-
-	/*
-	 *
-	 *                                 C=3Dz
-	 *   |                             ___       (region a, capped, util >=3D=
 uclamp_max)
-	 *   |                  C=3Dy       |   |
-	 *   |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
-	 *   |      C=3Dx        |   |      |   |
-	 *   |      ___        |   |      |   |      (region b, uclamp_min <=3D u=
til <=3D uclamp_max)
-	 *   |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min
-	 *   |     |   |       |   |      |   |
-	 *   |     |   |       |   |      |   |      (region c, boosted, util < u=
clamp_min)
-	 *   +----------------------------------------
-	 *         cpu0        cpu1       cpu2
-	 *
-	 * a) If util > uclamp_max, then we're capped, we don't care about
-	 *    actual fitness value here. We only care if uclamp_max fits
-	 *    capacity without taking margin/pressure into account.
-	 *    See comment above.
-	 *
-	 * b) If uclamp_min <=3D util <=3D uclamp_max, then the normal
-	 *    fits_capacity() rules apply. Except we need to ensure that we
-	 *    enforce we remain within uclamp_max, see comment above.
-	 *
-	 * c) If util < uclamp_min, then we are boosted. Same as (b) but we
-	 *    need to take into account the boosted value fits the CPU without
-	 *    taking margin/pressure into account.
-	 *
-	 * Cases (a) and (b) are handled in the 'fits' variable already. We
-	 * just need to consider an extra check for case (c) after ensuring we
-	 * handle the case uclamp_min > uclamp_max.
-	 */
-	uclamp_min =3D min(uclamp_min, uclamp_max);
-	if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
-		return -1;
-
-	return fits;
+	return fits_capacity(util, capacity);
 }
=20
 static inline int task_fits_cpu(struct task_struct *p, int cpu)
 {
-	unsigned long uclamp_min =3D uclamp_eff_value(p, UCLAMP_MIN);
-	unsigned long uclamp_max =3D uclamp_eff_value(p, UCLAMP_MAX);
 	unsigned long util =3D task_util_est(p);
-	/*
-	 * Return true only if the cpu fully fits the task requirements, which
-	 * include the utilization but also the performance hints.
-	 */
-	return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0);
+
+	return util_fits_cpu(util, cpu);
 }
=20
 static inline void update_misfit_status(struct task_struct *p, struct rq *=
rq)
@@ -6424,11 +6308,8 @@ static inline void hrtick_update(struct rq *rq)
 #ifdef CONFIG_SMP
 static inline bool cpu_overutilized(int cpu)
 {
-	unsigned long rq_util_min =3D uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
-	unsigned long rq_util_max =3D uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
-
 	/* Return true only if the utilization doesn't fit CPU's capacity */
-	return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
+	return !util_fits_cpu(cpu_util_cfs(cpu), cpu);
 }
=20
 static inline void update_overutilized_status(struct rq *rq)
@@ -7248,8 +7129,7 @@ static int select_idle_cpu(struct task_struct *p, str=
uct sched_domain *sd, bool
 static int
 select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int t=
arget)
 {
-	unsigned long task_util, util_min, util_max, best_cap =3D 0;
-	int fits, best_fits =3D 0;
+	unsigned long task_util, best_cap =3D 0;
 	int cpu, best_cpu =3D -1;
 	struct cpumask *cpus;
=20
@@ -7257,8 +7137,6 @@ select_idle_capacity(struct task_struct *p, struct sc=
hed_domain *sd, int target)
 	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
=20
 	task_util =3D task_util_est(p);
-	util_min =3D uclamp_eff_value(p, UCLAMP_MIN);
-	util_max =3D uclamp_eff_value(p, UCLAMP_MAX);
=20
 	for_each_cpu_wrap(cpu, cpus, target) {
 		unsigned long cpu_cap =3D capacity_of(cpu);
@@ -7266,44 +7144,22 @@ select_idle_capacity(struct task_struct *p, struct =
sched_domain *sd, int target)
 		if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
 			continue;
=20
-		fits =3D util_fits_cpu(task_util, util_min, util_max, cpu);
-
-		/* This CPU fits with all requirements */
-		if (fits > 0)
+		if (util_fits_cpu(task_util, cpu))
 			return cpu;
-		/*
-		 * Only the min performance hint (i.e. uclamp_min) doesn't fit.
-		 * Look for the CPU with best capacity.
-		 */
-		else if (fits < 0)
-			cpu_cap =3D capacity_orig_of(cpu) - thermal_load_avg(cpu_rq(cpu));
=20
-		/*
-		 * First, select CPU which fits better (-1 being better than 0).
-		 * Then, select the one with best capacity at same level.
-		 */
-		if ((fits < best_fits) ||
-		    ((fits =3D=3D best_fits) && (cpu_cap > best_cap))) {
+		if (cpu_cap > best_cap) {
 			best_cap =3D cpu_cap;
 			best_cpu =3D cpu;
-			best_fits =3D fits;
 		}
 	}
=20
 	return best_cpu;
 }
=20
-static inline bool asym_fits_cpu(unsigned long util,
-				 unsigned long util_min,
-				 unsigned long util_max,
-				 int cpu)
+static inline bool asym_fits_cpu(unsigned long util, int cpu)
 {
 	if (sched_asym_cpucap_active())
-		/*
-		 * Return true only if the cpu fully fits the task requirements
-		 * which include the utilization and the performance hints.
-		 */
-		return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
+		return util_fits_cpu(util, cpu);
=20
 	return true;
 }
@@ -7315,7 +7171,7 @@ static int select_idle_sibling(struct task_struct *p,=
 int prev, int target)
 {
 	bool has_idle_core =3D false;
 	struct sched_domain *sd;
-	unsigned long task_util, util_min, util_max;
+	unsigned long task_util;
 	int i, recent_used_cpu;
=20
 	/*
@@ -7325,8 +7181,6 @@ static int select_idle_sibling(struct task_struct *p,=
 int prev, int target)
 	if (sched_asym_cpucap_active()) {
 		sync_entity_load_avg(&p->se);
 		task_util =3D task_util_est(p);
-		util_min =3D uclamp_eff_value(p, UCLAMP_MIN);
-		util_max =3D uclamp_eff_value(p, UCLAMP_MAX);
 	}
=20
 	/*
@@ -7335,7 +7189,7 @@ static int select_idle_sibling(struct task_struct *p,=
 int prev, int target)
 	lockdep_assert_irqs_disabled();
=20
 	if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
-	    asym_fits_cpu(task_util, util_min, util_max, target))
+	    asym_fits_cpu(task_util, target))
 		return target;
=20
 	/*
@@ -7343,7 +7197,7 @@ static int select_idle_sibling(struct task_struct *p,=
 int prev, int target)
 	 */
 	if (prev !=3D target && cpus_share_cache(prev, target) &&
 	    (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
-	    asym_fits_cpu(task_util, util_min, util_max, prev))
+	    asym_fits_cpu(task_util, prev))
 		return prev;
=20
 	/*
@@ -7358,7 +7212,7 @@ static int select_idle_sibling(struct task_struct *p,=
 int prev, int target)
 	    in_task() &&
 	    prev =3D=3D smp_processor_id() &&
 	    this_rq()->nr_running <=3D 1 &&
-	    asym_fits_cpu(task_util, util_min, util_max, prev)) {
+	    asym_fits_cpu(task_util, prev)) {
 		return prev;
 	}
=20
@@ -7370,7 +7224,7 @@ static int select_idle_sibling(struct task_struct *p,=
 int prev, int target)
 	    cpus_share_cache(recent_used_cpu, target) &&
 	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cp=
u)) &&
 	    cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
-	    asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
+	    asym_fits_cpu(task_util, recent_used_cpu)) {
 		return recent_used_cpu;
 	}
=20
@@ -7721,13 +7575,8 @@ static int find_energy_efficient_cpu(struct task_str=
uct *p, int prev_cpu)
 {
 	struct cpumask *cpus =3D this_cpu_cpumask_var_ptr(select_rq_mask);
 	unsigned long prev_delta =3D ULONG_MAX, best_delta =3D ULONG_MAX;
-	unsigned long p_util_min =3D uclamp_is_used() ? uclamp_eff_value(p, UCLAM=
P_MIN) : 0;
-	unsigned long p_util_max =3D uclamp_is_used() ? uclamp_eff_value(p, UCLAM=
P_MAX) : 1024;
 	struct root_domain *rd =3D this_rq()->rd;
 	int cpu, best_energy_cpu, target =3D -1;
-	int prev_fits =3D -1, best_fits =3D -1;
-	unsigned long best_thermal_cap =3D 0;
-	unsigned long prev_thermal_cap =3D 0;
 	struct sched_domain *sd;
 	struct perf_domain *pd;
 	struct energy_env eenv;
@@ -7756,14 +7605,11 @@ static int find_energy_efficient_cpu(struct task_st=
ruct *p, int prev_cpu)
 	eenv_task_busy_time(&eenv, p, prev_cpu);
=20
 	for (; pd; pd =3D pd->next) {
-		unsigned long util_min =3D p_util_min, util_max =3D p_util_max;
 		unsigned long cpu_cap, cpu_thermal_cap, util;
 		unsigned long cur_delta, max_spare_cap =3D 0;
-		unsigned long rq_util_min, rq_util_max;
 		unsigned long prev_spare_cap =3D 0;
 		int max_spare_cap_cpu =3D -1;
 		unsigned long base_energy;
-		int fits, max_fits =3D -1;
=20
 		cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
=20
@@ -7779,8 +7625,6 @@ static int find_energy_efficient_cpu(struct task_stru=
ct *p, int prev_cpu)
 		eenv.pd_cap =3D 0;
=20
 		for_each_cpu(cpu, cpus) {
-			struct rq *rq =3D cpu_rq(cpu);
-
 			eenv.pd_cap +=3D cpu_thermal_cap;
=20
 			if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
@@ -7791,31 +7635,7 @@ static int find_energy_efficient_cpu(struct task_str=
uct *p, int prev_cpu)
=20
 			util =3D cpu_util(cpu, p, cpu, 0);
 			cpu_cap =3D capacity_of(cpu);
-
-			/*
-			 * Skip CPUs that cannot satisfy the capacity request.
-			 * IOW, placing the task there would make the CPU
-			 * overutilized. Take uclamp into account to see how
-			 * much capacity we can get out of the CPU; this is
-			 * aligned with sched_cpu_util().
-			 */
-			if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) {
-				/*
-				 * Open code uclamp_rq_util_with() except for
-				 * the clamp() part. Ie: apply max aggregation
-				 * only. util_fits_cpu() logic requires to
-				 * operate on non clamped util but must use the
-				 * max-aggregated uclamp_{min, max}.
-				 */
-				rq_util_min =3D uclamp_rq_get(rq, UCLAMP_MIN);
-				rq_util_max =3D uclamp_rq_get(rq, UCLAMP_MAX);
-
-				util_min =3D max(rq_util_min, p_util_min);
-				util_max =3D max(rq_util_max, p_util_max);
-			}
-
-			fits =3D util_fits_cpu(util, util_min, util_max, cpu);
-			if (!fits)
+			if (!util_fits_cpu(util, cpu))
 				continue;
=20
 			lsub_positive(&cpu_cap, util);
@@ -7823,9 +7643,7 @@ static int find_energy_efficient_cpu(struct task_stru=
ct *p, int prev_cpu)
 			if (cpu =3D=3D prev_cpu) {
 				/* Always use prev_cpu as a candidate. */
 				prev_spare_cap =3D cpu_cap;
-				prev_fits =3D fits;
-			} else if ((fits > max_fits) ||
-				   ((fits =3D=3D max_fits) && (cpu_cap > max_spare_cap))) {
+			} else if (cpu_cap > max_spare_cap) {
 				/*
 				 * Find the CPU with the maximum spare capacity
 				 * among the remaining CPUs in the performance
@@ -7833,7 +7651,6 @@ static int find_energy_efficient_cpu(struct task_stru=
ct *p, int prev_cpu)
 				 */
 				max_spare_cap =3D cpu_cap;
 				max_spare_cap_cpu =3D cpu;
-				max_fits =3D fits;
 			}
 		}
=20
@@ -7852,50 +7669,26 @@ static int find_energy_efficient_cpu(struct task_st=
ruct *p, int prev_cpu)
 			if (prev_delta < base_energy)
 				goto unlock;
 			prev_delta -=3D base_energy;
-			prev_thermal_cap =3D cpu_thermal_cap;
 			best_delta =3D min(best_delta, prev_delta);
 		}
=20
 		/* Evaluate the energy impact of using max_spare_cap_cpu. */
 		if (max_spare_cap_cpu >=3D 0 && max_spare_cap > prev_spare_cap) {
-			/* Current best energy cpu fits better */
-			if (max_fits < best_fits)
-				continue;
-
-			/*
-			 * Both don't fit performance hint (i.e. uclamp_min)
-			 * but best energy cpu has better capacity.
-			 */
-			if ((max_fits < 0) &&
-			    (cpu_thermal_cap <=3D best_thermal_cap))
-				continue;
-
 			cur_delta =3D compute_energy(&eenv, pd, cpus, p,
 						   max_spare_cap_cpu);
 			/* CPU utilization has changed */
 			if (cur_delta < base_energy)
 				goto unlock;
 			cur_delta -=3D base_energy;
-
-			/*
-			 * Both fit for the task but best energy cpu has lower
-			 * energy impact.
-			 */
-			if ((max_fits > 0) && (best_fits > 0) &&
-			    (cur_delta >=3D best_delta))
-				continue;
-
-			best_delta =3D cur_delta;
-			best_energy_cpu =3D max_spare_cap_cpu;
-			best_fits =3D max_fits;
-			best_thermal_cap =3D cpu_thermal_cap;
+			if (cur_delta < best_delta) {
+				best_delta =3D cur_delta;
+				best_energy_cpu =3D max_spare_cap_cpu;
+			}
 		}
 	}
 	rcu_read_unlock();
=20
-	if ((best_fits > prev_fits) ||
-	    ((best_fits > 0) && (best_delta < prev_delta)) ||
-	    ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap)))
+	if (best_delta < prev_delta)
 		target =3D best_energy_cpu;
=20
 	return target;
--=20
2.34.1
From nobody Sat Feb  7 20:51:51 2026
Return-Path: <linux-kernel-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id EBFE4E8FDDD
	for <linux-kernel@archiver.kernel.org>; Wed,  4 Oct 2023 09:06:11 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S241981AbjJDJGM (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
        Wed, 4 Oct 2023 05:06:12 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:57096 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S241942AbjJDJFx (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Wed, 4 Oct 2023 05:05:53 -0400
Received: from foss.arm.com (foss.arm.com [217.140.110.172])
        by lindbergh.monkeyblade.net (Postfix) with ESMTP id BD0C0DC
        for <linux-kernel@vger.kernel.org>;
 Wed,  4 Oct 2023 02:05:40 -0700 (PDT)
Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14])
        by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 15C50152B;
        Wed,  4 Oct 2023 02:06:19 -0700 (PDT)
Received: from e130256.cambridge.arm.com (usa-sjc-imap-foss1.foss.arm.com
 [10.121.207.14])
        by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id
 0FFA73F59C;
        Wed,  4 Oct 2023 02:05:38 -0700 (PDT)
From: Hongyan Xia <Hongyan.Xia2@arm.com>
To: Ingo Molnar <mingo@redhat.com>,
        Peter Zijlstra <peterz@infradead.org>,
        Vincent Guittot <vincent.guittot@linaro.org>,
        Dietmar Eggemann <dietmar.eggemann@arm.com>,
        Juri Lelli <juri.lelli@redhat.com>
Cc: Qais Yousef <qyousef@layalina.io>,
        Morten Rasmussen <morten.rasmussen@arm.com>,
        Lukasz Luba <lukasz.luba@arm.com>,
        Christian Loehle <christian.loehle@arm.com>,
        linux-kernel@vger.kernel.org, Hongyan Xia <hongyan.xia2@arm.com>
Subject: [RFC PATCH 5/6] sched/uclamp: Remove all uclamp bucket logic
Date: Wed,  4 Oct 2023 10:04:53 +0100
Message-Id: 
 <48fcea0a9bb2d2212c575032e64ab717756dc0fa.1696345700.git.Hongyan.Xia2@arm.com>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <cover.1696345700.git.Hongyan.Xia2@arm.com>
References: <cover.1696345700.git.Hongyan.Xia2@arm.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

From: Hongyan Xia <hongyan.xia2@arm.com>

Also rewrite uclamp_update_active() so that the effective uclamp values
are updated every time we change task group properties, change system
defaults or a request is issued from userspace.

TODO: Rewrite documentation to match the new logic.

Signed-off-by: Hongyan Xia <hongyan.xia2@arm.com>
---
 include/linux/sched.h |   4 -
 init/Kconfig          |  32 -----
 kernel/sched/core.c   | 295 +++---------------------------------------
 kernel/sched/fair.c   |   4 -
 kernel/sched/rt.c     |   4 -
 kernel/sched/sched.h  |  85 ------------
 6 files changed, 16 insertions(+), 408 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 825d7b86b006..5b8d5abb2bba 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -685,9 +685,6 @@ struct sched_dl_entity {
 };
=20
 #ifdef CONFIG_UCLAMP_TASK
-/* Number of utilization clamp buckets (shorter alias) */
-#define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT
-
 /*
  * Utilization clamp for a scheduling entity
  * @value:		clamp value "assigned" to a se
@@ -713,7 +710,6 @@ struct sched_dl_entity {
  */
 struct uclamp_se {
 	unsigned int value		: bits_per(SCHED_CAPACITY_SCALE);
-	unsigned int bucket_id		: bits_per(UCLAMP_BUCKETS);
 	unsigned int active		: 1;
 	unsigned int user_defined	: 1;
 };
diff --git a/init/Kconfig b/init/Kconfig
index 5e7d4885d1bf..4ec0023d2149 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -808,38 +808,6 @@ config UCLAMP_TASK
 	  enforce or grant any specific bandwidth for tasks.
=20
 	  If in doubt, say N.
-
-config UCLAMP_BUCKETS_COUNT
-	int "Number of supported utilization clamp buckets"
-	range 5 20
-	default 5
-	depends on UCLAMP_TASK
-	help
-	  Defines the number of clamp buckets to use. The range of each bucket
-	  will be SCHED_CAPACITY_SCALE/UCLAMP_BUCKETS_COUNT. The higher the
-	  number of clamp buckets the finer their granularity and the higher
-	  the precision of clamping aggregation and tracking at run-time.
-
-	  For example, with the minimum configuration value we will have 5
-	  clamp buckets tracking 20% utilization each. A 25% boosted tasks will
-	  be refcounted in the [20..39]% bucket and will set the bucket clamp
-	  effective value to 25%.
-	  If a second 30% boosted task should be co-scheduled on the same CPU,
-	  that task will be refcounted in the same bucket of the first task and
-	  it will boost the bucket clamp effective value to 30%.
-	  The clamp effective value of a bucket is reset to its nominal value
-	  (20% in the example above) when there are no more tasks refcounted in
-	  that bucket.
-
-	  An additional boost/capping margin can be added to some tasks. In the
-	  example above the 25% task will be boosted to 30% until it exits the
-	  CPU. If that should be considered not acceptable on certain systems,
-	  it's always possible to reduce the margin by increasing the number of
-	  clamp buckets to trade off used memory for run-time tracking
-	  precision.
-
-	  If in doubt, use the default value.
-
 endmenu
=20
 #
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 32511ee63f01..c5bf01e7df28 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1387,17 +1387,9 @@ static struct uclamp_se uclamp_default[UCLAMP_CNT];
  */
 DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
=20
-/* Integer rounded range for each bucket */
-#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP=
_BUCKETS)
-
 #define for_each_clamp_id(clamp_id) \
 	for ((clamp_id) =3D 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
=20
-static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
-{
-	return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCK=
ETS - 1);
-}
-
 static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
 {
 	if (clamp_id =3D=3D UCLAMP_MIN)
@@ -1409,58 +1401,9 @@ static inline void uclamp_se_set(struct uclamp_se *u=
c_se,
 				 unsigned int value, bool user_defined)
 {
 	uc_se->value =3D value;
-	uc_se->bucket_id =3D uclamp_bucket_id(value);
 	uc_se->user_defined =3D user_defined;
 }
=20
-static inline unsigned int
-uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
-		  unsigned int clamp_value)
-{
-	/*
-	 * Avoid blocked utilization pushing up the frequency when we go
-	 * idle (which drops the max-clamp) by retaining the last known
-	 * max-clamp.
-	 */
-	if (clamp_id =3D=3D UCLAMP_MAX) {
-		rq->uclamp_flags |=3D UCLAMP_FLAG_IDLE;
-		return clamp_value;
-	}
-
-	return uclamp_none(UCLAMP_MIN);
-}
-
-static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_i=
d,
-				     unsigned int clamp_value)
-{
-	/* Reset max-clamp retention only on idle exit */
-	if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
-		return;
-
-	uclamp_rq_set(rq, clamp_id, clamp_value);
-}
-
-static inline
-unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
-				   unsigned int clamp_value)
-{
-	struct uclamp_bucket *bucket =3D rq->uclamp[clamp_id].bucket;
-	int bucket_id =3D UCLAMP_BUCKETS - 1;
-
-	/*
-	 * Since both min and max clamps are max aggregated, find the
-	 * top most bucket with tasks in.
-	 */
-	for ( ; bucket_id >=3D 0; bucket_id--) {
-		if (!bucket[bucket_id].tasks)
-			continue;
-		return bucket[bucket_id].value;
-	}
-
-	/* No tasks -- default clamp values */
-	return uclamp_idle_value(rq, clamp_id, clamp_value);
-}
-
 static void __uclamp_update_util_min_rt_default(struct task_struct *p)
 {
 	unsigned int default_util_min;
@@ -1542,196 +1485,24 @@ uclamp_eff_get(struct task_struct *p, enum uclamp_=
id clamp_id)
=20
 unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp=
_id)
 {
-	struct uclamp_se uc_eff;
-
-	/* Task currently refcounted: use back-annotated (effective) value */
-	if (p->uclamp[clamp_id].active)
-		return (unsigned long)p->uclamp[clamp_id].value;
-
-	uc_eff =3D uclamp_eff_get(p, clamp_id);
-
-	return (unsigned long)uc_eff.value;
-}
-
-/*
- * When a task is enqueued on a rq, the clamp bucket currently defined by =
the
- * task's uclamp::bucket_id is refcounted on that rq. This also immediately
- * updates the rq's clamp value if required.
- *
- * Tasks can have a task-specific value requested from user-space, track
- * within each bucket the maximum value for tasks refcounted in it.
- * This "local max aggregation" allows to track the exact "requested" value
- * for each bucket when all its RUNNABLE tasks require the same clamp.
- */
-static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
-				    enum uclamp_id clamp_id)
-{
-	struct uclamp_rq *uc_rq =3D &rq->uclamp[clamp_id];
-	struct uclamp_se *uc_se =3D &p->uclamp[clamp_id];
-	struct uclamp_bucket *bucket;
-
-	lockdep_assert_rq_held(rq);
+	if (!uclamp_is_used() || !p->uclamp[clamp_id].active)
+		return uclamp_none(clamp_id);
=20
-	/* Update task effective clamp */
-	p->uclamp[clamp_id] =3D uclamp_eff_get(p, clamp_id);
-
-	bucket =3D &uc_rq->bucket[uc_se->bucket_id];
-	bucket->tasks++;
-	uc_se->active =3D true;
-
-	uclamp_idle_reset(rq, clamp_id, uc_se->value);
-
-	/*
-	 * Local max aggregation: rq buckets always track the max
-	 * "requested" clamp value of its RUNNABLE tasks.
-	 */
-	if (bucket->tasks =3D=3D 1 || uc_se->value > bucket->value)
-		bucket->value =3D uc_se->value;
-
-	if (uc_se->value > uclamp_rq_get(rq, clamp_id))
-		uclamp_rq_set(rq, clamp_id, uc_se->value);
+	return p->uclamp[clamp_id].value;
 }
=20
-/*
- * When a task is dequeued from a rq, the clamp bucket refcounted by the t=
ask
- * is released. If this is the last task reference counting the rq's max
- * active clamp value, then the rq's clamp value is updated.
- *
- * Both refcounted tasks and rq's cached clamp values are expected to be
- * always valid. If it's detected they are not, as defensive programming,
- * enforce the expected state and warn.
- */
-static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
-				    enum uclamp_id clamp_id)
-{
-	struct uclamp_rq *uc_rq =3D &rq->uclamp[clamp_id];
-	struct uclamp_se *uc_se =3D &p->uclamp[clamp_id];
-	struct uclamp_bucket *bucket;
-	unsigned int bkt_clamp;
-	unsigned int rq_clamp;
-
-	lockdep_assert_rq_held(rq);
-
-	/*
-	 * If sched_uclamp_used was enabled after task @p was enqueued,
-	 * we could end up with unbalanced call to uclamp_rq_dec_id().
-	 *
-	 * In this case the uc_se->active flag should be false since no uclamp
-	 * accounting was performed at enqueue time and we can just return
-	 * here.
-	 *
-	 * Need to be careful of the following enqueue/dequeue ordering
-	 * problem too
-	 *
-	 *	enqueue(taskA)
-	 *	// sched_uclamp_used gets enabled
-	 *	enqueue(taskB)
-	 *	dequeue(taskA)
-	 *	// Must not decrement bucket->tasks here
-	 *	dequeue(taskB)
-	 *
-	 * where we could end up with stale data in uc_se and
-	 * bucket[uc_se->bucket_id].
-	 *
-	 * The following check here eliminates the possibility of such race.
-	 */
-	if (unlikely(!uc_se->active))
-		return;
-
-	bucket =3D &uc_rq->bucket[uc_se->bucket_id];
-
-	SCHED_WARN_ON(!bucket->tasks);
-	if (likely(bucket->tasks))
-		bucket->tasks--;
-
-	uc_se->active =3D false;
-
-	/*
-	 * Keep "local max aggregation" simple and accept to (possibly)
-	 * overboost some RUNNABLE tasks in the same bucket.
-	 * The rq clamp bucket value is reset to its base value whenever
-	 * there are no more RUNNABLE tasks refcounting it.
-	 */
-	if (likely(bucket->tasks))
-		return;
-
-	rq_clamp =3D uclamp_rq_get(rq, clamp_id);
-	/*
-	 * Defensive programming: this should never happen. If it happens,
-	 * e.g. due to future modification, warn and fixup the expected value.
-	 */
-	SCHED_WARN_ON(bucket->value > rq_clamp);
-	if (bucket->value >=3D rq_clamp) {
-		bkt_clamp =3D uclamp_rq_max_value(rq, clamp_id, uc_se->value);
-		uclamp_rq_set(rq, clamp_id, bkt_clamp);
-	}
-}
-
-static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
-{
-	enum uclamp_id clamp_id;
-
-	/*
-	 * Avoid any overhead until uclamp is actually used by the userspace.
-	 *
-	 * The condition is constructed such that a NOP is generated when
-	 * sched_uclamp_used is disabled.
-	 */
-	if (!static_branch_unlikely(&sched_uclamp_used))
-		return;
-
-	if (unlikely(!p->sched_class->uclamp_enabled))
-		return;
-
-	for_each_clamp_id(clamp_id)
-		uclamp_rq_inc_id(rq, p, clamp_id);
-
-	/* Reset clamp idle holding when there is one RUNNABLE task */
-	if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
-		rq->uclamp_flags &=3D ~UCLAMP_FLAG_IDLE;
-}
-
-static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
+static inline void
+uclamp_update_active_nolock(struct task_struct *p)
 {
 	enum uclamp_id clamp_id;
=20
-	/*
-	 * Avoid any overhead until uclamp is actually used by the userspace.
-	 *
-	 * The condition is constructed such that a NOP is generated when
-	 * sched_uclamp_used is disabled.
-	 */
-	if (!static_branch_unlikely(&sched_uclamp_used))
-		return;
-
-	if (unlikely(!p->sched_class->uclamp_enabled))
-		return;
-
 	for_each_clamp_id(clamp_id)
-		uclamp_rq_dec_id(rq, p, clamp_id);
-}
-
-static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p,
-				      enum uclamp_id clamp_id)
-{
-	if (!p->uclamp[clamp_id].active)
-		return;
-
-	uclamp_rq_dec_id(rq, p, clamp_id);
-	uclamp_rq_inc_id(rq, p, clamp_id);
-
-	/*
-	 * Make sure to clear the idle flag if we've transiently reached 0
-	 * active tasks on rq.
-	 */
-	if (clamp_id =3D=3D UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE))
-		rq->uclamp_flags &=3D ~UCLAMP_FLAG_IDLE;
+		p->uclamp[clamp_id] =3D uclamp_eff_get(p, clamp_id);
 }
=20
 static inline void
 uclamp_update_active(struct task_struct *p)
 {
-	enum uclamp_id clamp_id;
 	struct rq_flags rf;
 	struct rq *rq;
=20
@@ -1745,14 +1516,7 @@ uclamp_update_active(struct task_struct *p)
 	 */
 	rq =3D task_rq_lock(p, &rf);
=20
-	/*
-	 * Setting the clamp bucket is serialized by task_rq_lock().
-	 * If the task is not yet RUNNABLE and its task_struct is not
-	 * affecting a valid clamp bucket, the next time it's enqueued,
-	 * it will already see the updated clamp bucket value.
-	 */
-	for_each_clamp_id(clamp_id)
-		uclamp_rq_reinc_id(rq, p, clamp_id);
+	uclamp_update_active_nolock(p);
=20
 	task_rq_unlock(rq, p, &rf);
 }
@@ -1983,26 +1747,22 @@ static void __setscheduler_uclamp(struct task_struc=
t *p,
 		uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
 			      attr->sched_util_max, true);
 	}
+
+	uclamp_update_active_nolock(p);
 }
=20
 static void uclamp_fork(struct task_struct *p)
 {
 	enum uclamp_id clamp_id;
=20
-	/*
-	 * We don't need to hold task_rq_lock() when updating p->uclamp_* here
-	 * as the task is still at its early fork stages.
-	 */
-	for_each_clamp_id(clamp_id)
-		p->uclamp[clamp_id].active =3D false;
-
-	if (likely(!p->sched_reset_on_fork))
-		return;
-
-	for_each_clamp_id(clamp_id) {
-		uclamp_se_set(&p->uclamp_req[clamp_id],
-			      uclamp_none(clamp_id), false);
+	if (unlikely(p->sched_reset_on_fork)) {
+		for_each_clamp_id(clamp_id) {
+			uclamp_se_set(&p->uclamp_req[clamp_id],
+				      uclamp_none(clamp_id), false);
+		}
 	}
+
+	uclamp_update_active(p);
 }
=20
 static void uclamp_post_fork(struct task_struct *p)
@@ -2010,28 +1770,10 @@ static void uclamp_post_fork(struct task_struct *p)
 	uclamp_update_util_min_rt_default(p);
 }
=20
-static void __init init_uclamp_rq(struct rq *rq)
-{
-	enum uclamp_id clamp_id;
-	struct uclamp_rq *uc_rq =3D rq->uclamp;
-
-	for_each_clamp_id(clamp_id) {
-		uc_rq[clamp_id] =3D (struct uclamp_rq) {
-			.value =3D uclamp_none(clamp_id)
-		};
-	}
-
-	rq->uclamp_flags =3D UCLAMP_FLAG_IDLE;
-}
-
 static void __init init_uclamp(void)
 {
 	struct uclamp_se uc_max =3D {};
 	enum uclamp_id clamp_id;
-	int cpu;
-
-	for_each_possible_cpu(cpu)
-		init_uclamp_rq(cpu_rq(cpu));
=20
 	for_each_clamp_id(clamp_id) {
 		uclamp_se_set(&init_task.uclamp_req[clamp_id],
@@ -2050,8 +1792,6 @@ static void __init init_uclamp(void)
 }
=20
 #else /* CONFIG_UCLAMP_TASK */
-static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
-static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
 static inline int uclamp_validate(struct task_struct *p,
 				  const struct sched_attr *attr)
 {
@@ -2098,7 +1838,6 @@ static inline void enqueue_task(struct rq *rq, struct=
 task_struct *p, int flags)
 		psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
 	}
=20
-	uclamp_rq_inc(rq, p);
 	p->sched_class->enqueue_task(rq, p, flags);
=20
 	if (sched_core_enabled(rq))
@@ -2118,7 +1857,6 @@ static inline void dequeue_task(struct rq *rq, struct=
 task_struct *p, int flags)
 		psi_dequeue(p, flags & DEQUEUE_SLEEP);
 	}
=20
-	uclamp_rq_dec(rq, p);
 	p->sched_class->dequeue_task(rq, p, flags);
 }
=20
@@ -10659,7 +10397,6 @@ static void cpu_util_update_eff(struct cgroup_subsy=
s_state *css)
 			if (eff[clamp_id] =3D=3D uc_se[clamp_id].value)
 				continue;
 			uc_se[clamp_id].value =3D eff[clamp_id];
-			uc_se[clamp_id].bucket_id =3D uclamp_bucket_id(eff[clamp_id]);
 			clamps |=3D (0x1 << clamp_id);
 		}
 		if (!clamps) {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 75a8f7d50e9c..bfe01f534a21 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12708,10 +12708,6 @@ DEFINE_SCHED_CLASS(fair) =3D {
 #ifdef CONFIG_SCHED_CORE
 	.task_is_throttled	=3D task_is_throttled_fair,
 #endif
-
-#ifdef CONFIG_UCLAMP_TASK
-	.uclamp_enabled		=3D 1,
-#endif
 };
=20
 #ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 0597ba0f85ff..68f257150c16 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2732,10 +2732,6 @@ DEFINE_SCHED_CLASS(rt) =3D {
 #ifdef CONFIG_SCHED_CORE
 	.task_is_throttled	=3D task_is_throttled_rt,
 #endif
-
-#ifdef CONFIG_UCLAMP_TASK
-	.uclamp_enabled		=3D 1,
-#endif
 };
=20
 #ifdef CONFIG_RT_GROUP_SCHED
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e73aedd9a76b..30dee8eb2ed9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -903,46 +903,6 @@ extern void rto_push_irq_work_func(struct irq_work *wo=
rk);
 #endif /* CONFIG_SMP */
=20
 #ifdef CONFIG_UCLAMP_TASK
-/*
- * struct uclamp_bucket - Utilization clamp bucket
- * @value: utilization clamp value for tasks on this clamp bucket
- * @tasks: number of RUNNABLE tasks on this clamp bucket
- *
- * Keep track of how many tasks are RUNNABLE for a given utilization
- * clamp value.
- */
-struct uclamp_bucket {
-	unsigned long value : bits_per(SCHED_CAPACITY_SCALE);
-	unsigned long tasks : BITS_PER_LONG - bits_per(SCHED_CAPACITY_SCALE);
-};
-
-/*
- * struct uclamp_rq - rq's utilization clamp
- * @value: currently active clamp values for a rq
- * @bucket: utilization clamp buckets affecting a rq
- *
- * Keep track of RUNNABLE tasks on a rq to aggregate their clamp values.
- * A clamp value is affecting a rq when there is at least one task RUNNABLE
- * (or actually running) with that value.
- *
- * There are up to UCLAMP_CNT possible different clamp values, currently t=
here
- * are only two: minimum utilization and maximum utilization.
- *
- * All utilization clamping values are MAX aggregated, since:
- * - for util_min: we want to run the CPU at least at the max of the minim=
um
- *   utilization required by its currently RUNNABLE tasks.
- * - for util_max: we want to allow the CPU to run up to the max of the
- *   maximum utilization allowed by its currently RUNNABLE tasks.
- *
- * Since on each system we expect only a limited number of different
- * utilization clamp values (UCLAMP_BUCKETS), use a simple array to track
- * the metrics required to compute all the per-rq utilization clamp values.
- */
-struct uclamp_rq {
-	unsigned int value;
-	struct uclamp_bucket bucket[UCLAMP_BUCKETS];
-};
-
 DECLARE_STATIC_KEY_FALSE(sched_uclamp_used);
 #endif /* CONFIG_UCLAMP_TASK */
=20
@@ -989,12 +949,8 @@ struct rq {
 	u64			nr_switches;
=20
 #ifdef CONFIG_UCLAMP_TASK
-	/* Utilization clamp values based on CPU's RUNNABLE tasks */
-	struct uclamp_rq	uclamp[UCLAMP_CNT] ____cacheline_aligned;
-	unsigned int		uclamp_flags;
 	unsigned int		root_cfs_util_uclamp;
 	unsigned int		root_cfs_util_uclamp_removed;
-#define UCLAMP_FLAG_IDLE 0x01
 #endif
=20
 	struct cfs_rq		cfs;
@@ -2229,11 +2185,6 @@ struct affinity_context {
 };
=20
 struct sched_class {
-
-#ifdef CONFIG_UCLAMP_TASK
-	int uclamp_enabled;
-#endif
-
 	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
 	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
 	void (*yield_task)   (struct rq *rq);
@@ -3037,23 +2988,6 @@ static inline unsigned long cpu_util_rt(struct rq *r=
q)
 #ifdef CONFIG_UCLAMP_TASK
 unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp=
_id);
=20
-static inline unsigned long uclamp_rq_get(struct rq *rq,
-					  enum uclamp_id clamp_id)
-{
-	return READ_ONCE(rq->uclamp[clamp_id].value);
-}
-
-static inline void uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id,
-				 unsigned int value)
-{
-	WRITE_ONCE(rq->uclamp[clamp_id].value, value);
-}
-
-static inline bool uclamp_rq_is_idle(struct rq *rq)
-{
-	return rq->uclamp_flags & UCLAMP_FLAG_IDLE;
-}
-
 /*
  * When uclamp is compiled in, the aggregation at rq level is 'turned off'
  * by default in the fast path and only gets turned on once userspace perf=
orms
@@ -3137,25 +3071,6 @@ static inline bool uclamp_is_used(void)
 	return false;
 }
=20
-static inline unsigned long uclamp_rq_get(struct rq *rq,
-					  enum uclamp_id clamp_id)
-{
-	if (clamp_id =3D=3D UCLAMP_MIN)
-		return 0;
-
-	return SCHED_CAPACITY_SCALE;
-}
-
-static inline void uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id,
-				 unsigned int value)
-{
-}
-
-static inline bool uclamp_rq_is_idle(struct rq *rq)
-{
-	return false;
-}
-
 static inline unsigned long root_cfs_util(struct rq *rq)
 {
 	return READ_ONCE(rq->cfs.avg.util_avg);
--=20
2.34.1
From nobody Sat Feb  7 20:51:51 2026
Return-Path: <linux-kernel-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 5918FE8FDDE
	for <linux-kernel@archiver.kernel.org>; Wed,  4 Oct 2023 09:06:15 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S241989AbjJDJGO (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
        Wed, 4 Oct 2023 05:06:14 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56900 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S241094AbjJDJFx (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Wed, 4 Oct 2023 05:05:53 -0400
Received: from foss.arm.com (foss.arm.com [217.140.110.172])
        by lindbergh.monkeyblade.net (Postfix) with ESMTP id BE0CBF0
        for <linux-kernel@vger.kernel.org>;
 Wed,  4 Oct 2023 02:05:44 -0700 (PDT)
Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14])
        by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id E5187153B;
        Wed,  4 Oct 2023 02:06:22 -0700 (PDT)
Received: from e130256.cambridge.arm.com (usa-sjc-imap-foss1.foss.arm.com
 [10.121.207.14])
        by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id
 E08FB3F59C;
        Wed,  4 Oct 2023 02:05:42 -0700 (PDT)
From: Hongyan Xia <Hongyan.Xia2@arm.com>
To: Ingo Molnar <mingo@redhat.com>,
        Peter Zijlstra <peterz@infradead.org>,
        Vincent Guittot <vincent.guittot@linaro.org>,
        Dietmar Eggemann <dietmar.eggemann@arm.com>,
        Juri Lelli <juri.lelli@redhat.com>
Cc: Qais Yousef <qyousef@layalina.io>,
        Morten Rasmussen <morten.rasmussen@arm.com>,
        Lukasz Luba <lukasz.luba@arm.com>,
        Christian Loehle <christian.loehle@arm.com>,
        linux-kernel@vger.kernel.org, Hongyan Xia <hongyan.xia2@arm.com>
Subject: [RFC PATCH 6/6] sched/uclamp: Simplify uclamp_eff_value()
Date: Wed,  4 Oct 2023 10:04:54 +0100
Message-Id: 
 <6006f27e2cae8c5f8b00987aa04ee29317aabcc1.1696345700.git.Hongyan.Xia2@arm.com>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <cover.1696345700.git.Hongyan.Xia2@arm.com>
References: <cover.1696345700.git.Hongyan.Xia2@arm.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

From: Hongyan Xia <hongyan.xia2@arm.com>

The commit

sched: Remove all uclamp bucket logic

removes uclamp_{inc/dec}() functions, so now p->uclamp contains the
correct values all the time after a update_uclamp_active() call, and
there's no need to toggle the boolean `active` after an update. As a
result, this function is fairly simple now and can live as a static
inline function.

Signed-off-by: Hongyan Xia <hongyan.xia2@arm.com>
---
 kernel/sched/core.c  | 13 ++++---------
 kernel/sched/sched.h | 14 ++++++++++++--
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c5bf01e7df28..737921a9dd91 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1483,21 +1483,15 @@ uclamp_eff_get(struct task_struct *p, enum uclamp_i=
d clamp_id)
 	return uc_req;
 }
=20
-unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp=
_id)
-{
-	if (!uclamp_is_used() || !p->uclamp[clamp_id].active)
-		return uclamp_none(clamp_id);
-
-	return p->uclamp[clamp_id].value;
-}
-
 static inline void
 uclamp_update_active_nolock(struct task_struct *p)
 {
 	enum uclamp_id clamp_id;
=20
-	for_each_clamp_id(clamp_id)
+	for_each_clamp_id(clamp_id) {
 		p->uclamp[clamp_id] =3D uclamp_eff_get(p, clamp_id);
+		p->uclamp[clamp_id].active =3D 1;
+	}
 }
=20
 static inline void
@@ -1759,6 +1753,7 @@ static void uclamp_fork(struct task_struct *p)
 		for_each_clamp_id(clamp_id) {
 			uclamp_se_set(&p->uclamp_req[clamp_id],
 				      uclamp_none(clamp_id), false);
+			p->uclamp[clamp_id].active =3D 0;
 		}
 	}
=20
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 30dee8eb2ed9..896626afbedc 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2986,8 +2986,6 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
 #endif
=20
 #ifdef CONFIG_UCLAMP_TASK
-unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp=
_id);
-
 /*
  * When uclamp is compiled in, the aggregation at rq level is 'turned off'
  * by default in the fast path and only gets turned on once userspace perf=
orms
@@ -3001,6 +2999,18 @@ static inline bool uclamp_is_used(void)
 	return static_branch_likely(&sched_uclamp_used);
 }
=20
+static inline unsigned long uclamp_eff_value(struct task_struct *p,
+					     enum uclamp_id clamp_id)
+{
+	if (uclamp_is_used() && p->uclamp[clamp_id].active)
+		return p->uclamp[clamp_id].value;
+
+	if (clamp_id =3D=3D UCLAMP_MIN)
+		return 0;
+
+	return SCHED_CAPACITY_SCALE;
+}
+
 static inline unsigned long root_cfs_util(struct rq *rq)
 {
 	return READ_ONCE(rq->root_cfs_util_uclamp);
--=20
2.34.1