From nobody Sat Feb  7 13:43:40 2026
Received: from foss.arm.com (foss.arm.com [217.140.110.172])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id A4FA88665E
	for <linux-kernel@vger.kernel.org>; Thu,  1 Feb 2024 13:12:15 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=217.140.110.172
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1706793138; cv=none;
 b=LWi6hG2F0E23K4DfSJIgMzgtTZci3zWMCw9gXSk31I8LZKm6eX4Dm9lMoH8C5T6sjntN2Hyp8qO/4XKMbl3Ww9uwRjA9LEVKePDG+kXqaOx7aJ8tyjzLDrgm7KJDf2Vo26QIL2y8HJ2t2ieXB0067v2O/46ou2JwtjZcX6+SSUY=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1706793138; c=relaxed/simple;
	bh=fFtMnS0pExRU9XxAc52P1T23Oszmuznw5qpiL99BkH4=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=uZXS/h9AVZIoGu1bvxoKlS3kaaEG09XjSGrX0XPeeEhv6jO4nSwoSL+BXS68tfmpo0gMhsySc2OIfltwc2+9aG1hjrDCbtQHPonhAag4R8xTu2aFyj8jVOntL39Pb76se+GsHDissJax3JepmtE3IMiOR1gRDQhkUzW/SIgYwy0=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=arm.com;
 spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=arm.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=arm.com
Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14])
	by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id A837B11FB;
	Thu,  1 Feb 2024 05:12:57 -0800 (PST)
Received: from e130256.cambridge.arm.com (usa-sjc-imap-foss1.foss.arm.com
 [10.121.207.14])
	by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id C1AC63F762;
	Thu,  1 Feb 2024 05:12:12 -0800 (PST)
From: Hongyan Xia <hongyan.xia2@arm.com>
To: Ingo Molnar <mingo@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Vincent Guittot <vincent.guittot@linaro.org>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Juri Lelli <juri.lelli@redhat.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Ben Segall <bsegall@google.com>,
	Mel Gorman <mgorman@suse.de>,
	Daniel Bristot de Oliveira <bristot@redhat.com>,
	Valentin Schneider <vschneid@redhat.com>
Cc: Qais Yousef <qyousef@layalina.io>,
	Morten Rasmussen <morten.rasmussen@arm.com>,
	Lukasz Luba <lukasz.luba@arm.com>,
	Christian Loehle <christian.loehle@arm.com>,
	linux-kernel@vger.kernel.org,
	David Dai <davidai@google.com>,
	Saravana Kannan <saravanak@google.com>,
	Hongyan Xia <Hongyan.Xia2@arm.com>
Subject: [RFC PATCH v2 1/7] Revert "sched/uclamp: Set max_spare_cap_cpu even
 if max_spare_cap is 0"
Date: Thu,  1 Feb 2024 13:11:57 +0000
Message-Id: 
 <b29e7df921ce07c2c2dbbde390e234d162756c42.1706792708.git.hongyan.xia2@arm.com>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <cover.1706792708.git.hongyan.xia2@arm.com>
References: <cover.1706792708.git.hongyan.xia2@arm.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

From: Hongyan Xia <Hongyan.Xia2@arm.com>

That commit creates further problems because 0 spare capacity can be
either a real indication that the CPU is maxed out, or the CPU is
UCLAMP_MAX throttled, but we end up giving all of them a chance which
can results in bogus energy calculations. It also tends to schedule
tasks on the same CPU and requires load balancing patches. Sum
aggregation solves these problems and this patch is not needed.

This reverts commit 6b00a40147653c8ea748e8f4396510f252763364.

Signed-off-by: Hongyan Xia <hongyan.xia2@arm.com>
---
 kernel/sched/fair.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b803030c3a03..d5cc87db4845 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7978,10 +7978,11 @@ static int find_energy_efficient_cpu(struct task_st=
ruct *p, int prev_cpu)
 	for (; pd; pd =3D pd->next) {
 		unsigned long util_min =3D p_util_min, util_max =3D p_util_max;
 		unsigned long cpu_cap, cpu_thermal_cap, util;
-		long prev_spare_cap =3D -1, max_spare_cap =3D -1;
+		unsigned long cur_delta, max_spare_cap =3D 0;
 		unsigned long rq_util_min, rq_util_max;
-		unsigned long cur_delta, base_energy;
+		unsigned long prev_spare_cap =3D 0;
 		int max_spare_cap_cpu =3D -1;
+		unsigned long base_energy;
 		int fits, max_fits =3D -1;
=20
 		cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
@@ -8044,7 +8045,7 @@ static int find_energy_efficient_cpu(struct task_stru=
ct *p, int prev_cpu)
 				prev_spare_cap =3D cpu_cap;
 				prev_fits =3D fits;
 			} else if ((fits > max_fits) ||
-				   ((fits =3D=3D max_fits) && ((long)cpu_cap > max_spare_cap))) {
+				   ((fits =3D=3D max_fits) && (cpu_cap > max_spare_cap))) {
 				/*
 				 * Find the CPU with the maximum spare capacity
 				 * among the remaining CPUs in the performance
@@ -8056,7 +8057,7 @@ static int find_energy_efficient_cpu(struct task_stru=
ct *p, int prev_cpu)
 			}
 		}
=20
-		if (max_spare_cap_cpu < 0 && prev_spare_cap < 0)
+		if (max_spare_cap_cpu < 0 && prev_spare_cap =3D=3D 0)
 			continue;
=20
 		eenv_pd_busy_time(&eenv, cpus, p);
@@ -8064,7 +8065,7 @@ static int find_energy_efficient_cpu(struct task_stru=
ct *p, int prev_cpu)
 		base_energy =3D compute_energy(&eenv, pd, cpus, p, -1);
=20
 		/* Evaluate the energy impact of using prev_cpu. */
-		if (prev_spare_cap > -1) {
+		if (prev_spare_cap > 0) {
 			prev_delta =3D compute_energy(&eenv, pd, cpus, p,
 						    prev_cpu);
 			/* CPU utilization has changed */
--=20
2.34.1
From nobody Sat Feb  7 13:43:40 2026
Received: from foss.arm.com (foss.arm.com [217.140.110.172])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id CFDD75338E
	for <linux-kernel@vger.kernel.org>; Thu,  1 Feb 2024 13:12:18 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=217.140.110.172
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1706793141; cv=none;
 b=dVyAFyDM0Vvwa9pxfpoNBehs3sF9Hs1W/bLlxDbGcCBjJGA/JFU5rcxPIMSb/KVwijlomIzZwJL5YEGqkdc1KO0kQKqLGW2rAXzo4pxYl6bv1kEARP75Ux5YLmlS+ESeXY3AwUujg9f/tnZ6DOtN8qUNrqfEI5nwF61h5/7X8TQ=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1706793141; c=relaxed/simple;
	bh=8GQcDzINuKnM88L0X3+SWU8VavTC8VBBqvmgixz9hgw=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=U+6adyVFjTFjPDJrbUd/eUf6I6ASsUO3enuvgnWJKtfGWPn737JomBZB30EBDanI2BMnoj38ZsZNjcJs0vPzu4p3u9lY6r1+9NTbHQBqf0LeP0Sp2axFre3gqlMuFHrlX+em9ondIidq435YVR9d/EWzKhOkCF9UFfj6qAHKvlg=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=arm.com;
 spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=arm.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=arm.com
Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14])
	by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id D99C7175D;
	Thu,  1 Feb 2024 05:13:00 -0800 (PST)
Received: from e130256.cambridge.arm.com (usa-sjc-imap-foss1.foss.arm.com
 [10.121.207.14])
	by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id 19CCD3F762;
	Thu,  1 Feb 2024 05:12:15 -0800 (PST)
From: Hongyan Xia <hongyan.xia2@arm.com>
To: Ingo Molnar <mingo@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Vincent Guittot <vincent.guittot@linaro.org>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Juri Lelli <juri.lelli@redhat.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Ben Segall <bsegall@google.com>,
	Mel Gorman <mgorman@suse.de>,
	Daniel Bristot de Oliveira <bristot@redhat.com>,
	Valentin Schneider <vschneid@redhat.com>
Cc: Qais Yousef <qyousef@layalina.io>,
	Morten Rasmussen <morten.rasmussen@arm.com>,
	Lukasz Luba <lukasz.luba@arm.com>,
	Christian Loehle <christian.loehle@arm.com>,
	linux-kernel@vger.kernel.org,
	David Dai <davidai@google.com>,
	Saravana Kannan <saravanak@google.com>
Subject: [RFC PATCH v2 2/7] sched/uclamp: Track uclamped util_avg in sched_avg
Date: Thu,  1 Feb 2024 13:11:58 +0000
Message-Id: 
 <92b6ffbffa4dd9ac5d27809bb14528183a54c3a3.1706792708.git.hongyan.xia2@arm.com>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <cover.1706792708.git.hongyan.xia2@arm.com>
References: <cover.1706792708.git.hongyan.xia2@arm.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

Track a uclamped version of util_avg in sched_avg, which clamps util_avg
within [uclamp[UCLAMP_MIN], uclamp[UCLAMP_MAX]] every time util_avg is
updated. At the root CFS rq level, just like util_est,
rq->cfs.avg.util_avg_uclamp must always be the sum of all
util_avg_uclamp of CFS tasks on this rq. So, each time the
util_avg_uclamp of a task gets updated, we also track the delta and
update the root cfs_rq. When a CFS task gets enqueued or dequeued, the
rq->cfs.avg.util_avg_uclamp also needs to add or subtract the
util_avg_uclamp of this task.

Signed-off-by: Hongyan Xia <hongyan.xia2@arm.com>
---
 include/linux/sched.h |  3 +++
 kernel/sched/fair.c   | 21 +++++++++++++++++++
 kernel/sched/pelt.c   | 48 +++++++++++++++++++++++++++++++++++--------
 kernel/sched/pelt.h   |  5 +++--
 kernel/sched/sched.h  | 27 ++++++++++++++++++++++++
 5 files changed, 94 insertions(+), 10 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 03bfe9ab2951..f28eeff169ff 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -470,6 +470,9 @@ struct sched_avg {
 	unsigned long			runnable_avg;
 	unsigned long			util_avg;
 	unsigned int			util_est;
+#ifdef CONFIG_UCLAMP_TASK
+	unsigned int			util_avg_uclamp;
+#endif
 } ____cacheline_aligned;
=20
 /*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d5cc87db4845..4f535c96463b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1089,6 +1089,9 @@ void post_init_entity_util_avg(struct task_struct *p)
 	}
=20
 	sa->runnable_avg =3D sa->util_avg;
+#ifdef CONFIG_UCLAMP_TASK
+	sa->util_avg_uclamp =3D sa->util_avg;
+#endif
 }
=20
 #else /* !CONFIG_SMP */
@@ -6763,6 +6766,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct =
*p, int flags)
=20
 	/* At this point se is NULL and we are at root level*/
 	add_nr_running(rq, 1);
+#ifdef CONFIG_UCLAMP_TASK
+	util_uclamp_enqueue(&rq->cfs.avg, p);
+	update_util_uclamp(0, 0, 0, &rq->cfs.avg, p);
+	/* TODO: Better skip the frequency update in the for loop above. */
+	cpufreq_update_util(rq, 0);
+#endif
=20
 	/*
 	 * Since new tasks are assigned an initial util_avg equal to
@@ -6854,6 +6863,9 @@ static void dequeue_task_fair(struct rq *rq, struct t=
ask_struct *p, int flags)
=20
 	/* At this point se is NULL and we are at root level*/
 	sub_nr_running(rq, 1);
+#ifdef CONFIG_UCLAMP_TASK
+	util_uclamp_dequeue(&rq->cfs.avg, p);
+#endif
=20
 	/* balance early to pull high priority tasks */
 	if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
@@ -6862,6 +6874,15 @@ static void dequeue_task_fair(struct rq *rq, struct =
task_struct *p, int flags)
 dequeue_throttle:
 	util_est_update(&rq->cfs, p, task_sleep);
 	hrtick_update(rq);
+
+#ifdef CONFIG_UCLAMP_TASK
+	if (rq->cfs.h_nr_running =3D=3D 0) {
+		WARN_ONCE(rq->cfs.avg.util_avg_uclamp,
+			"0 tasks on CFS of CPU %d, but util_avg_uclamp is %u\n",
+			rq->cpu, rq->cfs.avg.util_avg_uclamp);
+		WRITE_ONCE(rq->cfs.avg.util_avg_uclamp, 0);
+	}
+#endif
 }
=20
 #ifdef CONFIG_SMP
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 63b6cf898220..eca45a863f9f 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -266,6 +266,39 @@ ___update_load_avg(struct sched_avg *sa, unsigned long=
 load)
 	WRITE_ONCE(sa->util_avg, sa->util_sum / divider);
 }
=20
+#ifdef CONFIG_UCLAMP_TASK
+/* avg must belong to the queue this se is on. */
+void update_util_uclamp(struct sched_avg *avg, struct task_struct *p)
+{
+	unsigned int util, uclamp_min, uclamp_max;
+	int delta;
+
+	if (!p->se.on_rq)
+		return;
+
+	if (!avg)
+		return;
+
+	util =3D READ_ONCE(p->se.avg.util_avg);
+	uclamp_min =3D uclamp_eff_value(p, UCLAMP_MIN);
+	uclamp_max =3D uclamp_eff_value(p, UCLAMP_MAX);
+	util =3D clamp(util, uclamp_min, uclamp_max);
+
+	delta =3D util - READ_ONCE(p->se.avg.util_avg_uclamp);
+	if (delta =3D=3D 0)
+		return;
+
+	WRITE_ONCE(p->se.avg.util_avg_uclamp, util);
+	util =3D READ_ONCE(avg->util_avg_uclamp);
+	util +=3D delta;
+	WRITE_ONCE(avg->util_avg_uclamp, util);
+}
+#else /* !CONFIG_UCLAMP_TASK */
+void update_util_uclamp(struct sched_avg *avg, struct task_struct *p)
+{
+}
+#endif
+
 /*
  * sched_entity:
  *
@@ -292,29 +325,28 @@ ___update_load_avg(struct sched_avg *sa, unsigned lon=
g load)
  *   load_avg =3D \Sum se->avg.load_avg
  */
=20
-int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
+void __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
 {
 	if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
 		___update_load_avg(&se->avg, se_weight(se));
+		if (entity_is_task(se))
+			update_util_uclamp(NULL, task_of(se));
 		trace_pelt_se_tp(se);
-		return 1;
 	}
-
-	return 0;
 }
=20
-int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_enti=
ty *se)
+void __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_ent=
ity *se)
 {
 	if (___update_load_sum(now, &se->avg, !!se->on_rq, se_runnable(se),
 				cfs_rq->curr =3D=3D se)) {
=20
 		___update_load_avg(&se->avg, se_weight(se));
 		cfs_se_util_change(&se->avg);
+		if (entity_is_task(se))
+			update_util_uclamp(&rq_of(cfs_rq)->cfs.avg,
+					   task_of(se));
 		trace_pelt_se_tp(se);
-		return 1;
 	}
-
-	return 0;
 }
=20
 int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 9e1083465fbc..6862f79e0fcd 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -1,8 +1,9 @@
 #ifdef CONFIG_SMP
 #include "sched-pelt.h"
=20
-int __update_load_avg_blocked_se(u64 now, struct sched_entity *se);
-int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_enti=
ty *se);
+void update_util_uclamp(struct sched_avg *avg, struct task_struct *p);
+void __update_load_avg_blocked_se(u64 now, struct sched_entity *se);
+void __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_ent=
ity *se);
 int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq);
 int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
 int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e58a54bda77d..35036246824b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3085,6 +3085,33 @@ static inline bool uclamp_is_used(void)
 {
 	return static_branch_likely(&sched_uclamp_used);
 }
+
+static inline void util_uclamp_enqueue(struct sched_avg *avg,
+				       struct task_struct *p)
+{
+	unsigned int avg_val =3D READ_ONCE(avg->util_avg_uclamp);
+	unsigned int p_val =3D READ_ONCE(p->se.avg.util_avg_uclamp);
+
+	WRITE_ONCE(avg->util_avg_uclamp, avg_val + p_val);
+}
+
+static inline void util_uclamp_dequeue(struct sched_avg *avg,
+				       struct task_struct *p)
+{
+	unsigned int avg_val =3D READ_ONCE(avg->util_avg_uclamp);
+	unsigned int p_val =3D READ_ONCE(p->se.avg.util_avg_uclamp), new_val;
+
+	if (avg_val > p_val)
+		new_val =3D avg_val - p_val;
+	else {
+		WARN_ONCE(avg_val < p_val,
+			"avg_val underflow. avg_val %u is even less than p_val %u before subtra=
ction\n",
+			avg_val, p_val);
+		new_val =3D 0;
+	}
+
+	WRITE_ONCE(avg->util_avg_uclamp, new_val);
+}
 #else /* CONFIG_UCLAMP_TASK */
 static inline unsigned long uclamp_eff_value(struct task_struct *p,
 					     enum uclamp_id clamp_id)
--=20
2.34.1
From nobody Sat Feb  7 13:43:40 2026
Received: from foss.arm.com (foss.arm.com [217.140.110.172])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id 065CB5B68D
	for <linux-kernel@vger.kernel.org>; Thu,  1 Feb 2024 13:12:21 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=217.140.110.172
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1706793145; cv=none;
 b=GaIrfsPsfil384E+jKhrhlocEgLhcm7WQTjYyATEvzTwPHaWTi96vT326KaYi/Cf21ntNvnfMVD6HjYPTq0R1K+C0YX4wyUdF64Wxw7P5g7q7o4ArrhSla/8vwwXe/lBm4WBXW3J4slGnGL+GPDknJMkwq6/FX13a4eyz5e3MSo=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1706793145; c=relaxed/simple;
	bh=MF8KG6hTtTJAn69SRdLKWSXk+01l8dQMQa0VhYm+Aq0=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=QtV9RW0yZDshgoKzx8QT8brFjmjFZV3HvriB39KrBABdYSbGf5LHiZYN0/Q/ngKYKAVu0sbHCdSQ0STFca2z8EAYP3TVQQMgoN3TJKs/hU2fVQA53uB8Hvhwyl+n5p8k7iT+aVwkKRBFBObyXfe4F6bcdHVvtpwvOY71MNeadqQ=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=arm.com;
 spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=arm.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=arm.com
Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14])
	by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 0B7E01762;
	Thu,  1 Feb 2024 05:13:04 -0800 (PST)
Received: from e130256.cambridge.arm.com (usa-sjc-imap-foss1.foss.arm.com
 [10.121.207.14])
	by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id 3FDE43F762;
	Thu,  1 Feb 2024 05:12:19 -0800 (PST)
From: Hongyan Xia <hongyan.xia2@arm.com>
To: Ingo Molnar <mingo@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Vincent Guittot <vincent.guittot@linaro.org>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Juri Lelli <juri.lelli@redhat.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Ben Segall <bsegall@google.com>,
	Mel Gorman <mgorman@suse.de>,
	Daniel Bristot de Oliveira <bristot@redhat.com>,
	Valentin Schneider <vschneid@redhat.com>
Cc: Qais Yousef <qyousef@layalina.io>,
	Morten Rasmussen <morten.rasmussen@arm.com>,
	Lukasz Luba <lukasz.luba@arm.com>,
	Christian Loehle <christian.loehle@arm.com>,
	linux-kernel@vger.kernel.org,
	David Dai <davidai@google.com>,
	Saravana Kannan <saravanak@google.com>
Subject: [RFC PATCH v2 3/7] sched/uclamp: Introduce root_cfs_util_uclamp for
 rq
Date: Thu,  1 Feb 2024 13:11:59 +0000
Message-Id: 
 <68fbd0c0bb7e2ef7a80e7359512672a235a963b1.1706792708.git.hongyan.xia2@arm.com>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <cover.1706792708.git.hongyan.xia2@arm.com>
References: <cover.1706792708.git.hongyan.xia2@arm.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

The problem with rq->cfs.avg.util_avg_uclamp is that it only tracks the
sum of contributions of CFS tasks that are on the rq. However, CFS tasks
that belong to a CPU which were just dequeued from the rq->cfs still
have decaying contributions to the rq utilization due to PELT. Introduce
root_cfs_util_uclamp to capture the total utilization of CFS tasks both
on and off this rq.

Theoretically, keeping track of the sum of all tasks on a CPU (either on
or off the rq) requires us to periodically sample the decaying PELT
utilization of all off-rq tasks and then sum them up, which introduces
substantial extra code and overhead. However, we can avoid the overhead,
shown in this example:

Let's assume 3 tasks, A, B and C. A is still on rq->cfs but B and C have
just been dequeued. The cfs.avg.util_avg_uclamp has dropped from A + B +
C to just A but the instantaneous utilization only just starts to decay
and is now still A + B + C. Let's denote root_cfs_util_uclamp_old as the
instantaneous total utilization right before B and C are dequeued.

After p periods, with y being the decay factor, the new
root_cfs_util_uclamp becomes:

root_cfs_util_uclamp
    =3D A + B * y^p + C * y^p
    =3D A + (A + B + C - A) * y^p
    =3D cfs.avg.util_avg_uclamp +
        (root_cfs_util_uclamp_old - cfs.avg.util_avg_uclamp) * y^p
    =3D cfs.avg.util_avg_uclamp + diff * y^p

So, whenever we want to calculate the new root_cfs_util_uclamp
(including both on- and off-rq CFS tasks of a CPU), we could just decay
the diff between root_cfs_util_uclamp and cfs.avg.util_avg_uclamp, and
add the decayed diff to cfs.avg.util_avg_uclamp to obtain the new
root_cfs_util_uclamp, without bothering to periodically sample off-rq
CFS tasks and sum them up. This significantly reduces the overhead
needed to maintain this signal, and makes sure we now also include the
decaying contributions of CFS tasks that are dequeued.

NOTE: In no way do we change how PELT and util_avg work. The original
PELT signal is kept as-is and is used when needed. The new signals,
util_avg_uclamp and root_cfs_util_uclamp are additional hints to the
scheduler and are not meant to replace the original PELT signals.

Signed-off-by: Hongyan Xia <hongyan.xia2@arm.com>
---
 kernel/sched/fair.c  |   7 +++
 kernel/sched/pelt.c  | 106 +++++++++++++++++++++++++++++++++++++++----
 kernel/sched/pelt.h  |   3 +-
 kernel/sched/sched.h |  16 +++++++
 4 files changed, 123 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4f535c96463b..36357cfaf48d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6710,6 +6710,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *=
p, int flags)
 	struct sched_entity *se =3D &p->se;
 	int idle_h_nr_running =3D task_has_idle_policy(p);
 	int task_new =3D !(flags & ENQUEUE_WAKEUP);
+	bool __maybe_unused migrated =3D p->se.avg.last_update_time =3D=3D 0;
=20
 	/*
 	 * The code below (indirectly) updates schedutil which looks at
@@ -6769,6 +6770,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct =
*p, int flags)
 #ifdef CONFIG_UCLAMP_TASK
 	util_uclamp_enqueue(&rq->cfs.avg, p);
 	update_util_uclamp(0, 0, 0, &rq->cfs.avg, p);
+	if (migrated)
+		rq->root_cfs_util_uclamp +=3D p->se.avg.util_avg_uclamp;
+	rq->root_cfs_util_uclamp =3D max(rq->root_cfs_util_uclamp,
+				       rq->cfs.avg.util_avg_uclamp);
 	/* TODO: Better skip the frequency update in the for loop above. */
 	cpufreq_update_util(rq, 0);
 #endif
@@ -8252,6 +8257,7 @@ static void migrate_task_rq_fair(struct task_struct *=
p, int new_cpu)
 		migrate_se_pelt_lag(se);
 	}
=20
+	remove_root_cfs_util_uclamp(p);
 	/* Tell new CPU we are migrated */
 	se->avg.last_update_time =3D 0;
=20
@@ -8261,6 +8267,7 @@ static void migrate_task_rq_fair(struct task_struct *=
p, int new_cpu)
 static void task_dead_fair(struct task_struct *p)
 {
 	remove_entity_load_avg(&p->se);
+	remove_root_cfs_util_uclamp(p);
 }
=20
 static int
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index eca45a863f9f..9ba208ac26db 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -267,14 +267,78 @@ ___update_load_avg(struct sched_avg *sa, unsigned lon=
g load)
 }
=20
 #ifdef CONFIG_UCLAMP_TASK
+static int ___update_util_uclamp_towards(u64 now,
+					 u64 last_update_time,
+					 u32 period_contrib,
+					 unsigned int *old,
+					 unsigned int new_val)
+{
+	unsigned int old_val =3D READ_ONCE(*old);
+	u64 delta, periods;
+
+	if (old_val <=3D new_val) {
+		WRITE_ONCE(*old, new_val);
+		return old_val < new_val;
+	}
+
+	if (!last_update_time)
+		return 0;
+	delta =3D now - last_update_time;
+	if ((s64)delta < 0)
+		return 0;
+	delta >>=3D 10;
+	if (!delta)
+		return 0;
+
+	delta +=3D period_contrib;
+	periods =3D delta / 1024;
+	if (periods) {
+		u64 diff =3D old_val - new_val;
+
+		/*
+		 * Let's assume 3 tasks, A, B and C. A is still on rq but B and
+		 * C have just been dequeued. The cfs.avg.util_avg_uclamp has
+		 * become A but root_cfs_util_uclamp just starts to decay and is
+		 * now still A + B + C.
+		 *
+		 * After p periods with y being the decay factor, the new
+		 * root_cfs_util_uclamp should become
+		 *
+		 * A + B * y^p + C * y^p =3D=3D A + (A + B + C - A) * y^p
+		 *     =3D=3D cfs.avg.util_avg_uclamp +
+		 *        (root_cfs_util_uclamp_at_the_start - cfs.avg.util_avg_uclamp) =
* y^p
+		 *     =3D=3D cfs.avg.util_avg_uclamp + diff * y^p
+		 *
+		 * So, instead of summing up each individual decayed values, we
+		 * could just decay the diff and not bother with the summation
+		 * at all. This is why we decay the diff here.
+		 */
+		diff =3D decay_load(diff, periods);
+		WRITE_ONCE(*old, new_val + diff);
+		return old_val !=3D *old;
+	}
+
+	return 0;
+}
+
 /* avg must belong to the queue this se is on. */
-void update_util_uclamp(struct sched_avg *avg, struct task_struct *p)
+void update_util_uclamp(u64 now,
+			u64 last_update_time,
+			u32 period_contrib,
+			struct sched_avg *avg,
+			struct task_struct *p)
 {
 	unsigned int util, uclamp_min, uclamp_max;
 	int delta;
=20
-	if (!p->se.on_rq)
+	if (!p->se.on_rq) {
+		___update_util_uclamp_towards(now,
+					      last_update_time,
+					      period_contrib,
+					      &p->se.avg.util_avg_uclamp,
+					      0);
 		return;
+	}
=20
 	if (!avg)
 		return;
@@ -294,7 +358,11 @@ void update_util_uclamp(struct sched_avg *avg, struct =
task_struct *p)
 	WRITE_ONCE(avg->util_avg_uclamp, util);
 }
 #else /* !CONFIG_UCLAMP_TASK */
-void update_util_uclamp(struct sched_avg *avg, struct task_struct *p)
+void update_util_uclamp(u64 now,
+			u64 last_update_time,
+			u32 period_contrib,
+			struct sched_avg *avg,
+			struct task_struct *p)
 {
 }
 #endif
@@ -327,23 +395,32 @@ void update_util_uclamp(struct sched_avg *avg, struct=
 task_struct *p)
=20
 void __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
 {
+	u64 last_update_time =3D se->avg.last_update_time;
+	u32 period_contrib =3D se->avg.period_contrib;
+
 	if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
 		___update_load_avg(&se->avg, se_weight(se));
 		if (entity_is_task(se))
-			update_util_uclamp(NULL, task_of(se));
+			update_util_uclamp(now, last_update_time,
+					   period_contrib, NULL, task_of(se));
 		trace_pelt_se_tp(se);
 	}
 }
=20
 void __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_ent=
ity *se)
 {
+	u64 last_update_time =3D se->avg.last_update_time;
+	u32 period_contrib =3D se->avg.period_contrib;
+
 	if (___update_load_sum(now, &se->avg, !!se->on_rq, se_runnable(se),
 				cfs_rq->curr =3D=3D se)) {
=20
 		___update_load_avg(&se->avg, se_weight(se));
 		cfs_se_util_change(&se->avg);
 		if (entity_is_task(se))
-			update_util_uclamp(&rq_of(cfs_rq)->cfs.avg,
+			update_util_uclamp(now, last_update_time,
+					   period_contrib,
+					   &rq_of(cfs_rq)->cfs.avg,
 					   task_of(se));
 		trace_pelt_se_tp(se);
 	}
@@ -351,17 +428,30 @@ void __update_load_avg_se(u64 now, struct cfs_rq *cfs=
_rq, struct sched_entity *s
=20
 int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
 {
+	u64 __maybe_unused last_update_time =3D cfs_rq->avg.last_update_time;
+	u32 __maybe_unused period_contrib =3D cfs_rq->avg.period_contrib;
+	int ret =3D 0;
+
 	if (___update_load_sum(now, &cfs_rq->avg,
 				scale_load_down(cfs_rq->load.weight),
 				cfs_rq->h_nr_running,
 				cfs_rq->curr !=3D NULL)) {
=20
 		___update_load_avg(&cfs_rq->avg, 1);
-		trace_pelt_cfs_tp(cfs_rq);
-		return 1;
+		ret =3D 1;
 	}
=20
-	return 0;
+#ifdef CONFIG_UCLAMP_TASK
+	if (&rq_of(cfs_rq)->cfs =3D=3D cfs_rq)
+		ret =3D ___update_util_uclamp_towards(now,
+				last_update_time, period_contrib,
+				&rq_of(cfs_rq)->root_cfs_util_uclamp,
+				READ_ONCE(cfs_rq->avg.util_avg_uclamp));
+#endif
+	if (ret)
+		trace_pelt_cfs_tp(cfs_rq);
+
+	return ret;
 }
=20
 /*
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 6862f79e0fcd..a2852d5e862d 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -1,7 +1,8 @@
 #ifdef CONFIG_SMP
 #include "sched-pelt.h"
=20
-void update_util_uclamp(struct sched_avg *avg, struct task_struct *p);
+void update_util_uclamp(u64 now, u64 last_update_time, u32 period_contrib,
+			struct sched_avg *avg, struct task_struct *p);
 void __update_load_avg_blocked_se(u64 now, struct sched_entity *se);
 void __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_ent=
ity *se);
 int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 35036246824b..ce80b87b549b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -998,6 +998,7 @@ struct rq {
 	/* Utilization clamp values based on CPU's RUNNABLE tasks */
 	struct uclamp_rq	uclamp[UCLAMP_CNT] ____cacheline_aligned;
 	unsigned int		uclamp_flags;
+	unsigned int		root_cfs_util_uclamp;
 #define UCLAMP_FLAG_IDLE 0x01
 #endif
=20
@@ -3112,6 +3113,17 @@ static inline void util_uclamp_dequeue(struct sched_=
avg *avg,
=20
 	WRITE_ONCE(avg->util_avg_uclamp, new_val);
 }
+
+static inline void remove_root_cfs_util_uclamp(struct task_struct *p)
+{
+	struct rq *rq =3D task_rq(p);
+	unsigned int root_util =3D READ_ONCE(rq->root_cfs_util_uclamp);
+	unsigned int p_util =3D READ_ONCE(p->se.avg.util_avg_uclamp), new_util;
+
+	new_util =3D (root_util > p_util) ? root_util - p_util : 0;
+	new_util =3D max(new_util, READ_ONCE(rq->cfs.avg.util_avg_uclamp));
+	WRITE_ONCE(rq->root_cfs_util_uclamp, new_util);
+}
 #else /* CONFIG_UCLAMP_TASK */
 static inline unsigned long uclamp_eff_value(struct task_struct *p,
 					     enum uclamp_id clamp_id)
@@ -3147,6 +3159,10 @@ static inline bool uclamp_rq_is_idle(struct rq *rq)
 {
 	return false;
 }
+
+static inline void remove_root_cfs_util_uclamp(struct task_struct *p)
+{
+}
 #endif /* CONFIG_UCLAMP_TASK */
=20
 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
--=20
2.34.1
From nobody Sat Feb  7 13:43:40 2026
Received: from foss.arm.com (foss.arm.com [217.140.110.172])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id 868D35336B;
	Thu,  1 Feb 2024 13:12:25 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=217.140.110.172
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1706793147; cv=none;
 b=M0SOYbiWhHh+LdEk68kOg3MhL08qO3tXD9SXjlAGfraSZ8kU79A6QmrfLQBko3QQ73rZeeqL5QN1bjXNHMSd+P1lgEGRwb+tuyeHUJ7oAiyvCikz/Nush/4cCnf7XRZMXcLezn1nV7bsRy0FR58jqGYg81Y6ELtY30OIzMSWmu4=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1706793147; c=relaxed/simple;
	bh=g9to9mHQTMEKme4LM6eXw5e3J3La88oBjiXs5Njofgs=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=DOg2YSLvlsgDvR02CqZAgQn6vEVa3OWscYyRR9xs5cJyf2hhdTfUf0sO/ZsvShpSNoy9E/1EFgieag6A+FDMRbVZoiopVskdsY4v4Za58IMIISyQY+g2DEgHVS55BxR/UlFyIPsiefZFzXChVdJlljOK8Yi9Z2CqdZLQz6S1pBY=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=arm.com;
 spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=arm.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=arm.com
Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14])
	by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 7641A1763;
	Thu,  1 Feb 2024 05:13:07 -0800 (PST)
Received: from e130256.cambridge.arm.com (usa-sjc-imap-foss1.foss.arm.com
 [10.121.207.14])
	by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id 5A3A43F762;
	Thu,  1 Feb 2024 05:12:22 -0800 (PST)
From: Hongyan Xia <hongyan.xia2@arm.com>
To: Ingo Molnar <mingo@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Vincent Guittot <vincent.guittot@linaro.org>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Juri Lelli <juri.lelli@redhat.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Ben Segall <bsegall@google.com>,
	Mel Gorman <mgorman@suse.de>,
	Daniel Bristot de Oliveira <bristot@redhat.com>,
	Valentin Schneider <vschneid@redhat.com>,
	"Rafael J. Wysocki" <rafael@kernel.org>,
	Viresh Kumar <viresh.kumar@linaro.org>
Cc: Qais Yousef <qyousef@layalina.io>,
	Morten Rasmussen <morten.rasmussen@arm.com>,
	Lukasz Luba <lukasz.luba@arm.com>,
	Christian Loehle <christian.loehle@arm.com>,
	linux-kernel@vger.kernel.org,
	David Dai <davidai@google.com>,
	Saravana Kannan <saravanak@google.com>,
	linux-pm@vger.kernel.org
Subject: [RFC PATCH v2 4/7] sched/fair: Use CFS util_avg_uclamp for
 utilization and frequency
Date: Thu,  1 Feb 2024 13:12:00 +0000
Message-Id: 
 <4f755ae12895bbc74a74bac56bf2ef0f30413a32.1706792708.git.hongyan.xia2@arm.com>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <cover.1706792708.git.hongyan.xia2@arm.com>
References: <cover.1706792708.git.hongyan.xia2@arm.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

Switch to the new util_avg_uclamp for task and runqueue utilization.
Since task_util_est() calls task_util() which now uses util_avg_uclamp,
this means util_est is now also a clamped value.

Now that we have the sum aggregated CFS util value, we do not need to
consult uclamp buckets to know how the frequency should be clamped. We
simply look at the aggregated top level root_cfs_util_uclamp to know
what frequency to choose.

TODO: Sum aggregation for RT tasks. I have already implemented RT sum
aggregation, which is only 49 lines of code, but I don't want RT to
distract this series which is mainly CFS-focused. RT will be sent in a
separate mini series.

Signed-off-by: Hongyan Xia <hongyan.xia2@arm.com>
---
 kernel/sched/core.c              | 17 ++++----------
 kernel/sched/cpufreq_schedutil.c | 10 ++------
 kernel/sched/fair.c              | 39 ++++++++++++++++----------------
 kernel/sched/sched.h             | 21 +++++++++++++----
 4 files changed, 42 insertions(+), 45 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index db4be4921e7f..0bedc05c883f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7465,6 +7465,9 @@ int sched_core_idle_cpu(int cpu)
  * The DL bandwidth number otoh is not a measured metric but a value compu=
ted
  * based on the task model parameters and gives the minimal utilization
  * required to meet deadlines.
+ *
+ * The util_cfs parameter has already taken uclamp into account (unless uc=
lamp
+ * support is not compiled in).
  */
 unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
 				 unsigned long *min,
@@ -7490,13 +7493,7 @@ unsigned long effective_cpu_util(int cpu, unsigned l=
ong util_cfs,
 	}
=20
 	if (min) {
-		/*
-		 * The minimum utilization returns the highest level between:
-		 * - the computed DL bandwidth needed with the IRQ pressure which
-		 *   steals time to the deadline task.
-		 * - The minimum performance requirement for CFS and/or RT.
-		 */
-		*min =3D max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
+		*min =3D irq + cpu_bw_dl(rq);
=20
 		/*
 		 * When an RT task is runnable and uclamp is not used, we must
@@ -7515,12 +7512,8 @@ unsigned long effective_cpu_util(int cpu, unsigned l=
ong util_cfs,
 	util =3D util_cfs + cpu_util_rt(rq);
 	util +=3D cpu_util_dl(rq);
=20
-	/*
-	 * The maximum hint is a soft bandwidth requirement, which can be lower
-	 * than the actual utilization because of uclamp_max requirements.
-	 */
 	if (max)
-		*max =3D min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
+		*max =3D scale;
=20
 	if (util >=3D scale)
 		return scale;
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedu=
til.c
index 95c3c097083e..48a4e4a685d0 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -381,11 +381,8 @@ static void sugov_update_single_freq(struct update_uti=
l_data *hook, u64 time,
 	/*
 	 * Do not reduce the frequency if the CPU has not been idle
 	 * recently, as the reduction is likely to be premature then.
-	 *
-	 * Except when the rq is capped by uclamp_max.
 	 */
-	if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) &&
-	    sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq &&
+	if (sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq &&
 	    !sg_policy->need_freq_update) {
 		next_f =3D sg_policy->next_freq;
=20
@@ -435,11 +432,8 @@ static void sugov_update_single_perf(struct update_uti=
l_data *hook, u64 time,
 	/*
 	 * Do not reduce the target performance level if the CPU has not been
 	 * idle recently, as the reduction is likely to be premature then.
-	 *
-	 * Except when the rq is capped by uclamp_max.
 	 */
-	if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) &&
-	    sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util)
+	if (sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util)
 		sg_cpu->util =3D prev_util;
=20
 	cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 36357cfaf48d..b92739e1c52f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4821,10 +4821,17 @@ static inline unsigned long cfs_rq_load_avg(struct =
cfs_rq *cfs_rq)
=20
 static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
=20
+#ifdef CONFIG_UCLAMP_TASK
+static inline unsigned long task_util(struct task_struct *p)
+{
+	return READ_ONCE(p->se.avg.util_avg_uclamp);
+}
+#else
 static inline unsigned long task_util(struct task_struct *p)
 {
 	return READ_ONCE(p->se.avg.util_avg);
 }
+#endif
=20
 static inline unsigned long task_runnable(struct task_struct *p)
 {
@@ -4932,8 +4939,13 @@ static inline void util_est_update(struct cfs_rq *cf=
s_rq,
 	 * To avoid underestimate of task utilization, skip updates of EWMA if
 	 * we cannot grant that thread got all CPU time it wanted.
 	 */
-	if ((dequeued + UTIL_EST_MARGIN) < task_runnable(p))
+	if ((READ_ONCE(p->se.avg.util_avg) + UTIL_EST_MARGIN) <
+			task_runnable(p)) {
+		ewma =3D clamp(ewma,
+			     uclamp_eff_value(p, UCLAMP_MIN),
+			     uclamp_eff_value(p, UCLAMP_MAX));
 		goto done;
+	}
=20
=20
 	/*
@@ -7685,11 +7697,13 @@ static int select_idle_sibling(struct task_struct *=
p, int prev, int target)
 static unsigned long
 cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
 {
-	struct cfs_rq *cfs_rq =3D &cpu_rq(cpu)->cfs;
-	unsigned long util =3D READ_ONCE(cfs_rq->avg.util_avg);
+	struct rq *rq =3D cpu_rq(cpu);
+	struct cfs_rq *cfs_rq =3D &rq->cfs;
+	unsigned long util =3D root_cfs_util(rq);
+	bool capped =3D uclamp_rq_is_capped(rq);
 	unsigned long runnable;
=20
-	if (boost) {
+	if (boost && !capped) {
 		runnable =3D READ_ONCE(cfs_rq->avg.runnable_avg);
 		util =3D max(util, runnable);
 	}
@@ -7867,7 +7881,6 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpum=
ask *pd_cpus,
 	int cpu;
=20
 	for_each_cpu(cpu, pd_cpus) {
-		struct task_struct *tsk =3D (cpu =3D=3D dst_cpu) ? p : NULL;
 		unsigned long util =3D cpu_util(cpu, p, dst_cpu, 1);
 		unsigned long eff_util, min, max;
=20
@@ -7880,20 +7893,6 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpu=
mask *pd_cpus,
 		 */
 		eff_util =3D effective_cpu_util(cpu, util, &min, &max);
=20
-		/* Task's uclamp can modify min and max value */
-		if (tsk && uclamp_is_used()) {
-			min =3D max(min, uclamp_eff_value(p, UCLAMP_MIN));
-
-			/*
-			 * If there is no active max uclamp constraint,
-			 * directly use task's one, otherwise keep max.
-			 */
-			if (uclamp_rq_is_idle(cpu_rq(cpu)))
-				max =3D uclamp_eff_value(p, UCLAMP_MAX);
-			else
-				max =3D max(max, uclamp_eff_value(p, UCLAMP_MAX));
-		}
-
 		eff_util =3D sugov_effective_cpu_perf(cpu, eff_util, min, max);
 		max_util =3D max(max_util, eff_util);
 	}
@@ -7996,7 +7995,7 @@ static int find_energy_efficient_cpu(struct task_stru=
ct *p, int prev_cpu)
 	target =3D prev_cpu;
=20
 	sync_entity_load_avg(&p->se);
-	if (!task_util_est(p) && p_util_min =3D=3D 0)
+	if (!task_util_est(p))
 		goto unlock;
=20
 	eenv_task_busy_time(&eenv, p, prev_cpu);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ce80b87b549b..3ee28822f48f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3062,16 +3062,17 @@ static inline bool uclamp_rq_is_idle(struct rq *rq)
 /* Is the rq being capped/throttled by uclamp_max? */
 static inline bool uclamp_rq_is_capped(struct rq *rq)
 {
-	unsigned long rq_util;
-	unsigned long max_util;
+	unsigned long rq_uclamp_util, rq_real_util;
=20
 	if (!static_branch_likely(&sched_uclamp_used))
 		return false;
=20
-	rq_util =3D cpu_util_cfs(cpu_of(rq)) + cpu_util_rt(rq);
-	max_util =3D READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
+	rq_uclamp_util =3D cpu_util_cfs(cpu_of(rq)) + cpu_util_rt(rq);
+	rq_real_util =3D READ_ONCE(rq->cfs.avg.util_avg) +
+		       READ_ONCE(rq->avg_rt.util_avg);
=20
-	return max_util !=3D SCHED_CAPACITY_SCALE && rq_util >=3D max_util;
+	return rq_uclamp_util < SCHED_CAPACITY_SCALE &&
+	       rq_real_util > rq_uclamp_util;
 }
=20
 /*
@@ -3087,6 +3088,11 @@ static inline bool uclamp_is_used(void)
 	return static_branch_likely(&sched_uclamp_used);
 }
=20
+static inline unsigned long root_cfs_util(struct rq *rq)
+{
+	return READ_ONCE(rq->root_cfs_util_uclamp);
+}
+
 static inline void util_uclamp_enqueue(struct sched_avg *avg,
 				       struct task_struct *p)
 {
@@ -3160,6 +3166,11 @@ static inline bool uclamp_rq_is_idle(struct rq *rq)
 	return false;
 }
=20
+static inline unsigned long root_cfs_util(struct rq *rq)
+{
+	return READ_ONCE(rq->cfs.avg.util_avg);
+}
+
 static inline void remove_root_cfs_util_uclamp(struct task_struct *p)
 {
 }
--=20
2.34.1
From nobody Sat Feb  7 13:43:40 2026
Received: from foss.arm.com (foss.arm.com [217.140.110.172])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id 735F25CDFD
	for <linux-kernel@vger.kernel.org>; Thu,  1 Feb 2024 13:12:28 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=217.140.110.172
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1706793151; cv=none;
 b=lWp06rbFPC6TcqgiDvTLhTPPnY/mfIz3fFJw/oY6H50MbHkuW4pZAlstHsgokpF0QkDOTR6lL0DnrDt6U6iF091e4wCJtGWjYc0mBjOZrZm61PmycG3itbeHpZH4H9TzJRDxePOr7YLNqkLw2NgOCtprXqaA4TiwkbYVum43HAQ=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1706793151; c=relaxed/simple;
	bh=t7Hg4wTa+RM0YHgJQOc3/BebdVYjqWwE6KBAvsLFOVs=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=urVen31Z1aQcVwfX29KQvmlnjyFBPG+ScEPuRYsXOn0uLysVrT/sfayYMRXgXkkj786i2oIpjNt2shYY1/yRaS1OcQW0dp1nG3QI0Rtnp9ydrFteuqxg2BdZEDZGz4yZKQOOoaIQ7HeB5OZI7Rjyeget8cRZ+P0p3aTdhkywI3Y=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=arm.com;
 spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=arm.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=arm.com
Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14])
	by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 88A6F1764;
	Thu,  1 Feb 2024 05:13:10 -0800 (PST)
Received: from e130256.cambridge.arm.com (usa-sjc-imap-foss1.foss.arm.com
 [10.121.207.14])
	by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id A19333F762;
	Thu,  1 Feb 2024 05:12:25 -0800 (PST)
From: Hongyan Xia <hongyan.xia2@arm.com>
To: Ingo Molnar <mingo@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Vincent Guittot <vincent.guittot@linaro.org>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Juri Lelli <juri.lelli@redhat.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Ben Segall <bsegall@google.com>,
	Mel Gorman <mgorman@suse.de>,
	Daniel Bristot de Oliveira <bristot@redhat.com>,
	Valentin Schneider <vschneid@redhat.com>
Cc: Qais Yousef <qyousef@layalina.io>,
	Morten Rasmussen <morten.rasmussen@arm.com>,
	Lukasz Luba <lukasz.luba@arm.com>,
	Christian Loehle <christian.loehle@arm.com>,
	linux-kernel@vger.kernel.org,
	David Dai <davidai@google.com>,
	Saravana Kannan <saravanak@google.com>
Subject: [RFC PATCH v2 5/7] sched/fair: Massively simplify util_fits_cpu()
Date: Thu,  1 Feb 2024 13:12:01 +0000
Message-Id: 
 <b9a301921ccf7ee7e203ba6b74cb75bcb7c2b052.1706792708.git.hongyan.xia2@arm.com>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <cover.1706792708.git.hongyan.xia2@arm.com>
References: <cover.1706792708.git.hongyan.xia2@arm.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

Currently, there's no way to distinguish the difference between 1) a CPU
that is actually maxed out at its highest frequency, or 2) one that is
throttled because of UCLAMP_MAX, since both present util_avg values of
1024. This is problematic because when we try to pick a CPU for a task
to run, we would like to give 2) a chance, or at least prefer 2) to 1).

Current upstream now gives all 0 spare capacity CPUs a chance to
consider queuing more tasks because there's a chance that 0 spare
capacity is due to UCLAMP_MAX. However, this creates further problems
because energy calculations are now bogus when spare capacity is already
0, and tasks tend to pile up on one CPU.

Fix by using util_avg_uclamp for util_fits_cpu(). This way, case 1) will
still keep its utilization at 1024 whereas 2) shows spare capacities if
the sum of util_avg_uclamp values is still under the CPU capacity.

Under sum aggregation, checking whether a task fits a CPU becomes much
simpler. We simply do fits_capacity() and there does not need to be all
kinds of code checking all corner cases for uclamp. This means
util_fits_cpu() returns to true and false instead of tri-state,
simplifying a huge amount of code.

[1]: https://lore.kernel.org/all/20230205224318.2035646-2-qyousef@layalina.=
io/

Signed-off-by: Hongyan Xia <hongyan.xia2@arm.com>
---
 kernel/sched/fair.c | 253 ++++----------------------------------------
 1 file changed, 23 insertions(+), 230 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b92739e1c52f..49997f1f58fb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4974,135 +4974,19 @@ static inline void util_est_update(struct cfs_rq *=
cfs_rq,
 	trace_sched_util_est_se_tp(&p->se);
 }
=20
-static inline int util_fits_cpu(unsigned long util,
-				unsigned long uclamp_min,
-				unsigned long uclamp_max,
-				int cpu)
+/* util must be the uclamp'ed value (i.e. from util_avg_uclamp). */
+static inline int util_fits_cpu(unsigned long util, int cpu)
 {
-	unsigned long capacity_orig, capacity_orig_thermal;
 	unsigned long capacity =3D capacity_of(cpu);
-	bool fits, uclamp_max_fits;
=20
-	/*
-	 * Check if the real util fits without any uclamp boost/cap applied.
-	 */
-	fits =3D fits_capacity(util, capacity);
-
-	if (!uclamp_is_used())
-		return fits;
-
-	/*
-	 * We must use arch_scale_cpu_capacity() for comparing against uclamp_min=
 and
-	 * uclamp_max. We only care about capacity pressure (by using
-	 * capacity_of()) for comparing against the real util.
-	 *
-	 * If a task is boosted to 1024 for example, we don't want a tiny
-	 * pressure to skew the check whether it fits a CPU or not.
-	 *
-	 * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), =
it
-	 * should fit a little cpu even if there's some pressure.
-	 *
-	 * Only exception is for thermal pressure since it has a direct impact
-	 * on available OPP of the system.
-	 *
-	 * We honour it for uclamp_min only as a drop in performance level
-	 * could result in not getting the requested minimum performance level.
-	 *
-	 * For uclamp_max, we can tolerate a drop in performance level as the
-	 * goal is to cap the task. So it's okay if it's getting less.
-	 */
-	capacity_orig =3D arch_scale_cpu_capacity(cpu);
-	capacity_orig_thermal =3D capacity_orig - arch_scale_thermal_pressure(cpu=
);
-
-	/*
-	 * We want to force a task to fit a cpu as implied by uclamp_max.
-	 * But we do have some corner cases to cater for..
-	 *
-	 *
-	 *                                 C=3Dz
-	 *   |                             ___
-	 *   |                  C=3Dy       |   |
-	 *   |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _  uclamp_max
-	 *   |      C=3Dx        |   |      |   |
-	 *   |      ___        |   |      |   |
-	 *   |     |   |       |   |      |   |    (util somewhere in this region)
-	 *   |     |   |       |   |      |   |
-	 *   |     |   |       |   |      |   |
-	 *   +----------------------------------------
-	 *         cpu0        cpu1       cpu2
-	 *
-	 *   In the above example if a task is capped to a specific performance
-	 *   point, y, then when:
-	 *
-	 *   * util =3D 80% of x then it does not fit on cpu0 and should migrate
-	 *     to cpu1
-	 *   * util =3D 80% of y then it is forced to fit on cpu1 to honour
-	 *     uclamp_max request.
-	 *
-	 *   which is what we're enforcing here. A task always fits if
-	 *   uclamp_max <=3D capacity_orig. But when uclamp_max > capacity_orig,
-	 *   the normal upmigration rules should withhold still.
-	 *
-	 *   Only exception is when we are on max capacity, then we need to be
-	 *   careful not to block overutilized state. This is so because:
-	 *
-	 *     1. There's no concept of capping at max_capacity! We can't go
-	 *        beyond this performance level anyway.
-	 *     2. The system is being saturated when we're operating near
-	 *        max capacity, it doesn't make sense to block overutilized.
-	 */
-	uclamp_max_fits =3D (capacity_orig =3D=3D SCHED_CAPACITY_SCALE) && (uclam=
p_max =3D=3D SCHED_CAPACITY_SCALE);
-	uclamp_max_fits =3D !uclamp_max_fits && (uclamp_max <=3D capacity_orig);
-	fits =3D fits || uclamp_max_fits;
-
-	/*
-	 *
-	 *                                 C=3Dz
-	 *   |                             ___       (region a, capped, util >=3D=
 uclamp_max)
-	 *   |                  C=3Dy       |   |
-	 *   |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
-	 *   |      C=3Dx        |   |      |   |
-	 *   |      ___        |   |      |   |      (region b, uclamp_min <=3D u=
til <=3D uclamp_max)
-	 *   |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min
-	 *   |     |   |       |   |      |   |
-	 *   |     |   |       |   |      |   |      (region c, boosted, util < u=
clamp_min)
-	 *   +----------------------------------------
-	 *         cpu0        cpu1       cpu2
-	 *
-	 * a) If util > uclamp_max, then we're capped, we don't care about
-	 *    actual fitness value here. We only care if uclamp_max fits
-	 *    capacity without taking margin/pressure into account.
-	 *    See comment above.
-	 *
-	 * b) If uclamp_min <=3D util <=3D uclamp_max, then the normal
-	 *    fits_capacity() rules apply. Except we need to ensure that we
-	 *    enforce we remain within uclamp_max, see comment above.
-	 *
-	 * c) If util < uclamp_min, then we are boosted. Same as (b) but we
-	 *    need to take into account the boosted value fits the CPU without
-	 *    taking margin/pressure into account.
-	 *
-	 * Cases (a) and (b) are handled in the 'fits' variable already. We
-	 * just need to consider an extra check for case (c) after ensuring we
-	 * handle the case uclamp_min > uclamp_max.
-	 */
-	uclamp_min =3D min(uclamp_min, uclamp_max);
-	if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
-		return -1;
-
-	return fits;
+	return fits_capacity(util, capacity);
 }
=20
 static inline int task_fits_cpu(struct task_struct *p, int cpu)
 {
-	unsigned long uclamp_min =3D uclamp_eff_value(p, UCLAMP_MIN);
-	unsigned long uclamp_max =3D uclamp_eff_value(p, UCLAMP_MAX);
 	unsigned long util =3D task_util_est(p);
-	/*
-	 * Return true only if the cpu fully fits the task requirements, which
-	 * include the utilization but also the performance hints.
-	 */
-	return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0);
+
+	return util_fits_cpu(util, cpu);
 }
=20
 static inline void update_misfit_status(struct task_struct *p, struct rq *=
rq)
@@ -6678,11 +6562,8 @@ static inline void hrtick_update(struct rq *rq)
 #ifdef CONFIG_SMP
 static inline bool cpu_overutilized(int cpu)
 {
-	unsigned long rq_util_min =3D uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
-	unsigned long rq_util_max =3D uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
-
 	/* Return true only if the utilization doesn't fit CPU's capacity */
-	return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
+	return !util_fits_cpu(cpu_util_cfs(cpu), cpu);
 }
=20
 static inline void update_overutilized_status(struct rq *rq)
@@ -7463,8 +7344,7 @@ static int select_idle_cpu(struct task_struct *p, str=
uct sched_domain *sd, bool
 static int
 select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int t=
arget)
 {
-	unsigned long task_util, util_min, util_max, best_cap =3D 0;
-	int fits, best_fits =3D 0;
+	unsigned long task_util, best_cap =3D 0;
 	int cpu, best_cpu =3D -1;
 	struct cpumask *cpus;
=20
@@ -7472,8 +7352,6 @@ select_idle_capacity(struct task_struct *p, struct sc=
hed_domain *sd, int target)
 	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
=20
 	task_util =3D task_util_est(p);
-	util_min =3D uclamp_eff_value(p, UCLAMP_MIN);
-	util_max =3D uclamp_eff_value(p, UCLAMP_MAX);
=20
 	for_each_cpu_wrap(cpu, cpus, target) {
 		unsigned long cpu_cap =3D capacity_of(cpu);
@@ -7481,44 +7359,22 @@ select_idle_capacity(struct task_struct *p, struct =
sched_domain *sd, int target)
 		if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
 			continue;
=20
-		fits =3D util_fits_cpu(task_util, util_min, util_max, cpu);
-
-		/* This CPU fits with all requirements */
-		if (fits > 0)
+		if (util_fits_cpu(task_util, cpu))
 			return cpu;
-		/*
-		 * Only the min performance hint (i.e. uclamp_min) doesn't fit.
-		 * Look for the CPU with best capacity.
-		 */
-		else if (fits < 0)
-			cpu_cap =3D arch_scale_cpu_capacity(cpu) - thermal_load_avg(cpu_rq(cpu)=
);
=20
-		/*
-		 * First, select CPU which fits better (-1 being better than 0).
-		 * Then, select the one with best capacity at same level.
-		 */
-		if ((fits < best_fits) ||
-		    ((fits =3D=3D best_fits) && (cpu_cap > best_cap))) {
+		if (cpu_cap > best_cap) {
 			best_cap =3D cpu_cap;
 			best_cpu =3D cpu;
-			best_fits =3D fits;
 		}
 	}
=20
 	return best_cpu;
 }
=20
-static inline bool asym_fits_cpu(unsigned long util,
-				 unsigned long util_min,
-				 unsigned long util_max,
-				 int cpu)
+static inline bool asym_fits_cpu(unsigned long util, int cpu)
 {
 	if (sched_asym_cpucap_active())
-		/*
-		 * Return true only if the cpu fully fits the task requirements
-		 * which include the utilization and the performance hints.
-		 */
-		return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
+		return util_fits_cpu(util, cpu);
=20
 	return true;
 }
@@ -7530,7 +7386,7 @@ static int select_idle_sibling(struct task_struct *p,=
 int prev, int target)
 {
 	bool has_idle_core =3D false;
 	struct sched_domain *sd;
-	unsigned long task_util, util_min, util_max;
+	unsigned long task_util;
 	int i, recent_used_cpu, prev_aff =3D -1;
=20
 	/*
@@ -7540,8 +7396,6 @@ static int select_idle_sibling(struct task_struct *p,=
 int prev, int target)
 	if (sched_asym_cpucap_active()) {
 		sync_entity_load_avg(&p->se);
 		task_util =3D task_util_est(p);
-		util_min =3D uclamp_eff_value(p, UCLAMP_MIN);
-		util_max =3D uclamp_eff_value(p, UCLAMP_MAX);
 	}
=20
 	/*
@@ -7550,7 +7404,7 @@ static int select_idle_sibling(struct task_struct *p,=
 int prev, int target)
 	lockdep_assert_irqs_disabled();
=20
 	if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
-	    asym_fits_cpu(task_util, util_min, util_max, target))
+	    asym_fits_cpu(task_util, target))
 		return target;
=20
 	/*
@@ -7558,7 +7412,7 @@ static int select_idle_sibling(struct task_struct *p,=
 int prev, int target)
 	 */
 	if (prev !=3D target && cpus_share_cache(prev, target) &&
 	    (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
-	    asym_fits_cpu(task_util, util_min, util_max, prev)) {
+	    asym_fits_cpu(task_util, prev)) {
=20
 		if (!static_branch_unlikely(&sched_cluster_active) ||
 		    cpus_share_resources(prev, target))
@@ -7579,7 +7433,7 @@ static int select_idle_sibling(struct task_struct *p,=
 int prev, int target)
 	    in_task() &&
 	    prev =3D=3D smp_processor_id() &&
 	    this_rq()->nr_running <=3D 1 &&
-	    asym_fits_cpu(task_util, util_min, util_max, prev)) {
+	    asym_fits_cpu(task_util, prev)) {
 		return prev;
 	}
=20
@@ -7591,7 +7445,7 @@ static int select_idle_sibling(struct task_struct *p,=
 int prev, int target)
 	    cpus_share_cache(recent_used_cpu, target) &&
 	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cp=
u)) &&
 	    cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
-	    asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
+	    asym_fits_cpu(task_util, recent_used_cpu)) {
=20
 		if (!static_branch_unlikely(&sched_cluster_active) ||
 		    cpus_share_resources(recent_used_cpu, target))
@@ -7966,13 +7820,8 @@ static int find_energy_efficient_cpu(struct task_str=
uct *p, int prev_cpu)
 {
 	struct cpumask *cpus =3D this_cpu_cpumask_var_ptr(select_rq_mask);
 	unsigned long prev_delta =3D ULONG_MAX, best_delta =3D ULONG_MAX;
-	unsigned long p_util_min =3D uclamp_is_used() ? uclamp_eff_value(p, UCLAM=
P_MIN) : 0;
-	unsigned long p_util_max =3D uclamp_is_used() ? uclamp_eff_value(p, UCLAM=
P_MAX) : 1024;
 	struct root_domain *rd =3D this_rq()->rd;
 	int cpu, best_energy_cpu, target =3D -1;
-	int prev_fits =3D -1, best_fits =3D -1;
-	unsigned long best_thermal_cap =3D 0;
-	unsigned long prev_thermal_cap =3D 0;
 	struct sched_domain *sd;
 	struct perf_domain *pd;
 	struct energy_env eenv;
@@ -8001,14 +7850,11 @@ static int find_energy_efficient_cpu(struct task_st=
ruct *p, int prev_cpu)
 	eenv_task_busy_time(&eenv, p, prev_cpu);
=20
 	for (; pd; pd =3D pd->next) {
-		unsigned long util_min =3D p_util_min, util_max =3D p_util_max;
 		unsigned long cpu_cap, cpu_thermal_cap, util;
 		unsigned long cur_delta, max_spare_cap =3D 0;
-		unsigned long rq_util_min, rq_util_max;
 		unsigned long prev_spare_cap =3D 0;
 		int max_spare_cap_cpu =3D -1;
 		unsigned long base_energy;
-		int fits, max_fits =3D -1;
=20
 		cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
=20
@@ -8024,8 +7870,6 @@ static int find_energy_efficient_cpu(struct task_stru=
ct *p, int prev_cpu)
 		eenv.pd_cap =3D 0;
=20
 		for_each_cpu(cpu, cpus) {
-			struct rq *rq =3D cpu_rq(cpu);
-
 			eenv.pd_cap +=3D cpu_thermal_cap;
=20
 			if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
@@ -8036,31 +7880,7 @@ static int find_energy_efficient_cpu(struct task_str=
uct *p, int prev_cpu)
=20
 			util =3D cpu_util(cpu, p, cpu, 0);
 			cpu_cap =3D capacity_of(cpu);
-
-			/*
-			 * Skip CPUs that cannot satisfy the capacity request.
-			 * IOW, placing the task there would make the CPU
-			 * overutilized. Take uclamp into account to see how
-			 * much capacity we can get out of the CPU; this is
-			 * aligned with sched_cpu_util().
-			 */
-			if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) {
-				/*
-				 * Open code uclamp_rq_util_with() except for
-				 * the clamp() part. Ie: apply max aggregation
-				 * only. util_fits_cpu() logic requires to
-				 * operate on non clamped util but must use the
-				 * max-aggregated uclamp_{min, max}.
-				 */
-				rq_util_min =3D uclamp_rq_get(rq, UCLAMP_MIN);
-				rq_util_max =3D uclamp_rq_get(rq, UCLAMP_MAX);
-
-				util_min =3D max(rq_util_min, p_util_min);
-				util_max =3D max(rq_util_max, p_util_max);
-			}
-
-			fits =3D util_fits_cpu(util, util_min, util_max, cpu);
-			if (!fits)
+			if (!util_fits_cpu(util, cpu))
 				continue;
=20
 			lsub_positive(&cpu_cap, util);
@@ -8068,9 +7888,7 @@ static int find_energy_efficient_cpu(struct task_stru=
ct *p, int prev_cpu)
 			if (cpu =3D=3D prev_cpu) {
 				/* Always use prev_cpu as a candidate. */
 				prev_spare_cap =3D cpu_cap;
-				prev_fits =3D fits;
-			} else if ((fits > max_fits) ||
-				   ((fits =3D=3D max_fits) && (cpu_cap > max_spare_cap))) {
+			} else if (cpu_cap > max_spare_cap) {
 				/*
 				 * Find the CPU with the maximum spare capacity
 				 * among the remaining CPUs in the performance
@@ -8078,7 +7896,6 @@ static int find_energy_efficient_cpu(struct task_stru=
ct *p, int prev_cpu)
 				 */
 				max_spare_cap =3D cpu_cap;
 				max_spare_cap_cpu =3D cpu;
-				max_fits =3D fits;
 			}
 		}
=20
@@ -8097,50 +7914,26 @@ static int find_energy_efficient_cpu(struct task_st=
ruct *p, int prev_cpu)
 			if (prev_delta < base_energy)
 				goto unlock;
 			prev_delta -=3D base_energy;
-			prev_thermal_cap =3D cpu_thermal_cap;
 			best_delta =3D min(best_delta, prev_delta);
 		}
=20
 		/* Evaluate the energy impact of using max_spare_cap_cpu. */
 		if (max_spare_cap_cpu >=3D 0 && max_spare_cap > prev_spare_cap) {
-			/* Current best energy cpu fits better */
-			if (max_fits < best_fits)
-				continue;
-
-			/*
-			 * Both don't fit performance hint (i.e. uclamp_min)
-			 * but best energy cpu has better capacity.
-			 */
-			if ((max_fits < 0) &&
-			    (cpu_thermal_cap <=3D best_thermal_cap))
-				continue;
-
 			cur_delta =3D compute_energy(&eenv, pd, cpus, p,
 						   max_spare_cap_cpu);
 			/* CPU utilization has changed */
 			if (cur_delta < base_energy)
 				goto unlock;
 			cur_delta -=3D base_energy;
-
-			/*
-			 * Both fit for the task but best energy cpu has lower
-			 * energy impact.
-			 */
-			if ((max_fits > 0) && (best_fits > 0) &&
-			    (cur_delta >=3D best_delta))
-				continue;
-
-			best_delta =3D cur_delta;
-			best_energy_cpu =3D max_spare_cap_cpu;
-			best_fits =3D max_fits;
-			best_thermal_cap =3D cpu_thermal_cap;
+			if (cur_delta < best_delta) {
+				best_delta =3D cur_delta;
+				best_energy_cpu =3D max_spare_cap_cpu;
+			}
 		}
 	}
 	rcu_read_unlock();
=20
-	if ((best_fits > prev_fits) ||
-	    ((best_fits > 0) && (best_delta < prev_delta)) ||
-	    ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap)))
+	if (best_delta < prev_delta)
 		target =3D best_energy_cpu;
=20
 	return target;
--=20
2.34.1
From nobody Sat Feb  7 13:43:40 2026
Received: from foss.arm.com (foss.arm.com [217.140.110.172])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id 640725D47E
	for <linux-kernel@vger.kernel.org>; Thu,  1 Feb 2024 13:12:31 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=217.140.110.172
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1706793155; cv=none;
 b=uR7nS4kmwIRXcW/vs0hQMee8PZvEo6lKOIAfNZnk58Pd+5wCVk3v/Of/AXPjD/iJUZaCCBktQAuRmp9BxsdpgMwnCvggFRZmpHIQtVgmcpqXXGqjCEZ/z7O+GEDNi8uVu+X5qChCQK202/8ZThWC+xCJ77IcCVToNLBlRH/NuIo=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1706793155; c=relaxed/simple;
	bh=JaenQUXRXmbunSHjMbrcYbbBgnAH6NQCxQv62fQFkRg=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=jxN8TgQn6xSjuQPypSnnvri84RMT2ld6QCg+oOzHy8ILthZ9p/YsZ3Nz2RkROqfEMvdse3gfZBiBumwB0nCxpnG9cP+l7EMwE6PB6aUsbvJ+FGG9NAf577gMkPqfURi4z5W1pbh2X1Q8qbgblO78UFT+HNpkSzd0OBliiP8+ncI=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=arm.com;
 spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=arm.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=arm.com
Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14])
	by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 329CA176A;
	Thu,  1 Feb 2024 05:13:14 -0800 (PST)
Received: from e130256.cambridge.arm.com (usa-sjc-imap-foss1.foss.arm.com
 [10.121.207.14])
	by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id 4D0743F762;
	Thu,  1 Feb 2024 05:12:29 -0800 (PST)
From: Hongyan Xia <hongyan.xia2@arm.com>
To: Ingo Molnar <mingo@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Vincent Guittot <vincent.guittot@linaro.org>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Juri Lelli <juri.lelli@redhat.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Ben Segall <bsegall@google.com>,
	Mel Gorman <mgorman@suse.de>,
	Daniel Bristot de Oliveira <bristot@redhat.com>,
	Valentin Schneider <vschneid@redhat.com>
Cc: Qais Yousef <qyousef@layalina.io>,
	Morten Rasmussen <morten.rasmussen@arm.com>,
	Lukasz Luba <lukasz.luba@arm.com>,
	Christian Loehle <christian.loehle@arm.com>,
	linux-kernel@vger.kernel.org,
	David Dai <davidai@google.com>,
	Saravana Kannan <saravanak@google.com>
Subject: [RFC PATCH v2 6/7] sched/uclamp: Remove all uclamp bucket logic
Date: Thu,  1 Feb 2024 13:12:02 +0000
Message-Id: 
 <61ef1a11325838e8b50e76a1b6c6d93bd5f2982c.1706792708.git.hongyan.xia2@arm.com>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <cover.1706792708.git.hongyan.xia2@arm.com>
References: <cover.1706792708.git.hongyan.xia2@arm.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

Also rewrite uclamp_update_active() so that the effective uclamp values
are updated every time we change task group properties, change system
defaults or a request is issued from userspace.

This also signnificantly reduces uclamp overhead because we no longer
need to compute effective uclamp values and manipulate buckets every
time a task is enqueued or dequeued (in uclamp_rq_{inc/dec}()).

TODO: Rewrite documentation to match the new logic.

Signed-off-by: Hongyan Xia <hongyan.xia2@arm.com>

---
Changed in v2:
- Remove stale comments about 'uclamp buckets'.
---
 include/linux/sched.h |   4 -
 init/Kconfig          |  32 -----
 kernel/sched/core.c   | 300 +++---------------------------------------
 kernel/sched/fair.c   |   4 -
 kernel/sched/rt.c     |   4 -
 kernel/sched/sched.h  |  85 ------------
 6 files changed, 19 insertions(+), 410 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f28eeff169ff..291b6781b221 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -678,9 +678,6 @@ struct sched_dl_entity {
 };
=20
 #ifdef CONFIG_UCLAMP_TASK
-/* Number of utilization clamp buckets (shorter alias) */
-#define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT
-
 /*
  * Utilization clamp for a scheduling entity
  * @value:		clamp value "assigned" to a se
@@ -706,7 +703,6 @@ struct sched_dl_entity {
  */
 struct uclamp_se {
 	unsigned int value		: bits_per(SCHED_CAPACITY_SCALE);
-	unsigned int bucket_id		: bits_per(UCLAMP_BUCKETS);
 	unsigned int active		: 1;
 	unsigned int user_defined	: 1;
 };
diff --git a/init/Kconfig b/init/Kconfig
index 9ffb103fc927..1c8e11dcda17 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -808,38 +808,6 @@ config UCLAMP_TASK
 	  enforce or grant any specific bandwidth for tasks.
=20
 	  If in doubt, say N.
-
-config UCLAMP_BUCKETS_COUNT
-	int "Number of supported utilization clamp buckets"
-	range 5 20
-	default 5
-	depends on UCLAMP_TASK
-	help
-	  Defines the number of clamp buckets to use. The range of each bucket
-	  will be SCHED_CAPACITY_SCALE/UCLAMP_BUCKETS_COUNT. The higher the
-	  number of clamp buckets the finer their granularity and the higher
-	  the precision of clamping aggregation and tracking at run-time.
-
-	  For example, with the minimum configuration value we will have 5
-	  clamp buckets tracking 20% utilization each. A 25% boosted tasks will
-	  be refcounted in the [20..39]% bucket and will set the bucket clamp
-	  effective value to 25%.
-	  If a second 30% boosted task should be co-scheduled on the same CPU,
-	  that task will be refcounted in the same bucket of the first task and
-	  it will boost the bucket clamp effective value to 30%.
-	  The clamp effective value of a bucket is reset to its nominal value
-	  (20% in the example above) when there are no more tasks refcounted in
-	  that bucket.
-
-	  An additional boost/capping margin can be added to some tasks. In the
-	  example above the 25% task will be boosted to 30% until it exits the
-	  CPU. If that should be considered not acceptable on certain systems,
-	  it's always possible to reduce the margin by increasing the number of
-	  clamp buckets to trade off used memory for run-time tracking
-	  precision.
-
-	  If in doubt, use the default value.
-
 endmenu
=20
 #
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0bedc05c883f..a3b36adc4dcc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1408,17 +1408,9 @@ static struct uclamp_se uclamp_default[UCLAMP_CNT];
  */
 DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
=20
-/* Integer rounded range for each bucket */
-#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP=
_BUCKETS)
-
 #define for_each_clamp_id(clamp_id) \
 	for ((clamp_id) =3D 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
=20
-static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
-{
-	return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCK=
ETS - 1);
-}
-
 static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
 {
 	if (clamp_id =3D=3D UCLAMP_MIN)
@@ -1430,58 +1422,9 @@ static inline void uclamp_se_set(struct uclamp_se *u=
c_se,
 				 unsigned int value, bool user_defined)
 {
 	uc_se->value =3D value;
-	uc_se->bucket_id =3D uclamp_bucket_id(value);
 	uc_se->user_defined =3D user_defined;
 }
=20
-static inline unsigned int
-uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
-		  unsigned int clamp_value)
-{
-	/*
-	 * Avoid blocked utilization pushing up the frequency when we go
-	 * idle (which drops the max-clamp) by retaining the last known
-	 * max-clamp.
-	 */
-	if (clamp_id =3D=3D UCLAMP_MAX) {
-		rq->uclamp_flags |=3D UCLAMP_FLAG_IDLE;
-		return clamp_value;
-	}
-
-	return uclamp_none(UCLAMP_MIN);
-}
-
-static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_i=
d,
-				     unsigned int clamp_value)
-{
-	/* Reset max-clamp retention only on idle exit */
-	if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
-		return;
-
-	uclamp_rq_set(rq, clamp_id, clamp_value);
-}
-
-static inline
-unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
-				   unsigned int clamp_value)
-{
-	struct uclamp_bucket *bucket =3D rq->uclamp[clamp_id].bucket;
-	int bucket_id =3D UCLAMP_BUCKETS - 1;
-
-	/*
-	 * Since both min and max clamps are max aggregated, find the
-	 * top most bucket with tasks in.
-	 */
-	for ( ; bucket_id >=3D 0; bucket_id--) {
-		if (!bucket[bucket_id].tasks)
-			continue;
-		return bucket[bucket_id].value;
-	}
-
-	/* No tasks -- default clamp values */
-	return uclamp_idle_value(rq, clamp_id, clamp_value);
-}
-
 static void __uclamp_update_util_min_rt_default(struct task_struct *p)
 {
 	unsigned int default_util_min;
@@ -1537,8 +1480,7 @@ uclamp_tg_restrict(struct task_struct *p, enum uclamp=
_id clamp_id)
 }
=20
 /*
- * The effective clamp bucket index of a task depends on, by increasing
- * priority:
+ * The effective uclamp value of a task depends on, by increasing priority:
  * - the task specific clamp value, when explicitly requested from userspa=
ce
  * - the task group effective clamp value, for tasks not either in the root
  *   group or in an autogroup
@@ -1559,196 +1501,24 @@ uclamp_eff_get(struct task_struct *p, enum uclamp_=
id clamp_id)
=20
 unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp=
_id)
 {
-	struct uclamp_se uc_eff;
-
-	/* Task currently refcounted: use back-annotated (effective) value */
-	if (p->uclamp[clamp_id].active)
-		return (unsigned long)p->uclamp[clamp_id].value;
-
-	uc_eff =3D uclamp_eff_get(p, clamp_id);
-
-	return (unsigned long)uc_eff.value;
-}
-
-/*
- * When a task is enqueued on a rq, the clamp bucket currently defined by =
the
- * task's uclamp::bucket_id is refcounted on that rq. This also immediately
- * updates the rq's clamp value if required.
- *
- * Tasks can have a task-specific value requested from user-space, track
- * within each bucket the maximum value for tasks refcounted in it.
- * This "local max aggregation" allows to track the exact "requested" value
- * for each bucket when all its RUNNABLE tasks require the same clamp.
- */
-static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
-				    enum uclamp_id clamp_id)
-{
-	struct uclamp_rq *uc_rq =3D &rq->uclamp[clamp_id];
-	struct uclamp_se *uc_se =3D &p->uclamp[clamp_id];
-	struct uclamp_bucket *bucket;
-
-	lockdep_assert_rq_held(rq);
+	if (!uclamp_is_used() || !p->uclamp[clamp_id].active)
+		return uclamp_none(clamp_id);
=20
-	/* Update task effective clamp */
-	p->uclamp[clamp_id] =3D uclamp_eff_get(p, clamp_id);
-
-	bucket =3D &uc_rq->bucket[uc_se->bucket_id];
-	bucket->tasks++;
-	uc_se->active =3D true;
-
-	uclamp_idle_reset(rq, clamp_id, uc_se->value);
-
-	/*
-	 * Local max aggregation: rq buckets always track the max
-	 * "requested" clamp value of its RUNNABLE tasks.
-	 */
-	if (bucket->tasks =3D=3D 1 || uc_se->value > bucket->value)
-		bucket->value =3D uc_se->value;
-
-	if (uc_se->value > uclamp_rq_get(rq, clamp_id))
-		uclamp_rq_set(rq, clamp_id, uc_se->value);
+	return p->uclamp[clamp_id].value;
 }
=20
-/*
- * When a task is dequeued from a rq, the clamp bucket refcounted by the t=
ask
- * is released. If this is the last task reference counting the rq's max
- * active clamp value, then the rq's clamp value is updated.
- *
- * Both refcounted tasks and rq's cached clamp values are expected to be
- * always valid. If it's detected they are not, as defensive programming,
- * enforce the expected state and warn.
- */
-static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
-				    enum uclamp_id clamp_id)
-{
-	struct uclamp_rq *uc_rq =3D &rq->uclamp[clamp_id];
-	struct uclamp_se *uc_se =3D &p->uclamp[clamp_id];
-	struct uclamp_bucket *bucket;
-	unsigned int bkt_clamp;
-	unsigned int rq_clamp;
-
-	lockdep_assert_rq_held(rq);
-
-	/*
-	 * If sched_uclamp_used was enabled after task @p was enqueued,
-	 * we could end up with unbalanced call to uclamp_rq_dec_id().
-	 *
-	 * In this case the uc_se->active flag should be false since no uclamp
-	 * accounting was performed at enqueue time and we can just return
-	 * here.
-	 *
-	 * Need to be careful of the following enqueue/dequeue ordering
-	 * problem too
-	 *
-	 *	enqueue(taskA)
-	 *	// sched_uclamp_used gets enabled
-	 *	enqueue(taskB)
-	 *	dequeue(taskA)
-	 *	// Must not decrement bucket->tasks here
-	 *	dequeue(taskB)
-	 *
-	 * where we could end up with stale data in uc_se and
-	 * bucket[uc_se->bucket_id].
-	 *
-	 * The following check here eliminates the possibility of such race.
-	 */
-	if (unlikely(!uc_se->active))
-		return;
-
-	bucket =3D &uc_rq->bucket[uc_se->bucket_id];
-
-	SCHED_WARN_ON(!bucket->tasks);
-	if (likely(bucket->tasks))
-		bucket->tasks--;
-
-	uc_se->active =3D false;
-
-	/*
-	 * Keep "local max aggregation" simple and accept to (possibly)
-	 * overboost some RUNNABLE tasks in the same bucket.
-	 * The rq clamp bucket value is reset to its base value whenever
-	 * there are no more RUNNABLE tasks refcounting it.
-	 */
-	if (likely(bucket->tasks))
-		return;
-
-	rq_clamp =3D uclamp_rq_get(rq, clamp_id);
-	/*
-	 * Defensive programming: this should never happen. If it happens,
-	 * e.g. due to future modification, warn and fixup the expected value.
-	 */
-	SCHED_WARN_ON(bucket->value > rq_clamp);
-	if (bucket->value >=3D rq_clamp) {
-		bkt_clamp =3D uclamp_rq_max_value(rq, clamp_id, uc_se->value);
-		uclamp_rq_set(rq, clamp_id, bkt_clamp);
-	}
-}
-
-static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
-{
-	enum uclamp_id clamp_id;
-
-	/*
-	 * Avoid any overhead until uclamp is actually used by the userspace.
-	 *
-	 * The condition is constructed such that a NOP is generated when
-	 * sched_uclamp_used is disabled.
-	 */
-	if (!static_branch_unlikely(&sched_uclamp_used))
-		return;
-
-	if (unlikely(!p->sched_class->uclamp_enabled))
-		return;
-
-	for_each_clamp_id(clamp_id)
-		uclamp_rq_inc_id(rq, p, clamp_id);
-
-	/* Reset clamp idle holding when there is one RUNNABLE task */
-	if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
-		rq->uclamp_flags &=3D ~UCLAMP_FLAG_IDLE;
-}
-
-static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
+static inline void
+uclamp_update_active_nolock(struct task_struct *p)
 {
 	enum uclamp_id clamp_id;
=20
-	/*
-	 * Avoid any overhead until uclamp is actually used by the userspace.
-	 *
-	 * The condition is constructed such that a NOP is generated when
-	 * sched_uclamp_used is disabled.
-	 */
-	if (!static_branch_unlikely(&sched_uclamp_used))
-		return;
-
-	if (unlikely(!p->sched_class->uclamp_enabled))
-		return;
-
 	for_each_clamp_id(clamp_id)
-		uclamp_rq_dec_id(rq, p, clamp_id);
-}
-
-static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p,
-				      enum uclamp_id clamp_id)
-{
-	if (!p->uclamp[clamp_id].active)
-		return;
-
-	uclamp_rq_dec_id(rq, p, clamp_id);
-	uclamp_rq_inc_id(rq, p, clamp_id);
-
-	/*
-	 * Make sure to clear the idle flag if we've transiently reached 0
-	 * active tasks on rq.
-	 */
-	if (clamp_id =3D=3D UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE))
-		rq->uclamp_flags &=3D ~UCLAMP_FLAG_IDLE;
+		p->uclamp[clamp_id] =3D uclamp_eff_get(p, clamp_id);
 }
=20
 static inline void
 uclamp_update_active(struct task_struct *p)
 {
-	enum uclamp_id clamp_id;
 	struct rq_flags rf;
 	struct rq *rq;
=20
@@ -1762,14 +1532,7 @@ uclamp_update_active(struct task_struct *p)
 	 */
 	rq =3D task_rq_lock(p, &rf);
=20
-	/*
-	 * Setting the clamp bucket is serialized by task_rq_lock().
-	 * If the task is not yet RUNNABLE and its task_struct is not
-	 * affecting a valid clamp bucket, the next time it's enqueued,
-	 * it will already see the updated clamp bucket value.
-	 */
-	for_each_clamp_id(clamp_id)
-		uclamp_rq_reinc_id(rq, p, clamp_id);
+	uclamp_update_active_nolock(p);
=20
 	task_rq_unlock(rq, p, &rf);
 }
@@ -1998,26 +1761,22 @@ static void __setscheduler_uclamp(struct task_struc=
t *p,
 		uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
 			      attr->sched_util_max, true);
 	}
+
+	uclamp_update_active_nolock(p);
 }
=20
 static void uclamp_fork(struct task_struct *p)
 {
 	enum uclamp_id clamp_id;
=20
-	/*
-	 * We don't need to hold task_rq_lock() when updating p->uclamp_* here
-	 * as the task is still at its early fork stages.
-	 */
-	for_each_clamp_id(clamp_id)
-		p->uclamp[clamp_id].active =3D false;
-
-	if (likely(!p->sched_reset_on_fork))
-		return;
-
-	for_each_clamp_id(clamp_id) {
-		uclamp_se_set(&p->uclamp_req[clamp_id],
-			      uclamp_none(clamp_id), false);
+	if (unlikely(p->sched_reset_on_fork)) {
+		for_each_clamp_id(clamp_id) {
+			uclamp_se_set(&p->uclamp_req[clamp_id],
+				      uclamp_none(clamp_id), false);
+		}
 	}
+
+	uclamp_update_active(p);
 }
=20
 static void uclamp_post_fork(struct task_struct *p)
@@ -2025,28 +1784,10 @@ static void uclamp_post_fork(struct task_struct *p)
 	uclamp_update_util_min_rt_default(p);
 }
=20
-static void __init init_uclamp_rq(struct rq *rq)
-{
-	enum uclamp_id clamp_id;
-	struct uclamp_rq *uc_rq =3D rq->uclamp;
-
-	for_each_clamp_id(clamp_id) {
-		uc_rq[clamp_id] =3D (struct uclamp_rq) {
-			.value =3D uclamp_none(clamp_id)
-		};
-	}
-
-	rq->uclamp_flags =3D UCLAMP_FLAG_IDLE;
-}
-
 static void __init init_uclamp(void)
 {
 	struct uclamp_se uc_max =3D {};
 	enum uclamp_id clamp_id;
-	int cpu;
-
-	for_each_possible_cpu(cpu)
-		init_uclamp_rq(cpu_rq(cpu));
=20
 	for_each_clamp_id(clamp_id) {
 		uclamp_se_set(&init_task.uclamp_req[clamp_id],
@@ -2065,8 +1806,7 @@ static void __init init_uclamp(void)
 }
=20
 #else /* CONFIG_UCLAMP_TASK */
-static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
-static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
+static inline void uclamp_update_active_nolock(struct task_struct *p) { }
 static inline int uclamp_validate(struct task_struct *p,
 				  const struct sched_attr *attr)
 {
@@ -2113,7 +1853,6 @@ static inline void enqueue_task(struct rq *rq, struct=
 task_struct *p, int flags)
 		psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
 	}
=20
-	uclamp_rq_inc(rq, p);
 	p->sched_class->enqueue_task(rq, p, flags);
=20
 	if (sched_core_enabled(rq))
@@ -2133,7 +1872,6 @@ static inline void dequeue_task(struct rq *rq, struct=
 task_struct *p, int flags)
 		psi_dequeue(p, flags & DEQUEUE_SLEEP);
 	}
=20
-	uclamp_rq_dec(rq, p);
 	p->sched_class->dequeue_task(rq, p, flags);
 }
=20
@@ -10480,6 +10218,7 @@ void sched_move_task(struct task_struct *tsk)
 		put_prev_task(rq, tsk);
=20
 	sched_change_group(tsk, group);
+	uclamp_update_active_nolock(tsk);
=20
 	if (queued)
 		enqueue_task(rq, tsk, queue_flags);
@@ -10612,7 +10351,6 @@ static void cpu_util_update_eff(struct cgroup_subsy=
s_state *css)
 			if (eff[clamp_id] =3D=3D uc_se[clamp_id].value)
 				continue;
 			uc_se[clamp_id].value =3D eff[clamp_id];
-			uc_se[clamp_id].bucket_id =3D uclamp_bucket_id(eff[clamp_id]);
 			clamps |=3D (0x1 << clamp_id);
 		}
 		if (!clamps) {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 49997f1f58fb..ac1dd5739ec6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12996,10 +12996,6 @@ DEFINE_SCHED_CLASS(fair) =3D {
 #ifdef CONFIG_SCHED_CORE
 	.task_is_throttled	=3D task_is_throttled_fair,
 #endif
-
-#ifdef CONFIG_UCLAMP_TASK
-	.uclamp_enabled		=3D 1,
-#endif
 };
=20
 #ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 3261b067b67e..86733bed0e3c 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2681,10 +2681,6 @@ DEFINE_SCHED_CLASS(rt) =3D {
 #ifdef CONFIG_SCHED_CORE
 	.task_is_throttled	=3D task_is_throttled_rt,
 #endif
-
-#ifdef CONFIG_UCLAMP_TASK
-	.uclamp_enabled		=3D 1,
-#endif
 };
=20
 #ifdef CONFIG_RT_GROUP_SCHED
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3ee28822f48f..81578410984c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -913,46 +913,6 @@ extern void rto_push_irq_work_func(struct irq_work *wo=
rk);
 #endif /* CONFIG_SMP */
=20
 #ifdef CONFIG_UCLAMP_TASK
-/*
- * struct uclamp_bucket - Utilization clamp bucket
- * @value: utilization clamp value for tasks on this clamp bucket
- * @tasks: number of RUNNABLE tasks on this clamp bucket
- *
- * Keep track of how many tasks are RUNNABLE for a given utilization
- * clamp value.
- */
-struct uclamp_bucket {
-	unsigned long value : bits_per(SCHED_CAPACITY_SCALE);
-	unsigned long tasks : BITS_PER_LONG - bits_per(SCHED_CAPACITY_SCALE);
-};
-
-/*
- * struct uclamp_rq - rq's utilization clamp
- * @value: currently active clamp values for a rq
- * @bucket: utilization clamp buckets affecting a rq
- *
- * Keep track of RUNNABLE tasks on a rq to aggregate their clamp values.
- * A clamp value is affecting a rq when there is at least one task RUNNABLE
- * (or actually running) with that value.
- *
- * There are up to UCLAMP_CNT possible different clamp values, currently t=
here
- * are only two: minimum utilization and maximum utilization.
- *
- * All utilization clamping values are MAX aggregated, since:
- * - for util_min: we want to run the CPU at least at the max of the minim=
um
- *   utilization required by its currently RUNNABLE tasks.
- * - for util_max: we want to allow the CPU to run up to the max of the
- *   maximum utilization allowed by its currently RUNNABLE tasks.
- *
- * Since on each system we expect only a limited number of different
- * utilization clamp values (UCLAMP_BUCKETS), use a simple array to track
- * the metrics required to compute all the per-rq utilization clamp values.
- */
-struct uclamp_rq {
-	unsigned int value;
-	struct uclamp_bucket bucket[UCLAMP_BUCKETS];
-};
-
 DECLARE_STATIC_KEY_FALSE(sched_uclamp_used);
 #endif /* CONFIG_UCLAMP_TASK */
=20
@@ -995,11 +955,7 @@ struct rq {
 	u64			nr_switches;
=20
 #ifdef CONFIG_UCLAMP_TASK
-	/* Utilization clamp values based on CPU's RUNNABLE tasks */
-	struct uclamp_rq	uclamp[UCLAMP_CNT] ____cacheline_aligned;
-	unsigned int		uclamp_flags;
 	unsigned int		root_cfs_util_uclamp;
-#define UCLAMP_FLAG_IDLE 0x01
 #endif
=20
 	struct cfs_rq		cfs;
@@ -2247,11 +2203,6 @@ struct affinity_context {
 extern s64 update_curr_common(struct rq *rq);
=20
 struct sched_class {
-
-#ifdef CONFIG_UCLAMP_TASK
-	int uclamp_enabled;
-#endif
-
 	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
 	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
 	void (*yield_task)   (struct rq *rq);
@@ -3042,23 +2993,6 @@ static inline unsigned long cpu_util_rt(struct rq *r=
q)
 #ifdef CONFIG_UCLAMP_TASK
 unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp=
_id);
=20
-static inline unsigned long uclamp_rq_get(struct rq *rq,
-					  enum uclamp_id clamp_id)
-{
-	return READ_ONCE(rq->uclamp[clamp_id].value);
-}
-
-static inline void uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id,
-				 unsigned int value)
-{
-	WRITE_ONCE(rq->uclamp[clamp_id].value, value);
-}
-
-static inline bool uclamp_rq_is_idle(struct rq *rq)
-{
-	return rq->uclamp_flags & UCLAMP_FLAG_IDLE;
-}
-
 /* Is the rq being capped/throttled by uclamp_max? */
 static inline bool uclamp_rq_is_capped(struct rq *rq)
 {
@@ -3147,25 +3081,6 @@ static inline bool uclamp_is_used(void)
 	return false;
 }
=20
-static inline unsigned long uclamp_rq_get(struct rq *rq,
-					  enum uclamp_id clamp_id)
-{
-	if (clamp_id =3D=3D UCLAMP_MIN)
-		return 0;
-
-	return SCHED_CAPACITY_SCALE;
-}
-
-static inline void uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id,
-				 unsigned int value)
-{
-}
-
-static inline bool uclamp_rq_is_idle(struct rq *rq)
-{
-	return false;
-}
-
 static inline unsigned long root_cfs_util(struct rq *rq)
 {
 	return READ_ONCE(rq->cfs.avg.util_avg);
--=20
2.34.1
From nobody Sat Feb  7 13:43:40 2026
Received: from foss.arm.com (foss.arm.com [217.140.110.172])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id E04285D48A
	for <linux-kernel@vger.kernel.org>; Thu,  1 Feb 2024 13:12:34 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=217.140.110.172
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1706793157; cv=none;
 b=nbDCH1/u8l4RFM9ZyTF0zXfps7VXitYP5aypWjF/Gm3iSxNiiWCHqI2/SEY38x/Sa+lBEe4Oab0lr4SViwC9VdwpcMTb2lLHeWO1ZD7FMKh+aqqz7C6XqZSC1ZuRI2oqnj0aUW0QF2Z9VxtOeJkIENcHTHuwPE7MoZvHqfMJczE=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1706793157; c=relaxed/simple;
	bh=qyzq9pirjl2WQPG64jgIbtQm4pn18T/tfWhWUBDLcXY=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=UNgaMlQB3LAVvuHRY+D5fJV18eCSinpFyUM6FOkBFSxTSKlFva4ZZF8ouk+ndZj/CyxZevoJG+GwckJIV8QJG5QI6yLtlHxLhbhWsWv+y7534HvJyj3OmzTXMr3VEqOPb8PUc1kkWBdqmr3YuTAOFL4RTGNnzX43T7U4ZCc60Ag=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=arm.com;
 spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=arm.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=arm.com
Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14])
	by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 108E2176B;
	Thu,  1 Feb 2024 05:13:17 -0800 (PST)
Received: from e130256.cambridge.arm.com (usa-sjc-imap-foss1.foss.arm.com
 [10.121.207.14])
	by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id 442993F762;
	Thu,  1 Feb 2024 05:12:32 -0800 (PST)
From: Hongyan Xia <hongyan.xia2@arm.com>
To: Ingo Molnar <mingo@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Vincent Guittot <vincent.guittot@linaro.org>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Juri Lelli <juri.lelli@redhat.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Ben Segall <bsegall@google.com>,
	Mel Gorman <mgorman@suse.de>,
	Daniel Bristot de Oliveira <bristot@redhat.com>,
	Valentin Schneider <vschneid@redhat.com>
Cc: Qais Yousef <qyousef@layalina.io>,
	Morten Rasmussen <morten.rasmussen@arm.com>,
	Lukasz Luba <lukasz.luba@arm.com>,
	Christian Loehle <christian.loehle@arm.com>,
	linux-kernel@vger.kernel.org,
	David Dai <davidai@google.com>,
	Saravana Kannan <saravanak@google.com>
Subject: [RFC PATCH v2 7/7] sched/uclamp: Simplify uclamp_eff_value()
Date: Thu,  1 Feb 2024 13:12:03 +0000
Message-Id: 
 <215a6377e1aef10460d1aa870fb06774680925c5.1706792708.git.hongyan.xia2@arm.com>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <cover.1706792708.git.hongyan.xia2@arm.com>
References: <cover.1706792708.git.hongyan.xia2@arm.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

The commit

sched: Remove all uclamp bucket logic

removes uclamp_rq_{inc/dec}() functions, so now p->uclamp contains the
correct values all the time after a uclamp_update_active() call, and
there's no need to toggle the boolean `active` after an update. As a
result, this function is fairly simple now and can live as a static
inline function.

Signed-off-by: Hongyan Xia <hongyan.xia2@arm.com>
---
 kernel/sched/core.c  | 13 ++++---------
 kernel/sched/sched.h | 14 ++++++++++++--
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a3b36adc4dcc..f5f5f056525c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1499,21 +1499,15 @@ uclamp_eff_get(struct task_struct *p, enum uclamp_i=
d clamp_id)
 	return uc_req;
 }
=20
-unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp=
_id)
-{
-	if (!uclamp_is_used() || !p->uclamp[clamp_id].active)
-		return uclamp_none(clamp_id);
-
-	return p->uclamp[clamp_id].value;
-}
-
 static inline void
 uclamp_update_active_nolock(struct task_struct *p)
 {
 	enum uclamp_id clamp_id;
=20
-	for_each_clamp_id(clamp_id)
+	for_each_clamp_id(clamp_id) {
 		p->uclamp[clamp_id] =3D uclamp_eff_get(p, clamp_id);
+		p->uclamp[clamp_id].active =3D 1;
+	}
 }
=20
 static inline void
@@ -1773,6 +1767,7 @@ static void uclamp_fork(struct task_struct *p)
 		for_each_clamp_id(clamp_id) {
 			uclamp_se_set(&p->uclamp_req[clamp_id],
 				      uclamp_none(clamp_id), false);
+			p->uclamp[clamp_id].active =3D 0;
 		}
 	}
=20
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 81578410984c..2caefc3344bb 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2991,8 +2991,6 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
 #endif
=20
 #ifdef CONFIG_UCLAMP_TASK
-unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp=
_id);
-
 /* Is the rq being capped/throttled by uclamp_max? */
 static inline bool uclamp_rq_is_capped(struct rq *rq)
 {
@@ -3022,6 +3020,18 @@ static inline bool uclamp_is_used(void)
 	return static_branch_likely(&sched_uclamp_used);
 }
=20
+static inline unsigned long uclamp_eff_value(struct task_struct *p,
+					     enum uclamp_id clamp_id)
+{
+	if (uclamp_is_used() && p->uclamp[clamp_id].active)
+		return p->uclamp[clamp_id].value;
+
+	if (clamp_id =3D=3D UCLAMP_MIN)
+		return 0;
+
+	return SCHED_CAPACITY_SCALE;
+}
+
 static inline unsigned long root_cfs_util(struct rq *rq)
 {
 	return READ_ONCE(rq->root_cfs_util_uclamp);
--=20
2.34.1