From nobody Sun Feb  8 03:17:38 2026
Received: from foss.arm.com (foss.arm.com [217.140.110.172])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id 353F8205E36
	for <linux-kernel@vger.kernel.org>; Tue,  4 Mar 2025 14:23:29 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=217.140.110.172
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1741098212; cv=none;
 b=jWBBZcCu9nBN5xTRPHEWWj53Vzaqt5L7X6X1IUNjKnOBrOGIsghNjFuXfFxWtc74B9ACn3gGHgBhcRKOjfIx3PXBTJmPbnns5nTLuvqywi/pg3jGZzuvqZo/TCgAOroFx22ioyBneKW/d08FDMEqf+kymY5XjuDsThxGPjUuk+k=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1741098212; c=relaxed/simple;
	bh=Hq2UWhVleNV3rRPtc00pqkqQi0ddlRTPYKZ0ga1YvZQ=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=cLbqO4ompGwNPGuIiDuwgcKh5n8Gg/CB8nREj6gPjg90sNwnSrwd7i6KihgVsHgx1xJwhtmGSIZrqE5ZWpfHgDaYkI83FEKuIqjtQ8F+YiDHwGtlRGXIJtP10BmqPACyUEnv0i3kcBaeRn3Gg+wl3X8a0i82mMqBu/BU4kbmfAU=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=arm.com;
 spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=arm.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=arm.com
Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14])
	by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 1618B1007;
	Tue,  4 Mar 2025 06:23:43 -0800 (PST)
Received: from e130256.cambridge.arm.com (usa-sjc-imap-foss1.foss.arm.com
 [10.121.207.14])
	by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id 9B74E3F66E;
	Tue,  4 Mar 2025 06:23:27 -0800 (PST)
From: Hongyan Xia <hongyan.xia2@arm.com>
To: Ingo Molnar <mingo@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Vincent Guittot <vincent.guittot@linaro.org>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Juri Lelli <juri.lelli@redhat.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Ben Segall <bsegall@google.com>,
	Mel Gorman <mgorman@suse.de>,
	Valentin Schneider <vschneid@redhat.com>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>,
	Lukasz Luba <lukasz.luba@arm.com>,
	Christian Loehle <christian.loehle@arm.com>,
	Pierre Gondois <pierre.gondois@arm.com>,
	linux-kernel@vger.kernel.org,
	Hongyan Xia <Hongyan.Xia2@arm.com>
Subject: [PATCH v2 1/8] Revert "sched/uclamp: Set max_spare_cap_cpu even if
 max_spare_cap is 0"
Date: Tue,  4 Mar 2025 14:23:08 +0000
Message-Id: 
 <4d7dc8f07bede735d307969ca58ed145ff2254eb.1741091349.git.hongyan.xia2@arm.com>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <cover.1741091349.git.hongyan.xia2@arm.com>
References: <cover.1741091349.git.hongyan.xia2@arm.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

From: Hongyan Xia <Hongyan.Xia2@arm.com>

That commit creates further problems because 0 spare capacity can be
either a real indication that the CPU is maxed out, or the CPU is
UCLAMP_MAX throttled, but we end up giving all of them a chance which
can results in bogus energy calculations. It also tends to schedule
tasks on the same CPU and requires load balancing patches. Sum
aggregation solves these problems and this patch is not needed.

This reverts commit 6b00a40147653c8ea748e8f4396510f252763364.

Signed-off-by: Hongyan Xia <hongyan.xia2@arm.com>
---
 kernel/sched/fair.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 857808da23d8..71fc86eafbd9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8417,10 +8417,11 @@ static int find_energy_efficient_cpu(struct task_st=
ruct *p, int prev_cpu)
 	for (; pd; pd =3D pd->next) {
 		unsigned long util_min =3D p_util_min, util_max =3D p_util_max;
 		unsigned long cpu_cap, cpu_actual_cap, util;
-		long prev_spare_cap =3D -1, max_spare_cap =3D -1;
+		unsigned long cur_delta, max_spare_cap =3D 0;
 		unsigned long rq_util_min, rq_util_max;
-		unsigned long cur_delta, base_energy;
+		unsigned long prev_spare_cap =3D 0;
 		int max_spare_cap_cpu =3D -1;
+		unsigned long base_energy;
 		int fits, max_fits =3D -1;
=20
 		cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
@@ -8482,7 +8483,7 @@ static int find_energy_efficient_cpu(struct task_stru=
ct *p, int prev_cpu)
 				prev_spare_cap =3D cpu_cap;
 				prev_fits =3D fits;
 			} else if ((fits > max_fits) ||
-				   ((fits =3D=3D max_fits) && ((long)cpu_cap > max_spare_cap))) {
+				   ((fits =3D=3D max_fits) && (cpu_cap > max_spare_cap))) {
 				/*
 				 * Find the CPU with the maximum spare capacity
 				 * among the remaining CPUs in the performance
@@ -8494,7 +8495,7 @@ static int find_energy_efficient_cpu(struct task_stru=
ct *p, int prev_cpu)
 			}
 		}
=20
-		if (max_spare_cap_cpu < 0 && prev_spare_cap < 0)
+		if (max_spare_cap_cpu < 0 && prev_spare_cap =3D=3D 0)
 			continue;
=20
 		eenv_pd_busy_time(&eenv, cpus, p);
@@ -8502,7 +8503,7 @@ static int find_energy_efficient_cpu(struct task_stru=
ct *p, int prev_cpu)
 		base_energy =3D compute_energy(&eenv, pd, cpus, p, -1);
=20
 		/* Evaluate the energy impact of using prev_cpu. */
-		if (prev_spare_cap > -1) {
+		if (prev_spare_cap > 0) {
 			prev_delta =3D compute_energy(&eenv, pd, cpus, p,
 						    prev_cpu);
 			/* CPU utilization has changed */
--=20
2.34.1
From nobody Sun Feb  8 03:17:38 2026
Received: from foss.arm.com (foss.arm.com [217.140.110.172])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id 24FAE2066DE
	for <linux-kernel@vger.kernel.org>; Tue,  4 Mar 2025 14:23:32 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=217.140.110.172
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1741098215; cv=none;
 b=K+8z6rANozA8V+KLtvVssMVoNemALQQMv2i7QxaOW3nEweXEOb4HW+dsnUQUw3ey6b4hfqQeecCMCqwQGCMTP5ooPJYtL60p3jt0hYGRXTN5LHUYhc4rrTmFEOnbfjp+pc8QRIuwULkFqzFpQe2h1qoquNcX6Gd4UjhwtUZtHcc=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1741098215; c=relaxed/simple;
	bh=qwOHIjJoK+nAIUIyVranFczSuwcOV87HZG5JpsHbf5M=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=S+cYLft7gsajIjcjrtcVUsg3ej+Sb4EiUPMY8xhtWni1uWE+fj8qBs+A/8ov0oG72rEANJZgTFY/BiPo/C9FSm6+4Wa4FH0apenTnDGtFcSrcePwHLJ/MD3CBG8phapsgZl8Luk1bZFAg5pCR4Aveq93FEu1c08/7VV/ppgaMqc=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=arm.com;
 spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=arm.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=arm.com
Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14])
	by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 016241C25;
	Tue,  4 Mar 2025 06:23:46 -0800 (PST)
Received: from e130256.cambridge.arm.com (usa-sjc-imap-foss1.foss.arm.com
 [10.121.207.14])
	by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id A15093F66E;
	Tue,  4 Mar 2025 06:23:30 -0800 (PST)
From: Hongyan Xia <hongyan.xia2@arm.com>
To: Ingo Molnar <mingo@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Vincent Guittot <vincent.guittot@linaro.org>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Juri Lelli <juri.lelli@redhat.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Ben Segall <bsegall@google.com>,
	Mel Gorman <mgorman@suse.de>,
	Valentin Schneider <vschneid@redhat.com>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>,
	Lukasz Luba <lukasz.luba@arm.com>,
	Christian Loehle <christian.loehle@arm.com>,
	Pierre Gondois <pierre.gondois@arm.com>,
	linux-kernel@vger.kernel.org
Subject: [PATCH v2 2/8] sched/uclamp: Track a new util_avg_bias signal
Date: Tue,  4 Mar 2025 14:23:09 +0000
Message-Id: 
 <24dd7c5800cfca1e7c63e4fab66338f3bdbb1aeb.1741091349.git.hongyan.xia2@arm.com>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <cover.1741091349.git.hongyan.xia2@arm.com>
References: <cover.1741091349.git.hongyan.xia2@arm.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

Add a util_avg_bias signal in sched_avg, which is obtained by:

util_avg_bias =3D clamp(util_avg, uclamp_min, uclamp_max) - util_avg

The task utilization after considering uclamp is;

util_avg_uclamp =3D util_avg + util_avg_bias

We then sum up all biases on the same rq and use the total bias to bias
the rq utilization. This is the core idea of uclamp sum aggregation. The
rq utilization will be

rq_util_avg_uclamp =3D rq_util_avg + total_util_avg_bias

Signed-off-by: Hongyan Xia <hongyan.xia2@arm.com>
---
 include/linux/sched.h |  3 ++-
 kernel/sched/debug.c  |  2 +-
 kernel/sched/fair.c   | 33 +++++++++++++++++++++++++++++++++
 kernel/sched/pelt.c   | 37 +++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h  | 24 ++++++++++++++++++++++++
 5 files changed, 97 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9632e3318e0d..1f3b06aa024d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -487,7 +487,8 @@ struct sched_avg {
 	u32				period_contrib;
 	unsigned long			load_avg;
 	unsigned long			runnable_avg;
-	unsigned long			util_avg;
+	unsigned int			util_avg;
+	int				util_avg_bias;
 	unsigned int			util_est;
 } ____cacheline_aligned;
=20
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index ef047add7f9e..264ee83958b5 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -853,7 +853,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct c=
fs_rq *cfs_rq)
 			cfs_rq->avg.load_avg);
 	SEQ_printf(m, "  .%-30s: %lu\n", "runnable_avg",
 			cfs_rq->avg.runnable_avg);
-	SEQ_printf(m, "  .%-30s: %lu\n", "util_avg",
+	SEQ_printf(m, "  .%-30s: %u\n", "util_avg",
 			cfs_rq->avg.util_avg);
 	SEQ_printf(m, "  .%-30s: %u\n", "util_est",
 			cfs_rq->avg.util_est);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 71fc86eafbd9..438755f55624 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1131,6 +1131,7 @@ void post_init_entity_util_avg(struct task_struct *p)
 	}
=20
 	sa->runnable_avg =3D sa->util_avg;
+	sa->util_avg_bias =3D 0;
 }
=20
 #else /* !CONFIG_SMP */
@@ -4852,6 +4853,32 @@ static inline unsigned long task_util_est(struct tas=
k_struct *p)
 	return max(task_util(p), _task_util_est(p));
 }
=20
+#ifdef CONFIG_UCLAMP_TASK
+static inline long task_util_bias(struct task_struct *p)
+{
+	return READ_ONCE(p->se.avg.util_avg_bias);
+}
+
+static inline unsigned long task_util_uclamp(struct task_struct *p)
+{
+	long ret =3D task_util(p);
+
+	ret +=3D task_util_bias(p);
+
+	return max(ret, 0L);
+}
+#else
+static inline long task_util_bias(struct task_struct *p)
+{
+	return 0;
+}
+
+static inline unsigned long task_util_uclamp(struct task_struct *p)
+{
+	return task_util(p);
+}
+#endif
+
 static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
 				    struct task_struct *p)
 {
@@ -7027,6 +7054,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct =
*p, int flags)
=20
 	/* At this point se is NULL and we are at root level*/
 	add_nr_running(rq, 1);
+	util_bias_enqueue(rq, p);
+	/* XXX: We should skip the update above and only do it once here. */
+	if (task_util_bias(p) > 0)
+		cpufreq_update_util(rq, 0);
=20
 	/*
 	 * Since new tasks are assigned an initial util_avg equal to
@@ -7150,6 +7181,8 @@ static int dequeue_entities(struct rq *rq, struct sch=
ed_entity *se, int flags)
 	}
=20
 	sub_nr_running(rq, h_nr_queued);
+	if (p)
+		util_bias_dequeue(rq, p);
=20
 	if (rq_h_nr_queued && !rq->cfs.h_nr_queued)
 		dl_server_stop(&rq->fair_server);
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 7a8534a2deff..f38abe6f0b8b 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -266,6 +266,39 @@ ___update_load_avg(struct sched_avg *sa, unsigned long=
 load)
 	WRITE_ONCE(sa->util_avg, sa->util_sum / divider);
 }
=20
+#ifdef CONFIG_UCLAMP_TASK
+/* avg must belong to the queue this se is on. */
+static void util_bias_update(struct task_struct *p)
+{
+	unsigned int util, uclamp_min, uclamp_max;
+	struct rq *rq;
+	int old, new;
+
+	util =3D READ_ONCE(p->se.avg.util_avg);
+	uclamp_min =3D uclamp_eff_value(p, UCLAMP_MIN);
+	uclamp_max =3D uclamp_eff_value(p, UCLAMP_MAX);
+	/*
+	 * uclamp_max at the max value means there is no uclamp_max, and should
+	 * not have any clamping effect at all here.
+	 */
+	if (uclamp_max =3D=3D SCHED_CAPACITY_SCALE)
+		uclamp_max =3D UINT_MAX;
+	old =3D READ_ONCE(p->se.avg.util_avg_bias);
+	new =3D (int)clamp(util, uclamp_min, uclamp_max) - (int)util;
+
+	WRITE_ONCE(p->se.avg.util_avg_bias, new);
+	if (!p->se.on_rq)
+		return;
+	rq =3D task_rq(p);
+	WRITE_ONCE(rq->cfs.avg.util_avg_bias,
+		   READ_ONCE(rq->cfs.avg.util_avg_bias) + new - old);
+}
+#else /* !CONFIG_UCLAMP_TASK */
+static void util_bias_update(struct task_struct *p)
+{
+}
+#endif
+
 /*
  * sched_entity:
  *
@@ -296,6 +329,8 @@ int __update_load_avg_blocked_se(u64 now, struct sched_=
entity *se)
 {
 	if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
 		___update_load_avg(&se->avg, se_weight(se));
+		if (entity_is_task(se))
+			util_bias_update(task_of(se));
 		trace_pelt_se_tp(se);
 		return 1;
 	}
@@ -310,6 +345,8 @@ int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq=
, struct sched_entity *se
=20
 		___update_load_avg(&se->avg, se_weight(se));
 		cfs_se_util_change(&se->avg);
+		if (entity_is_task(se))
+			util_bias_update(task_of(se));
 		trace_pelt_se_tp(se);
 		return 1;
 	}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ab16d3d0e51c..74363bc74e23 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3468,6 +3468,22 @@ uclamp_se_set(struct uclamp_se *uc_se, unsigned int =
value, bool user_defined)
 	uc_se->user_defined =3D user_defined;
 }
=20
+static inline void util_bias_enqueue(struct rq *rq, struct task_struct *p)
+{
+	int rq_val =3D READ_ONCE(rq->cfs.avg.util_avg_bias);
+	int p_val =3D READ_ONCE(p->se.avg.util_avg_bias);
+
+	WRITE_ONCE(rq->cfs.avg.util_avg_bias, rq_val + p_val);
+}
+
+static inline void util_bias_dequeue(struct rq *rq, struct task_struct *p)
+{
+	int rq_val =3D READ_ONCE(rq->cfs.avg.util_avg_bias);
+	int p_val =3D READ_ONCE(p->se.avg.util_avg_bias);
+
+	WRITE_ONCE(rq->cfs.avg.util_avg_bias, rq_val - p_val);
+}
+
 #else /* !CONFIG_UCLAMP_TASK: */
=20
 static inline unsigned long
@@ -3505,6 +3521,14 @@ static inline bool uclamp_rq_is_idle(struct rq *rq)
 	return false;
 }
=20
+static inline void util_bias_enqueue(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline void util_bias_dequeue(struct rq *rq, struct task_struct *p)
+{
+}
+
 #endif /* !CONFIG_UCLAMP_TASK */
=20
 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
--=20
2.34.1
From nobody Sun Feb  8 03:17:38 2026
Received: from foss.arm.com (foss.arm.com [217.140.110.172])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id 6F4F0206F01
	for <linux-kernel@vger.kernel.org>; Tue,  4 Mar 2025 14:23:35 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=217.140.110.172
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1741098217; cv=none;
 b=eo8TKRTsH99GeywJnKxkAoIWd6owzwcgya6eJ75rNxemAZDD2iBGhKdLj/ZDeL1hQO4IL5MbhxYGQsC/oA7LfJX+o8j4fWRlYGyg+N7bb7BqH75OqD2gUQPimTzTefFXZ9d9iIHGwwujZ8HUtgH9VAqiYTYcqfZV4sJ2C1IMjS0=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1741098217; c=relaxed/simple;
	bh=gF/kBpcoYSDf4gxMDlpN07XcjBl9f1gdlaUeSadBZ5o=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=Z31CCA4vQ3YgBBvWFPwFhHYWND0R1+yGKqOMGYmYfAwwOQeJM87r3ac0X9nR5x1ehLvZrkg28zs3VX3GiIoZ+J25jEJhByD+kIAI8H+qL5tx/hNJorWage+Ke88YA+jDMgVpfrFH/J4UivCG+XkPa10JZG/jbMPf29CtIOFDA30=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=arm.com;
 spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=arm.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=arm.com
Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14])
	by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 6F52EFEC;
	Tue,  4 Mar 2025 06:23:48 -0800 (PST)
Received: from e130256.cambridge.arm.com (usa-sjc-imap-foss1.foss.arm.com
 [10.121.207.14])
	by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id 1AF953F66E;
	Tue,  4 Mar 2025 06:23:33 -0800 (PST)
From: Hongyan Xia <hongyan.xia2@arm.com>
To: Ingo Molnar <mingo@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Vincent Guittot <vincent.guittot@linaro.org>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Juri Lelli <juri.lelli@redhat.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Ben Segall <bsegall@google.com>,
	Mel Gorman <mgorman@suse.de>,
	Valentin Schneider <vschneid@redhat.com>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>,
	Lukasz Luba <lukasz.luba@arm.com>,
	Christian Loehle <christian.loehle@arm.com>,
	Pierre Gondois <pierre.gondois@arm.com>,
	linux-kernel@vger.kernel.org
Subject: [PATCH v2 3/8] sched/uclamp: Add util_est_uclamp
Date: Tue,  4 Mar 2025 14:23:10 +0000
Message-Id: 
 <723859b17ea463f91e04c87696b6d38ea2839deb.1741091349.git.hongyan.xia2@arm.com>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <cover.1741091349.git.hongyan.xia2@arm.com>
References: <cover.1741091349.git.hongyan.xia2@arm.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

The new util_est_uclamp is essentially clamp(util_est, min, max) and
follows how util_est operates.

Signed-off-by: Hongyan Xia <hongyan.xia2@arm.com>
---
 include/linux/sched.h |  1 +
 kernel/sched/fair.c   | 30 ++++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1f3b06aa024d..a4bdfa1d6be1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -490,6 +490,7 @@ struct sched_avg {
 	unsigned int			util_avg;
 	int				util_avg_bias;
 	unsigned int			util_est;
+	unsigned int			util_est_uclamp;
 } ____cacheline_aligned;
=20
 /*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 438755f55624..e9aa93f99a4e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4867,6 +4867,16 @@ static inline unsigned long task_util_uclamp(struct =
task_struct *p)
=20
 	return max(ret, 0L);
 }
+
+static inline unsigned long _task_util_est_uclamp(struct task_struct *p)
+{
+	return READ_ONCE(p->se.avg.util_est_uclamp);
+}
+
+static inline unsigned long task_util_est_uclamp(struct task_struct *p)
+{
+	return max(task_util_uclamp(p), _task_util_est_uclamp(p));
+}
 #else
 static inline long task_util_bias(struct task_struct *p)
 {
@@ -4877,6 +4887,16 @@ static inline unsigned long task_util_uclamp(struct =
task_struct *p)
 {
 	return task_util(p);
 }
+
+static inline unsigned long _task_util_est_uclamp(struct task_struct *p)
+{
+	return _task_util_est(p);
+}
+
+static inline unsigned long task_util_est_uclamp(struct task_struct *p)
+{
+	return task_util_est(p);
+}
 #endif
=20
 static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
@@ -4891,6 +4911,9 @@ static inline void util_est_enqueue(struct cfs_rq *cf=
s_rq,
 	enqueued  =3D cfs_rq->avg.util_est;
 	enqueued +=3D _task_util_est(p);
 	WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
+	enqueued  =3D cfs_rq->avg.util_est_uclamp;
+	enqueued +=3D _task_util_est_uclamp(p);
+	WRITE_ONCE(cfs_rq->avg.util_est_uclamp, enqueued);
=20
 	trace_sched_util_est_cfs_tp(cfs_rq);
 }
@@ -4907,6 +4930,9 @@ static inline void util_est_dequeue(struct cfs_rq *cf=
s_rq,
 	enqueued  =3D cfs_rq->avg.util_est;
 	enqueued -=3D min_t(unsigned int, enqueued, _task_util_est(p));
 	WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
+	enqueued  =3D cfs_rq->avg.util_est_uclamp;
+	enqueued -=3D _task_util_est_uclamp(p);
+	WRITE_ONCE(cfs_rq->avg.util_est_uclamp, enqueued);
=20
 	trace_sched_util_est_cfs_tp(cfs_rq);
 }
@@ -4994,6 +5020,10 @@ static inline void util_est_update(struct cfs_rq *cf=
s_rq,
 	ewma  -=3D last_ewma_diff;
 	ewma >>=3D UTIL_EST_WEIGHT_SHIFT;
 done:
+	WRITE_ONCE(p->se.avg.util_est_uclamp,
+		   clamp(ewma,
+			 (unsigned int)uclamp_eff_value(p, UCLAMP_MIN),
+			 (unsigned int)uclamp_eff_value(p, UCLAMP_MAX)));
 	ewma |=3D UTIL_AVG_UNCHANGED;
 	WRITE_ONCE(p->se.avg.util_est, ewma);
=20
--=20
2.34.1
From nobody Sun Feb  8 03:17:38 2026
Received: from foss.arm.com (foss.arm.com [217.140.110.172])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id F0863207667;
	Tue,  4 Mar 2025 14:23:38 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=217.140.110.172
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1741098221; cv=none;
 b=A0hdJccolSXALkAelBNKSDD6ISEh8UFA8W+xiu3aUzihohxwXBaFu5E0U80kH0s47WOTagTgAqcsggJyOGniqAEa6fzgTuykXnlJaAb96aqO/XWqqyVuT0xfVj2V8UzTqnjs5p2/r8fEztbenD+SB0qS3tAV+vZhFw/1U4WbNvg=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1741098221; c=relaxed/simple;
	bh=8e9RsqEtg4IZ2v/cKPYsvbcbI56XhmSnJzHdsycy+Lo=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=KMwNP4YWzTtG5fEuO/5VHsICNiTRYXAKngZSTi5Hm+RalbmEA/7NxuxRC9jloNlwYzatNpDRcEY56P2fRzGo63WUvYY11yTBMhtZSr2gKwiBQdHUJ7FROL1idY2Fbxg72m6c/p/Io3kIUF3GIMfUSDV2uky4NtcL9+FRbZkZ5k0=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=arm.com;
 spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=arm.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=arm.com
Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14])
	by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id E7CC5FEC;
	Tue,  4 Mar 2025 06:23:51 -0800 (PST)
Received: from e130256.cambridge.arm.com (usa-sjc-imap-foss1.foss.arm.com
 [10.121.207.14])
	by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id 127E03F66E;
	Tue,  4 Mar 2025 06:23:35 -0800 (PST)
From: Hongyan Xia <hongyan.xia2@arm.com>
To: Ingo Molnar <mingo@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Vincent Guittot <vincent.guittot@linaro.org>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	"Rafael J. Wysocki" <rafael@kernel.org>,
	Viresh Kumar <viresh.kumar@linaro.org>,
	Juri Lelli <juri.lelli@redhat.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Ben Segall <bsegall@google.com>,
	Mel Gorman <mgorman@suse.de>,
	Valentin Schneider <vschneid@redhat.com>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>,
	Lukasz Luba <lukasz.luba@arm.com>,
	Christian Loehle <christian.loehle@arm.com>,
	Pierre Gondois <pierre.gondois@arm.com>,
	linux-kernel@vger.kernel.org,
	linux-pm@vger.kernel.org
Subject: [PATCH v2 4/8] sched/fair: Use util biases for utilization and
 frequency
Date: Tue,  4 Mar 2025 14:23:11 +0000
Message-Id: 
 <6bc37fbf6a81f8fdd1a7a1b43bc82b0190a948ea.1741091349.git.hongyan.xia2@arm.com>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <cover.1741091349.git.hongyan.xia2@arm.com>
References: <cover.1741091349.git.hongyan.xia2@arm.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

Use the new util_avg_bias for task and runqueue utilization. We also
maintain separate util_est and util_est_uclamp signals.

Now that we have the uclamp sum aggregated CFS util value, we do not
need to consult uclamp buckets to know how the frequency should be
clamped. We simply look at the aggregated top level rq->cfs.avg.util_avg
+ rq->cfs.avg.util_avg_bias and rq->cfs.avg.util_est_uclamp to know what
frequency to choose and how to place tasks.

Signed-off-by: Hongyan Xia <hongyan.xia2@arm.com>
---
 kernel/sched/cpufreq_schedutil.c |   6 +-
 kernel/sched/fair.c              | 296 +++++++++++--------------------
 kernel/sched/sched.h             |  19 +-
 3 files changed, 101 insertions(+), 220 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedu=
til.c
index 1a19d69b91ed..7b1616b139f6 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -200,7 +200,7 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu, un=
signed long boost)
 	unsigned long min, max, util =3D scx_cpuperf_target(sg_cpu->cpu);
=20
 	if (!scx_switched_all())
-		util +=3D cpu_util_cfs_boost(sg_cpu->cpu);
+		util +=3D cpu_util_cfs_boost_uclamp(sg_cpu->cpu);
 	util =3D effective_cpu_util(sg_cpu->cpu, util, &min, &max);
 	util =3D max(util, boost);
 	sg_cpu->bw_min =3D min;
@@ -340,10 +340,6 @@ static bool sugov_hold_freq(struct sugov_cpu *sg_cpu)
 	if (scx_switched_all())
 		return false;
=20
-	/* if capped by uclamp_max, always update to be in compliance */
-	if (uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)))
-		return false;
-
 	/*
 	 * Maintain the frequency if the CPU has not been idle recently, as
 	 * reduction is likely to be premature.
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e9aa93f99a4e..3d91dbd19a85 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4877,6 +4877,15 @@ static inline unsigned long task_util_est_uclamp(str=
uct task_struct *p)
 {
 	return max(task_util_uclamp(p), _task_util_est_uclamp(p));
 }
+
+static inline unsigned long root_cfs_util_uclamp(struct rq *rq)
+{
+	long ret =3D READ_ONCE(rq->cfs.avg.util_avg);
+
+	ret +=3D READ_ONCE(rq->cfs.avg.util_avg_bias);
+
+	return max(ret, 0L);
+}
 #else
 static inline long task_util_bias(struct task_struct *p)
 {
@@ -4897,6 +4906,11 @@ static inline unsigned long task_util_est_uclamp(str=
uct task_struct *p)
 {
 	return task_util_est(p);
 }
+
+static inline unsigned long root_cfs_util_uclamp(struct rq *rq)
+{
+	return READ_ONCE(rq->cfs.avg.util_avg);
+}
 #endif
=20
 static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
@@ -5039,135 +5053,24 @@ static inline unsigned long get_actual_cpu_capacit=
y(int cpu)
 	return capacity;
 }
=20
-static inline int util_fits_cpu(unsigned long util,
-				unsigned long uclamp_min,
-				unsigned long uclamp_max,
-				int cpu)
+static inline int util_fits_cpu(unsigned long util, int cpu)
 {
 	unsigned long capacity =3D capacity_of(cpu);
-	unsigned long capacity_orig;
-	bool fits, uclamp_max_fits;
-
-	/*
-	 * Check if the real util fits without any uclamp boost/cap applied.
-	 */
-	fits =3D fits_capacity(util, capacity);
-
-	if (!uclamp_is_used())
-		return fits;
-
-	/*
-	 * We must use arch_scale_cpu_capacity() for comparing against uclamp_min=
 and
-	 * uclamp_max. We only care about capacity pressure (by using
-	 * capacity_of()) for comparing against the real util.
-	 *
-	 * If a task is boosted to 1024 for example, we don't want a tiny
-	 * pressure to skew the check whether it fits a CPU or not.
-	 *
-	 * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), =
it
-	 * should fit a little cpu even if there's some pressure.
-	 *
-	 * Only exception is for HW or cpufreq pressure since it has a direct imp=
act
-	 * on available OPP of the system.
-	 *
-	 * We honour it for uclamp_min only as a drop in performance level
-	 * could result in not getting the requested minimum performance level.
-	 *
-	 * For uclamp_max, we can tolerate a drop in performance level as the
-	 * goal is to cap the task. So it's okay if it's getting less.
-	 */
-	capacity_orig =3D arch_scale_cpu_capacity(cpu);
-
-	/*
-	 * We want to force a task to fit a cpu as implied by uclamp_max.
-	 * But we do have some corner cases to cater for..
-	 *
-	 *
-	 *                                 C=3Dz
-	 *   |                             ___
-	 *   |                  C=3Dy       |   |
-	 *   |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _  uclamp_max
-	 *   |      C=3Dx        |   |      |   |
-	 *   |      ___        |   |      |   |
-	 *   |     |   |       |   |      |   |    (util somewhere in this region)
-	 *   |     |   |       |   |      |   |
-	 *   |     |   |       |   |      |   |
-	 *   +----------------------------------------
-	 *         CPU0        CPU1       CPU2
-	 *
-	 *   In the above example if a task is capped to a specific performance
-	 *   point, y, then when:
-	 *
-	 *   * util =3D 80% of x then it does not fit on CPU0 and should migrate
-	 *     to CPU1
-	 *   * util =3D 80% of y then it is forced to fit on CPU1 to honour
-	 *     uclamp_max request.
-	 *
-	 *   which is what we're enforcing here. A task always fits if
-	 *   uclamp_max <=3D capacity_orig. But when uclamp_max > capacity_orig,
-	 *   the normal upmigration rules should withhold still.
-	 *
-	 *   Only exception is when we are on max capacity, then we need to be
-	 *   careful not to block overutilized state. This is so because:
-	 *
-	 *     1. There's no concept of capping at max_capacity! We can't go
-	 *        beyond this performance level anyway.
-	 *     2. The system is being saturated when we're operating near
-	 *        max capacity, it doesn't make sense to block overutilized.
-	 */
-	uclamp_max_fits =3D (capacity_orig =3D=3D SCHED_CAPACITY_SCALE) && (uclam=
p_max =3D=3D SCHED_CAPACITY_SCALE);
-	uclamp_max_fits =3D !uclamp_max_fits && (uclamp_max <=3D capacity_orig);
-	fits =3D fits || uclamp_max_fits;
=20
-	/*
-	 *
-	 *                                 C=3Dz
-	 *   |                             ___       (region a, capped, util >=3D=
 uclamp_max)
-	 *   |                  C=3Dy       |   |
-	 *   |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
-	 *   |      C=3Dx        |   |      |   |
-	 *   |      ___        |   |      |   |      (region b, uclamp_min <=3D u=
til <=3D uclamp_max)
-	 *   |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min
-	 *   |     |   |       |   |      |   |
-	 *   |     |   |       |   |      |   |      (region c, boosted, util < u=
clamp_min)
-	 *   +----------------------------------------
-	 *         CPU0        CPU1       CPU2
-	 *
-	 * a) If util > uclamp_max, then we're capped, we don't care about
-	 *    actual fitness value here. We only care if uclamp_max fits
-	 *    capacity without taking margin/pressure into account.
-	 *    See comment above.
-	 *
-	 * b) If uclamp_min <=3D util <=3D uclamp_max, then the normal
-	 *    fits_capacity() rules apply. Except we need to ensure that we
-	 *    enforce we remain within uclamp_max, see comment above.
-	 *
-	 * c) If util < uclamp_min, then we are boosted. Same as (b) but we
-	 *    need to take into account the boosted value fits the CPU without
-	 *    taking margin/pressure into account.
-	 *
-	 * Cases (a) and (b) are handled in the 'fits' variable already. We
-	 * just need to consider an extra check for case (c) after ensuring we
-	 * handle the case uclamp_min > uclamp_max.
-	 */
-	uclamp_min =3D min(uclamp_min, uclamp_max);
-	if (fits && (util < uclamp_min) &&
-	    (uclamp_min > get_actual_cpu_capacity(cpu)))
-		return -1;
+	if (fits_capacity(util, capacity))
+		return 1;
=20
-	return fits;
+	return 0;
 }
=20
 static inline int task_fits_cpu(struct task_struct *p, int cpu)
 {
-	unsigned long uclamp_min =3D uclamp_eff_value(p, UCLAMP_MIN);
-	unsigned long uclamp_max =3D uclamp_eff_value(p, UCLAMP_MAX);
-	unsigned long util =3D task_util_est(p);
+	unsigned long util_uclamp =3D task_util_est_uclamp(p);
 	/*
 	 * Return true only if the cpu fully fits the task requirements, which
 	 * include the utilization but also the performance hints.
 	 */
-	return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0);
+	return (util_fits_cpu(util_uclamp, cpu) > 0);
 }
=20
 static inline void update_misfit_status(struct task_struct *p, struct rq *=
rq)
@@ -6886,18 +6789,18 @@ static inline void hrtick_update(struct rq *rq)
 #endif
=20
 #ifdef CONFIG_SMP
+static unsigned long cpu_util_cfs_uclamp(int cpu);
+
 static inline bool cpu_overutilized(int cpu)
 {
-	unsigned long  rq_util_min, rq_util_max;
+	unsigned long util_uclamp;
=20
 	if (!sched_energy_enabled())
 		return false;
=20
-	rq_util_min =3D uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
-	rq_util_max =3D uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
+	util_uclamp =3D cpu_util_cfs_uclamp(cpu);
=20
-	/* Return true only if the utilization doesn't fit CPU's capacity */
-	return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
+	return !util_fits_cpu(util_uclamp, cpu);
 }
=20
 /*
@@ -7828,7 +7731,7 @@ static int select_idle_cpu(struct task_struct *p, str=
uct sched_domain *sd, bool
 static int
 select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int t=
arget)
 {
-	unsigned long task_util, util_min, util_max, best_cap =3D 0;
+	unsigned long task_util_uclamp, best_cap =3D 0;
 	int fits, best_fits =3D 0;
 	int cpu, best_cpu =3D -1;
 	struct cpumask *cpus;
@@ -7836,9 +7739,7 @@ select_idle_capacity(struct task_struct *p, struct sc=
hed_domain *sd, int target)
 	cpus =3D this_cpu_cpumask_var_ptr(select_rq_mask);
 	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
=20
-	task_util =3D task_util_est(p);
-	util_min =3D uclamp_eff_value(p, UCLAMP_MIN);
-	util_max =3D uclamp_eff_value(p, UCLAMP_MAX);
+	task_util_uclamp =3D task_util_est_uclamp(p);
=20
 	for_each_cpu_wrap(cpu, cpus, target) {
 		unsigned long cpu_cap =3D capacity_of(cpu);
@@ -7846,7 +7747,7 @@ select_idle_capacity(struct task_struct *p, struct sc=
hed_domain *sd, int target)
 		if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
 			continue;
=20
-		fits =3D util_fits_cpu(task_util, util_min, util_max, cpu);
+		fits =3D util_fits_cpu(task_util_uclamp, cpu);
=20
 		/* This CPU fits with all requirements */
 		if (fits > 0)
@@ -7874,8 +7775,6 @@ select_idle_capacity(struct task_struct *p, struct sc=
hed_domain *sd, int target)
 }
=20
 static inline bool asym_fits_cpu(unsigned long util,
-				 unsigned long util_min,
-				 unsigned long util_max,
 				 int cpu)
 {
 	if (sched_asym_cpucap_active())
@@ -7883,7 +7782,7 @@ static inline bool asym_fits_cpu(unsigned long util,
 		 * Return true only if the cpu fully fits the task requirements
 		 * which include the utilization and the performance hints.
 		 */
-		return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
+		return (util_fits_cpu(util, cpu) > 0);
=20
 	return true;
 }
@@ -7895,7 +7794,7 @@ static int select_idle_sibling(struct task_struct *p,=
 int prev, int target)
 {
 	bool has_idle_core =3D false;
 	struct sched_domain *sd;
-	unsigned long task_util, util_min, util_max;
+	unsigned long task_util_uclamp;
 	int i, recent_used_cpu, prev_aff =3D -1;
=20
 	/*
@@ -7904,9 +7803,7 @@ static int select_idle_sibling(struct task_struct *p,=
 int prev, int target)
 	 */
 	if (sched_asym_cpucap_active()) {
 		sync_entity_load_avg(&p->se);
-		task_util =3D task_util_est(p);
-		util_min =3D uclamp_eff_value(p, UCLAMP_MIN);
-		util_max =3D uclamp_eff_value(p, UCLAMP_MAX);
+		task_util_uclamp =3D task_util_est_uclamp(p);
 	}
=20
 	/*
@@ -7915,7 +7812,7 @@ static int select_idle_sibling(struct task_struct *p,=
 int prev, int target)
 	lockdep_assert_irqs_disabled();
=20
 	if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
-	    asym_fits_cpu(task_util, util_min, util_max, target))
+	    asym_fits_cpu(task_util_uclamp, target))
 		return target;
=20
 	/*
@@ -7923,7 +7820,7 @@ static int select_idle_sibling(struct task_struct *p,=
 int prev, int target)
 	 */
 	if (prev !=3D target && cpus_share_cache(prev, target) &&
 	    (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
-	    asym_fits_cpu(task_util, util_min, util_max, prev)) {
+	    asym_fits_cpu(task_util_uclamp, prev)) {
=20
 		if (!static_branch_unlikely(&sched_cluster_active) ||
 		    cpus_share_resources(prev, target))
@@ -7944,7 +7841,7 @@ static int select_idle_sibling(struct task_struct *p,=
 int prev, int target)
 	    in_task() &&
 	    prev =3D=3D smp_processor_id() &&
 	    this_rq()->nr_running <=3D 1 &&
-	    asym_fits_cpu(task_util, util_min, util_max, prev)) {
+	    asym_fits_cpu(task_util_uclamp, prev)) {
 		return prev;
 	}
=20
@@ -7956,7 +7853,7 @@ static int select_idle_sibling(struct task_struct *p,=
 int prev, int target)
 	    cpus_share_cache(recent_used_cpu, target) &&
 	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cp=
u)) &&
 	    cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
-	    asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
+	    asym_fits_cpu(task_util_uclamp, recent_used_cpu)) {
=20
 		if (!static_branch_unlikely(&sched_cluster_active) ||
 		    cpus_share_resources(recent_used_cpu, target))
@@ -8124,16 +8021,67 @@ cpu_util(int cpu, struct task_struct *p, int dst_cp=
u, int boost)
 	return min(util, arch_scale_cpu_capacity(cpu));
 }
=20
+/* This is basically a copy-paste from cpu_util(), but instead using uclam=
p values. */
+static unsigned long
+cpu_util_uclamp(int cpu, struct task_struct *p, int dst_cpu, int boost)
+{
+	struct rq *rq =3D cpu_rq(cpu);
+	struct cfs_rq *cfs_rq =3D &rq->cfs;
+	unsigned long util =3D root_cfs_util_uclamp(rq);
+
+	if (boost) {
+		unsigned long runnable =3D READ_ONCE(cfs_rq->avg.runnable_avg);
+		unsigned long util_raw =3D READ_ONCE(cfs_rq->avg.util_avg);
+
+		util =3D max(util, util_raw ? util * runnable / util_raw : 0);
+	}
+
+	if (p) {
+		if (task_cpu(p) =3D=3D cpu && !p->se.on_rq) {
+			util +=3D task_util_bias(p);
+			if ((long)util < 0)
+				util =3D 0;
+		}
+		if (task_cpu(p) =3D=3D cpu && dst_cpu !=3D cpu)
+			lsub_positive(&util, task_util_uclamp(p));
+		else if (task_cpu(p) !=3D cpu && dst_cpu =3D=3D cpu)
+			util +=3D task_util_uclamp(p);
+	}
+
+	if (sched_feat(UTIL_EST)) {
+		unsigned long util_est =3D READ_ONCE(cfs_rq->avg.util_est_uclamp);
+
+		if (dst_cpu =3D=3D cpu)
+			util_est +=3D _task_util_est_uclamp(p);
+		else if (p && unlikely(task_on_rq_queued(p) || current =3D=3D p))
+			lsub_positive(&util_est, _task_util_est_uclamp(p));
+
+		util =3D max(util, util_est);
+	}
+
+	return min(util, arch_scale_cpu_capacity(cpu));
+}
+
 unsigned long cpu_util_cfs(int cpu)
 {
 	return cpu_util(cpu, NULL, -1, 0);
 }
=20
-unsigned long cpu_util_cfs_boost(int cpu)
+static unsigned long cpu_util_cfs_uclamp(int cpu)
+{
+	return cpu_util_uclamp(cpu, NULL, -1, 0);
+}
+
+static unsigned long cpu_util_cfs_boost(int cpu)
 {
 	return cpu_util(cpu, NULL, -1, 1);
 }
=20
+unsigned long cpu_util_cfs_boost_uclamp(int cpu)
+{
+	return cpu_util_uclamp(cpu, NULL, -1, 1);
+}
+
 /*
  * cpu_util_without: compute cpu utilization without any contributions fro=
m *p
  * @cpu: the CPU which utilization is requested
@@ -8206,7 +8154,7 @@ unsigned long effective_cpu_util(int cpu, unsigned lo=
ng util_cfs,
 		 *   steals time to the deadline task.
 		 * - The minimum performance requirement for CFS and/or RT.
 		 */
-		*min =3D max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
+		*min =3D irq + cpu_bw_dl(rq);
=20
 		/*
 		 * When an RT task is runnable and uclamp is not used, we must
@@ -8230,7 +8178,7 @@ unsigned long effective_cpu_util(int cpu, unsigned lo=
ng util_cfs,
 	 * than the actual utilization because of uclamp_max requirements.
 	 */
 	if (max)
-		*max =3D min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
+		*max =3D scale;
=20
 	if (util >=3D scale)
 		return scale;
@@ -8343,33 +8291,15 @@ eenv_pd_max_util(struct energy_env *eenv, struct cp=
umask *pd_cpus,
 	int cpu;
=20
 	for_each_cpu(cpu, pd_cpus) {
-		struct task_struct *tsk =3D (cpu =3D=3D dst_cpu) ? p : NULL;
-		unsigned long util =3D cpu_util(cpu, p, dst_cpu, 1);
+		unsigned long util =3D cpu_util_uclamp(cpu, p, dst_cpu, 1);
 		unsigned long eff_util, min, max;
=20
 		/*
-		 * Performance domain frequency: utilization clamping
-		 * must be considered since it affects the selection
-		 * of the performance domain frequency.
-		 * NOTE: in case RT tasks are running, by default the min
-		 * utilization can be max OPP.
+		 * NOTE: in case RT tasks are running, by default the
+		 * FREQUENCY_UTIL's utilization can be max OPP.
 		 */
 		eff_util =3D effective_cpu_util(cpu, util, &min, &max);
=20
-		/* Task's uclamp can modify min and max value */
-		if (tsk && uclamp_is_used()) {
-			min =3D max(min, uclamp_eff_value(p, UCLAMP_MIN));
-
-			/*
-			 * If there is no active max uclamp constraint,
-			 * directly use task's one, otherwise keep max.
-			 */
-			if (uclamp_rq_is_idle(cpu_rq(cpu)))
-				max =3D uclamp_eff_value(p, UCLAMP_MAX);
-			else
-				max =3D max(max, uclamp_eff_value(p, UCLAMP_MAX));
-		}
-
 		eff_util =3D sugov_effective_cpu_perf(cpu, eff_util, min, max);
 		max_util =3D max(max_util, eff_util);
 	}
@@ -8443,8 +8373,6 @@ static int find_energy_efficient_cpu(struct task_stru=
ct *p, int prev_cpu)
 {
 	struct cpumask *cpus =3D this_cpu_cpumask_var_ptr(select_rq_mask);
 	unsigned long prev_delta =3D ULONG_MAX, best_delta =3D ULONG_MAX;
-	unsigned long p_util_min =3D uclamp_is_used() ? uclamp_eff_value(p, UCLAM=
P_MIN) : 0;
-	unsigned long p_util_max =3D uclamp_is_used() ? uclamp_eff_value(p, UCLAM=
P_MAX) : 1024;
 	struct root_domain *rd =3D this_rq()->rd;
 	int cpu, best_energy_cpu, target =3D -1;
 	int prev_fits =3D -1, best_fits =3D -1;
@@ -8472,16 +8400,14 @@ static int find_energy_efficient_cpu(struct task_st=
ruct *p, int prev_cpu)
 	target =3D prev_cpu;
=20
 	sync_entity_load_avg(&p->se);
-	if (!task_util_est(p) && p_util_min =3D=3D 0)
+	if (!task_util_est_uclamp(p))
 		goto unlock;
=20
 	eenv_task_busy_time(&eenv, p, prev_cpu);
=20
 	for (; pd; pd =3D pd->next) {
-		unsigned long util_min =3D p_util_min, util_max =3D p_util_max;
-		unsigned long cpu_cap, cpu_actual_cap, util;
+		unsigned long cpu_cap, cpu_actual_cap, util_uclamp;
 		unsigned long cur_delta, max_spare_cap =3D 0;
-		unsigned long rq_util_min, rq_util_max;
 		unsigned long prev_spare_cap =3D 0;
 		int max_spare_cap_cpu =3D -1;
 		unsigned long base_energy;
@@ -8500,8 +8426,6 @@ static int find_energy_efficient_cpu(struct task_stru=
ct *p, int prev_cpu)
 		eenv.pd_cap =3D 0;
=20
 		for_each_cpu(cpu, cpus) {
-			struct rq *rq =3D cpu_rq(cpu);
-
 			eenv.pd_cap +=3D cpu_actual_cap;
=20
 			if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
@@ -8510,37 +8434,15 @@ static int find_energy_efficient_cpu(struct task_st=
ruct *p, int prev_cpu)
 			if (!cpumask_test_cpu(cpu, p->cpus_ptr))
 				continue;
=20
-			util =3D cpu_util(cpu, p, cpu, 0);
+			util_uclamp =3D cpu_util_uclamp(cpu, p, cpu, 0);
 			cpu_cap =3D capacity_of(cpu);
=20
-			/*
-			 * Skip CPUs that cannot satisfy the capacity request.
-			 * IOW, placing the task there would make the CPU
-			 * overutilized. Take uclamp into account to see how
-			 * much capacity we can get out of the CPU; this is
-			 * aligned with sched_cpu_util().
-			 */
-			if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) {
-				/*
-				 * Open code uclamp_rq_util_with() except for
-				 * the clamp() part. I.e.: apply max aggregation
-				 * only. util_fits_cpu() logic requires to
-				 * operate on non clamped util but must use the
-				 * max-aggregated uclamp_{min, max}.
-				 */
-				rq_util_min =3D uclamp_rq_get(rq, UCLAMP_MIN);
-				rq_util_max =3D uclamp_rq_get(rq, UCLAMP_MAX);
-
-				util_min =3D max(rq_util_min, p_util_min);
-				util_max =3D max(rq_util_max, p_util_max);
-			}
-
-			fits =3D util_fits_cpu(util, util_min, util_max, cpu);
-			if (!fits)
+			fits =3D util_fits_cpu(util_uclamp, cpu);
+			if (fits =3D=3D 1)
+				lsub_positive(&cpu_cap, util_uclamp);
+			else
 				continue;
=20
-			lsub_positive(&cpu_cap, util);
-
 			if (cpu =3D=3D prev_cpu) {
 				/* Always use prev_cpu as a candidate. */
 				prev_spare_cap =3D cpu_cap;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 74363bc74e23..b50e3d6e79c4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3379,7 +3379,7 @@ static inline unsigned long cpu_util_dl(struct rq *rq)
=20
=20
 extern unsigned long cpu_util_cfs(int cpu);
-extern unsigned long cpu_util_cfs_boost(int cpu);
+extern unsigned long cpu_util_cfs_boost_uclamp(int cpu);
=20
 static inline unsigned long cpu_util_rt(struct rq *rq)
 {
@@ -3411,21 +3411,6 @@ static inline bool uclamp_rq_is_idle(struct rq *rq)
 	return rq->uclamp_flags & UCLAMP_FLAG_IDLE;
 }
=20
-/* Is the rq being capped/throttled by uclamp_max? */
-static inline bool uclamp_rq_is_capped(struct rq *rq)
-{
-	unsigned long rq_util;
-	unsigned long max_util;
-
-	if (!static_branch_likely(&sched_uclamp_used))
-		return false;
-
-	rq_util =3D cpu_util_cfs(cpu_of(rq)) + cpu_util_rt(rq);
-	max_util =3D READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
-
-	return max_util !=3D SCHED_CAPACITY_SCALE && rq_util >=3D max_util;
-}
-
 /*
  * When uclamp is compiled in, the aggregation at rq level is 'turned off'
  * by default in the fast path and only gets turned on once userspace perf=
orms
@@ -3495,8 +3480,6 @@ uclamp_eff_value(struct task_struct *p, enum uclamp_i=
d clamp_id)
 	return SCHED_CAPACITY_SCALE;
 }
=20
-static inline bool uclamp_rq_is_capped(struct rq *rq) { return false; }
-
 static inline bool uclamp_is_used(void)
 {
 	return false;
--=20
2.34.1
From nobody Sun Feb  8 03:17:38 2026
Received: from foss.arm.com (foss.arm.com [217.140.110.172])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id 514172063E9
	for <linux-kernel@vger.kernel.org>; Tue,  4 Mar 2025 14:23:42 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=217.140.110.172
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1741098224; cv=none;
 b=K/oF8FC90YW1BttF+v9N1pfan/+FkuYFCSyz8V/9exMRBWkaGF91LYAxlVPq2i4Vv3Z+5W8EXJsumDRDX21UlRIPVMmwRX/t8kBMlrEzUh/pujtwa5JHfmVa6Jg8su6xg12ps9N1zZI6rdl+/6sbSlBopFYV3zuWmVv8k5oa6Kc=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1741098224; c=relaxed/simple;
	bh=TEDlHF/i9Vmved/T6z79jyfz+oPYRc/M33scgh/U23Q=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=AS+r6G93SqSy7UwDNNLyk4PnmFigDDSULGj9UuGLYFqg84GHaMtiYNRd9t13cflYo76Pp6Iy9JHxBogUQLm+pFg9q47ENrtavsQ1K5akPvSAbxBsjpnIOWV1dJFqeKSMnOx9B+Xaw4RLogXpoxuGMcMP7kTCBtGf3ggIBLASs/o=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=arm.com;
 spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=arm.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=arm.com
Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14])
	by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 91976FEC;
	Tue,  4 Mar 2025 06:23:55 -0800 (PST)
Received: from e130256.cambridge.arm.com (usa-sjc-imap-foss1.foss.arm.com
 [10.121.207.14])
	by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id 2208C3F66E;
	Tue,  4 Mar 2025 06:23:40 -0800 (PST)
From: Hongyan Xia <hongyan.xia2@arm.com>
To: Ingo Molnar <mingo@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Vincent Guittot <vincent.guittot@linaro.org>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Juri Lelli <juri.lelli@redhat.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Ben Segall <bsegall@google.com>,
	Mel Gorman <mgorman@suse.de>,
	Valentin Schneider <vschneid@redhat.com>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>,
	Lukasz Luba <lukasz.luba@arm.com>,
	Christian Loehle <christian.loehle@arm.com>,
	Pierre Gondois <pierre.gondois@arm.com>,
	linux-kernel@vger.kernel.org
Subject: [PATCH v2 5/8] sched/uclamp: Remove all uclamp bucket logic
Date: Tue,  4 Mar 2025 14:23:12 +0000
Message-Id: 
 <9d071784c2a0ed26d61df571c0ced1e951ecbe3b.1741091349.git.hongyan.xia2@arm.com>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <cover.1741091349.git.hongyan.xia2@arm.com>
References: <cover.1741091349.git.hongyan.xia2@arm.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

Also rewrite uclamp_update_active() so that the effective uclamp values
are updated every time we change task group properties, change system
defaults or a request is issued from userspace.

This also signnificantly reduces uclamp overhead because we no longer
need to compute effective uclamp values and manipulate buckets every
time a task is enqueued or dequeued (in uclamp_rq_{inc/dec}()).

TODO: Rewrite documentation to match the new logic.

Signed-off-by: Hongyan Xia <hongyan.xia2@arm.com>
---
 include/linux/sched.h   |   4 -
 init/Kconfig            |  32 -----
 kernel/sched/core.c     | 301 ++--------------------------------------
 kernel/sched/fair.c     |   4 -
 kernel/sched/rt.c       |   4 -
 kernel/sched/sched.h    |  95 +------------
 kernel/sched/syscalls.c |   2 +
 7 files changed, 20 insertions(+), 422 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a4bdfa1d6be1..012df2f612d4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -722,9 +722,6 @@ struct sched_dl_entity {
 };
=20
 #ifdef CONFIG_UCLAMP_TASK
-/* Number of utilization clamp buckets (shorter alias) */
-#define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT
-
 /*
  * Utilization clamp for a scheduling entity
  * @value:		clamp value "assigned" to a se
@@ -750,7 +747,6 @@ struct sched_dl_entity {
  */
 struct uclamp_se {
 	unsigned int value		: bits_per(SCHED_CAPACITY_SCALE);
-	unsigned int bucket_id		: bits_per(UCLAMP_BUCKETS);
 	unsigned int active		: 1;
 	unsigned int user_defined	: 1;
 };
diff --git a/init/Kconfig b/init/Kconfig
index d0d021b3fa3b..6d6d2eaa2963 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -843,38 +843,6 @@ config UCLAMP_TASK
 	  enforce or grant any specific bandwidth for tasks.
=20
 	  If in doubt, say N.
-
-config UCLAMP_BUCKETS_COUNT
-	int "Number of supported utilization clamp buckets"
-	range 5 20
-	default 5
-	depends on UCLAMP_TASK
-	help
-	  Defines the number of clamp buckets to use. The range of each bucket
-	  will be SCHED_CAPACITY_SCALE/UCLAMP_BUCKETS_COUNT. The higher the
-	  number of clamp buckets the finer their granularity and the higher
-	  the precision of clamping aggregation and tracking at run-time.
-
-	  For example, with the minimum configuration value we will have 5
-	  clamp buckets tracking 20% utilization each. A 25% boosted tasks will
-	  be refcounted in the [20..39]% bucket and will set the bucket clamp
-	  effective value to 25%.
-	  If a second 30% boosted task should be co-scheduled on the same CPU,
-	  that task will be refcounted in the same bucket of the first task and
-	  it will boost the bucket clamp effective value to 30%.
-	  The clamp effective value of a bucket is reset to its nominal value
-	  (20% in the example above) when there are no more tasks refcounted in
-	  that bucket.
-
-	  An additional boost/capping margin can be added to some tasks. In the
-	  example above the 25% task will be boosted to 30% until it exits the
-	  CPU. If that should be considered not acceptable on certain systems,
-	  it's always possible to reduce the margin by increasing the number of
-	  clamp buckets to trade off used memory for run-time tracking
-	  precision.
-
-	  If in doubt, use the default value.
-
 endmenu
=20
 #
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b00f884701a6..85c69ca7abaa 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1495,54 +1495,6 @@ static struct uclamp_se uclamp_default[UCLAMP_CNT];
  */
 DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
=20
-static inline unsigned int
-uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
-		  unsigned int clamp_value)
-{
-	/*
-	 * Avoid blocked utilization pushing up the frequency when we go
-	 * idle (which drops the max-clamp) by retaining the last known
-	 * max-clamp.
-	 */
-	if (clamp_id =3D=3D UCLAMP_MAX) {
-		rq->uclamp_flags |=3D UCLAMP_FLAG_IDLE;
-		return clamp_value;
-	}
-
-	return uclamp_none(UCLAMP_MIN);
-}
-
-static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_i=
d,
-				     unsigned int clamp_value)
-{
-	/* Reset max-clamp retention only on idle exit */
-	if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
-		return;
-
-	uclamp_rq_set(rq, clamp_id, clamp_value);
-}
-
-static inline
-unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
-				   unsigned int clamp_value)
-{
-	struct uclamp_bucket *bucket =3D rq->uclamp[clamp_id].bucket;
-	int bucket_id =3D UCLAMP_BUCKETS - 1;
-
-	/*
-	 * Since both min and max clamps are max aggregated, find the
-	 * top most bucket with tasks in.
-	 */
-	for ( ; bucket_id >=3D 0; bucket_id--) {
-		if (!bucket[bucket_id].tasks)
-			continue;
-		return bucket[bucket_id].value;
-	}
-
-	/* No tasks -- default clamp values */
-	return uclamp_idle_value(rq, clamp_id, clamp_value);
-}
-
 static void __uclamp_update_util_min_rt_default(struct task_struct *p)
 {
 	unsigned int default_util_min;
@@ -1598,8 +1550,7 @@ uclamp_tg_restrict(struct task_struct *p, enum uclamp=
_id clamp_id)
 }
=20
 /*
- * The effective clamp bucket index of a task depends on, by increasing
- * priority:
+ * The effective uclamp value of a task depends on, by increasing priority:
  * - the task specific clamp value, when explicitly requested from userspa=
ce
  * - the task group effective clamp value, for tasks not either in the root
  *   group or in an autogroup
@@ -1620,202 +1571,23 @@ uclamp_eff_get(struct task_struct *p, enum uclamp_=
id clamp_id)
=20
 unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp=
_id)
 {
-	struct uclamp_se uc_eff;
-
-	/* Task currently refcounted: use back-annotated (effective) value */
-	if (p->uclamp[clamp_id].active)
-		return (unsigned long)p->uclamp[clamp_id].value;
-
-	uc_eff =3D uclamp_eff_get(p, clamp_id);
-
-	return (unsigned long)uc_eff.value;
-}
-
-/*
- * When a task is enqueued on a rq, the clamp bucket currently defined by =
the
- * task's uclamp::bucket_id is refcounted on that rq. This also immediately
- * updates the rq's clamp value if required.
- *
- * Tasks can have a task-specific value requested from user-space, track
- * within each bucket the maximum value for tasks refcounted in it.
- * This "local max aggregation" allows to track the exact "requested" value
- * for each bucket when all its RUNNABLE tasks require the same clamp.
- */
-static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
-				    enum uclamp_id clamp_id)
-{
-	struct uclamp_rq *uc_rq =3D &rq->uclamp[clamp_id];
-	struct uclamp_se *uc_se =3D &p->uclamp[clamp_id];
-	struct uclamp_bucket *bucket;
-
-	lockdep_assert_rq_held(rq);
-
-	/* Update task effective clamp */
-	p->uclamp[clamp_id] =3D uclamp_eff_get(p, clamp_id);
-
-	bucket =3D &uc_rq->bucket[uc_se->bucket_id];
-	bucket->tasks++;
-	uc_se->active =3D true;
-
-	uclamp_idle_reset(rq, clamp_id, uc_se->value);
-
-	/*
-	 * Local max aggregation: rq buckets always track the max
-	 * "requested" clamp value of its RUNNABLE tasks.
-	 */
-	if (bucket->tasks =3D=3D 1 || uc_se->value > bucket->value)
-		bucket->value =3D uc_se->value;
-
-	if (uc_se->value > uclamp_rq_get(rq, clamp_id))
-		uclamp_rq_set(rq, clamp_id, uc_se->value);
-}
-
-/*
- * When a task is dequeued from a rq, the clamp bucket refcounted by the t=
ask
- * is released. If this is the last task reference counting the rq's max
- * active clamp value, then the rq's clamp value is updated.
- *
- * Both refcounted tasks and rq's cached clamp values are expected to be
- * always valid. If it's detected they are not, as defensive programming,
- * enforce the expected state and warn.
- */
-static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
-				    enum uclamp_id clamp_id)
-{
-	struct uclamp_rq *uc_rq =3D &rq->uclamp[clamp_id];
-	struct uclamp_se *uc_se =3D &p->uclamp[clamp_id];
-	struct uclamp_bucket *bucket;
-	unsigned int bkt_clamp;
-	unsigned int rq_clamp;
-
-	lockdep_assert_rq_held(rq);
-
-	/*
-	 * If sched_uclamp_used was enabled after task @p was enqueued,
-	 * we could end up with unbalanced call to uclamp_rq_dec_id().
-	 *
-	 * In this case the uc_se->active flag should be false since no uclamp
-	 * accounting was performed at enqueue time and we can just return
-	 * here.
-	 *
-	 * Need to be careful of the following enqueue/dequeue ordering
-	 * problem too
-	 *
-	 *	enqueue(taskA)
-	 *	// sched_uclamp_used gets enabled
-	 *	enqueue(taskB)
-	 *	dequeue(taskA)
-	 *	// Must not decrement bucket->tasks here
-	 *	dequeue(taskB)
-	 *
-	 * where we could end up with stale data in uc_se and
-	 * bucket[uc_se->bucket_id].
-	 *
-	 * The following check here eliminates the possibility of such race.
-	 */
-	if (unlikely(!uc_se->active))
-		return;
-
-	bucket =3D &uc_rq->bucket[uc_se->bucket_id];
-
-	SCHED_WARN_ON(!bucket->tasks);
-	if (likely(bucket->tasks))
-		bucket->tasks--;
-
-	uc_se->active =3D false;
-
-	/*
-	 * Keep "local max aggregation" simple and accept to (possibly)
-	 * overboost some RUNNABLE tasks in the same bucket.
-	 * The rq clamp bucket value is reset to its base value whenever
-	 * there are no more RUNNABLE tasks refcounting it.
-	 */
-	if (likely(bucket->tasks))
-		return;
-
-	rq_clamp =3D uclamp_rq_get(rq, clamp_id);
-	/*
-	 * Defensive programming: this should never happen. If it happens,
-	 * e.g. due to future modification, warn and fix up the expected value.
-	 */
-	SCHED_WARN_ON(bucket->value > rq_clamp);
-	if (bucket->value >=3D rq_clamp) {
-		bkt_clamp =3D uclamp_rq_max_value(rq, clamp_id, uc_se->value);
-		uclamp_rq_set(rq, clamp_id, bkt_clamp);
-	}
-}
-
-static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
-{
-	enum uclamp_id clamp_id;
-
-	/*
-	 * Avoid any overhead until uclamp is actually used by the userspace.
-	 *
-	 * The condition is constructed such that a NOP is generated when
-	 * sched_uclamp_used is disabled.
-	 */
-	if (!static_branch_unlikely(&sched_uclamp_used))
-		return;
-
-	if (unlikely(!p->sched_class->uclamp_enabled))
-		return;
-
-	if (p->se.sched_delayed)
-		return;
-
-	for_each_clamp_id(clamp_id)
-		uclamp_rq_inc_id(rq, p, clamp_id);
+	if (!uclamp_is_used() || !p->uclamp[clamp_id].active)
+		return uclamp_none(clamp_id);
=20
-	/* Reset clamp idle holding when there is one RUNNABLE task */
-	if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
-		rq->uclamp_flags &=3D ~UCLAMP_FLAG_IDLE;
+	return p->uclamp[clamp_id].value;
 }
=20
-static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
+void uclamp_update_active_nolock(struct task_struct *p)
 {
 	enum uclamp_id clamp_id;
=20
-	/*
-	 * Avoid any overhead until uclamp is actually used by the userspace.
-	 *
-	 * The condition is constructed such that a NOP is generated when
-	 * sched_uclamp_used is disabled.
-	 */
-	if (!static_branch_unlikely(&sched_uclamp_used))
-		return;
-
-	if (unlikely(!p->sched_class->uclamp_enabled))
-		return;
-
-	if (p->se.sched_delayed)
-		return;
-
 	for_each_clamp_id(clamp_id)
-		uclamp_rq_dec_id(rq, p, clamp_id);
-}
-
-static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p,
-				      enum uclamp_id clamp_id)
-{
-	if (!p->uclamp[clamp_id].active)
-		return;
-
-	uclamp_rq_dec_id(rq, p, clamp_id);
-	uclamp_rq_inc_id(rq, p, clamp_id);
-
-	/*
-	 * Make sure to clear the idle flag if we've transiently reached 0
-	 * active tasks on rq.
-	 */
-	if (clamp_id =3D=3D UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE))
-		rq->uclamp_flags &=3D ~UCLAMP_FLAG_IDLE;
+		p->uclamp[clamp_id] =3D uclamp_eff_get(p, clamp_id);
 }
=20
 static inline void
 uclamp_update_active(struct task_struct *p)
 {
-	enum uclamp_id clamp_id;
 	struct rq_flags rf;
 	struct rq *rq;
=20
@@ -1829,14 +1601,7 @@ uclamp_update_active(struct task_struct *p)
 	 */
 	rq =3D task_rq_lock(p, &rf);
=20
-	/*
-	 * Setting the clamp bucket is serialized by task_rq_lock().
-	 * If the task is not yet RUNNABLE and its task_struct is not
-	 * affecting a valid clamp bucket, the next time it's enqueued,
-	 * it will already see the updated clamp bucket value.
-	 */
-	for_each_clamp_id(clamp_id)
-		uclamp_rq_reinc_id(rq, p, clamp_id);
+	uclamp_update_active_nolock(p);
=20
 	task_rq_unlock(rq, p, &rf);
 }
@@ -1968,20 +1733,14 @@ static void uclamp_fork(struct task_struct *p)
 {
 	enum uclamp_id clamp_id;
=20
-	/*
-	 * We don't need to hold task_rq_lock() when updating p->uclamp_* here
-	 * as the task is still at its early fork stages.
-	 */
-	for_each_clamp_id(clamp_id)
-		p->uclamp[clamp_id].active =3D false;
-
-	if (likely(!p->sched_reset_on_fork))
-		return;
-
-	for_each_clamp_id(clamp_id) {
-		uclamp_se_set(&p->uclamp_req[clamp_id],
-			      uclamp_none(clamp_id), false);
+	if (unlikely(p->sched_reset_on_fork)) {
+		for_each_clamp_id(clamp_id) {
+			uclamp_se_set(&p->uclamp_req[clamp_id],
+				      uclamp_none(clamp_id), false);
+		}
 	}
+
+	uclamp_update_active(p);
 }
=20
 static void uclamp_post_fork(struct task_struct *p)
@@ -1989,28 +1748,10 @@ static void uclamp_post_fork(struct task_struct *p)
 	uclamp_update_util_min_rt_default(p);
 }
=20
-static void __init init_uclamp_rq(struct rq *rq)
-{
-	enum uclamp_id clamp_id;
-	struct uclamp_rq *uc_rq =3D rq->uclamp;
-
-	for_each_clamp_id(clamp_id) {
-		uc_rq[clamp_id] =3D (struct uclamp_rq) {
-			.value =3D uclamp_none(clamp_id)
-		};
-	}
-
-	rq->uclamp_flags =3D UCLAMP_FLAG_IDLE;
-}
-
 static void __init init_uclamp(void)
 {
 	struct uclamp_se uc_max =3D {};
 	enum uclamp_id clamp_id;
-	int cpu;
-
-	for_each_possible_cpu(cpu)
-		init_uclamp_rq(cpu_rq(cpu));
=20
 	for_each_clamp_id(clamp_id) {
 		uclamp_se_set(&init_task.uclamp_req[clamp_id],
@@ -2029,8 +1770,6 @@ static void __init init_uclamp(void)
 }
=20
 #else /* !CONFIG_UCLAMP_TASK */
-static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
-static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
 static inline void uclamp_fork(struct task_struct *p) { }
 static inline void uclamp_post_fork(struct task_struct *p) { }
 static inline void init_uclamp(void) { }
@@ -2066,11 +1805,6 @@ void enqueue_task(struct rq *rq, struct task_struct =
*p, int flags)
 		update_rq_clock(rq);
=20
 	p->sched_class->enqueue_task(rq, p, flags);
-	/*
-	 * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
-	 * ->sched_delayed.
-	 */
-	uclamp_rq_inc(rq, p);
=20
 	psi_enqueue(p, flags);
=20
@@ -2097,11 +1831,6 @@ inline bool dequeue_task(struct rq *rq, struct task_=
struct *p, int flags)
=20
 	psi_dequeue(p, flags);
=20
-	/*
-	 * Must be before ->dequeue_task() because ->dequeue_task() can 'fail'
-	 * and mark the task ->sched_delayed.
-	 */
-	uclamp_rq_dec(rq, p);
 	return p->sched_class->dequeue_task(rq, p, flags);
 }
=20
@@ -9079,6 +8808,7 @@ void sched_move_task(struct task_struct *tsk)
=20
 	sched_change_group(tsk, group);
 	scx_move_task(tsk);
+	uclamp_update_active_nolock(tsk);
=20
 	if (queued)
 		enqueue_task(rq, tsk, queue_flags);
@@ -9225,7 +8955,6 @@ static void cpu_util_update_eff(struct cgroup_subsys_=
state *css)
 			if (eff[clamp_id] =3D=3D uc_se[clamp_id].value)
 				continue;
 			uc_se[clamp_id].value =3D eff[clamp_id];
-			uc_se[clamp_id].bucket_id =3D uclamp_bucket_id(eff[clamp_id]);
 			clamps |=3D (0x1 << clamp_id);
 		}
 		if (!clamps) {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3d91dbd19a85..68e7b1ac7a57 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -13626,10 +13626,6 @@ DEFINE_SCHED_CLASS(fair) =3D {
 #ifdef CONFIG_SCHED_CORE
 	.task_is_throttled	=3D task_is_throttled_fair,
 #endif
-
-#ifdef CONFIG_UCLAMP_TASK
-	.uclamp_enabled		=3D 1,
-#endif
 };
=20
 #ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 4b8e33c615b1..4cd8d3e06eeb 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2649,10 +2649,6 @@ DEFINE_SCHED_CLASS(rt) =3D {
 #ifdef CONFIG_SCHED_CORE
 	.task_is_throttled	=3D task_is_throttled_rt,
 #endif
-
-#ifdef CONFIG_UCLAMP_TASK
-	.uclamp_enabled		=3D 1,
-#endif
 };
=20
 #ifdef CONFIG_RT_GROUP_SCHED
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b50e3d6e79c4..cfeaefcec8b6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1050,46 +1050,6 @@ extern void rto_push_irq_work_func(struct irq_work *=
work);
 #endif /* CONFIG_SMP */
=20
 #ifdef CONFIG_UCLAMP_TASK
-/*
- * struct uclamp_bucket - Utilization clamp bucket
- * @value: utilization clamp value for tasks on this clamp bucket
- * @tasks: number of RUNNABLE tasks on this clamp bucket
- *
- * Keep track of how many tasks are RUNNABLE for a given utilization
- * clamp value.
- */
-struct uclamp_bucket {
-	unsigned long value : bits_per(SCHED_CAPACITY_SCALE);
-	unsigned long tasks : BITS_PER_LONG - bits_per(SCHED_CAPACITY_SCALE);
-};
-
-/*
- * struct uclamp_rq - rq's utilization clamp
- * @value: currently active clamp values for a rq
- * @bucket: utilization clamp buckets affecting a rq
- *
- * Keep track of RUNNABLE tasks on a rq to aggregate their clamp values.
- * A clamp value is affecting a rq when there is at least one task RUNNABLE
- * (or actually running) with that value.
- *
- * There are up to UCLAMP_CNT possible different clamp values, currently t=
here
- * are only two: minimum utilization and maximum utilization.
- *
- * All utilization clamping values are MAX aggregated, since:
- * - for util_min: we want to run the CPU at least at the max of the minim=
um
- *   utilization required by its currently RUNNABLE tasks.
- * - for util_max: we want to allow the CPU to run up to the max of the
- *   maximum utilization allowed by its currently RUNNABLE tasks.
- *
- * Since on each system we expect only a limited number of different
- * utilization clamp values (UCLAMP_BUCKETS), use a simple array to track
- * the metrics required to compute all the per-rq utilization clamp values.
- */
-struct uclamp_rq {
-	unsigned int value;
-	struct uclamp_bucket bucket[UCLAMP_BUCKETS];
-};
-
 DECLARE_STATIC_KEY_FALSE(sched_uclamp_used);
 #endif /* CONFIG_UCLAMP_TASK */
=20
@@ -1126,10 +1086,6 @@ struct rq {
 	u64			nr_switches;
=20
 #ifdef CONFIG_UCLAMP_TASK
-	/* Utilization clamp values based on CPU's RUNNABLE tasks */
-	struct uclamp_rq	uclamp[UCLAMP_CNT] ____cacheline_aligned;
-	unsigned int		uclamp_flags;
-#define UCLAMP_FLAG_IDLE 0x01
 #endif
=20
 	struct cfs_rq		cfs;
@@ -2409,11 +2365,6 @@ struct affinity_context {
 extern s64 update_curr_common(struct rq *rq);
=20
 struct sched_class {
-
-#ifdef CONFIG_UCLAMP_TASK
-	int uclamp_enabled;
-#endif
-
 	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
 	bool (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
 	void (*yield_task)   (struct rq *rq);
@@ -3393,23 +3344,7 @@ static inline bool update_other_load_avgs(struct rq =
*rq) { return false; }
 #ifdef CONFIG_UCLAMP_TASK
=20
 unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp=
_id);
-
-static inline unsigned long uclamp_rq_get(struct rq *rq,
-					  enum uclamp_id clamp_id)
-{
-	return READ_ONCE(rq->uclamp[clamp_id].value);
-}
-
-static inline void uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id,
-				 unsigned int value)
-{
-	WRITE_ONCE(rq->uclamp[clamp_id].value, value);
-}
-
-static inline bool uclamp_rq_is_idle(struct rq *rq)
-{
-	return rq->uclamp_flags & UCLAMP_FLAG_IDLE;
-}
+void uclamp_update_active_nolock(struct task_struct *p);
=20
 /*
  * When uclamp is compiled in, the aggregation at rq level is 'turned off'
@@ -3437,19 +3372,10 @@ static inline unsigned int uclamp_none(enum uclamp_=
id clamp_id)
 	return SCHED_CAPACITY_SCALE;
 }
=20
-/* Integer rounded range for each bucket */
-#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP=
_BUCKETS)
-
-static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
-{
-	return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCK=
ETS - 1);
-}
-
 static inline void
 uclamp_se_set(struct uclamp_se *uc_se, unsigned int value, bool user_defin=
ed)
 {
 	uc_se->value =3D value;
-	uc_se->bucket_id =3D uclamp_bucket_id(value);
 	uc_se->user_defined =3D user_defined;
 }
=20
@@ -3480,26 +3406,11 @@ uclamp_eff_value(struct task_struct *p, enum uclamp=
_id clamp_id)
 	return SCHED_CAPACITY_SCALE;
 }
=20
-static inline bool uclamp_is_used(void)
-{
-	return false;
-}
-
-static inline unsigned long
-uclamp_rq_get(struct rq *rq, enum uclamp_id clamp_id)
-{
-	if (clamp_id =3D=3D UCLAMP_MIN)
-		return 0;
-
-	return SCHED_CAPACITY_SCALE;
-}
-
-static inline void
-uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id, unsigned int value)
+static inline void uclamp_update_active_nolock(struct task_struct *p)
 {
 }
=20
-static inline bool uclamp_rq_is_idle(struct rq *rq)
+static inline bool uclamp_is_used(void)
 {
 	return false;
 }
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 9f40348f1dc7..24af915f8d18 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -437,6 +437,8 @@ static void __setscheduler_uclamp(struct task_struct *p,
 		uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
 			      attr->sched_util_max, true);
 	}
+
+	uclamp_update_active_nolock(p);
 }
=20
 #else /* !CONFIG_UCLAMP_TASK: */
--=20
2.34.1
From nobody Sun Feb  8 03:17:38 2026
Received: from foss.arm.com (foss.arm.com [217.140.110.172])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id A96E7207676
	for <linux-kernel@vger.kernel.org>; Tue,  4 Mar 2025 14:23:44 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=217.140.110.172
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1741098226; cv=none;
 b=ol5v7KIcvsYXoGwdXizbT7Y4URvaEcPOzie4mQ0i6Q4y7ZHS4X/xq/kvtj2dVZStsECMMFy681N3eGBzqJq3/G+2XtzmRPPcWIOCj7fHMT0XdkGes9xJZ3zSj7Djj9NGjNUZmz97b8rj9oVKyZV8aTgWrKTsZ6oZMxV3SZYDibQ=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1741098226; c=relaxed/simple;
	bh=QCUb/jDI9q9/yfiSHSxl4Y+RGwLh55vL7LgIc3xr40Y=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=FsN+ENP/kYGyrqZXWXKFh0FrIMIJwMK4R3QwcPZ1s6cYx76AV+cGcl7f0kEaedKMW3T3ihKicuNvmSWTT8LZ8FzDZ3YuJbPTlkN4/E94Y1L8cnxkImJ8bzT7xyjuI9U28OUG6hESXJSfkdHP2o7E6e0lbljIbAQXHSMtMKchAUI=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=arm.com;
 spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=arm.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=arm.com
Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14])
	by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 065531007;
	Tue,  4 Mar 2025 06:23:58 -0800 (PST)
Received: from e130256.cambridge.arm.com (usa-sjc-imap-foss1.foss.arm.com
 [10.121.207.14])
	by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id A5B2A3F66E;
	Tue,  4 Mar 2025 06:23:42 -0800 (PST)
From: Hongyan Xia <hongyan.xia2@arm.com>
To: Ingo Molnar <mingo@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Vincent Guittot <vincent.guittot@linaro.org>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Juri Lelli <juri.lelli@redhat.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Ben Segall <bsegall@google.com>,
	Mel Gorman <mgorman@suse.de>,
	Valentin Schneider <vschneid@redhat.com>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>,
	Lukasz Luba <lukasz.luba@arm.com>,
	Christian Loehle <christian.loehle@arm.com>,
	Pierre Gondois <pierre.gondois@arm.com>,
	linux-kernel@vger.kernel.org
Subject: [PATCH v2 6/8] sched/uclamp: Simplify uclamp_eff_value()
Date: Tue,  4 Mar 2025 14:23:13 +0000
Message-Id: 
 <d97467f531aac07116ecad4e5be44693d07d26af.1741091349.git.hongyan.xia2@arm.com>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <cover.1741091349.git.hongyan.xia2@arm.com>
References: <cover.1741091349.git.hongyan.xia2@arm.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

The commit

sched: Remove all uclamp bucket logic

removes uclamp_rq_{inc/dec}() functions, so now p->uclamp contains the
correct values all the time after a uclamp_update_active() call, and
there's no need to toggle the boolean `active` after an update. As a
result, this function is fairly simple now and can live as a static
inline function.

Signed-off-by: Hongyan Xia <hongyan.xia2@arm.com>
---
 kernel/sched/core.c  | 13 ++++---------
 kernel/sched/sched.h | 10 +++++++++-
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 85c69ca7abaa..f3825e36ae85 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1569,20 +1569,14 @@ uclamp_eff_get(struct task_struct *p, enum uclamp_i=
d clamp_id)
 	return uc_req;
 }
=20
-unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp=
_id)
-{
-	if (!uclamp_is_used() || !p->uclamp[clamp_id].active)
-		return uclamp_none(clamp_id);
-
-	return p->uclamp[clamp_id].value;
-}
-
 void uclamp_update_active_nolock(struct task_struct *p)
 {
 	enum uclamp_id clamp_id;
=20
-	for_each_clamp_id(clamp_id)
+	for_each_clamp_id(clamp_id) {
 		p->uclamp[clamp_id] =3D uclamp_eff_get(p, clamp_id);
+		p->uclamp[clamp_id].active =3D 1;
+	}
 }
=20
 static inline void
@@ -1737,6 +1731,7 @@ static void uclamp_fork(struct task_struct *p)
 		for_each_clamp_id(clamp_id) {
 			uclamp_se_set(&p->uclamp_req[clamp_id],
 				      uclamp_none(clamp_id), false);
+			p->uclamp[clamp_id].active =3D 0;
 		}
 	}
=20
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index cfeaefcec8b6..f4a82e6cc029 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3343,7 +3343,6 @@ static inline bool update_other_load_avgs(struct rq *=
rq) { return false; }
=20
 #ifdef CONFIG_UCLAMP_TASK
=20
-unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp=
_id);
 void uclamp_update_active_nolock(struct task_struct *p);
=20
 /*
@@ -3372,6 +3371,15 @@ static inline unsigned int uclamp_none(enum uclamp_i=
d clamp_id)
 	return SCHED_CAPACITY_SCALE;
 }
=20
+static inline unsigned long
+uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
+{
+	if (uclamp_is_used() && p->uclamp[clamp_id].active)
+		return p->uclamp[clamp_id].value;
+
+	return uclamp_none(clamp_id);
+}
+
 static inline void
 uclamp_se_set(struct uclamp_se *uc_se, unsigned int value, bool user_defin=
ed)
 {
--=20
2.34.1
From nobody Sun Feb  8 03:17:38 2026
Received: from foss.arm.com (foss.arm.com [217.140.110.172])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id 71C1D207A2A
	for <linux-kernel@vger.kernel.org>; Tue,  4 Mar 2025 14:23:47 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=217.140.110.172
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1741098229; cv=none;
 b=pfEbiD3JG0kbs+OGBSIJruhrOhGm9ChvjYamvJtEFvMYCKhMaA3WDGZ4o2Odo2T2cYr+eSPWXp1wCflyeVTbgjgvqeR2Uyg6q5V7qQDzpID1XZl3/xvfCLQDbq1XB7yf7z/CL0cG2hYvQkk4vqpF2r7hH5HhwBwTpVIEoXNillI=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1741098229; c=relaxed/simple;
	bh=ePyTVUHpYXKRFzTNY0Q7NWvN8OyPJ3n0fcbMve5YEMM=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=eaaM710DbPcshkiNVOPxx4L/XTUeo9c5QUtTRs0sheMPiCE8Zhb/VkejUpEhnErq5jS9p1obLElXJA30eRcnnXCSe6oQVXm9yfSZQTHtkbJ3CM2kEnl/4xP8zuu/KBkRK7qfAdypxzLa0URDDN+u+uKqEv4ZEj4bm5DcmA0oyPk=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=arm.com;
 spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=arm.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=arm.com
Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14])
	by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 758D7FEC;
	Tue,  4 Mar 2025 06:24:00 -0800 (PST)
Received: from e130256.cambridge.arm.com (usa-sjc-imap-foss1.foss.arm.com
 [10.121.207.14])
	by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id 219013F66E;
	Tue,  4 Mar 2025 06:23:45 -0800 (PST)
From: Hongyan Xia <hongyan.xia2@arm.com>
To: Ingo Molnar <mingo@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Vincent Guittot <vincent.guittot@linaro.org>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Juri Lelli <juri.lelli@redhat.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Ben Segall <bsegall@google.com>,
	Mel Gorman <mgorman@suse.de>,
	Valentin Schneider <vschneid@redhat.com>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>,
	Lukasz Luba <lukasz.luba@arm.com>,
	Christian Loehle <christian.loehle@arm.com>,
	Pierre Gondois <pierre.gondois@arm.com>,
	linux-kernel@vger.kernel.org
Subject: [PATCH v2 7/8] sched/uclamp: Propagate negative bias
Date: Tue,  4 Mar 2025 14:23:14 +0000
Message-Id: 
 <53749ecebbed9ef59f6f0fea9c8a8daec0733d68.1741091349.git.hongyan.xia2@arm.com>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <cover.1741091349.git.hongyan.xia2@arm.com>
References: <cover.1741091349.git.hongyan.xia2@arm.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

Negative bias is interesting, because dequeuing such a task will
actually increase utilization.

Solve by applying PELT decay to negative biases as well. This in fact
can be implemented easily with some math tricks.

Signed-off-by: Hongyan Xia <hongyan.xia2@arm.com>
---
 kernel/sched/fair.c  | 46 +++++++++++++++++++++++++++++++++++++++++++-
 kernel/sched/sched.h |  4 ++++
 2 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 68e7b1ac7a57..944953b90297 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4886,6 +4886,48 @@ static inline unsigned long root_cfs_util_uclamp(str=
uct rq *rq)
=20
 	return max(ret, 0L);
 }
+
+/*
+ * Negative biases are tricky. If we remove them right away then dequeuing=
 a
+ * uclamp_max task has the interesting effect that dequeuing results in a =
higher
+ * rq utilization. Solve this by applying PELT decay to the bias itself.
+ *
+ * Keeping track of a PELT-decayed negative bias is extra overhead. Howeve=
r, we
+ * observe this interesting math property, where y is the decay factor and=
 p is
+ * the number of periods elapsed:
+ *
+ *	util_new =3D util_old * y^p - neg_bias * y^p
+ *		 =3D (util_old - neg_bias) * y^p
+ *
+ * Therefore, we simply subtract the negative bias from util_avg the momen=
t we
+ * dequeue, then the PELT signal itself is the total of util_avg and the d=
ecayed
+ * negative bias, and we no longer need to track the decayed bias separate=
ly.
+ */
+static void propagate_negative_bias(struct task_struct *p)
+{
+	if (task_util_bias(p) < 0 && !task_on_rq_migrating(p)) {
+		unsigned long neg_bias =3D -task_util_bias(p);
+		struct sched_entity *se =3D &p->se;
+
+		p->se.avg.util_avg_bias =3D 0;
+
+		for_each_sched_entity(se) {
+			struct sched_avg *sa =3D &se->avg;
+			u32 divider =3D get_pelt_divider(sa);
+
+			sub_positive(&sa->util_avg, neg_bias);
+			sub_positive(&sa->util_sum, neg_bias * divider);
+			sa->util_sum =3D max_t(u32, sa->util_sum,
+					     sa->util_avg * PELT_MIN_DIVIDER);
+			sa =3D &cfs_rq_of(se)->avg;
+			divider =3D get_pelt_divider(sa);
+			sub_positive(&sa->util_avg, neg_bias);
+			sub_positive(&sa->util_sum, neg_bias * divider);
+			sa->util_sum =3D max_t(u32, sa->util_sum,
+					     sa->util_avg * PELT_MIN_DIVIDER);
+		}
+	}
+}
 #else
 static inline long task_util_bias(struct task_struct *p)
 {
@@ -7114,8 +7156,10 @@ static int dequeue_entities(struct rq *rq, struct sc=
hed_entity *se, int flags)
 	}
=20
 	sub_nr_running(rq, h_nr_queued);
-	if (p)
+	if (p) {
 		util_bias_dequeue(rq, p);
+		propagate_negative_bias(p);
+	}
=20
 	if (rq_h_nr_queued && !rq->cfs.h_nr_queued)
 		dl_server_stop(&rq->fair_server);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f4a82e6cc029..654eede62979 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3431,6 +3431,10 @@ static inline void util_bias_dequeue(struct rq *rq, =
struct task_struct *p)
 {
 }
=20
+static inline void propagate_negative_bias(struct task_struct *p)
+{
+}
+
 #endif /* !CONFIG_UCLAMP_TASK */
=20
 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
--=20
2.34.1
From nobody Sun Feb  8 03:17:38 2026
Received: from foss.arm.com (foss.arm.com [217.140.110.172])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id C08AF207DF5
	for <linux-kernel@vger.kernel.org>; Tue,  4 Mar 2025 14:23:49 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=217.140.110.172
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1741098231; cv=none;
 b=iAWC89oKji8+lpKGVCgBjwIyZyLCErO2tmEiQmKk7qpFxWoEGRnlJ1ceV3R/csYwo0dd592+CS4E5A+ZGwcvyilxT87hbpVt22FMeKvdnbkGgdBygL60HaEw7qy70aMQ22wXugJgmGcWsEyOfZPRdH0yGJHNK86yrn1+kfNNKn8=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1741098231; c=relaxed/simple;
	bh=+/4CvG4LsmN5Sf5U9c6W1i9TGVpKsEVoSEJ4l2pow9M=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=efQSGha87g25eZFguSxpdtF2776pC4rWogeJB7ka1tb3VLQx8njc4vTyPCEDQ0fXYrNLlbOz6+ye4k7rsLs0YzaE/BMhBP1Qanz9PFLcjyXv13ojyNixPMFw6Ct2QVxVBCP+jxmEStcZX8FyrhH1jHKH2vvKmq2+MAcT54c1JmY=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=arm.com;
 spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=arm.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=arm.com
Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14])
	by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 11748FEC;
	Tue,  4 Mar 2025 06:24:03 -0800 (PST)
Received: from e130256.cambridge.arm.com (usa-sjc-imap-foss1.foss.arm.com
 [10.121.207.14])
	by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id B135F3F66E;
	Tue,  4 Mar 2025 06:23:47 -0800 (PST)
From: Hongyan Xia <hongyan.xia2@arm.com>
To: Ingo Molnar <mingo@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Vincent Guittot <vincent.guittot@linaro.org>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Juri Lelli <juri.lelli@redhat.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Ben Segall <bsegall@google.com>,
	Mel Gorman <mgorman@suse.de>,
	Valentin Schneider <vschneid@redhat.com>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>,
	Lukasz Luba <lukasz.luba@arm.com>,
	Christian Loehle <christian.loehle@arm.com>,
	Pierre Gondois <pierre.gondois@arm.com>,
	linux-kernel@vger.kernel.org
Subject: [PATCH v2 8/8] sched/uclamp: Solve under-utilization problem
Date: Tue,  4 Mar 2025 14:23:15 +0000
Message-Id: 
 <94048802c665752e92d1d354fdc38dd95ffe4a03.1741091349.git.hongyan.xia2@arm.com>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <cover.1741091349.git.hongyan.xia2@arm.com>
References: <cover.1741091349.git.hongyan.xia2@arm.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

With sum aggregation, a heavily uclamp_max-throttled task may throttle
the whole rq, resulting in low OPP.

For example, two tasks having the same priority and both tasks are
always-running tasks. One task has no uclamp values but the other has
uclamp_max of 1. Then, under sum aggregation, the CPU will run at 512 +
1 =3D 513 OPP, which means the task without uclamp_max only gets 513 / 2 =
=3D
256 utilization, even though the CPU still can run faster.

With this patch, we do not throttle a uclamp_max too hard such that it
impacts other tasks. This is done by tracking the highest uclamp_factor
and any uclamp_max tasks cannot throttle more than this factor allows.

Signed-off-by: Hongyan Xia <hongyan.xia2@arm.com>
---
 kernel/sched/fair.c  | 12 ++++++++++++
 kernel/sched/pelt.c  | 33 +++++++++++++++++++++++++++++----
 kernel/sched/sched.h |  2 ++
 3 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 944953b90297..966ca63da3fa 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7159,6 +7159,18 @@ static int dequeue_entities(struct rq *rq, struct sc=
hed_entity *se, int flags)
 	if (p) {
 		util_bias_dequeue(rq, p);
 		propagate_negative_bias(p);
+		if (p->pid =3D=3D rq->max_uclamp_factor_pid) {
+			/*
+			 * If the task with the highest uclamp_factor gets
+			 * dequeued, the correct thing to do is to set pid and
+			 * factor to the second highest. However, the overhead
+			 * isn't really necessary because the second highest
+			 * will set these fields the next time it gets updated
+			 * anyway.
+			 */
+			rq->max_uclamp_factor_pid =3D -1;
+			rq->max_uclamp_factor =3D 0;
+		}
 	}
=20
 	if (rq_h_nr_queued && !rq->cfs.h_nr_queued)
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index f38abe6f0b8b..e96ca045af2e 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -271,8 +271,8 @@ ___update_load_avg(struct sched_avg *sa, unsigned long =
load)
 static void util_bias_update(struct task_struct *p)
 {
 	unsigned int util, uclamp_min, uclamp_max;
-	struct rq *rq;
-	int old, new;
+	struct rq *rq =3D task_rq(p);
+	int old, new, clamped_util, prio =3D p->prio - MAX_RT_PRIO;
=20
 	util =3D READ_ONCE(p->se.avg.util_avg);
 	uclamp_min =3D uclamp_eff_value(p, UCLAMP_MIN);
@@ -284,12 +284,37 @@ static void util_bias_update(struct task_struct *p)
 	if (uclamp_max =3D=3D SCHED_CAPACITY_SCALE)
 		uclamp_max =3D UINT_MAX;
 	old =3D READ_ONCE(p->se.avg.util_avg_bias);
-	new =3D (int)clamp(util, uclamp_min, uclamp_max) - (int)util;
+	clamped_util =3D (int)clamp(util, uclamp_min, uclamp_max);
+	if (p->se.on_rq && prio >=3D 0) {
+		/* We only do this for fair class priorities. */
+		u64 uclamp_factor =3D sched_prio_to_wmult[prio];
+
+		/* This has to be a 64-bit multiplication. */
+		uclamp_factor *=3D clamped_util;
+		if (rq->max_uclamp_factor_pid =3D=3D p->pid) {
+			rq->max_uclamp_factor =3D uclamp_factor;
+		} else if (uclamp_factor > rq->max_uclamp_factor) {
+			rq->max_uclamp_factor =3D uclamp_factor;
+			rq->max_uclamp_factor_pid =3D p->pid;
+		} else {
+			u32 weight =3D sched_prio_to_weight[prio];
+
+			/*
+			 * We cannot throttle too much if some other task is
+			 * running at high utilization. We should prioritize
+			 * giving that task enough utilization and respect
+			 * task priority, before enforcing uclamp_max.
+			 */
+			uclamp_max =3D max(uclamp_max,
+				(rq->max_uclamp_factor * weight) >> 32);
+			clamped_util =3D (int)clamp(util, uclamp_min, uclamp_max);
+		}
+	}
+	new =3D clamped_util - (int)util;
=20
 	WRITE_ONCE(p->se.avg.util_avg_bias, new);
 	if (!p->se.on_rq)
 		return;
-	rq =3D task_rq(p);
 	WRITE_ONCE(rq->cfs.avg.util_avg_bias,
 		   READ_ONCE(rq->cfs.avg.util_avg_bias) + new - old);
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 654eede62979..0dc90208ad73 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1086,6 +1086,8 @@ struct rq {
 	u64			nr_switches;
=20
 #ifdef CONFIG_UCLAMP_TASK
+	u64			max_uclamp_factor;
+	pid_t			max_uclamp_factor_pid;
 #endif
=20
 	struct cfs_rq		cfs;
--=20
2.34.1