From nobody Sun Feb  8 05:57:35 2026
Return-Path: <linux-kernel-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id EC472C43334
	for <linux-kernel@archiver.kernel.org>; Tue, 21 Jun 2022 09:04:35 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S1348794AbiFUJEf (ORCPT
        <rfc822;linux-kernel@archiver.kernel.org>);
        Tue, 21 Jun 2022 05:04:35 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:48272 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S1348775AbiFUJE3 (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Tue, 21 Jun 2022 05:04:29 -0400
Received: from mail-wr1-x449.google.com (mail-wr1-x449.google.com
 [IPv6:2a00:1450:4864:20::449])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id DB81117E2B
        for <linux-kernel@vger.kernel.org>;
 Tue, 21 Jun 2022 02:04:27 -0700 (PDT)
Received: by mail-wr1-x449.google.com with SMTP id
 q13-20020adfab0d000000b0021b831e5b60so2144914wrc.3
        for <linux-kernel@vger.kernel.org>;
 Tue, 21 Jun 2022 02:04:27 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=google.com; s=20210112;
        h=date:in-reply-to:message-id:mime-version:references:subject:from:to
         :cc;
        bh=HewmPXOmppnE+WSwuJjoMY97DEcqVrm8vM9g9IoIsOQ=;
        b=EplJZw4I9tJTp/1Yg+G6BMhFG03nQbMFs8EUYdopmBsnnqlR4mgD0lvx8mqfWXtQ6w
         c3J7BHuzQP3OARsUtVc6gQ28IVS7EGHT7axdM4/pmkIZXZRb6Iv5BRMMAnPj2NlvPLDp
         GN5+uCOk/yt81qs8vb1OejE+n+Tqs8/5yv0lTEjh3akaIykRiKPdNvvQMLHi9et+CL2K
         5DMu5G9a3BU8NKxN0PRS1rxqK72VV6SVxmq9xGXvjczFQ8aFM/+fCEQS0iYmD8aiDoR0
         gpKW7Bdqcm7ZEZLBMTwKhTuMjFm1UOz4Fmw5OnUhat/LvOc6Yd4SrQGTBvMzPt37pzcP
         L6sw==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=x-gm-message-state:date:in-reply-to:message-id:mime-version
         :references:subject:from:to:cc;
        bh=HewmPXOmppnE+WSwuJjoMY97DEcqVrm8vM9g9IoIsOQ=;
        b=KqNBMoWNpp/VhzPXvvNmbXjbMYFxqqYOQetJFOocS0/dn2BuL2sGnOaej7KoGwVI0Y
         PRN0ixc4/wBy4wEaMSWlPI6FMq/Ia2l+gUgeLikv7GGndq9bd83ERCNZ4viKbccuRnLc
         Bm55qnAZ1+7iweYr8/ZDaveD8Ytuh94fnaxMhfH+dIrudDb/dVIPYCGJX8ZpuqwdJa6s
         F7YARyiOXSEA6mUgt/QlX8LGQktDuSpcEMWp3M5FpJ09+MHQ7zIrAP0NPKbxDQF16bJ3
         i5P925s9dOk6hNdApiYv5w/lKw+MGHF3XDPKnD4Ef98KIQr4sH/KP2Aa3JqTQa4zBXue
         KOig==
X-Gm-Message-State: AOAM530RkJIg2I9ZqugNlH+MhuVx4DyXOkVKLD35EZadZAfzt5ekPiDj
        FJdog60NPnqKWu8R/OLhZSfgkOTKLhk1pTqv
X-Google-Smtp-Source: 
 ABdhPJysd0UauzHB2RFrP2X/NLlaVjpJLMuayRz11adpVO26x6V140jXiBNX4hqFGtd9JS8jLnYGRULl55DvW7EZ
X-Received: from vdonnefort.c.googlers.com
 ([fda3:e722:ac3:cc00:28:9cb1:c0a8:2eea])
 (user=vdonnefort job=sendgmr) by 2002:a05:600c:3516:b0:39c:8091:31b6 with
 SMTP id h22-20020a05600c351600b0039c809131b6mr39443792wmq.164.1655802266271;
 Tue, 21 Jun 2022 02:04:26 -0700 (PDT)
Date: Tue, 21 Jun 2022 10:04:08 +0100
In-Reply-To: <20220621090414.433602-1-vdonnefort@google.com>
Message-Id: <20220621090414.433602-2-vdonnefort@google.com>
Mime-Version: 1.0
References: <20220621090414.433602-1-vdonnefort@google.com>
X-Mailer: git-send-email 2.37.0.rc0.104.g0611611a94-goog
Subject: [PATCH v11 1/7] sched/fair: Provide u64 read for 32-bits arch helper
From: Vincent Donnefort <vdonnefort@google.com>
To: peterz@infradead.org, mingo@redhat.com, vincent.guittot@linaro.org
Cc: linux-kernel@vger.kernel.org, dietmar.eggemann@arm.com,
        morten.rasmussen@arm.com, chris.redpath@arm.com,
        qperret@google.com, tao.zhou@linux.dev, kernel-team@android.com,
        vdonnefort@google.com,
        Vincent Donnefort <vincent.donnefort@arm.com>,
        Lukasz Luba <lukasz.luba@arm.com>
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

From: Vincent Donnefort <vincent.donnefort@arm.com>

Introducing macro helpers u64_u32_{store,load}() to factorize lockless
accesses to u64 variables for 32-bits architectures.

Users are for now cfs_rq.min_vruntime and sched_avg.last_update_time. To
accommodate the later where the copy lies outside of the structure
(cfs_rq.last_udpate_time_copy instead of sched_avg.last_update_time_copy),
use the _copy() version of those helpers.

Those new helpers encapsulate smp_rmb() and smp_wmb() synchronization and
therefore, have a small penalty for 32-bits machines in set_task_rq_fair()
and init_cfs_rq().

Signed-off-by: Vincent Donnefort <vincent.donnefort@arm.com>
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Lukasz Luba <lukasz.luba@arm.com>

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 78795a997d9c..56e56e2dcf93 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -612,11 +612,8 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
 	}
=20
 	/* ensure we never gain time by being placed backwards. */
-	cfs_rq->min_vruntime =3D max_vruntime(cfs_rq->min_vruntime, vruntime);
-#ifndef CONFIG_64BIT
-	smp_wmb();
-	cfs_rq->min_vruntime_copy =3D cfs_rq->min_vruntime;
-#endif
+	u64_u32_store(cfs_rq->min_vruntime,
+		      max_vruntime(cfs_rq->min_vruntime, vruntime));
 }
=20
 static inline bool __entity_less(struct rb_node *a, const struct rb_node *=
b)
@@ -3352,6 +3349,11 @@ static inline void cfs_rq_util_change(struct cfs_rq =
*cfs_rq, int flags)
 }
=20
 #ifdef CONFIG_SMP
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
+	return u64_u32_load_copy(cfs_rq->avg.last_update_time,
+				 cfs_rq->last_update_time_copy);
+}
 #ifdef CONFIG_FAIR_GROUP_SCHED
 /*
  * Because list_add_leaf_cfs_rq always places a child cfs_rq on the list
@@ -3462,27 +3464,9 @@ void set_task_rq_fair(struct sched_entity *se,
 	if (!(se->avg.last_update_time && prev))
 		return;
=20
-#ifndef CONFIG_64BIT
-	{
-		u64 p_last_update_time_copy;
-		u64 n_last_update_time_copy;
-
-		do {
-			p_last_update_time_copy =3D prev->load_last_update_time_copy;
-			n_last_update_time_copy =3D next->load_last_update_time_copy;
-
-			smp_rmb();
-
-			p_last_update_time =3D prev->avg.last_update_time;
-			n_last_update_time =3D next->avg.last_update_time;
+	p_last_update_time =3D cfs_rq_last_update_time(prev);
+	n_last_update_time =3D cfs_rq_last_update_time(next);
=20
-		} while (p_last_update_time !=3D p_last_update_time_copy ||
-			 n_last_update_time !=3D n_last_update_time_copy);
-	}
-#else
-	p_last_update_time =3D prev->avg.last_update_time;
-	n_last_update_time =3D next->avg.last_update_time;
-#endif
 	__update_load_avg_blocked_se(p_last_update_time, se);
 	se->avg.last_update_time =3D n_last_update_time;
 }
@@ -3835,12 +3819,9 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_r=
q)
 	}
=20
 	decayed |=3D __update_load_avg_cfs_rq(now, cfs_rq);
-
-#ifndef CONFIG_64BIT
-	smp_wmb();
-	cfs_rq->load_last_update_time_copy =3D sa->last_update_time;
-#endif
-
+	u64_u32_store_copy(sa->last_update_time,
+			   cfs_rq->last_update_time_copy,
+			   sa->last_update_time);
 	return decayed;
 }
=20
@@ -3972,27 +3953,6 @@ static inline void update_load_avg(struct cfs_rq *cf=
s_rq, struct sched_entity *s
 	}
 }
=20
-#ifndef CONFIG_64BIT
-static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
-{
-	u64 last_update_time_copy;
-	u64 last_update_time;
-
-	do {
-		last_update_time_copy =3D cfs_rq->load_last_update_time_copy;
-		smp_rmb();
-		last_update_time =3D cfs_rq->avg.last_update_time;
-	} while (last_update_time !=3D last_update_time_copy);
-
-	return last_update_time;
-}
-#else
-static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
-{
-	return cfs_rq->avg.last_update_time;
-}
-#endif
-
 /*
  * Synchronize entity load avg of dequeued entity without locking
  * the previous rq.
@@ -6960,21 +6920,8 @@ static void migrate_task_rq_fair(struct task_struct =
*p, int new_cpu)
 	if (READ_ONCE(p->__state) =3D=3D TASK_WAKING) {
 		struct sched_entity *se =3D &p->se;
 		struct cfs_rq *cfs_rq =3D cfs_rq_of(se);
-		u64 min_vruntime;
-
-#ifndef CONFIG_64BIT
-		u64 min_vruntime_copy;
-
-		do {
-			min_vruntime_copy =3D cfs_rq->min_vruntime_copy;
-			smp_rmb();
-			min_vruntime =3D cfs_rq->min_vruntime;
-		} while (min_vruntime !=3D min_vruntime_copy);
-#else
-		min_vruntime =3D cfs_rq->min_vruntime;
-#endif
=20
-		se->vruntime -=3D min_vruntime;
+		se->vruntime -=3D u64_u32_load(cfs_rq->min_vruntime);
 	}
=20
 	if (p->on_rq =3D=3D TASK_ON_RQ_MIGRATING) {
@@ -11425,10 +11372,7 @@ static void set_next_task_fair(struct rq *rq, stru=
ct task_struct *p, bool first)
 void init_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	cfs_rq->tasks_timeline =3D RB_ROOT_CACHED;
-	cfs_rq->min_vruntime =3D (u64)(-(1LL << 20));
-#ifndef CONFIG_64BIT
-	cfs_rq->min_vruntime_copy =3D cfs_rq->min_vruntime;
-#endif
+	u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20)));
 #ifdef CONFIG_SMP
 	raw_spin_lock_init(&cfs_rq->removed.lock);
 #endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5b14b6b4495d..2b563f2002e6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -521,6 +521,45 @@ struct cfs_bandwidth { };
=20
 #endif	/* CONFIG_CGROUP_SCHED */
=20
+/*
+ * u64_u32_load/u64_u32_store
+ *
+ * Use a copy of a u64 value to protect against data race. This is only
+ * applicable for 32-bits architectures.
+ */
+#ifdef CONFIG_64BIT
+# define u64_u32_load_copy(var, copy)       var
+# define u64_u32_store_copy(var, copy, val) (var =3D val)
+#else
+# define u64_u32_load_copy(var, copy)					\
+({									\
+	u64 __val, __val_copy;						\
+	do {								\
+		__val_copy =3D copy;					\
+		/*							\
+		 * paired with u64_u32_store_copy(), ordering access	\
+		 * to var and copy.					\
+		 */							\
+		smp_rmb();						\
+		__val =3D var;						\
+	} while (__val !=3D __val_copy);					\
+	__val;								\
+})
+# define u64_u32_store_copy(var, copy, val)				\
+do {									\
+	typeof(val) __val =3D (val);					\
+	var =3D __val;							\
+	/*								\
+	 * paired with u64_u32_load_copy(), ordering access to var and	\
+	 * copy.							\
+	 */								\
+	smp_wmb();							\
+	copy =3D __val;							\
+} while (0)
+#endif
+# define u64_u32_load(var)      u64_u32_load_copy(var, var##_copy)
+# define u64_u32_store(var, val) u64_u32_store_copy(var, var##_copy, val)
+
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
 	struct load_weight	load;
@@ -561,7 +600,7 @@ struct cfs_rq {
 	 */
 	struct sched_avg	avg;
 #ifndef CONFIG_64BIT
-	u64			load_last_update_time_copy;
+	u64			last_update_time_copy;
 #endif
 	struct {
 		raw_spinlock_t	lock ____cacheline_aligned;
--=20
2.37.0.rc0.104.g0611611a94-goog
From nobody Sun Feb  8 05:57:35 2026
Return-Path: <linux-kernel-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 71B3EC43334
	for <linux-kernel@archiver.kernel.org>; Tue, 21 Jun 2022 09:04:40 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S1348139AbiFUJEi (ORCPT
        <rfc822;linux-kernel@archiver.kernel.org>);
        Tue, 21 Jun 2022 05:04:38 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:48304 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S1348055AbiFUJEc (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Tue, 21 Jun 2022 05:04:32 -0400
Received: from mail-wm1-x34a.google.com (mail-wm1-x34a.google.com
 [IPv6:2a00:1450:4864:20::34a])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id ACFF615A26
        for <linux-kernel@vger.kernel.org>;
 Tue, 21 Jun 2022 02:04:30 -0700 (PDT)
Received: by mail-wm1-x34a.google.com with SMTP id
 n15-20020a05600c4f8f00b0039c3e76d646so6098128wmq.7
        for <linux-kernel@vger.kernel.org>;
 Tue, 21 Jun 2022 02:04:30 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=google.com; s=20210112;
        h=date:in-reply-to:message-id:mime-version:references:subject:from:to
         :cc;
        bh=s1oCTn2vqjEnHg46lngZwH52OzhorJIn/c/UxM0M67g=;
        b=C0bxoxW5OBwE/sMtP4/lGQ+4jqb3hZg0NpuyjcxgULWiB8kIZpBL3/O/LbVGGDs6Kf
         9iy6EZ9PSs60Ka7voHAKIqcvqjgjtvqEK1qjdbchJKNb4lRTQrMpHNilL9JCwBDjsWdJ
         w/QS/ZjpeCmhqussC6uyR8jmskmYnC+sxSMVX9vwBNPWcsXNL/9+X4LHFOW0JTb0kvEh
         Fa7yZubRVeQ84WKZQpceal6ucz2yWBNp40g50vvNRmH7cab5FKShgXoXXtu2i7GlTqXS
         EZqEk9XvNiFTKnMvCkps3rjIhyAjp+JZqSsjdsmJXSSpmq9M8pAYS8eK/FlKe2GpCAvc
         a//g==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=x-gm-message-state:date:in-reply-to:message-id:mime-version
         :references:subject:from:to:cc;
        bh=s1oCTn2vqjEnHg46lngZwH52OzhorJIn/c/UxM0M67g=;
        b=H52SyD5arBCu5eoRnJvS/qEGRwxomoy1oy/NZKbf5NHNXDiIcyTXVtOQz3/34EWIhp
         oUXkTHOPas4tSjww8Tbvs40YWw4TI29AUqNBxba/Wgn887YYBIgaFJQ5osLbtSZO7HaK
         GUtdx6l686h/uYBI1lsrMelaKiJZo7yybiKsLSGnE5qbl5MFwpdPTOsaq63FRDv4kIwg
         wJdM/ydzGzfMaN3NrtEMSJ9ZtMBihHB5kJvSkWAyTVIC64d83k5mWuJqGkpYG4dkKcrF
         soc7Ymt4IgXlGpjr+Y8eAi9ip+nK1N/ch7wDBPTgMzcFxjXkLrTl0ODv/pTJn6PVGc0V
         CfBg==
X-Gm-Message-State: AOAM530CcFbkeYMSYPDNleR+d3zU1q4yTRB5k8SRZxzHGQRQVLVy2ejj
        jHZ9ZV0v4XSeuCxldqWxu8WODrdT3PoHhfL3
X-Google-Smtp-Source: 
 ABdhPJyXa+i6NR3eYuA3h7KVpz3JvFm9vkzdLcMGm0Z6lKAMgnhdlKeEcCOdPUdEsLkckhmsJN64xIb3D3DL+Js6
X-Received: from vdonnefort.c.googlers.com
 ([fda3:e722:ac3:cc00:28:9cb1:c0a8:2eea])
 (user=vdonnefort job=sendgmr) by 2002:a05:600c:3052:b0:39c:6540:c280 with
 SMTP id n18-20020a05600c305200b0039c6540c280mr1747516wmh.1.1655802268670;
 Tue, 21 Jun 2022 02:04:28 -0700 (PDT)
Date: Tue, 21 Jun 2022 10:04:09 +0100
In-Reply-To: <20220621090414.433602-1-vdonnefort@google.com>
Message-Id: <20220621090414.433602-3-vdonnefort@google.com>
Mime-Version: 1.0
References: <20220621090414.433602-1-vdonnefort@google.com>
X-Mailer: git-send-email 2.37.0.rc0.104.g0611611a94-goog
Subject: [PATCH v11 2/7] sched/fair: Decay task PELT values during wakeup
 migration
From: Vincent Donnefort <vdonnefort@google.com>
To: peterz@infradead.org, mingo@redhat.com, vincent.guittot@linaro.org
Cc: linux-kernel@vger.kernel.org, dietmar.eggemann@arm.com,
        morten.rasmussen@arm.com, chris.redpath@arm.com,
        qperret@google.com, tao.zhou@linux.dev, kernel-team@android.com,
        vdonnefort@google.com,
        Vincent Donnefort <vincent.donnefort@arm.com>,
        Lukasz Luba <lukasz.luba@arm.com>
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

From: Vincent Donnefort <vincent.donnefort@arm.com>

Before being migrated to a new CPU, a task sees its PELT values
synchronized with rq last_update_time. Once done, that same task will also
have its sched_avg last_update_time reset. This means the time between
the migration and the last clock update will not be accounted for in
util_avg and a discontinuity will appear. This issue is amplified by the
PELT clock scaling. It takes currently one tick after the CPU being idle
to let clock_pelt catching up clock_task.

This is especially problematic for asymmetric CPU capacity systems which
need stable util_avg signals for task placement and energy estimation.

Ideally, this problem would be solved by updating the runqueue clocks
before the migration. But that would require taking the runqueue lock
which is quite expensive [1]. Instead estimate the missing time and update
the task util_avg with that value.

To that end, we need sched_clock_cpu() but it is a costly function. Limit
the usage to the case where the source CPU is idle as we know this is when
the clock is having the biggest risk of being outdated.

See comment in migrate_se_pelt_lag() for more details about how the PELT
value is estimated. Notice though this estimation doesn't take into account
IRQ and Paravirt time.

[1] https://lkml.kernel.org/r/20190709115759.10451-1-chris.redpath@arm.com

Signed-off-by: Vincent Donnefort <vincent.donnefort@arm.com>
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Lukasz Luba <lukasz.luba@arm.com>

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 56e56e2dcf93..9e9622b770fa 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3349,6 +3349,29 @@ static inline void cfs_rq_util_change(struct cfs_rq =
*cfs_rq, int flags)
 }
=20
 #ifdef CONFIG_SMP
+static inline bool load_avg_is_decayed(struct sched_avg *sa)
+{
+	if (sa->load_sum)
+		return false;
+
+	if (sa->util_sum)
+		return false;
+
+	if (sa->runnable_sum)
+		return false;
+
+	/*
+	 * _avg must be null when _sum are null because _avg =3D _sum / divider
+	 * Make sure that rounding and/or propagation of PELT values never
+	 * break this.
+	 */
+	SCHED_WARN_ON(sa->load_avg ||
+		      sa->util_avg ||
+		      sa->runnable_avg);
+
+	return true;
+}
+
 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
 {
 	return u64_u32_load_copy(cfs_rq->avg.last_update_time,
@@ -3386,27 +3409,12 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq =
*cfs_rq)
 	if (cfs_rq->load.weight)
 		return false;
=20
-	if (cfs_rq->avg.load_sum)
-		return false;
-
-	if (cfs_rq->avg.util_sum)
-		return false;
-
-	if (cfs_rq->avg.runnable_sum)
+	if (!load_avg_is_decayed(&cfs_rq->avg))
 		return false;
=20
 	if (child_cfs_rq_on_list(cfs_rq))
 		return false;
=20
-	/*
-	 * _avg must be null when _sum are null because _avg =3D _sum / divider
-	 * Make sure that rounding and/or propagation of PELT values never
-	 * break this.
-	 */
-	SCHED_WARN_ON(cfs_rq->avg.load_avg ||
-		      cfs_rq->avg.util_avg ||
-		      cfs_rq->avg.runnable_avg);
-
 	return true;
 }
=20
@@ -3745,6 +3753,89 @@ static inline void add_tg_cfs_propagate(struct cfs_r=
q *cfs_rq, long runnable_sum
=20
 #endif /* CONFIG_FAIR_GROUP_SCHED */
=20
+#ifdef CONFIG_NO_HZ_COMMON
+static inline void migrate_se_pelt_lag(struct sched_entity *se)
+{
+	u64 throttled =3D 0, now, lut;
+	struct cfs_rq *cfs_rq;
+	struct rq *rq;
+	bool is_idle;
+
+	if (load_avg_is_decayed(&se->avg))
+		return;
+
+	cfs_rq =3D cfs_rq_of(se);
+	rq =3D rq_of(cfs_rq);
+
+	rcu_read_lock();
+	is_idle =3D is_idle_task(rcu_dereference(rq->curr));
+	rcu_read_unlock();
+
+	/*
+	 * The lag estimation comes with a cost we don't want to pay all the
+	 * time. Hence, limiting to the case where the source CPU is idle and
+	 * we know we are at the greatest risk to have an outdated clock.
+	 */
+	if (!is_idle)
+		return;
+
+	/*
+	 * Estimated "now" is: last_update_time + cfs_idle_lag + rq_idle_lag, whe=
re:
+	 *
+	 *   last_update_time (the cfs_rq's last_update_time)
+	 *	=3D cfs_rq_clock_pelt()@cfs_rq_idle
+	 *      =3D rq_clock_pelt()@cfs_rq_idle
+	 *        - cfs->throttled_clock_pelt_time@cfs_rq_idle
+	 *
+	 *   cfs_idle_lag (delta between rq's update and cfs_rq's update)
+	 *      =3D rq_clock_pelt()@rq_idle - rq_clock_pelt()@cfs_rq_idle
+	 *
+	 *   rq_idle_lag (delta between now and rq's update)
+	 *      =3D sched_clock_cpu() - rq_clock()@rq_idle
+	 *
+	 * We can then write:
+	 *
+	 *    now =3D rq_clock_pelt()@rq_idle - cfs->throttled_clock_pelt_time +
+	 *          sched_clock_cpu() - rq_clock()@rq_idle
+	 * Where:
+	 *      rq_clock_pelt()@rq_idle is rq->clock_pelt_idle
+	 *      rq_clock()@rq_idle      is rq->clock_idle
+	 *      cfs->throttled_clock_pelt_time@cfs_rq_idle
+	 *                              is cfs_rq->throttled_pelt_idle
+	 */
+
+#ifdef CONFIG_CFS_BANDWIDTH
+	throttled =3D u64_u32_load(cfs_rq->throttled_pelt_idle);
+	/* The clock has been stopped for throttling */
+	if (throttled =3D=3D U64_MAX)
+		return;
+#endif
+	now =3D u64_u32_load(rq->clock_pelt_idle);
+	/*
+	 * Paired with _update_idle_rq_clock_pelt(). It ensures at the worst case
+	 * is observed the old clock_pelt_idle value and the new clock_idle,
+	 * which lead to an underestimation. The opposite would lead to an
+	 * overestimation.
+	 */
+	smp_rmb();
+	lut =3D cfs_rq_last_update_time(cfs_rq);
+
+	now -=3D throttled;
+	if (now < lut)
+		/*
+		 * cfs_rq->avg.last_update_time is more recent than our
+		 * estimation, let's use it.
+		 */
+		now =3D lut;
+	else
+		now +=3D sched_clock_cpu(cpu_of(rq)) - u64_u32_load(rq->clock_idle);
+
+	__update_load_avg_blocked_se(now, se);
+}
+#else
+static void migrate_se_pelt_lag(struct sched_entity *se) {}
+#endif
+
 /**
  * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
  * @now: current time, as per cfs_rq_clock_pelt()
@@ -4471,6 +4562,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_en=
tity *se, int flags)
 	 */
 	if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) !=3D DEQUEUE_SAVE)
 		update_min_vruntime(cfs_rq);
+
+	if (cfs_rq->nr_running =3D=3D 0)
+		update_idle_cfs_rq_clock_pelt(cfs_rq);
 }
=20
 /*
@@ -6911,6 +7005,8 @@ static void detach_entity_cfs_rq(struct sched_entity =
*se);
  */
 static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
 {
+	struct sched_entity *se =3D &p->se;
+
 	/*
 	 * As blocked tasks retain absolute vruntime the migration needs to
 	 * deal with this by subtracting the old and adding the new
@@ -6918,7 +7014,6 @@ static void migrate_task_rq_fair(struct task_struct *=
p, int new_cpu)
 	 * the task on the new runqueue.
 	 */
 	if (READ_ONCE(p->__state) =3D=3D TASK_WAKING) {
-		struct sched_entity *se =3D &p->se;
 		struct cfs_rq *cfs_rq =3D cfs_rq_of(se);
=20
 		se->vruntime -=3D u64_u32_load(cfs_rq->min_vruntime);
@@ -6930,25 +7025,29 @@ static void migrate_task_rq_fair(struct task_struct=
 *p, int new_cpu)
 		 * rq->lock and can modify state directly.
 		 */
 		lockdep_assert_rq_held(task_rq(p));
-		detach_entity_cfs_rq(&p->se);
+		detach_entity_cfs_rq(se);
=20
 	} else {
+		remove_entity_load_avg(se);
+
 		/*
-		 * We are supposed to update the task to "current" time, then
-		 * its up to date and ready to go to new CPU/cfs_rq. But we
-		 * have difficulty in getting what current time is, so simply
-		 * throw away the out-of-date time. This will result in the
-		 * wakee task is less decayed, but giving the wakee more load
-		 * sounds not bad.
+		 * Here, the task's PELT values have been updated according to
+		 * the current rq's clock. But if that clock hasn't been
+		 * updated in a while, a substantial idle time will be missed,
+		 * leading to an inflation after wake-up on the new rq.
+		 *
+		 * Estimate the missing time from the cfs_rq last_update_time
+		 * and update sched_avg to improve the PELT continuity after
+		 * migration.
 		 */
-		remove_entity_load_avg(&p->se);
+		migrate_se_pelt_lag(se);
 	}
=20
 	/* Tell new CPU we are migrated */
-	p->se.avg.last_update_time =3D 0;
+	se->avg.last_update_time =3D 0;
=20
 	/* We have migrated, no longer consider this task hot */
-	p->se.exec_start =3D 0;
+	se->exec_start =3D 0;
=20
 	update_scan_period(p, new_cpu);
 }
@@ -8114,6 +8213,9 @@ static bool __update_blocked_fair(struct rq *rq, bool=
 *done)
 		if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
 			update_tg_load_avg(cfs_rq);
=20
+			if (cfs_rq->nr_running =3D=3D 0)
+				update_idle_cfs_rq_clock_pelt(cfs_rq);
+
 			if (cfs_rq =3D=3D &rq->cfs)
 				decayed =3D true;
 		}
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 4ff2ed4f8fa1..3a0e0dc28721 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -61,6 +61,25 @@ static inline void cfs_se_util_change(struct sched_avg *=
avg)
 	WRITE_ONCE(avg->util_est.enqueued, enqueued);
 }
=20
+static inline u64 rq_clock_pelt(struct rq *rq)
+{
+	lockdep_assert_rq_held(rq);
+	assert_clock_updated(rq);
+
+	return rq->clock_pelt - rq->lost_idle_time;
+}
+
+/* The rq is idle, we can sync to clock_task */
+static inline void _update_idle_rq_clock_pelt(struct rq *rq)
+{
+	rq->clock_pelt  =3D rq_clock_task(rq);
+
+	u64_u32_store(rq->clock_idle, rq_clock(rq));
+	/* Paired with smp_rmb in migrate_se_pelt_lag() */
+	smp_wmb();
+	u64_u32_store(rq->clock_pelt_idle, rq_clock_pelt(rq));
+}
+
 /*
  * The clock_pelt scales the time to reflect the effective amount of
  * computation done during the running delta time but then sync back to
@@ -76,8 +95,7 @@ static inline void cfs_se_util_change(struct sched_avg *a=
vg)
 static inline void update_rq_clock_pelt(struct rq *rq, s64 delta)
 {
 	if (unlikely(is_idle_task(rq->curr))) {
-		/* The rq is idle, we can sync to clock_task */
-		rq->clock_pelt  =3D rq_clock_task(rq);
+		_update_idle_rq_clock_pelt(rq);
 		return;
 	}
=20
@@ -130,17 +148,23 @@ static inline void update_idle_rq_clock_pelt(struct r=
q *rq)
 	 */
 	if (util_sum >=3D divider)
 		rq->lost_idle_time +=3D rq_clock_task(rq) - rq->clock_pelt;
+
+	_update_idle_rq_clock_pelt(rq);
 }
=20
-static inline u64 rq_clock_pelt(struct rq *rq)
+#ifdef CONFIG_CFS_BANDWIDTH
+static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
 {
-	lockdep_assert_rq_held(rq);
-	assert_clock_updated(rq);
+	u64 throttled;
=20
-	return rq->clock_pelt - rq->lost_idle_time;
+	if (unlikely(cfs_rq->throttle_count))
+		throttled =3D U64_MAX;
+	else
+		throttled =3D cfs_rq->throttled_clock_pelt_time;
+
+	u64_u32_store(cfs_rq->throttled_pelt_idle, throttled);
 }
=20
-#ifdef CONFIG_CFS_BANDWIDTH
 /* rq->task_clock normalized against any time this cfs_rq has spent thrott=
led */
 static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
 {
@@ -150,6 +174,7 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_=
rq)
 	return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time;
 }
 #else
+static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { }
 static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
 {
 	return rq_clock_pelt(rq_of(cfs_rq));
@@ -204,6 +229,7 @@ update_rq_clock_pelt(struct rq *rq, s64 delta) { }
 static inline void
 update_idle_rq_clock_pelt(struct rq *rq) { }
=20
+static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { }
 #endif
=20
=20
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2b563f2002e6..278760fb2ef0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -649,6 +649,10 @@ struct cfs_rq {
 	int			runtime_enabled;
 	s64			runtime_remaining;
=20
+	u64			throttled_pelt_idle;
+#ifndef CONFIG_64BIT
+	u64                     throttled_pelt_idle_copy;
+#endif
 	u64			throttled_clock;
 	u64			throttled_clock_pelt;
 	u64			throttled_clock_pelt_time;
@@ -1021,6 +1025,12 @@ struct rq {
 	u64			clock_task ____cacheline_aligned;
 	u64			clock_pelt;
 	unsigned long		lost_idle_time;
+	u64			clock_pelt_idle;
+	u64			clock_idle;
+#ifndef CONFIG_64BIT
+	u64			clock_pelt_idle_copy;
+	u64			clock_idle_copy;
+#endif
=20
 	atomic_t		nr_iowait;
=20
--=20
2.37.0.rc0.104.g0611611a94-goog
From nobody Sun Feb  8 05:57:35 2026
Return-Path: <linux-kernel-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 5C9ECC433EF
	for <linux-kernel@archiver.kernel.org>; Tue, 21 Jun 2022 09:04:43 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S1348560AbiFUJEl (ORCPT
        <rfc822;linux-kernel@archiver.kernel.org>);
        Tue, 21 Jun 2022 05:04:41 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:48316 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S1348779AbiFUJEd (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Tue, 21 Jun 2022 05:04:33 -0400
Received: from mail-yb1-xb49.google.com (mail-yb1-xb49.google.com
 [IPv6:2607:f8b0:4864:20::b49])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 289FC17E1C
        for <linux-kernel@vger.kernel.org>;
 Tue, 21 Jun 2022 02:04:32 -0700 (PDT)
Received: by mail-yb1-xb49.google.com with SMTP id
 r6-20020a5b06c6000000b006693f6a6d67so2455998ybq.7
        for <linux-kernel@vger.kernel.org>;
 Tue, 21 Jun 2022 02:04:32 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=google.com; s=20210112;
        h=date:in-reply-to:message-id:mime-version:references:subject:from:to
         :cc;
        bh=RAc4mXVipDiWq1PntAMbZwW5jl2XzWMCzcXzpYPwhS0=;
        b=luJTuDPSZU8s1ZP69Qb5oi63a6GaSfii9QMQjTg+wdDPooVmVR0G1uLoJRvlCAGceI
         Vo1fmcXgnFr01As6Xq0OIjUFl2RzAz1DZOJQ/HuHu0XGP1LG55q/9TGrnJE9EKoqtgNA
         Nts27fn+uVAJy6+DJIQNX/6i7IgVj6QUy+w1qTj5Ox/Odz0aZ8GOsqKFuTlb7i9vp5Dx
         vgzCLclICvA/9OmmGa6fxqorTQHTVv5+7gGi22I3hD57Nc826b4OvLKxHAhUs8tQ8486
         tJQCCLqvgY+6uyZaPRQ55C5/lQKe3RVQ3l5puyFgQu5a9JW6l/nKdYIP7F+Bk7hI8qzH
         Km2Q==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=x-gm-message-state:date:in-reply-to:message-id:mime-version
         :references:subject:from:to:cc;
        bh=RAc4mXVipDiWq1PntAMbZwW5jl2XzWMCzcXzpYPwhS0=;
        b=HhoTfKYuUD+Wld0X6wZBDCTwjFCR64r51TFXDS2q+jeR41mUhHJ08URppX4q7orsTX
         sge6Aa8EMBh3ELI5Yc1lG8p8mZsP/+h76mYbavT6IrfE10r8Ai8TXXRYUIylanJ0q6D3
         7tDhYV7w0yEs6i0Bv4AzHP+A3WB7nBuR2offyN3RwD1eV3uSj19iHRO5PxZCZOK0Exp2
         tc8wRAcJzcVMDlpD+yYP5bXw6AHuV2O6AsV/CxAyosvfJG6riMQVvgxxlaBo1jriYBbz
         INPs786fJQCJBCcoNwwJM7JlXewzcz05D58J4GlTCqP9mpD8XZbF2uMOjvs6CstNr4cv
         bMCA==
X-Gm-Message-State: AJIora/sXD3WS5Me71MvtYpGcRsrBeBIqylERLtYaL7hCf/TUeXa6OHA
        1l3TfShOT6IUZ1Ls25Sq803qFo72gcKRJg1b
X-Google-Smtp-Source: 
 AGRyM1uyuNNoE44Dz7PvvdR5D70erWPAlfjh262lUeg2iqPKSvN+LMgTN8N8MffAEQzhnOCWTfK8qyYpk5C/8FdK
X-Received: from vdonnefort.c.googlers.com
 ([fda3:e722:ac3:cc00:28:9cb1:c0a8:2eea])
 (user=vdonnefort job=sendgmr) by 2002:a81:2f84:0:b0:314:2bfd:bf1f with SMTP
 id v126-20020a812f84000000b003142bfdbf1fmr31529837ywv.320.1655802271348; Tue,
 21 Jun 2022 02:04:31 -0700 (PDT)
Date: Tue, 21 Jun 2022 10:04:10 +0100
In-Reply-To: <20220621090414.433602-1-vdonnefort@google.com>
Message-Id: <20220621090414.433602-4-vdonnefort@google.com>
Mime-Version: 1.0
References: <20220621090414.433602-1-vdonnefort@google.com>
X-Mailer: git-send-email 2.37.0.rc0.104.g0611611a94-goog
Subject: [PATCH v11 3/7] sched,
 drivers: Remove max param from effective_cpu_util()/sched_cpu_util()
From: Vincent Donnefort <vdonnefort@google.com>
To: peterz@infradead.org, mingo@redhat.com, vincent.guittot@linaro.org
Cc: linux-kernel@vger.kernel.org, dietmar.eggemann@arm.com,
        morten.rasmussen@arm.com, chris.redpath@arm.com,
        qperret@google.com, tao.zhou@linux.dev, kernel-team@android.com,
        vdonnefort@google.com, Lukasz Luba <lukasz.luba@arm.com>
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

From: Dietmar Eggemann <dietmar.eggemann@arm.com>

effective_cpu_util() already has a `int cpu' parameter which allows to
retrieve the CPU capacity scale factor (or maximum CPU capacity) inside
this function via an arch_scale_cpu_capacity(cpu).

A lot of code calling effective_cpu_util() (or the shim
sched_cpu_util()) needs the maximum CPU capacity, i.e. it will call
arch_scale_cpu_capacity() already.
But not having to pass it into effective_cpu_util() will make the EAS
wake-up code easier, especially when the maximum CPU capacity reduced
by the thermal pressure is passed through the EAS wake-up functions.

Due to the asymmetric CPU capacity support of arm/arm64 architectures,
arch_scale_cpu_capacity(int cpu) is a per-CPU variable read access via
per_cpu(cpu_scale, cpu) on such a system.
On all other architectures it is a a compile-time constant
(SCHED_CAPACITY_SCALE).

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
Tested-by: Lukasz Luba <lukasz.luba@arm.com>

diff --git a/drivers/powercap/dtpm_cpu.c b/drivers/powercap/dtpm_cpu.c
index f5eced0842b3..6a88eb7e9f75 100644
--- a/drivers/powercap/dtpm_cpu.c
+++ b/drivers/powercap/dtpm_cpu.c
@@ -71,34 +71,19 @@ static u64 set_pd_power_limit(struct dtpm *dtpm, u64 po=
wer_limit)
=20
 static u64 scale_pd_power_uw(struct cpumask *pd_mask, u64 power)
 {
-	unsigned long max =3D 0, sum_util =3D 0;
+	unsigned long max, sum_util =3D 0;
 	int cpu;
=20
-	for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
-
-		/*
-		 * The capacity is the same for all CPUs belonging to
-		 * the same perf domain, so a single call to
-		 * arch_scale_cpu_capacity() is enough. However, we
-		 * need the CPU parameter to be initialized by the
-		 * loop, so the call ends up in this block.
-		 *
-		 * We can initialize 'max' with a cpumask_first() call
-		 * before the loop but the bits computation is not
-		 * worth given the arch_scale_cpu_capacity() just
-		 * returns a value where the resulting assembly code
-		 * will be optimized by the compiler.
-		 */
-		max =3D arch_scale_cpu_capacity(cpu);
-		sum_util +=3D sched_cpu_util(cpu, max);
-	}
-
 	/*
-	 * In the improbable case where all the CPUs of the perf
-	 * domain are offline, 'max' will be zero and will lead to an
-	 * illegal operation with a zero division.
+	 * The capacity is the same for all CPUs belonging to
+	 * the same perf domain.
 	 */
-	return max ? (power * ((sum_util << 10) / max)) >> 10 : 0;
+	max =3D arch_scale_cpu_capacity(cpumask_first(pd_mask));
+
+	for_each_cpu_and(cpu, pd_mask, cpu_online_mask)
+		sum_util +=3D sched_cpu_util(cpu);
+
+	return (power * ((sum_util << 10) / max)) >> 10;
 }
=20
 static u64 get_pd_power_uw(struct dtpm *dtpm)
diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_co=
oling.c
index b8151d95a806..b263b0fde03c 100644
--- a/drivers/thermal/cpufreq_cooling.c
+++ b/drivers/thermal/cpufreq_cooling.c
@@ -137,11 +137,9 @@ static u32 cpu_power_to_freq(struct cpufreq_cooling_de=
vice *cpufreq_cdev,
 static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu,
 		    int cpu_idx)
 {
-	unsigned long max =3D arch_scale_cpu_capacity(cpu);
-	unsigned long util;
+	unsigned long util =3D sched_cpu_util(cpu);
=20
-	util =3D sched_cpu_util(cpu, max);
-	return (util * 100) / max;
+	return (util * 100) / arch_scale_cpu_capacity(cpu);
 }
 #else /* !CONFIG_SMP */
 static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9776dee75048..05a5fb5ea46a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2258,7 +2258,7 @@ static inline bool owner_on_cpu(struct task_struct *o=
wner)
 }
=20
 /* Returns effective CPU energy utilization, as seen by the scheduler */
-unsigned long sched_cpu_util(int cpu, unsigned long max);
+unsigned long sched_cpu_util(int cpu);
 #endif /* CONFIG_SMP */
=20
 #ifdef CONFIG_RSEQ
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 263d76489a48..8fc1bd9f13b6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7175,12 +7175,14 @@ struct task_struct *idle_task(int cpu)
  * required to meet deadlines.
  */
 unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
-				 unsigned long max, enum cpu_util_type type,
+				 enum cpu_util_type type,
 				 struct task_struct *p)
 {
-	unsigned long dl_util, util, irq;
+	unsigned long dl_util, util, irq, max;
 	struct rq *rq =3D cpu_rq(cpu);
=20
+	max =3D arch_scale_cpu_capacity(cpu);
+
 	if (!uclamp_is_used() &&
 	    type =3D=3D FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
 		return max;
@@ -7260,10 +7262,9 @@ unsigned long effective_cpu_util(int cpu, unsigned l=
ong util_cfs,
 	return min(max, util);
 }
=20
-unsigned long sched_cpu_util(int cpu, unsigned long max)
+unsigned long sched_cpu_util(int cpu)
 {
-	return effective_cpu_util(cpu, cpu_util_cfs(cpu), max,
-				  ENERGY_UTIL, NULL);
+	return effective_cpu_util(cpu, cpu_util_cfs(cpu), ENERGY_UTIL, NULL);
 }
 #endif /* CONFIG_SMP */
=20
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedu=
til.c
index 3dbf351d12d5..1207c78f85c1 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -157,11 +157,10 @@ static unsigned int get_next_freq(struct sugov_policy=
 *sg_policy,
 static void sugov_get_util(struct sugov_cpu *sg_cpu)
 {
 	struct rq *rq =3D cpu_rq(sg_cpu->cpu);
-	unsigned long max =3D arch_scale_cpu_capacity(sg_cpu->cpu);
=20
-	sg_cpu->max =3D max;
+	sg_cpu->max =3D arch_scale_cpu_capacity(sg_cpu->cpu);
 	sg_cpu->bw_dl =3D cpu_bw_dl(rq);
-	sg_cpu->util =3D effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu=
), max,
+	sg_cpu->util =3D effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu=
),
 					  FREQUENCY_UTIL, NULL);
 }
=20
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9e9622b770fa..ee034a89bc87 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6704,12 +6704,11 @@ static long
 compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
 {
 	struct cpumask *pd_mask =3D perf_domain_span(pd);
-	unsigned long cpu_cap =3D arch_scale_cpu_capacity(cpumask_first(pd_mask));
-	unsigned long max_util =3D 0, sum_util =3D 0;
-	unsigned long _cpu_cap =3D cpu_cap;
+	unsigned long max_util =3D 0, sum_util =3D 0, cpu_cap;
 	int cpu;
=20
-	_cpu_cap -=3D arch_scale_thermal_pressure(cpumask_first(pd_mask));
+	cpu_cap =3D arch_scale_cpu_capacity(cpumask_first(pd_mask));
+	cpu_cap -=3D arch_scale_thermal_pressure(cpumask_first(pd_mask));
=20
 	/*
 	 * The capacity state of CPUs of the current rd can be driven by CPUs
@@ -6746,10 +6745,10 @@ compute_energy(struct task_struct *p, int dst_cpu, =
struct perf_domain *pd)
 		 * is already enough to scale the EM reported power
 		 * consumption at the (eventually clamped) cpu_capacity.
 		 */
-		cpu_util =3D effective_cpu_util(cpu, util_running, cpu_cap,
-					      ENERGY_UTIL, NULL);
+		cpu_util =3D effective_cpu_util(cpu, util_running, ENERGY_UTIL,
+					      NULL);
=20
-		sum_util +=3D min(cpu_util, _cpu_cap);
+		sum_util +=3D min(cpu_util, cpu_cap);
=20
 		/*
 		 * Performance domain frequency: utilization clamping
@@ -6758,12 +6757,12 @@ compute_energy(struct task_struct *p, int dst_cpu, =
struct perf_domain *pd)
 		 * NOTE: in case RT tasks are running, by default the
 		 * FREQUENCY_UTIL's utilization can be max OPP.
 		 */
-		cpu_util =3D effective_cpu_util(cpu, util_freq, cpu_cap,
-					      FREQUENCY_UTIL, tsk);
-		max_util =3D max(max_util, min(cpu_util, _cpu_cap));
+		cpu_util =3D effective_cpu_util(cpu, util_freq, FREQUENCY_UTIL,
+					      tsk);
+		max_util =3D max(max_util, min(cpu_util, cpu_cap));
 	}
=20
-	return em_cpu_energy(pd->em_pd, max_util, sum_util, _cpu_cap);
+	return em_cpu_energy(pd->em_pd, max_util, sum_util, cpu_cap);
 }
=20
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 278760fb2ef0..887626f98292 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2901,7 +2901,7 @@ enum cpu_util_type {
 };
=20
 unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
-				 unsigned long max, enum cpu_util_type type,
+				 enum cpu_util_type type,
 				 struct task_struct *p);
=20
 static inline unsigned long cpu_bw_dl(struct rq *rq)
--=20
2.37.0.rc0.104.g0611611a94-goog
From nobody Sun Feb  8 05:57:35 2026
Return-Path: <linux-kernel-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 6066EC43334
	for <linux-kernel@archiver.kernel.org>; Tue, 21 Jun 2022 09:04:45 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S1348423AbiFUJEo (ORCPT
        <rfc822;linux-kernel@archiver.kernel.org>);
        Tue, 21 Jun 2022 05:04:44 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:48354 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S1348775AbiFUJEg (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Tue, 21 Jun 2022 05:04:36 -0400
Received: from mail-ed1-x549.google.com (mail-ed1-x549.google.com
 [IPv6:2a00:1450:4864:20::549])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 24C0F15A26
        for <linux-kernel@vger.kernel.org>;
 Tue, 21 Jun 2022 02:04:35 -0700 (PDT)
Received: by mail-ed1-x549.google.com with SMTP id
 z13-20020a056402274d00b004357fcdd51fso4260486edd.17
        for <linux-kernel@vger.kernel.org>;
 Tue, 21 Jun 2022 02:04:35 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=google.com; s=20210112;
        h=date:in-reply-to:message-id:mime-version:references:subject:from:to
         :cc;
        bh=uZkT0e9DzynSoNlb2+4Ge5Aa2zY9Tda7y0woN+dH9g4=;
        b=T+OHbeewkAFVIX4nvC8jC6nEMoX/Bm9vZeGRitoUIQGSdOwldSHJGY6AiPRR+mm6DY
         s9kWpC8Gr8RGzkZhrHZfHY1fQK8UhTrLuNt4d2VokXLzOX3BzUic5k3HCLvqz+G4feSF
         An1xZEZGnUqC8PZNdRLKdWkIlJeNdKKTRZMnpJl2sIXm5bsW3Lm/ZOPlIIFPGeHpnLRq
         ig4aZuw8QqJCdj6hDbhOQ/hxEWNTt6Jb1R2u+l6y1FGqCM10z8v3OwCTsytKqdigF40w
         Z0QtHIgQeJVb4b9vL4URzQxV5L2x/YDlKK/Uvo++HXPb72sp3ovQx9NcOfv/Opgnabf0
         HvDg==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=x-gm-message-state:date:in-reply-to:message-id:mime-version
         :references:subject:from:to:cc;
        bh=uZkT0e9DzynSoNlb2+4Ge5Aa2zY9Tda7y0woN+dH9g4=;
        b=PeEh2jEKhtBgLP5xCGIcT2sy56I3OKbxBguRayerqaMuJqKn8oW0DFVH2+NuxwbXpH
         o5XiKWUDuKNOTi83ADsYcqJ0ITfHt5pDaYfubHRKd8t5JaIyuwDBJ3lSpz64CJbj+pLS
         qfJFaoiBlmKmp/mJPhlc+6QNWqe+pVYxkes0qWm922LZdidMX/J4T6JCTRdLgWBs+8Vz
         5/ZYO68ViSmOqmy7Y0Q/kG4e3QRgPq3wAcdE1zPr9DER4we1K3Ryu27+htVXCKUZ42H+
         +IrAqcXzdpwzy2++++QAZClvJz2E+yhA1g68OxLU9QkUe1DE3q2ObJe11TumZnko6Sdw
         l9Ag==
X-Gm-Message-State: AJIora+OiQFQYjwXpcWPzD6euqYFRQiMxvQsUap789M/s4HngQqWzdHm
        QtxmdKHu6hyYH0o0+b3RyUdXdNOw7AcgxdkR
X-Google-Smtp-Source: 
 AGRyM1to9dgPH8cgE4MqwlH/PJJCkEKVB76JmkNeY66OkPfDj+QRsV1Dl8QqANXfRxat/4yTj310EX0mGl57HafI
X-Received: from vdonnefort.c.googlers.com
 ([fda3:e722:ac3:cc00:28:9cb1:c0a8:2eea])
 (user=vdonnefort job=sendgmr) by 2002:a17:907:3e01:b0:722:c339:bde7 with SMTP
 id hp1-20020a1709073e0100b00722c339bde7mr8518978ejc.285.1655802273490; Tue,
 21 Jun 2022 02:04:33 -0700 (PDT)
Date: Tue, 21 Jun 2022 10:04:11 +0100
In-Reply-To: <20220621090414.433602-1-vdonnefort@google.com>
Message-Id: <20220621090414.433602-5-vdonnefort@google.com>
Mime-Version: 1.0
References: <20220621090414.433602-1-vdonnefort@google.com>
X-Mailer: git-send-email 2.37.0.rc0.104.g0611611a94-goog
Subject: [PATCH v11 4/7] sched/fair: Rename select_idle_mask to select_rq_mask
From: Vincent Donnefort <vdonnefort@google.com>
To: peterz@infradead.org, mingo@redhat.com, vincent.guittot@linaro.org
Cc: linux-kernel@vger.kernel.org, dietmar.eggemann@arm.com,
        morten.rasmussen@arm.com, chris.redpath@arm.com,
        qperret@google.com, tao.zhou@linux.dev, kernel-team@android.com,
        vdonnefort@google.com, Lukasz Luba <lukasz.luba@arm.com>
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

From: Dietmar Eggemann <dietmar.eggemann@arm.com>

Decouple the name of the per-cpu cpumask select_idle_mask from its usage
in select_idle_[cpu/capacity]() of the CFS run-queue selection
(select_task_rq_fair()).

This is to support the reuse of this cpumask in the Energy Aware
Scheduling (EAS) path (find_energy_efficient_cpu()) of the CFS run-queue
selection.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Tested-by: Lukasz Luba <lukasz.luba@arm.com>

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8fc1bd9f13b6..cf88be0cc599 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9570,7 +9570,7 @@ static struct kmem_cache *task_group_cache __read_mos=
tly;
 #endif
=20
 DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
-DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
+DECLARE_PER_CPU(cpumask_var_t, select_rq_mask);
=20
 void __init sched_init(void)
 {
@@ -9619,7 +9619,7 @@ void __init sched_init(void)
 	for_each_possible_cpu(i) {
 		per_cpu(load_balance_mask, i) =3D (cpumask_var_t)kzalloc_node(
 			cpumask_size(), GFP_KERNEL, cpu_to_node(i));
-		per_cpu(select_idle_mask, i) =3D (cpumask_var_t)kzalloc_node(
+		per_cpu(select_rq_mask, i) =3D (cpumask_var_t)kzalloc_node(
 			cpumask_size(), GFP_KERNEL, cpu_to_node(i));
 	}
 #endif /* CONFIG_CPUMASK_OFFSTACK */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ee034a89bc87..aad1c2248547 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5898,7 +5898,7 @@ static void dequeue_task_fair(struct rq *rq, struct t=
ask_struct *p, int flags)
=20
 /* Working cpumask for: load_balance, load_balance_newidle. */
 DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
-DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
+DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
=20
 #ifdef CONFIG_NO_HZ_COMMON
=20
@@ -6388,7 +6388,7 @@ static inline int select_idle_smt(struct task_struct =
*p, struct sched_domain *sd
  */
 static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd,=
 bool has_idle_core, int target)
 {
-	struct cpumask *cpus =3D this_cpu_cpumask_var_ptr(select_idle_mask);
+	struct cpumask *cpus =3D this_cpu_cpumask_var_ptr(select_rq_mask);
 	int i, cpu, idle_cpu =3D -1, nr =3D INT_MAX;
 	struct rq *this_rq =3D this_rq();
 	int this =3D smp_processor_id();
@@ -6474,7 +6474,7 @@ select_idle_capacity(struct task_struct *p, struct sc=
hed_domain *sd, int target)
 	int cpu, best_cpu =3D -1;
 	struct cpumask *cpus;
=20
-	cpus =3D this_cpu_cpumask_var_ptr(select_idle_mask);
+	cpus =3D this_cpu_cpumask_var_ptr(select_rq_mask);
 	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
=20
 	task_util =3D uclamp_task_util(p);
--=20
2.37.0.rc0.104.g0611611a94-goog
From nobody Sun Feb  8 05:57:35 2026
Return-Path: <linux-kernel-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 8AD0AC433EF
	for <linux-kernel@archiver.kernel.org>; Tue, 21 Jun 2022 09:04:46 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S1348465AbiFUJEp (ORCPT
        <rfc822;linux-kernel@archiver.kernel.org>);
        Tue, 21 Jun 2022 05:04:45 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:48406 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S1348548AbiFUJEi (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Tue, 21 Jun 2022 05:04:38 -0400
Received: from mail-wm1-x349.google.com (mail-wm1-x349.google.com
 [IPv6:2a00:1450:4864:20::349])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 9A856186D0
        for <linux-kernel@vger.kernel.org>;
 Tue, 21 Jun 2022 02:04:37 -0700 (PDT)
Received: by mail-wm1-x349.google.com with SMTP id
 l17-20020a05600c4f1100b0039c860db521so6101599wmq.5
        for <linux-kernel@vger.kernel.org>;
 Tue, 21 Jun 2022 02:04:37 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=google.com; s=20210112;
        h=date:in-reply-to:message-id:mime-version:references:subject:from:to
         :cc;
        bh=MIz7mf9O+yDJg4rZecZM10jb/99QC/Kv+YtCD3EQfrA=;
        b=NrUeQOoS9ADwqZ/phm7tK20C/K3dSCmWla0NedrshRR7W2phNCvwjfPAEg78eDqsM1
         BNvdvg3AZffHgpSQJE++K0VH8Bb6gY1GUIWADtqGLdxqzHGSvlaQFy42xU6tTHByyGNJ
         Xj0tr3XMhGJM8GgLC6cJlOJn+o+pfttewK9vszflcYE1lInN5I04ptkiea8QtrJ8oZrO
         Yq4Yg+0QoDKn2q1JGEn59Fp61g+rIRVHplSwSdvvvKcE+3txmnpntFD4TQixOfy3RokS
         ybvgMeSGtFLyXhCdOsNdnwUBkFVU4ZCR02MbwLyjYl9sM2dSrqZj/TFyMlYbcv1dlfex
         oQGw==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=x-gm-message-state:date:in-reply-to:message-id:mime-version
         :references:subject:from:to:cc;
        bh=MIz7mf9O+yDJg4rZecZM10jb/99QC/Kv+YtCD3EQfrA=;
        b=ZPjazyxOa9qunUs3FLAQ1rNfK5Z54cqdGtwEsDn4EL9l//cx+FqZgeP9DAvyGCImlU
         ++0j49TxvPg6NubRLbWfac2g5XmE4fykrx6fmu/6bma1u2EkuY9a8FObbdzly5BRF5+2
         Iov+tEQSaVBjQ3N/Ty3KcJZREbnkOaJBRVO9dPmCskucn3FFx8zq6EA09x+O99RVGr5e
         lZ16X5ZRURicdEb9ywN0OXBlDcmC2+24ZWMxxR2z+Du0umJQv+OaYPNOlkwKWsh8IVXi
         wLDyHRcL7m6cYXcYUSDUndWzGpF7lYttHuacidKjdZ3Hmdg28T0hRcTUIny6pi/j3FxM
         P7yw==
X-Gm-Message-State: AOAM5321OVw/PUMW9YN/HRLbPEEOOltqSedBBlEZ7d6Mp/L6h8F4MbLU
        vc8C4PVw9CkATi+vWYh2apeaswt4vQ6xFbJT
X-Google-Smtp-Source: 
 ABdhPJzwNyHYXBKWXt9ezF/mnWGVK1/cu3Wd9/J14ZWMeBdjcXqZbtEbiXaFvXnAlxC8qfYzAN7mylQrgALHdCCF
X-Received: from vdonnefort.c.googlers.com
 ([fda3:e722:ac3:cc00:28:9cb1:c0a8:2eea])
 (user=vdonnefort job=sendgmr) by 2002:a05:600c:3052:b0:39c:6540:c280 with
 SMTP id n18-20020a05600c305200b0039c6540c280mr1747602wmh.1.1655802275960;
 Tue, 21 Jun 2022 02:04:35 -0700 (PDT)
Date: Tue, 21 Jun 2022 10:04:12 +0100
In-Reply-To: <20220621090414.433602-1-vdonnefort@google.com>
Message-Id: <20220621090414.433602-6-vdonnefort@google.com>
Mime-Version: 1.0
References: <20220621090414.433602-1-vdonnefort@google.com>
X-Mailer: git-send-email 2.37.0.rc0.104.g0611611a94-goog
Subject: [PATCH v11 5/7] sched/fair: Use the same cpumask per-PD throughout
 find_energy_efficient_cpu()
From: Vincent Donnefort <vdonnefort@google.com>
To: peterz@infradead.org, mingo@redhat.com, vincent.guittot@linaro.org
Cc: linux-kernel@vger.kernel.org, dietmar.eggemann@arm.com,
        morten.rasmussen@arm.com, chris.redpath@arm.com,
        qperret@google.com, tao.zhou@linux.dev, kernel-team@android.com,
        vdonnefort@google.com, Lukasz Luba <lukasz.luba@arm.com>
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

From: Dietmar Eggemann <dietmar.eggemann@arm.com>

The Perf Domain (PD) cpumask (struct em_perf_domain.cpus) stays
invariant after Energy Model creation, i.e. it is not updated after
CPU hotplug operations.

That's why the PD mask is used in conjunction with the cpu_online_mask
(or Sched Domain cpumask). Thereby the cpu_online_mask is fetched
multiple times (in compute_energy()) during a run-queue selection
for a task.

cpu_online_mask may change during this time which can lead to wrong
energy calculations.

To be able to avoid this, use the select_rq_mask per-cpu cpumask to
create a cpumask out of PD cpumask and cpu_online_mask and pass it
through the function calls of the EAS run-queue selection path.

The PD cpumask for max_spare_cap_cpu/compute_prev_delta selection
(find_energy_efficient_cpu()) is now ANDed not only with the SD mask
but also with the cpu_online_mask. This is fine since this cpumask
has to be in syc with the one used for energy computation
(compute_energy()).
An exclusive cpuset setup with at least one asymmetric CPU capacity
island (hence the additional AND with the SD cpumask) is the obvious
exception here.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Tested-by: Lukasz Luba <lukasz.luba@arm.com>

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index aad1c2248547..112f760ff47e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6701,14 +6701,14 @@ static unsigned long cpu_util_without(int cpu, stru=
ct task_struct *p)
  * task.
  */
 static long
-compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
+compute_energy(struct task_struct *p, int dst_cpu, struct cpumask *cpus,
+	       struct perf_domain *pd)
 {
-	struct cpumask *pd_mask =3D perf_domain_span(pd);
 	unsigned long max_util =3D 0, sum_util =3D 0, cpu_cap;
 	int cpu;
=20
-	cpu_cap =3D arch_scale_cpu_capacity(cpumask_first(pd_mask));
-	cpu_cap -=3D arch_scale_thermal_pressure(cpumask_first(pd_mask));
+	cpu_cap =3D arch_scale_cpu_capacity(cpumask_first(cpus));
+	cpu_cap -=3D arch_scale_thermal_pressure(cpumask_first(cpus));
=20
 	/*
 	 * The capacity state of CPUs of the current rd can be driven by CPUs
@@ -6719,7 +6719,7 @@ compute_energy(struct task_struct *p, int dst_cpu, st=
ruct perf_domain *pd)
 	 * If an entire pd is outside of the current rd, it will not appear in
 	 * its pd list and will not be accounted by compute_energy().
 	 */
-	for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
+	for_each_cpu(cpu, cpus) {
 		unsigned long util_freq =3D cpu_util_next(cpu, p, dst_cpu);
 		unsigned long cpu_util, util_running =3D util_freq;
 		struct task_struct *tsk =3D NULL;
@@ -6806,6 +6806,7 @@ compute_energy(struct task_struct *p, int dst_cpu, st=
ruct perf_domain *pd)
  */
 static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 {
+	struct cpumask *cpus =3D this_cpu_cpumask_var_ptr(select_rq_mask);
 	unsigned long prev_delta =3D ULONG_MAX, best_delta =3D ULONG_MAX;
 	struct root_domain *rd =3D cpu_rq(smp_processor_id())->rd;
 	int cpu, best_energy_cpu =3D prev_cpu, target =3D -1;
@@ -6840,7 +6841,9 @@ static int find_energy_efficient_cpu(struct task_stru=
ct *p, int prev_cpu)
 		unsigned long base_energy_pd;
 		int max_spare_cap_cpu =3D -1;
=20
-		for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
+		cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
+
+		for_each_cpu_and(cpu, cpus, sched_domain_span(sd)) {
 			if (!cpumask_test_cpu(cpu, p->cpus_ptr))
 				continue;
=20
@@ -6877,12 +6880,12 @@ static int find_energy_efficient_cpu(struct task_st=
ruct *p, int prev_cpu)
 			continue;
=20
 		/* Compute the 'base' energy of the pd, without @p */
-		base_energy_pd =3D compute_energy(p, -1, pd);
+		base_energy_pd =3D compute_energy(p, -1, cpus, pd);
 		base_energy +=3D base_energy_pd;
=20
 		/* Evaluate the energy impact of using prev_cpu. */
 		if (compute_prev_delta) {
-			prev_delta =3D compute_energy(p, prev_cpu, pd);
+			prev_delta =3D compute_energy(p, prev_cpu, cpus, pd);
 			if (prev_delta < base_energy_pd)
 				goto unlock;
 			prev_delta -=3D base_energy_pd;
@@ -6891,7 +6894,8 @@ static int find_energy_efficient_cpu(struct task_stru=
ct *p, int prev_cpu)
=20
 		/* Evaluate the energy impact of using max_spare_cap_cpu. */
 		if (max_spare_cap_cpu >=3D 0) {
-			cur_delta =3D compute_energy(p, max_spare_cap_cpu, pd);
+			cur_delta =3D compute_energy(p, max_spare_cap_cpu, cpus,
+						   pd);
 			if (cur_delta < base_energy_pd)
 				goto unlock;
 			cur_delta -=3D base_energy_pd;
--=20
2.37.0.rc0.104.g0611611a94-goog
From nobody Sun Feb  8 05:57:35 2026
Return-Path: <linux-kernel-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 46C53C43334
	for <linux-kernel@archiver.kernel.org>; Tue, 21 Jun 2022 09:04:48 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S1348637AbiFUJEq (ORCPT
        <rfc822;linux-kernel@archiver.kernel.org>);
        Tue, 21 Jun 2022 05:04:46 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:48374 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S1348491AbiFUJEl (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Tue, 21 Jun 2022 05:04:41 -0400
Received: from mail-wm1-x349.google.com (mail-wm1-x349.google.com
 [IPv6:2a00:1450:4864:20::349])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id D44E61A813
        for <linux-kernel@vger.kernel.org>;
 Tue, 21 Jun 2022 02:04:39 -0700 (PDT)
Received: by mail-wm1-x349.google.com with SMTP id
 206-20020a1c02d7000000b0039c9a08c52bso4053489wmc.4
        for <linux-kernel@vger.kernel.org>;
 Tue, 21 Jun 2022 02:04:39 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=google.com; s=20210112;
        h=date:in-reply-to:message-id:mime-version:references:subject:from:to
         :cc;
        bh=4KZsNDPCzJ207WML9wXC024ukG812EyGktRwhEK7nuM=;
        b=rxqkeNofePLxPhWW6xKvi8do244IN7uoiswhwgv8j3JwMquRffDY5LsF9pjd7ONtx9
         caJtDnF9unphAAzyRzm2SM2vnjenu70vjxNtuWqXeQzLUb4oraZS+mwy++0AMx+c7SVg
         VhXVH+YQ8xmcjigtlHnugsIUv0bzkPrKRzssEO5uLP7rwWmoUe9FRHQgfsHiDBrqlS0h
         98YvterYZb/P9EnsHSfL5dp3+eC5RsN0NaxlQZeRW4oG1Mmc1iko/rIJNpnmpfjqiIc0
         AtsZ9ZIoTbHIw84LKQa0zpVlOCoT4qed4YuaFU2l/uc5cGJsysR7EJiHOAiNco32LJ6d
         ITSg==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=x-gm-message-state:date:in-reply-to:message-id:mime-version
         :references:subject:from:to:cc;
        bh=4KZsNDPCzJ207WML9wXC024ukG812EyGktRwhEK7nuM=;
        b=W6tI5M/6l/GFWx5PTuR/6WWlo7kv0u6k8YVlbf4SfLWNEmeTzyRZ9vrA2rCg8ieRZ4
         8x8GuDk0pFfX4BlvTkR0L9piFlqEnXYEOyfuFHw9oxuCZ5DMx9t0bK3mPvMv8A2WqNr8
         6zhqB8Akoxl5JTfkRJMGRI7M91lUMtqq8qds6hiADH98Dd3fXo/ji3eN1XqwLJ7yT7vZ
         VGmuhqk6QPQ63+nWDPs7AaqIeuhp1bTbFC1jK+8xOXP9ILxX/+1oEqUFJGliDzf3eLaE
         dSQIwLhJUGTgHtOAodh+bapoa9bt2CCtSBt8WJRAsUbUaSIh5603GR4Bq1PIUw0lZ/wA
         BxQw==
X-Gm-Message-State: AOAM531i2IBfK80OYSC0ve5CzvqZvlMfobfqImbwkUFg4f4RdhFxXqUo
        S9Hy767q3Ck2kJ53CwlXjGuanZ2oy4kcuCfh
X-Google-Smtp-Source: 
 ABdhPJxfRe12sbYw0LlETkCtcIUiqVxXk++WdgNbODvvjCW+qqJQadlPUWumCqCiXVn4jT9P9AyYSO6LQ5DEcp9l
X-Received: from vdonnefort.c.googlers.com
 ([fda3:e722:ac3:cc00:28:9cb1:c0a8:2eea])
 (user=vdonnefort job=sendgmr) by 2002:a05:600c:1d12:b0:39c:4307:8b10 with
 SMTP id l18-20020a05600c1d1200b0039c43078b10mr39432649wms.103.1655802278317;
 Tue, 21 Jun 2022 02:04:38 -0700 (PDT)
Date: Tue, 21 Jun 2022 10:04:13 +0100
In-Reply-To: <20220621090414.433602-1-vdonnefort@google.com>
Message-Id: <20220621090414.433602-7-vdonnefort@google.com>
Mime-Version: 1.0
References: <20220621090414.433602-1-vdonnefort@google.com>
X-Mailer: git-send-email 2.37.0.rc0.104.g0611611a94-goog
Subject: [PATCH v11 6/7] sched/fair: Remove task_util from effective
 utilization in feec()
From: Vincent Donnefort <vdonnefort@google.com>
To: peterz@infradead.org, mingo@redhat.com, vincent.guittot@linaro.org
Cc: linux-kernel@vger.kernel.org, dietmar.eggemann@arm.com,
        morten.rasmussen@arm.com, chris.redpath@arm.com,
        qperret@google.com, tao.zhou@linux.dev, kernel-team@android.com,
        vdonnefort@google.com,
        Vincent Donnefort <vincent.donnefort@arm.com>,
        Lukasz Luba <lukasz.luba@arm.com>
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

From: Vincent Donnefort <vincent.donnefort@arm.com>

The energy estimation in find_energy_efficient_cpu() (feec()) relies on
the computation of the effective utilization for each CPU of a perf domain
(PD). This effective utilization is then used as an estimation of the busy
time for this pd. The function effective_cpu_util() which gives this value,
scales the utilization relative to IRQ pressure on the CPU to take into
account that the IRQ time is hidden from the task clock. The IRQ scaling is
as follow:

   effective_cpu_util =3D irq + (cpu_cap - irq)/cpu_cap * util

Where util is the sum of CFS/RT/DL utilization, cpu_cap the capacity of
the CPU and irq the IRQ avg time.

If now we take as an example a task placement which doesn't raise the OPP
on the candidate CPU, we can write the energy delta as:

  delta =3D OPPcost/cpu_cap * (effective_cpu_util(cpu_util + task_util) -
                             effective_cpu_util(cpu_util))
        =3D OPPcost/cpu_cap * (cpu_cap - irq)/cpu_cap * task_util

We end-up with an energy delta depending on the IRQ avg time, which is a
problem: first the time spent on IRQs by a CPU has no effect on the
additional energy that would be consumed by a task. Second, we don't want
to favour a CPU with a higher IRQ avg time value.

Nonetheless, we need to take the IRQ avg time into account. If a task
placement raises the PD's frequency, it will increase the energy cost for
the entire time where the CPU is busy. A solution is to only use
effective_cpu_util() with the CPU contribution part. The task contribution
is added separately and scaled according to prev_cpu's IRQ time.

No change for the FREQUENCY_UTIL component of the energy estimation. We
still want to get the actual frequency that would be selected after the
task placement.

Signed-off-by: Vincent Donnefort <vincent.donnefort@arm.com>
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Lukasz Luba <lukasz.luba@arm.com>

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 112f760ff47e..0a28891cb178 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6694,61 +6694,96 @@ static unsigned long cpu_util_without(int cpu, stru=
ct task_struct *p)
 }
=20
 /*
- * compute_energy(): Estimates the energy that @pd would consume if @p was
- * migrated to @dst_cpu. compute_energy() predicts what will be the utiliz=
ation
- * landscape of @pd's CPUs after the task migration, and uses the Energy M=
odel
- * to compute what would be the energy if we decided to actually migrate t=
hat
- * task.
+ * energy_env - Utilization landscape for energy estimation.
+ * @task_busy_time: Utilization contribution by the task for which we test=
 the
+ *                  placement. Given by eenv_task_busy_time().
+ * @pd_busy_time:   Utilization of the whole perf domain without the task
+ *                  contribution. Given by eenv_pd_busy_time().
+ * @cpu_cap:        Maximum CPU capacity for the perf domain.
+ * @pd_cap:         Entire perf domain capacity. (pd->nr_cpus * cpu_cap).
+ */
+struct energy_env {
+	unsigned long task_busy_time;
+	unsigned long pd_busy_time;
+	unsigned long cpu_cap;
+	unsigned long pd_cap;
+};
+
+/*
+ * Compute the task busy time for compute_energy(). This time cannot be
+ * injected directly into effective_cpu_util() because of the IRQ scaling.
+ * The latter only makes sense with the most recent CPUs where the task has
+ * run.
  */
-static long
-compute_energy(struct task_struct *p, int dst_cpu, struct cpumask *cpus,
-	       struct perf_domain *pd)
+static inline void eenv_task_busy_time(struct energy_env *eenv,
+				       struct task_struct *p, int prev_cpu)
 {
-	unsigned long max_util =3D 0, sum_util =3D 0, cpu_cap;
+	unsigned long busy_time, max_cap =3D arch_scale_cpu_capacity(prev_cpu);
+	unsigned long irq =3D cpu_util_irq(cpu_rq(prev_cpu));
+
+	if (unlikely(irq >=3D max_cap))
+		busy_time =3D max_cap;
+	else
+		busy_time =3D scale_irq_capacity(task_util_est(p), irq, max_cap);
+
+	eenv->task_busy_time =3D busy_time;
+}
+
+/*
+ * Compute the perf_domain (PD) busy time for compute_energy(). Based on t=
he
+ * utilization for each @pd_cpus, it however doesn't take into account
+ * clamping since the ratio (utilization / cpu_capacity) is already enough=
 to
+ * scale the EM reported power consumption at the (eventually clamped)
+ * cpu_capacity.
+ *
+ * The contribution of the task @p for which we want to estimate the
+ * energy cost is removed (by cpu_util_next()) and must be calculated
+ * separately (see eenv_task_busy_time). This ensures:
+ *
+ *   - A stable PD utilization, no matter which CPU of that PD we want to =
place
+ *     the task on.
+ *
+ *   - A fair comparison between CPUs as the task contribution (task_util(=
))
+ *     will always be the same no matter which CPU utilization we rely on
+ *     (util_avg or util_est).
+ *
+ * Set @eenv busy time for the PD that spans @pd_cpus. This busy time can't
+ * exceed @eenv->pd_cap.
+ */
+static inline void eenv_pd_busy_time(struct energy_env *eenv,
+				     struct cpumask *pd_cpus,
+				     struct task_struct *p)
+{
+	unsigned long busy_time =3D 0;
 	int cpu;
=20
-	cpu_cap =3D arch_scale_cpu_capacity(cpumask_first(cpus));
-	cpu_cap -=3D arch_scale_thermal_pressure(cpumask_first(cpus));
+	for_each_cpu(cpu, pd_cpus) {
+		unsigned long util =3D cpu_util_next(cpu, p, -1);
=20
-	/*
-	 * The capacity state of CPUs of the current rd can be driven by CPUs
-	 * of another rd if they belong to the same pd. So, account for the
-	 * utilization of these CPUs too by masking pd with cpu_online_mask
-	 * instead of the rd span.
-	 *
-	 * If an entire pd is outside of the current rd, it will not appear in
-	 * its pd list and will not be accounted by compute_energy().
-	 */
-	for_each_cpu(cpu, cpus) {
-		unsigned long util_freq =3D cpu_util_next(cpu, p, dst_cpu);
-		unsigned long cpu_util, util_running =3D util_freq;
-		struct task_struct *tsk =3D NULL;
+		busy_time +=3D effective_cpu_util(cpu, util, ENERGY_UTIL, NULL);
+	}
=20
-		/*
-		 * When @p is placed on @cpu:
-		 *
-		 * util_running =3D max(cpu_util, cpu_util_est) +
-		 *		  max(task_util, _task_util_est)
-		 *
-		 * while cpu_util_next is: max(cpu_util + task_util,
-		 *			       cpu_util_est + _task_util_est)
-		 */
-		if (cpu =3D=3D dst_cpu) {
-			tsk =3D p;
-			util_running =3D
-				cpu_util_next(cpu, p, -1) + task_util_est(p);
-		}
+	eenv->pd_busy_time =3D min(eenv->pd_cap, busy_time);
+}
=20
-		/*
-		 * Busy time computation: utilization clamping is not
-		 * required since the ratio (sum_util / cpu_capacity)
-		 * is already enough to scale the EM reported power
-		 * consumption at the (eventually clamped) cpu_capacity.
-		 */
-		cpu_util =3D effective_cpu_util(cpu, util_running, ENERGY_UTIL,
-					      NULL);
+/*
+ * Compute the maximum utilization for compute_energy() when the task @p
+ * is placed on the cpu @dst_cpu.
+ *
+ * Returns the maximum utilization among @eenv->cpus. This utilization can=
't
+ * exceed @eenv->cpu_cap.
+ */
+static inline unsigned long
+eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
+		 struct task_struct *p, int dst_cpu)
+{
+	unsigned long max_util =3D 0;
+	int cpu;
=20
-		sum_util +=3D min(cpu_util, cpu_cap);
+	for_each_cpu(cpu, pd_cpus) {
+		struct task_struct *tsk =3D (cpu =3D=3D dst_cpu) ? p : NULL;
+		unsigned long util =3D cpu_util_next(cpu, p, dst_cpu);
+		unsigned long cpu_util;
=20
 		/*
 		 * Performance domain frequency: utilization clamping
@@ -6757,12 +6792,29 @@ compute_energy(struct task_struct *p, int dst_cpu, =
struct cpumask *cpus,
 		 * NOTE: in case RT tasks are running, by default the
 		 * FREQUENCY_UTIL's utilization can be max OPP.
 		 */
-		cpu_util =3D effective_cpu_util(cpu, util_freq, FREQUENCY_UTIL,
-					      tsk);
-		max_util =3D max(max_util, min(cpu_util, cpu_cap));
+		cpu_util =3D effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk);
+		max_util =3D max(max_util, cpu_util);
 	}
=20
-	return em_cpu_energy(pd->em_pd, max_util, sum_util, cpu_cap);
+	return min(max_util, eenv->cpu_cap);
+}
+
+/*
+ * compute_energy(): Use the Energy Model to estimate the energy that @pd =
would
+ * consume for a given utilization landscape @eenv. When @dst_cpu < 0, the=
 task
+ * contribution is ignored.
+ */
+static inline unsigned long
+compute_energy(struct energy_env *eenv, struct perf_domain *pd,
+	       struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu)
+{
+	unsigned long max_util =3D eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
+	unsigned long busy_time =3D eenv->pd_busy_time;
+
+	if (dst_cpu >=3D 0)
+		busy_time =3D min(eenv->pd_cap, busy_time + eenv->task_busy_time);
+
+	return em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
 }
=20
 /*
@@ -6808,11 +6860,12 @@ static int find_energy_efficient_cpu(struct task_st=
ruct *p, int prev_cpu)
 {
 	struct cpumask *cpus =3D this_cpu_cpumask_var_ptr(select_rq_mask);
 	unsigned long prev_delta =3D ULONG_MAX, best_delta =3D ULONG_MAX;
-	struct root_domain *rd =3D cpu_rq(smp_processor_id())->rd;
 	int cpu, best_energy_cpu =3D prev_cpu, target =3D -1;
-	unsigned long cpu_cap, util, base_energy =3D 0;
+	struct root_domain *rd =3D this_rq()->rd;
+	unsigned long base_energy =3D 0;
 	struct sched_domain *sd;
 	struct perf_domain *pd;
+	struct energy_env eenv;
=20
 	rcu_read_lock();
 	pd =3D rcu_dereference(rd->pd);
@@ -6835,22 +6888,39 @@ static int find_energy_efficient_cpu(struct task_st=
ruct *p, int prev_cpu)
 	if (!task_util_est(p))
 		goto unlock;
=20
+	eenv_task_busy_time(&eenv, p, prev_cpu);
+
 	for (; pd; pd =3D pd->next) {
-		unsigned long cur_delta, spare_cap, max_spare_cap =3D 0;
+		unsigned long cpu_cap, cpu_thermal_cap, util;
+		unsigned long cur_delta, max_spare_cap =3D 0;
 		bool compute_prev_delta =3D false;
 		unsigned long base_energy_pd;
 		int max_spare_cap_cpu =3D -1;
=20
 		cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
=20
-		for_each_cpu_and(cpu, cpus, sched_domain_span(sd)) {
+		if (cpumask_empty(cpus))
+			continue;
+
+		/* Account thermal pressure for the energy estimation */
+		cpu =3D cpumask_first(cpus);
+		cpu_thermal_cap =3D arch_scale_cpu_capacity(cpu);
+		cpu_thermal_cap -=3D arch_scale_thermal_pressure(cpu);
+
+		eenv.cpu_cap =3D cpu_thermal_cap;
+		eenv.pd_cap =3D 0;
+
+		for_each_cpu(cpu, cpus) {
+			eenv.pd_cap +=3D cpu_thermal_cap;
+
+			if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
+				continue;
+
 			if (!cpumask_test_cpu(cpu, p->cpus_ptr))
 				continue;
=20
 			util =3D cpu_util_next(cpu, p, cpu);
 			cpu_cap =3D capacity_of(cpu);
-			spare_cap =3D cpu_cap;
-			lsub_positive(&spare_cap, util);
=20
 			/*
 			 * Skip CPUs that cannot satisfy the capacity request.
@@ -6863,15 +6933,17 @@ static int find_energy_efficient_cpu(struct task_st=
ruct *p, int prev_cpu)
 			if (!fits_capacity(util, cpu_cap))
 				continue;
=20
+			lsub_positive(&cpu_cap, util);
+
 			if (cpu =3D=3D prev_cpu) {
 				/* Always use prev_cpu as a candidate. */
 				compute_prev_delta =3D true;
-			} else if (spare_cap > max_spare_cap) {
+			} else if (cpu_cap > max_spare_cap) {
 				/*
 				 * Find the CPU with the maximum spare capacity
 				 * in the performance domain.
 				 */
-				max_spare_cap =3D spare_cap;
+				max_spare_cap =3D cpu_cap;
 				max_spare_cap_cpu =3D cpu;
 			}
 		}
@@ -6879,13 +6951,16 @@ static int find_energy_efficient_cpu(struct task_st=
ruct *p, int prev_cpu)
 		if (max_spare_cap_cpu < 0 && !compute_prev_delta)
 			continue;
=20
+		eenv_pd_busy_time(&eenv, cpus, p);
 		/* Compute the 'base' energy of the pd, without @p */
-		base_energy_pd =3D compute_energy(p, -1, cpus, pd);
+		base_energy_pd =3D compute_energy(&eenv, pd, cpus, p, -1);
 		base_energy +=3D base_energy_pd;
=20
 		/* Evaluate the energy impact of using prev_cpu. */
 		if (compute_prev_delta) {
-			prev_delta =3D compute_energy(p, prev_cpu, cpus, pd);
+			prev_delta =3D compute_energy(&eenv, pd, cpus, p,
+						    prev_cpu);
+			/* CPU utilization has changed */
 			if (prev_delta < base_energy_pd)
 				goto unlock;
 			prev_delta -=3D base_energy_pd;
@@ -6894,8 +6969,9 @@ static int find_energy_efficient_cpu(struct task_stru=
ct *p, int prev_cpu)
=20
 		/* Evaluate the energy impact of using max_spare_cap_cpu. */
 		if (max_spare_cap_cpu >=3D 0) {
-			cur_delta =3D compute_energy(p, max_spare_cap_cpu, cpus,
-						   pd);
+			cur_delta =3D compute_energy(&eenv, pd, cpus, p,
+						   max_spare_cap_cpu);
+			/* CPU utilization has changed */
 			if (cur_delta < base_energy_pd)
 				goto unlock;
 			cur_delta -=3D base_energy_pd;
--=20
2.37.0.rc0.104.g0611611a94-goog
From nobody Sun Feb  8 05:57:35 2026
Return-Path: <linux-kernel-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id D1F5AC43334
	for <linux-kernel@archiver.kernel.org>; Tue, 21 Jun 2022 09:04:51 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S1348706AbiFUJEt (ORCPT
        <rfc822;linux-kernel@archiver.kernel.org>);
        Tue, 21 Jun 2022 05:04:49 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:48412 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S1348055AbiFUJEn (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Tue, 21 Jun 2022 05:04:43 -0400
Received: from mail-yb1-xb4a.google.com (mail-yb1-xb4a.google.com
 [IPv6:2607:f8b0:4864:20::b4a])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 9294E1BE8E
        for <linux-kernel@vger.kernel.org>;
 Tue, 21 Jun 2022 02:04:41 -0700 (PDT)
Received: by mail-yb1-xb4a.google.com with SMTP id
 d6-20020a256806000000b00668a3d90e95so11554775ybc.2
        for <linux-kernel@vger.kernel.org>;
 Tue, 21 Jun 2022 02:04:41 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=google.com; s=20210112;
        h=date:in-reply-to:message-id:mime-version:references:subject:from:to
         :cc;
        bh=UU7oIO9O18gKAfGM519OK1NSQ+aff2VK7jK9K162lp4=;
        b=jH6AjcvQIuh5qhnpXazAOifdoMbJouJuxW6T5AQD4A8zot23ihMbuEy/vBw03/iEA/
         f0WoV9KLY2QRJDzT8535tElCwpCIBQqg63NBJ8EC3/rLB4laHG1q0tR7ZnOB+IWesm6i
         1nEffjSF5lHv9R7zhg7zhzqJLae8Fv5xUiUQ88Jp7KGb1bfRVOey/qOXd8wyDS3laRuC
         l00DzwBjmbpXcJMfK6NBcCEt1ouvzj/kiMFJuK8bzTPy5Q8hiJpmMobrvS+tGmb4igQb
         yhsnM611P1GK7q2hT1xWtVAraAAVS383fBEYcrjafZQwv+7Iys17eVki2B5kZLNzTN0/
         aobQ==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=x-gm-message-state:date:in-reply-to:message-id:mime-version
         :references:subject:from:to:cc;
        bh=UU7oIO9O18gKAfGM519OK1NSQ+aff2VK7jK9K162lp4=;
        b=uCtl+yFBEbOZ+esH39g5RKwN+ivExPnUp3cEHoUpLVA9avLUSG/GuQANO9SDWc49I3
         BTK+/T15qigfal9k4X/XcXQ8+MJ0P6Pk1oe+vfpWHY9Swk9tEynDip3RBtom9TndsXbE
         KXx6iAyNEGXySaUz22uOaD0UPgxlzkfxLXnBV7SQYOEt7JLZOVKo6SJYpakYp3EZaXWy
         Ub95s7qxgoigZx2G75bpZl6occW8Pd9FUGxPKizIdfAoEx/Cb6jAWig5fZHtRAOfsVa5
         amLKb1XHC1gzCX3KxoHDMRGjsBmEKceEmjj+xBdCIvdM/nFt5PnnJdXmEdMFuEMbgSiS
         e0Cg==
X-Gm-Message-State: AJIora8Lv+//JCNi1uG9cun6VbvYlpPHunfIgCLNbZT1seUFxdGitq6o
        2FXfK5JUBkWG7Tpwp6VubkyTJbZmMvYrkbdO
X-Google-Smtp-Source: 
 AGRyM1usP6U59wn0ubuwQYBq0mXIN7nJJb2Uuc/KW3HBoNVCHcgSAmNqZZD6akZsu1vYxIXI2Tl+ND66XhKyUp8n
X-Received: from vdonnefort.c.googlers.com
 ([fda3:e722:ac3:cc00:28:9cb1:c0a8:2eea])
 (user=vdonnefort job=sendgmr) by 2002:a81:a047:0:b0:317:8761:14f8 with SMTP
 id x68-20020a81a047000000b00317876114f8mr23605691ywg.469.1655802280606; Tue,
 21 Jun 2022 02:04:40 -0700 (PDT)
Date: Tue, 21 Jun 2022 10:04:14 +0100
In-Reply-To: <20220621090414.433602-1-vdonnefort@google.com>
Message-Id: <20220621090414.433602-8-vdonnefort@google.com>
Mime-Version: 1.0
References: <20220621090414.433602-1-vdonnefort@google.com>
X-Mailer: git-send-email 2.37.0.rc0.104.g0611611a94-goog
Subject: [PATCH v11 7/7] sched/fair: Remove the energy margin in feec()
From: Vincent Donnefort <vdonnefort@google.com>
To: peterz@infradead.org, mingo@redhat.com, vincent.guittot@linaro.org
Cc: linux-kernel@vger.kernel.org, dietmar.eggemann@arm.com,
        morten.rasmussen@arm.com, chris.redpath@arm.com,
        qperret@google.com, tao.zhou@linux.dev, kernel-team@android.com,
        vdonnefort@google.com,
        Vincent Donnefort <vincent.donnefort@arm.com>,
        Lukasz Luba <lukasz.luba@arm.com>
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

From: Vincent Donnefort <vincent.donnefort@arm.com>

find_energy_efficient_cpu() integrates a margin to protect tasks from
bouncing back and forth from a CPU to another. This margin is set as being
6% of the total current energy estimated on the system. This however does
not work for two reasons:

1. The energy estimation is not a good absolute value:

compute_energy() used in feec() is a good estimation for task placement as
it allows to compare the energy with and without a task. The computed
delta will give a good overview of the cost for a certain task placement.
It, however, doesn't work as an absolute estimation for the total energy
of the system. First it adds the contribution to idle CPUs into the
energy, second it mixes util_avg with util_est values. util_avg contains
the near history for a CPU usage, it doesn't tell at all what the current
utilization is. A system that has been quite busy in the near past will
hold a very high energy and then a high margin preventing any task
migration to a lower capacity CPU, wasting energy. It even creates a
negative feedback loop: by holding the tasks on a less efficient CPU, the
margin contributes in keeping the energy high.

2. The margin handicaps small tasks:

On a system where the workload is composed mostly of small tasks (which is
often the case on Android), the overall energy will be high enough to
create a margin none of those tasks can cross. On a Pixel4, a small
utilization of 5% on all the CPUs creates a global estimated energy of 140
joules, as per the Energy Model declaration of that same device. This
means, after applying the 6% margin that any migration must save more than
8 joules to happen. No task with a utilization lower than 40 would then be
able to migrate away from the biggest CPU of the system.

The 6% of the overall system energy was brought by the following patch:

 (eb92692b2544 sched/fair: Speed-up energy-aware wake-ups)

It was previously 6% of the prev_cpu energy. Also, the following one
made this margin value conditional on the clusters where the task fits:

 (8d4c97c105ca sched/fair: Only compute base_energy_pd if necessary)

We could simply revert that margin change to what it was, but the original
version didn't have strong grounds neither and as demonstrated in (1.) the
estimated energy isn't a good absolute value. Instead, removing it
completely. It is indeed, made possible by recent changes that improved
energy estimation comparison fairness (sched/fair: Remove task_util from
effective utilization in feec()) (PM: EM: Increase energy calculation
precision) and task utilization stabilization (sched/fair: Decay task
util_avg during migration)

Without a margin, we could have feared bouncing between CPUs. But running
LISA's eas_behaviour test coverage on three different platforms (Hikey960,
RB-5 and DB-845) showed no issue.

Removing the energy margin enables more energy-optimized placements for a
more energy efficient system.

Signed-off-by: Vincent Donnefort <vincent.donnefort@arm.com>
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Lukasz Luba <lukasz.luba@arm.com>

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0a28891cb178..44cf443d1efe 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6860,9 +6860,8 @@ static int find_energy_efficient_cpu(struct task_stru=
ct *p, int prev_cpu)
 {
 	struct cpumask *cpus =3D this_cpu_cpumask_var_ptr(select_rq_mask);
 	unsigned long prev_delta =3D ULONG_MAX, best_delta =3D ULONG_MAX;
-	int cpu, best_energy_cpu =3D prev_cpu, target =3D -1;
 	struct root_domain *rd =3D this_rq()->rd;
-	unsigned long base_energy =3D 0;
+	int cpu, best_energy_cpu, target =3D -1;
 	struct sched_domain *sd;
 	struct perf_domain *pd;
 	struct energy_env eenv;
@@ -6894,8 +6893,8 @@ static int find_energy_efficient_cpu(struct task_stru=
ct *p, int prev_cpu)
 		unsigned long cpu_cap, cpu_thermal_cap, util;
 		unsigned long cur_delta, max_spare_cap =3D 0;
 		bool compute_prev_delta =3D false;
-		unsigned long base_energy_pd;
 		int max_spare_cap_cpu =3D -1;
+		unsigned long base_energy;
=20
 		cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
=20
@@ -6953,17 +6952,16 @@ static int find_energy_efficient_cpu(struct task_st=
ruct *p, int prev_cpu)
=20
 		eenv_pd_busy_time(&eenv, cpus, p);
 		/* Compute the 'base' energy of the pd, without @p */
-		base_energy_pd =3D compute_energy(&eenv, pd, cpus, p, -1);
-		base_energy +=3D base_energy_pd;
+		base_energy =3D compute_energy(&eenv, pd, cpus, p, -1);
=20
 		/* Evaluate the energy impact of using prev_cpu. */
 		if (compute_prev_delta) {
 			prev_delta =3D compute_energy(&eenv, pd, cpus, p,
 						    prev_cpu);
 			/* CPU utilization has changed */
-			if (prev_delta < base_energy_pd)
+			if (prev_delta < base_energy)
 				goto unlock;
-			prev_delta -=3D base_energy_pd;
+			prev_delta -=3D base_energy;
 			best_delta =3D min(best_delta, prev_delta);
 		}
=20
@@ -6972,9 +6970,9 @@ static int find_energy_efficient_cpu(struct task_stru=
ct *p, int prev_cpu)
 			cur_delta =3D compute_energy(&eenv, pd, cpus, p,
 						   max_spare_cap_cpu);
 			/* CPU utilization has changed */
-			if (cur_delta < base_energy_pd)
+			if (cur_delta < base_energy)
 				goto unlock;
-			cur_delta -=3D base_energy_pd;
+			cur_delta -=3D base_energy;
 			if (cur_delta < best_delta) {
 				best_delta =3D cur_delta;
 				best_energy_cpu =3D max_spare_cap_cpu;
@@ -6983,12 +6981,7 @@ static int find_energy_efficient_cpu(struct task_str=
uct *p, int prev_cpu)
 	}
 	rcu_read_unlock();
=20
-	/*
-	 * Pick the best CPU if prev_cpu cannot be used, or if it saves at
-	 * least 6% of the energy used by prev_cpu.
-	 */
-	if ((prev_delta =3D=3D ULONG_MAX) ||
-	    (prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
+	if (best_delta < prev_delta)
 		target =3D best_energy_cpu;
=20
 	return target;
--=20
2.37.0.rc0.104.g0611611a94-goog