From nobody Tue Feb 10 04:18:13 2026
Received: from mta22.hihonor.com (mta22.honor.com [81.70.192.198])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 5F71C1C84BB
	for <linux-kernel@vger.kernel.org>; Mon,  8 Dec 2025 09:12:58 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=81.70.192.198
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1765185180; cv=none;
 b=chSMAcHhaAxXc782t3nO2Avl5udPd4wLbSGyCperqIwAosHusWGf/XFAjMJ2djkD67+SMyRjXmp5peKhIU6krJVf7Bs8g32Ghd7+RWpdYCZHXSqJO2BYHbm92b5lKIDtB6Z5wBD9nKDa1kNEDAOU84G9/JwEZ145uYR72eA7AVc=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1765185180; c=relaxed/simple;
	bh=sNabQTNvQp3cRtrnH4PO2CH6bmCYb4xZWjUHwqh6XwA=;
	h=From:To:CC:Subject:Date:Message-ID:MIME-Version:Content-Type;
 b=sUf4t7TKCrWLXIYa7wr26v/01DyuCpfW19Xv/vmEt3eVkGRkPJHfwQi1dLRsDoMV9bxWfSM/FcoNAAle+uEmgakSjsQbw7tuISA80ieFR9566M9KA8xToozPzWsAydSAB36sU9w1ew+W581zd9RDDR5fqbXaZaU2ibRLkuz9kVc=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=honor.com;
 spf=pass smtp.mailfrom=honor.com;
 dkim=pass (1024-bit key) header.d=honor.com header.i=@honor.com
 header.b=KLsinEnR; arc=none smtp.client-ip=81.70.192.198
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=honor.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=honor.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (1024-bit key) header.d=honor.com header.i=@honor.com
 header.b="KLsinEnR"
dkim-signature: v=1; a=rsa-sha256; d=honor.com; s=dkim;
	c=relaxed/relaxed; q=dns/txt;
	h=To:From;
	bh=+MIuIXrz8t6dDUhQPKILlHxDQAK3xunGxaNPY/VHLTc=;
	b=KLsinEnRjWxqIxNCaC3sW8C1qD1h6MlADKaIFjP7j1KK+eEctaJ1rmDKIDSXfgZD4+xUEzTOn
	Iz0ZCmQRjjhJjgAnbwVWHqRGEbRZAlyEo807PJ4/Zt9gkuBMc3Yzd1NVRJae+nJ/4NrwJao7A/Q
	UGtY80MLC/KycIHrFKca1yg=
Received: from w011.hihonor.com (unknown [10.68.20.122])
	by mta22.hihonor.com (SkyGuard) with ESMTPS id 4dPx655VdpzYl26L;
	Mon,  8 Dec 2025 17:10:37 +0800 (CST)
Received: from a011.hihonor.com (10.68.31.243) by w011.hihonor.com
 (10.68.20.122) with Microsoft SMTP Server (version=TLS1_2,
 cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.2.2562.27; Mon, 8 Dec
 2025 17:12:33 +0800
Received: from localhost.localdomain (10.144.18.117) by a011.hihonor.com
 (10.68.31.243) with Microsoft SMTP Server (version=TLS1_2,
 cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.2.2562.27; Mon, 8 Dec
 2025 17:12:32 +0800
From: wangtao <tao.wangtao@honor.com>
To: <mingo@redhat.com>, <peterz@infradead.org>, <juri.lelli@redhat.com>,
	<vincent.guittot@linaro.org>
CC: <dietmar.eggemann@arm.com>, <rostedt@goodmis.org>, <bsegall@google.com>,
	<mgorman@suse.de>, <vschneid@redhat.com>, <linux-kernel@vger.kernel.org>,
	<liulu.liu@honor.com>, <bintian.wang@honor.com>, <wangzicheng@honor.com>,
	wangtao <tao.wangtao@honor.com>
Subject: [PATCH v3] sched/fair: make task join/leave scheduling more stable
Date: Mon, 8 Dec 2025 17:11:17 +0800
Message-ID: <20251208091117.2727-1-tao.wangtao@honor.com>
X-Mailer: git-send-email 2.17.1
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
X-ClientProxiedBy: w010.hihonor.com (10.68.28.113) To a011.hihonor.com
 (10.68.31.243)
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

Suppose the current V =3D V0, all weights are 1, and we add 3 tasks
with join_lag (jlag) values 0, 36 and -12. Considering preserve_lag
handling with vlag_i =3D (W + w_i) * vlag_i' / W:

----------------------------------
event  |vlag'|vlag| v   | W | V
----------------------------------
T1 join|   0 |  0 |V0   | 1 |V0
T2 join|  36 | 72 |V0-72| 2 |V0-36
T3 join| -12 |-18 |V0-18| 3 |V0-30

Because V moves backward after T2 joins, even though lag_T3 < 0 =3D lag_T1,
we still have v_T2 < v_T3 < v_T1. So the schedule order is T2, T3, T1.
A similar issue exists even without preserve_lag.
If tasks are added in the order T3, T2, T1, the schedule order
becomes T2, T1, T3, which shows instability.

We define the join_lag of each joining/leaving task j as jlag_j, and
let J =3D \Sum jlag_i / W be the average jlag of all tasks in the queue.
Setting the new task's v_j =3D V - (vjlag_j - J) keeps V+J unchanged
before and after task join/leave, making scheduling more stable and fair.

Using the same example with lag values 0, 36 and -12.

---------------------------------------------------------------------------
time|  event    | W | avgJ | sumJ | jlag  |  V+J    |  v     |  V    |  lag
---------------------------------------------------------------------------
t   |T1 joins+  | 1 |   0 <|   0 <|   0   |> V0     |> V0    |> V0   |>   0
t   |T2 joins+  | 2 |  18 <|  36 <|  36   |> V0     |> V0-36 |> V0-18|>  18
t   |T3 joins+  | 3 |   8 <|  24 <| -12   |> V0     |> V0+12 |> V0-8 |> -20
t   |pick T2    | 3 |   8  |  24  |  36  <|  V0     |  V0-36 |  V0-8 |>  28
t+24|T2 leaves- | 3 |   8  |  24  |  20  <|  V0+8  <|  V0-12 |> V0   |>  12
t+24|T2 leaves+ | 2 |   2 <|   4 <|  NA   |  V0+8   |  NA    |> V0+6 |   NA
t+24|pick T1    | 2 |   2  |   4  |   8  <|  V0+8   |  V0    |  V0+6 |>   6
t+40|T1 leaves- | 2 |   2  |   4  |   0  <|  V0+16 <|  V0+16 |> V0+14|>  -2
t+40|T1 leaves+ | 1 |   4 <|   4 <|  NA   |  V0+16  |  NA    |> V0+12|   NA
t+40|T2 rejoins | 2 |   8 <|  16 <|  12   |> V0+16  |> V0+4  |> V0+8 |>   4
t+40|pick T2    | 2 |   8  |  16  |  12  <|  V0+16  |  V0+4  |  V0+8 |>   4
t+52|T2 leaves- | 2 |   8  |  16  |   6  <|  V0+22 <|  V0+16 |> V0+14|>  -2
t+52|T2 leaves+ | 1 |  10  |  10  |  NA   |  V0+22  |  NA    |> V0+12|   NA
t+52|pick T3    | 1 |  10  |  10  |  10  <|  V0+22  |  V0+12 |  V0+12|>   0
t+60|T3 leaves- | 1 |  10  |  10  |  10  <|  V0+30 <|  V0+20 |> V0+20|>   0
t+60|T3 leaves+ | 0 |   0  |   0  |  NA   |  NA     |  NA    |> V0+20|   NA

note: '<' and '>' indicate that the data needs to be recalculated.

hackbench tests show that this patch reduces execution time due to
fewer task switches.

-------------------------------------------------
hackbench test         base   patch   opt
-------------------------------------------------
process 1 group:       0.119  0.107  -10.6%
process 10 group:      0.869  0.767  -11.7%
process 20 group:      1.840  1.579  -14.1%
thread 1 group:        0.089  0.074  -17.2%
thread 10 group:       0.555  0.480  -13.5%
thread 20 group:       1.099  1.022  -7.0%
pipe process 1 group:  0.126  0.084  -33.5%
pipe process 10 group: 0.810  0.673  -17.0%
pipe process 20 group: 1.625  1.314  -19.1%
pipe thread 1 group:   0.092  0.077  -16.7%
pipe thread 10 group:  0.503  0.465  -7.6%
pipe thread 20 group:  0.947  0.906  -4.4%

v2->v3: updat the example and derivation
v2: https://lore.kernel.org/all/20251205105403.13278-1-tao.wangtao@honor.co=
m/
v1: https://lore.kernel.org/all/20251128081118.20025-1-tao.wangtao@honor.co=
m/T/#u

Signed-off-by: wangtao <tao.wangtao@honor.com>
---
 kernel/sched/debug.c |   2 +
 kernel/sched/fair.c  | 171 ++++++++++++++++++++++++-------------------
 kernel/sched/sched.h |   2 +
 3 files changed, 98 insertions(+), 77 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 41caa22e0680..a4fd11c5ca2d 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -830,6 +830,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct c=
fs_rq *cfs_rq)
 			SPLIT_NS(zero_vruntime));
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "avg_vruntime",
 			SPLIT_NS(avg_vruntime(cfs_rq)));
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "avg_vjlag",
+			SPLIT_NS(avg_vjlag(cfs_rq)));
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "right_vruntime",
 			SPLIT_NS(right_vruntime));
 	spread =3D right_vruntime - left_vruntime;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 769d7b7990df..10f48d0de322 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -593,7 +593,68 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, st=
ruct sched_entity *se)
  *
  *	      V +-=3D lag_i / W
  *
- *	    Also see the comment in place_entity() that deals with this. ]]
+ * If a joining/leaving task j has non-zero lag, V will change. Although
+ * v_i of other tasks remains unchanged, their vlag_i =3D V - v_i changes.
+ * We define join_lag of each task as jlag, the average jlag in the queue:
+ *
+ *   J =3D \Sum jlag_i / W
+ *
+ * 1) Before and after task j joins, v_i and jlag_i of other tasks remain
+ * unchanged:
+ *   W_j =3D W + w_j
+ *   V_j =3D (V * W + v_j * w_j) / W_j
+ *   V_j =3D (V * W_j - V * w_j + v_j * w_j) / W_j
+ *   V_j =3D V - (V - v_j) * w_j / W_j
+ * and
+ *   J_j =3D (J * W + jlag_j) / W_j
+ *   J_j =3D (J * W_j - J * w_j + vjlag_j * w_j) / W_j
+ *   J_j =3D J - (J - vjlag_j) * w_j / W_j
+ *
+ * Therefore:
+ *   (V_j + J_j) - (V + J) =3D ((v_j + vjlag_j) - (V + J)) * w_j / W_j
+ *
+ * The v_j of the joining task j is determined based on vjlag and V. If we=
 set
+ *   v_j =3D V + J - vjlag_j
+ * then we have:
+ *   (V_j + J_j) - (V + J) =3D 0
+ * meaning adding a task keeps V_j + J_j unchanged.
+ *
+ * Since
+ *   v_j =3D V_j - vlag_j
+ * we get:
+ *   v_j =3D V_j - vlag_j =3D V + J - vjlag_j
+ *   vlag_j =3D vjlag_j - J_j
+ * so after adding task j, preserve vlag: vlag_last_leave - J_j.
+ *
+ * 2) When any task i runs, V changes with v_i, but J does not:
+ * from
+ *   v_i =3D V - vlag_i =3D V + J - vjlag_i
+ * we get
+ *   vlag_i =3D V - v_i
+ *   vjlag_i =3D V + J - v_i =3D vlag_i + J
+ * vlag_i and vjlag_i change accordingly.
+ *
+ * 3) When task l leaves, v_i and jlag_i of other tasks remain unchanged:
+ *   W' =3D W_l - w_l
+ *   V' =3D (V_l * W_l - v_l * w_l) / W'
+ *   V' =3D (V_l * W' + V_l * w_l - v_l * w_l) / W'
+ *   V' =3D V_l + (V_l - v_l) * w_l / W'
+ * and
+ *   J' =3D (J_l * W_l - jlag_l) / W'
+ *   J' =3D (J_l * W' + J_l * w_l - vjlag_j * w_l) / W'
+ *   J' =3D J_l + (J_l - vjlag_l) * w_l / W'
+ *
+ * Therefore:
+ *   (V' + J') - (V_l + J_l) =3D ((V_l + J_l) - (v_l + vjlag_l)) * w_l / W=
' =3D 0
+ * so V'+J' also remains unchanged when removing a task.
+ *
+ * For n tasks in the queue, the upper bound of \Sum jlag_i is:
+ *   \Sum jlag_i =3D \Sum (vjlag_i * w_i)
+ *               =3D \Sum (vlag_i + J) * w_i
+ *               =3D \Sum (vlag_i * w_i) + J * \Sum w_i
+ *               =3D \Sum lag_i + J * W
+ *               =3D 0 + J * W <=3D n * q
+ * where q is the upper bound of lag. ]]
  *
  * However, since v_i is u64, and the multiplication could easily overflow
  * transform it into a relative form that uses smaller quantities:
@@ -638,6 +699,33 @@ avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_e=
ntity *se)
 	cfs_rq->avg_load -=3D weight;
 }
=20
+s64 avg_vjlag(struct cfs_rq *cfs_rq)
+{
+	struct sched_entity *curr =3D cfs_rq->curr;
+	long load =3D cfs_rq->avg_load;
+
+	if (curr && curr->on_rq)
+		load +=3D scale_load_down(curr->load.weight);
+
+	return load ? div_s64(cfs_rq->sum_jlag, load) : 0;
+}
+
+static void sum_jlag_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	unsigned long weight =3D scale_load_down(se->load.weight);
+	s64 jlag_join =3D se->vlag * weight; /* preserve vlag: vlag - J_j */
+
+	cfs_rq->sum_jlag +=3D jlag_join;
+}
+
+static void sum_jlag_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	unsigned long weight =3D scale_load_down(se->load.weight);
+	s64 jlag_leave =3D (se->vlag + avg_vjlag(cfs_rq)) * weight;
+
+	cfs_rq->sum_jlag -=3D jlag_leave;
+}
+
 static inline
 void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
 {
@@ -5106,82 +5194,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_ent=
ity *se, int flags)
 		se->slice =3D sysctl_sched_base_slice;
 	vslice =3D calc_delta_fair(se->slice, se);
=20
-	/*
-	 * Due to how V is constructed as the weighted average of entities,
-	 * adding tasks with positive lag, or removing tasks with negative lag
-	 * will move 'time' backwards, this can screw around with the lag of
-	 * other tasks.
-	 *
-	 * EEVDF: placement strategy #1 / #2
-	 */
-	if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) {
-		struct sched_entity *curr =3D cfs_rq->curr;
-		unsigned long load;
-
-		lag =3D se->vlag;
-
-		/*
-		 * If we want to place a task and preserve lag, we have to
-		 * consider the effect of the new entity on the weighted
-		 * average and compensate for this, otherwise lag can quickly
-		 * evaporate.
-		 *
-		 * Lag is defined as:
-		 *
-		 *   lag_i =3D S - s_i =3D w_i * (V - v_i)
-		 *
-		 * To avoid the 'w_i' term all over the place, we only track
-		 * the virtual lag:
-		 *
-		 *   vl_i =3D V - v_i <=3D> v_i =3D V - vl_i
-		 *
-		 * And we take V to be the weighted average of all v:
-		 *
-		 *   V =3D (\Sum w_j*v_j) / W
-		 *
-		 * Where W is: \Sum w_j
-		 *
-		 * Then, the weighted average after adding an entity with lag
-		 * vl_i is given by:
-		 *
-		 *   V' =3D (\Sum w_j*v_j + w_i*v_i) / (W + w_i)
-		 *      =3D (W*V + w_i*(V - vl_i)) / (W + w_i)
-		 *      =3D (W*V + w_i*V - w_i*vl_i) / (W + w_i)
-		 *      =3D (V*(W + w_i) - w_i*vl_i) / (W + w_i)
-		 *      =3D V - w_i*vl_i / (W + w_i)
-		 *
-		 * And the actual lag after adding an entity with vl_i is:
-		 *
-		 *   vl'_i =3D V' - v_i
-		 *         =3D V - w_i*vl_i / (W + w_i) - (V - vl_i)
-		 *         =3D vl_i - w_i*vl_i / (W + w_i)
-		 *
-		 * Which is strictly less than vl_i. So in order to preserve lag
-		 * we should inflate the lag before placement such that the
-		 * effective lag after placement comes out right.
-		 *
-		 * As such, invert the above relation for vl'_i to get the vl_i
-		 * we need to use such that the lag after placement is the lag
-		 * we computed before dequeue.
-		 *
-		 *   vl'_i =3D vl_i - w_i*vl_i / (W + w_i)
-		 *         =3D ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i)
-		 *
-		 *   (W + w_i)*vl'_i =3D (W + w_i)*vl_i - w_i*vl_i
-		 *                   =3D W*vl_i
-		 *
-		 *   vl_i =3D (W + w_i)*vl'_i / W
-		 */
-		load =3D cfs_rq->avg_load;
-		if (curr && curr->on_rq)
-			load +=3D scale_load_down(curr->load.weight);
-
-		lag *=3D load + scale_load_down(se->load.weight);
-		if (WARN_ON_ONCE(!load))
-			load =3D 1;
-		lag =3D div_s64(lag, load);
-	}
-
+	/* v_j: V - (vjlag_j - J) */
+	if (sched_feat(PLACE_LAG))
+		lag =3D se->vlag - avg_vjlag(cfs_rq);
 	se->vruntime =3D vruntime - lag;
=20
 	if (se->rel_deadline) {
@@ -5257,6 +5272,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_en=
tity *se, int flags)
=20
 	check_schedstat_required();
 	update_stats_enqueue_fair(cfs_rq, se, flags);
+	sum_jlag_add(cfs_rq, se);
 	if (!curr)
 		__enqueue_entity(cfs_rq, se);
 	se->on_rq =3D 1;
@@ -5394,6 +5410,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_en=
tity *se, int flags)
 		se->rel_deadline =3D 1;
 	}
=20
+	sum_jlag_sub(cfs_rq, se);
 	if (se !=3D cfs_rq->curr)
 		__dequeue_entity(cfs_rq, se);
 	se->on_rq =3D 0;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b419a4d98461..9957bf817b37 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -680,6 +680,7 @@ struct cfs_rq {
=20
 	s64			avg_vruntime;
 	u64			avg_load;
+	s64			sum_jlag;
=20
 	u64			zero_vruntime;
 #ifdef CONFIG_SCHED_CORE
@@ -3959,6 +3960,7 @@ static inline void init_sched_mm_cid(struct task_stru=
ct *t) { }
 #endif /* !CONFIG_SCHED_MM_CID */
=20
 extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
+extern s64 avg_vjlag(struct cfs_rq *cfs_rq);
 extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
 static inline
 void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct =
task_struct *task)
--=20
2.17.1