From nobody Tue Feb 10 04:19:52 2026
Received: from m15.mail.163.com (m15.mail.163.com [45.254.50.220])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id 55516AD21
	for <linux-kernel@vger.kernel.org>; Wed, 17 Jul 2024 03:02:32 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=45.254.50.220
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1721185356; cv=none;
 b=jz36V9/Nk+iO/ZZgtmgvF7d3TZbRfHR6xxNTMP8skiGIZcD7aGJxHDG/clV9HjmYhiOrlvbzfBD74F+ra+h1jkPO1O8WkRdM8Ge5zbHpClttE6RAZqQLmQdpySbneSTeafbth5QX8KYQMT6f2Bw2289BbEaXTrJCykxzzpRsG5Y=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1721185356; c=relaxed/simple;
	bh=4WJoeAaiZ+yvqfK0+TLhrOp9NTEu/6fKBWg3g/RcDzE=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=m9tzZLK3A14fQClgryA8nxkB0hK/q0amqmGjMZawrs5AXb4VcOqh0wcISEv1ei58Sh4/c1IynvLZHJbV3Os58JlFkmChcFOF+xmLBIwJYCx3RKIXqG11VxtVIEol0bI8TzUY3+O+fIJjgir8YRfaxw5mX1isbbQeZuNycxN1hhc=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=163.com;
 spf=pass smtp.mailfrom=163.com;
 dkim=pass (1024-bit key) header.d=163.com header.i=@163.com
 header.b=d5Ln7lYX; arc=none smtp.client-ip=45.254.50.220
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=163.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=163.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (1024-bit key) header.d=163.com header.i=@163.com
 header.b="d5Ln7lYX"
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=163.com;
	s=s110527; h=From:Subject:Date:Message-Id:MIME-Version; bh=LX68w
	ObMd1UN5bXryZy55ihm4IPJ93vozCzBlSB+rGw=; b=d5Ln7lYXOGNl8A8qtIQLV
	HCdRTgOJLpsCYggkccDKNH6LmmbHZ3FsquPYOztZf+qkskwclwIoGF4d4VRb01Y0
	4Nyi3Y+GmEGMpaY2Z0UJQqpNM+S/hwPBdfiTxfz67QgHqCDqT5fj9N1IlLYDd9lv
	pRBpgm1drUk65+Aq5+erDU=
Received: from localhost (unknown [101.132.132.191])
	by gzga-smtp-mta-g0-4 (Coremail) with SMTP id
 _____wD3f6zUM5dm8MmxDA--.63024S2;
	Wed, 17 Jul 2024 11:00:37 +0800 (CST)
From: Xavier <xavier_qy@163.com>
To: mingo@redhat.com,
	peterz@infradead.org,
	juri.lelli@redhat.com,
	vincent.guittot@linaro.org
Cc: dietmar.eggemann@arm.com,
	rostedt@goodmis.org,
	bsegall@google.com,
	mgorman@suse.de,
	bristot@redhat.com,
	vschneid@redhat.com,
	linux-kernel@vger.kernel.org,
	oliver.sang@intel.com,
	Xavier <xavier_qy@163.com>
Subject: [PATCH-RT sched v4 1/2] RT SCHED: Optimize the enqueue and dequeue
 operations for rt_se
Date: Wed, 17 Jul 2024 11:00:32 +0800
Message-Id: <20240717030033.309205-2-xavier_qy@163.com>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <20240717030033.309205-1-xavier_qy@163.com>
References: <202407170411.vRtOCOzx-lkp@intel.com>
 <20240717030033.309205-1-xavier_qy@163.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
X-CM-TRANSID: _____wD3f6zUM5dm8MmxDA--.63024S2
X-Coremail-Antispam: 1Uf129KBjvAXoW3KFWrZw47WF4UZr1fCrW3ZFb_yoW8Wr48Wo
	WxtrsFqa97Gw1kA34fGFy0yrWfZa1Y9F1fAayYkws5Jw1qqF1DZ3y5uw13Aa4Sq398KFZr
	Z3W8Xa48tFsrGFWfn29KB7ZKAUJUUUU8529EdanIXcx71UUUUU7v73VFW2AGmfu7bjvjm3
	AaLaJ3UbIYCTnIWIevJa73UjIFyTuYvjxUUKsjDUUUU
X-CM-SenderInfo: 50dyxvpubt5qqrwthudrp/1tbiYxUfEGV4JaK9FAACsI
Content-Type: text/plain; charset="utf-8"

This patch optimizes the enqueue and dequeue of rt_se, the strategy employs
a bottom-up removal approach. Specifically, when removing an rt_se at a
certain level, if it is determined that the highest priority of the rq
associated with that rt_se has not changed, there is no need to continue
removing rt_se at higher levels. At this point, only the total number
of removed rt_se needs to be recorded, and the rt_nr_running count of
higher-level rq should be removed accordingly.

Signed-off-by: Xavier <xavier_qy@163.com>
---
 kernel/sched/debug.c |  48 ++++++++
 kernel/sched/rt.c    | 287 +++++++++++++++++++++++++++++++++++++------
 kernel/sched/sched.h |   1 +
 3 files changed, 298 insertions(+), 38 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index c1eb9a1afd13..352ee55da25e 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -712,6 +712,54 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct =
cfs_rq *cfs_rq)
 #endif
 }
=20
+static void print_rt_se(struct seq_file *m, struct sched_rt_entity *rt_se)
+{
+	struct task_struct *task;
+
+#ifdef CONFIG_RT_GROUP_SCHED
+	if (rt_se->my_q) {
+		SEQ_printf_task_group_path(m, rt_se->my_q->tg, "%s\n");
+		return;
+	}
+#endif
+	task =3D container_of(rt_se, struct task_struct, rt);
+	SEQ_printf(m, "	prio-%d, pid-%d, %s\n", task->prio, task->pid, task->comm=
);
+}
+
+/*shall be called in rq lock*/
+void print_rt_rq_task(struct seq_file *m, struct rt_rq *rt_rq)
+{
+	struct rt_prio_array *array =3D &rt_rq->active;
+	struct sched_rt_entity *rt_se;
+	struct list_head *queue, *head;
+	unsigned long bitmap[2];
+	int idx;
+	int count =3D 0;
+
+	if (!rt_rq->rt_nr_running)
+		return;
+
+	memcpy(bitmap, array->bitmap, sizeof(unsigned long) * 2);
+	idx =3D sched_find_first_bit(bitmap);
+	WARN_ON_ONCE(idx >=3D MAX_RT_PRIO);
+
+	while (1) {
+		clear_bit(idx, bitmap);
+		queue =3D array->queue + idx;
+		head =3D queue;
+		queue =3D queue->next;
+		do {
+			rt_se =3D list_entry(queue, struct sched_rt_entity, run_list);
+			print_rt_se(m, rt_se);
+			queue =3D queue->next;
+			count++;
+		} while (queue !=3D head);
+		idx =3D sched_find_first_bit(bitmap);
+		if (idx >=3D MAX_RT_PRIO)
+			break;
+	}
+}
+
 void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 {
 #ifdef CONFIG_RT_GROUP_SCHED
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index aa4c1c874fa4..b18c424a50d2 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1113,7 +1113,7 @@ void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, i=
nt prev_prio) {}
 #endif /* CONFIG_SMP */
=20
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-static void
+static int
 inc_rt_prio(struct rt_rq *rt_rq, int prio)
 {
 	int prev_prio =3D rt_rq->highest_prio.curr;
@@ -1122,9 +1122,11 @@ inc_rt_prio(struct rt_rq *rt_rq, int prio)
 		rt_rq->highest_prio.curr =3D prio;
=20
 	inc_rt_prio_smp(rt_rq, prio, prev_prio);
+
+	return prev_prio > prio;
 }
=20
-static void
+static int
 dec_rt_prio(struct rt_rq *rt_rq, int prio)
 {
 	int prev_prio =3D rt_rq->highest_prio.curr;
@@ -1149,12 +1151,22 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio)
 	}
=20
 	dec_rt_prio_smp(rt_rq, prio, prev_prio);
+	if (rt_rq->highest_prio.curr > prio)
+		return prio;
+	else
+		return MAX_RT_PRIO;
 }
=20
 #else
=20
-static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
-static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
+static inline int inc_rt_prio(struct rt_rq *rt_rq, int prio)
+{
+	return 0;
+}
+static inline int dec_rt_prio(struct rt_rq *rt_rq, int prio)
+{
+	return 0;
+}
=20
 #endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
=20
@@ -1218,28 +1230,31 @@ unsigned int rt_se_rr_nr_running(struct sched_rt_en=
tity *rt_se)
 }
=20
 static inline
-void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+int inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
 	int prio =3D rt_se_prio(rt_se);
+	int prio_change;
=20
 	WARN_ON(!rt_prio(prio));
 	rt_rq->rt_nr_running +=3D rt_se_nr_running(rt_se);
 	rt_rq->rr_nr_running +=3D rt_se_rr_nr_running(rt_se);
=20
-	inc_rt_prio(rt_rq, prio);
+	prio_change =3D inc_rt_prio(rt_rq, prio);
 	inc_rt_group(rt_se, rt_rq);
+	return prio_change;
 }
=20
 static inline
-void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+int dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq, int p=
rio)
 {
+	int prio_changed;
 	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
-	WARN_ON(!rt_rq->rt_nr_running);
 	rt_rq->rt_nr_running -=3D rt_se_nr_running(rt_se);
 	rt_rq->rr_nr_running -=3D rt_se_rr_nr_running(rt_se);
=20
-	dec_rt_prio(rt_rq, rt_se_prio(rt_se));
+	prio_changed =3D dec_rt_prio(rt_rq, prio);
 	dec_rt_group(rt_se, rt_rq);
+	return prio_changed;
 }
=20
 /*
@@ -1255,12 +1270,13 @@ static inline bool move_entity(unsigned int flags)
 	return true;
 }
=20
-static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_pr=
io_array *array)
+static void __delist_rt_entity(struct sched_rt_entity *rt_se,
+						struct rt_prio_array *array, int last_prio)
 {
 	list_del_init(&rt_se->run_list);
=20
-	if (list_empty(array->queue + rt_se_prio(rt_se)))
-		__clear_bit(rt_se_prio(rt_se), array->bitmap);
+	if (list_empty(array->queue + last_prio))
+		__clear_bit(last_prio, array->bitmap);
=20
 	rt_se->on_list =3D 0;
 }
@@ -1371,7 +1387,12 @@ update_stats_dequeue_rt(struct rt_rq *rt_rq, struct =
sched_rt_entity *rt_se,
 	}
 }
=20
-static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned in=
t flags)
+/*
+ * Returns: -1 indicates that rt_se was not enqueued, 0 indicates that the=
 highest
+ * priority of the rq did not change after enqueue, and 1 indicates that t=
he highest
+ * priority of the rq changed after enqueue.
+ */
+static int __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int=
 flags)
 {
 	struct rt_rq *rt_rq =3D rt_rq_of_se(rt_se);
 	struct rt_prio_array *array =3D &rt_rq->active;
@@ -1386,8 +1407,8 @@ static void __enqueue_rt_entity(struct sched_rt_entit=
y *rt_se, unsigned int flag
 	 */
 	if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
 		if (rt_se->on_list)
-			__delist_rt_entity(rt_se, array);
-		return;
+			__delist_rt_entity(rt_se, array, rt_se_prio(rt_se));
+		return -1;
 	}
=20
 	if (move_entity(flags)) {
@@ -1402,73 +1423,263 @@ static void __enqueue_rt_entity(struct sched_rt_en=
tity *rt_se, unsigned int flag
 	}
 	rt_se->on_rq =3D 1;
=20
-	inc_rt_tasks(rt_se, rt_rq);
+	return inc_rt_tasks(rt_se, rt_rq);
 }
=20
-static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned in=
t flags)
+/**
+ * delete rt_se from rt_rq
+ *
+ * @rt_se		Nodes to be deleted
+ * @last_prio	The highest priority of this rt_se before the previous round
+ *				of deletion
+ * @flags		operation flags
+ *
+ * Returns: =3D0 indicates that the highest priority of the current rq did=
 not
+ * change during this deletion. >0 indicates it changed, and it returns the
+ * previous highest priority to use in the next round of deletion.
+ */
+static int __dequeue_rt_entity(struct sched_rt_entity *rt_se, int last_pri=
o,
+									unsigned int flags)
 {
 	struct rt_rq *rt_rq =3D rt_rq_of_se(rt_se);
 	struct rt_prio_array *array =3D &rt_rq->active;
=20
 	if (move_entity(flags)) {
 		WARN_ON_ONCE(!rt_se->on_list);
-		__delist_rt_entity(rt_se, array);
+		__delist_rt_entity(rt_se, array, last_prio);
 	}
 	rt_se->on_rq =3D 0;
=20
-	dec_rt_tasks(rt_se, rt_rq);
+	return dec_rt_tasks(rt_se, rt_rq, last_prio);
+}
+
+static inline void dec_rq_nr_running(struct sched_rt_entity *rt_se,
+						unsigned int rt, unsigned int rr)
+{
+	struct rt_rq *rt_rq =3D rt_rq_of_se(rt_se);
+
+	rt_rq->rt_nr_running -=3D rt;
+	rt_rq->rr_nr_running -=3D rr;
+}
+
+static inline void add_rq_nr_running(struct sched_rt_entity *rt_se,
+						unsigned int rt, unsigned int rr)
+{
+	struct rt_rq *rt_rq =3D rt_rq_of_se(rt_se);
+
+	rt_rq->rt_nr_running +=3D rt;
+	rt_rq->rr_nr_running +=3D rr;
+}
+
+static inline bool on_top_rt_rq(struct sched_rt_entity *rt_se)
+{
+#ifdef CONFIG_RT_GROUP_SCHED
+	if (rt_se->parent)
+		return false;
+#endif
+	return true;
 }
=20
 /*
- * Because the prio of an upper entry depends on the lower
- * entries, we must remove entries top - down.
+ * To optimize the enqueue and dequeue of rt_se, this strategy employs a
+ * bottom-up removal approach. Specifically, when removing an rt_se at a
+ * certain level, if it is determined that the highest priority of the rq
+ * associated with that rt_se has not changed, there is no need to continue
+ * removing rt_se at higher levels. At this point, only the total number
+ * of removed rt_se needs to be recorded, and the rt_nr_running count of
+ * higher-level rq should be removed accordingly.
+ *
+ * For enqueue operations, if an rt_se at a certain level is in the rq,
+ * it is still necessary to check the priority of the higher-level rq.
+ * If the priority of the higher-level rq is found to be lower than that
+ * of the rt_se to be added, it should be removed, as updating the highest
+ * priority of the rq during addition will cause the rq to be repositioned
+ * in the parent rq.
+ *
+ * Conversely, for dequeue operations, if an rt_se at a certain level is
+ * not in the rq, the operation can be exited immediately to reduce
+ * unnecessary checks and handling.
+ *
+ * The return value refers to the last rt_se that was removed for enqueue
+ * operations. And for dequeue operations, it refers to the last rt_se
+ * that was either removed or had its rt_nr_running updated.
  */
-static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int f=
lags)
+static struct sched_rt_entity *dequeue_rt_stack(struct sched_rt_entity *rt=
_se,
+						unsigned int flags, int for_enqueue)
 {
-	struct sched_rt_entity *back =3D NULL;
-	unsigned int rt_nr_running;
+	struct sched_rt_entity *last =3D rt_se;
+	struct sched_rt_entity *origin =3D rt_se;
+	unsigned int del_rt_nr =3D 0;
+	unsigned int del_rr_nr =3D 0;
+	int prio_changed =3D rt_se_prio(rt_se);
+	int sub_on_rq =3D 1;
=20
 	for_each_sched_rt_entity(rt_se) {
-		rt_se->back =3D back;
-		back =3D rt_se;
-	}
+		if (on_rt_rq(rt_se)) {
+			if (sub_on_rq) {
+				/*
+				 * The number of tasks removed from the sub-level rt_se also needs
+				 * to be subtracted from the rq of the current rt_se, as the current
+				 * rt_se's rq no longer includes the number of removed tasks.
+				 */
+				dec_rq_nr_running(rt_se, del_rt_nr, del_rr_nr);
+				if ((prio_changed !=3D MAX_RT_PRIO) ||
+					(rt_se_prio(rt_se) > rt_se_prio(origin))) {
+					/*
+					 * If the removal of the lower-level rt_se causes the
+					 * highest priority of the current rq to change, or if the
+					 * priority of current rq is lower than the rt_se to be
+					 * added, then the current rt_se also needs to be removed
+					 * from its parent rq, and the number of deleted tasks
+					 * should be accumulated.
+					 */
+					if (prio_changed =3D=3D MAX_RT_PRIO)
+						prio_changed =3D rt_se_prio(rt_se);
+					del_rt_nr +=3D rt_se_nr_running(rt_se);
+					del_rr_nr +=3D rt_se_rr_nr_running(rt_se);
+					prio_changed =3D __dequeue_rt_entity(rt_se,
+									prio_changed, flags);
+					last =3D rt_se;
+				} else if (!for_enqueue) {
+					/* For dequeue, last may only rt_nr_running was modified.*/
+					last =3D rt_se;
+				}
+			} else {
+				/*
+				 * Entering this branch must be for enqueue, as dequeue would break
+				 * if an rt_se is not online.
+				 * If the sub-level node is not online, and the current rt_se's
+				 * priority is lower than the one being added, current rt_se need
+				 * to be removed.
+				 */
+				prio_changed =3D rt_se_prio(rt_se);
+				if (prio_changed > rt_se_prio(origin)) {
+					del_rt_nr +=3D rt_se_nr_running(rt_se);
+					del_rr_nr +=3D rt_se_rr_nr_running(rt_se);
+					prio_changed =3D __dequeue_rt_entity(rt_se,
+									prio_changed, flags);
+					last =3D rt_se;
+				} else {
+					prio_changed =3D MAX_RT_PRIO;
+				}
+			}
=20
-	rt_nr_running =3D rt_rq_of_se(back)->rt_nr_running;
+			/*
+			 * If the current rt_se is on the top rt_rq, then the already deleted
+			 * nodes, plus the count of the rt_rq where current rt_se located,
+			 * need to be removed from the top_rt_rq.
+			 */
+			if (on_top_rt_rq(rt_se)) {
+				dequeue_top_rt_rq(rt_rq_of_se(rt_se),
+						del_rt_nr + rt_rq_of_se(rt_se)->rt_nr_running);
+			}
+			sub_on_rq =3D 1;
+		} else if (for_enqueue) {
+			struct rt_rq *group_rq =3D group_rt_rq(rt_se);
=20
-	for (rt_se =3D back; rt_se; rt_se =3D rt_se->back) {
-		if (on_rt_rq(rt_se))
-			__dequeue_rt_entity(rt_se, flags);
+			/*
+			 * In the case of an enqueue operation, if a certain level is found to =
be
+			 * not online, then the previous counts need to be reset to zero.
+			 */
+			prio_changed =3D MAX_RT_PRIO;
+			sub_on_rq =3D 0;
+			del_rt_nr =3D 0;
+			del_rr_nr =3D 0;
+
+			/*
+			 * If the current group is being throttled, then there is no need to ch=
eck
+			 * higher levels since enqueueing will not affect higher-level nodes.
+			 */
+			if (group_rq && rt_rq_throttled(group_rq))
+				break;
+
+			if (on_top_rt_rq(rt_se))
+				dequeue_top_rt_rq(rt_rq_of_se(rt_se),
+						rt_rq_of_se(rt_se)->rt_nr_running);
+		} else {
+			last =3D rt_se;
+			break;
+		}
 	}
=20
-	dequeue_top_rt_rq(rt_rq_of_se(back), rt_nr_running);
+	return last;
 }
=20
 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int =
flags)
 {
 	struct rq *rq =3D rq_of_rt_se(rt_se);
+	struct sched_rt_entity *last;
+	unsigned int add_rt_nr =3D 0;
+	unsigned int add_rr_nr =3D 0;
+	int enqueue =3D 1;
+	int prio_change =3D 1;
=20
 	update_stats_enqueue_rt(rt_rq_of_se(rt_se), rt_se, flags);
=20
-	dequeue_rt_stack(rt_se, flags);
-	for_each_sched_rt_entity(rt_se)
-		__enqueue_rt_entity(rt_se, flags);
+	last =3D dequeue_rt_stack(rt_se, flags, 1);
+
+	for_each_sched_rt_entity(rt_se) {
+		if (enqueue || !on_rt_rq(rt_se) || (prio_change =3D=3D 1)) {
+			prio_change =3D __enqueue_rt_entity(rt_se, flags);
+			if (prio_change >=3D 0) {
+				add_rt_nr =3D rt_se_nr_running(rt_se);
+				add_rr_nr =3D rt_se_rr_nr_running(rt_se);
+			} else {
+				add_rt_nr =3D add_rr_nr =3D 0;
+			}
+		} else {
+			add_rq_nr_running(rt_se, add_rt_nr, add_rr_nr);
+		}
+
+		if (rt_se =3D=3D last)
+			enqueue =3D 0;
+	}
+
 	enqueue_top_rt_rq(&rq->rt);
 }
=20
 static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int =
flags)
 {
 	struct rq *rq =3D rq_of_rt_se(rt_se);
+	struct sched_rt_entity *last;
+	unsigned int add_rt_nr =3D 0;
+	unsigned int add_rr_nr =3D 0;
+	int prio_change =3D 1;
=20
 	update_stats_dequeue_rt(rt_rq_of_se(rt_se), rt_se, flags);
=20
-	dequeue_rt_stack(rt_se, flags);
+	last =3D dequeue_rt_stack(rt_se, flags, 0);
=20
 	for_each_sched_rt_entity(rt_se) {
 		struct rt_rq *rt_rq =3D group_rt_rq(rt_se);
+		if (rt_rq && rt_rq->rt_nr_running) {
+			if (on_rt_rq(rt_se)) {
+				add_rq_nr_running(rt_se, add_rt_nr, add_rr_nr);
+			} else {
+				prio_change =3D __enqueue_rt_entity(rt_se, flags);
+				if (prio_change =3D=3D 0) {
+					/*
+					 * If enqueue is successful and the priority of the rq has
+					 * not changed, then the parent node only needs to add the
+					 * count of the current rt_se. Otherwise, the parent node
+					 * will also need to enqueue.
+					 */
+					add_rt_nr =3D rt_se_nr_running(rt_se);
+					add_rr_nr =3D rt_se_rr_nr_running(rt_se);
+				}
+			}
+		} else {
+			add_rt_nr =3D add_rr_nr =3D 0;
+		}
=20
-		if (rt_rq && rt_rq->rt_nr_running)
-			__enqueue_rt_entity(rt_se, flags);
+		/*
+		 * last is the rt_se of the last deletion or modification of the
+		 * count, so the subsequent rt_se does not need to be updated.
+		 */
+		if (rt_se =3D=3D last)
+			break;
 	}
+
 	enqueue_top_rt_rq(&rq->rt);
 }
=20
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ef20c61004eb..821d65106d13 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2879,6 +2879,7 @@ extern void print_rt_stats(struct seq_file *m, int cp=
u);
 extern void print_dl_stats(struct seq_file *m, int cpu);
 extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_r=
q);
 extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
+extern void print_rt_rq_task(struct seq_file *m, struct rt_rq *rt_rq);
 extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
=20
 extern void resched_latency_warn(int cpu, u64 latency);
--=20
2.45.2
From nobody Tue Feb 10 04:19:52 2026
Received: from m15.mail.163.com (m15.mail.163.com [45.254.50.219])
	by smtp.subspace.kernel.org (Postfix) with ESMTP id 935DC7494
	for <linux-kernel@vger.kernel.org>; Wed, 17 Jul 2024 03:01:20 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=45.254.50.219
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1721185284; cv=none;
 b=MLmxdI7N571FV9UuqFDs2asqxfzoAd92ATD06BuVQBc873ayicvK0RlQ292HZhLY5MmfTq434OBQiYRVM4ZV2thTYFUxIQEm4Tw6aYogb22vgX/MErGTsDEhc7qm7Xql6aq1m9Bl9L5wIzGzhfhpsew4Zmj7MIMgQI1eldXzrFs=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1721185284; c=relaxed/simple;
	bh=XLg27Rv/B9inZzky5yoEyv1Q/LpXmR+/Pv7EJpq3ZQY=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=IweFvZaWQzi2wByIC6DIbkt880vMwsLMPzx+VzYtNcA52n74uywO0XAFjld6aWpgVGowaMjU7J6npt1QFkLU0NSgebwpNcZ7CKLDq1q9u1MVSVEHHPn4vLGmWr7/dPbx5Q9goiH0RZMUFD2+sieXVHcHRrFhRzkvwuxT/9d/Oqo=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=163.com;
 spf=pass smtp.mailfrom=163.com;
 dkim=pass (1024-bit key) header.d=163.com header.i=@163.com
 header.b=fXMYj8Fl; arc=none smtp.client-ip=45.254.50.219
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=163.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=163.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (1024-bit key) header.d=163.com header.i=@163.com
 header.b="fXMYj8Fl"
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=163.com;
	s=s110527; h=From:Subject:Date:Message-Id:MIME-Version; bh=HQbzX
	QjgD2WplCDNCOpC02D2b5cuw8kGRQsnrVT8lRA=; b=fXMYj8FllCp2gfu55wrJW
	7ebMfaD4F1u7qCjRTuFC7THV1J+5Qe6rcEkvE4cA6Xvkx9P8wDXI0xvtdXgrah3+
	QXPe3LtGgqgbJ3FeqcgIszeWMrAF4frJekSP1cDGUdbP7kGWkCrpdNs4jthyt+Fi
	snijVz7ijAAkehiAHAMxJc=
Received: from localhost (unknown [101.132.132.191])
	by gzga-smtp-mta-g0-5 (Coremail) with SMTP id
 _____wD3n3zVM5dmFo_UAA--.32416S2;
	Wed, 17 Jul 2024 11:00:38 +0800 (CST)
From: Xavier <xavier_qy@163.com>
To: mingo@redhat.com,
	peterz@infradead.org,
	juri.lelli@redhat.com,
	vincent.guittot@linaro.org
Cc: dietmar.eggemann@arm.com,
	rostedt@goodmis.org,
	bsegall@google.com,
	mgorman@suse.de,
	bristot@redhat.com,
	vschneid@redhat.com,
	linux-kernel@vger.kernel.org,
	oliver.sang@intel.com,
	Xavier <xavier_qy@163.com>
Subject: [PATCH-RT sched v4 2/2] RT test: Adding test cases for RT group
 scheduling
Date: Wed, 17 Jul 2024 11:00:33 +0800
Message-Id: <20240717030033.309205-3-xavier_qy@163.com>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <20240717030033.309205-1-xavier_qy@163.com>
References: <202407170411.vRtOCOzx-lkp@intel.com>
 <20240717030033.309205-1-xavier_qy@163.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
X-CM-TRANSID: _____wD3n3zVM5dmFo_UAA--.32416S2
X-Coremail-Antispam: 1Uf129KBjvJXoW3Jw1xGr47XFy8ZrykCFWDXFb_yoWftFy8pF
	Wku34qyw4rG3W3trs3Ca10vF1ruws7ZF47JrZ5KryUAa4xGFs3tFyIkFW2qrn3CrWF9w13
	Za1agay7Cr47trJanT9S1TB71UUUUU7qnTZGkaVYY2UrUUUUjbIjqfuFe4nvWSU5nxnvy2
	9KBjDUYxBIdaVFxhVjvjDU0xZFpf9x07U-db8UUUUU=
X-CM-SenderInfo: 50dyxvpubt5qqrwthudrp/1tbiYxUfEGV4JaK9FAABsL
Content-Type: text/plain; charset="utf-8"

Adding test cases for RT group scheduling, create some RT infinite loop
processes/threads, then set them to the same or different priorities.
Place them in different RT task groups, run for a period of time,
and finally count the number of infinite loop executions for all tasks.

Signed-off-by: Xavier <xavier_qy@163.com>
---
 MAINTAINERS                                   |   7 +
 tools/testing/selftests/sched/Makefile        |   4 +-
 tools/testing/selftests/sched/deadloop.c      | 192 ++++++++++++++++++
 .../selftests/sched/rt_group_sched_test.sh    | 119 +++++++++++
 4 files changed, 320 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/sched/deadloop.c
 create mode 100755 tools/testing/selftests/sched/rt_group_sched_test.sh

diff --git a/MAINTAINERS b/MAINTAINERS
index 958e935449e5..f5cc821b8510 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -19463,6 +19463,13 @@ L:	linux-remoteproc@vger.kernel.org
 S:	Maintained
 F:	drivers/tty/rpmsg_tty.c
=20
+RT GROUP SCHED TEST
+M:	Xavier <xavier_qy@163.com>
+L:	linux-kernel@vger.kernel.org
+S:	Maintained
+F:	tools/testing/selftests/sched/deadloop.c
+F:	tools/testing/selftests/sched/rt_group_sched_test.sh
+
 RTL2830 MEDIA DRIVER
 L:	linux-media@vger.kernel.org
 S:	Orphan
diff --git a/tools/testing/selftests/sched/Makefile b/tools/testing/selftes=
ts/sched/Makefile
index 099ee9213557..96decb58bf35 100644
--- a/tools/testing/selftests/sched/Makefile
+++ b/tools/testing/selftests/sched/Makefile
@@ -8,7 +8,7 @@ CFLAGS +=3D -O2 -Wall -g -I./ $(KHDR_INCLUDES) -Wl,-rpath=
=3D./ \
 	  $(CLANG_FLAGS)
 LDLIBS +=3D -lpthread
=20
-TEST_GEN_FILES :=3D cs_prctl_test
-TEST_PROGS :=3D cs_prctl_test
+TEST_GEN_FILES :=3D cs_prctl_test deadloop
+TEST_PROGS :=3D cs_prctl_test deadloop
=20
 include ../lib.mk
diff --git a/tools/testing/selftests/sched/deadloop.c b/tools/testing/selft=
ests/sched/deadloop.c
new file mode 100644
index 000000000000..d850a3e2a0ab
--- /dev/null
+++ b/tools/testing/selftests/sched/deadloop.c
@@ -0,0 +1,192 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <signal.h>
+
+/*
+ * Create multiple infinite loop threads based on the passed parameters
+ * Usage: deadloop num policy prio
+ *	num: the number of child threads
+ *	policy: the scheduling policy of the child threads, 0-fair, 1-fifo, 2-rr
+ *	prio: the priority
+ * If this process is killed, it will print the loop count of all child th=
reads
+ * to the OUTPUT_FILE
+ *
+ * Date: June 27, 2024
+ * Author: Xavier <xavier_qy@163.com>
+ */
+
+#define OUTPUT_FILE "rt_group_sched_test.log"
+
+#if __GLIBC_PREREQ(2, 30) =3D=3D 0
+#include <sys/syscall.h>
+static pid_t gettid(void)
+{
+	return syscall(SYS_gettid);
+}
+#endif
+
+#define do_err(x) \
+do { \
+	if ((x) < 0) {  \
+		printf("test BUG_ON func %s, line %d %ld\n", \
+			__func__, __LINE__, (long)(x) \
+		); \
+		while (1) \
+			sleep(1); \
+	} \
+} while (0)
+
+#define do_false(x) \
+do { \
+	if ((x) =3D=3D 1) { \
+		printf("test BUG_ON func %s, line %d %d\n", \
+			__func__, __LINE__, (x) \
+		); \
+		while (1) \
+			sleep(1); \
+	} \
+} while (0)
+
+
+struct thread_data {
+	pthread_t thread;
+	int index;
+	int pid;
+	unsigned long cnt;
+};
+
+static struct thread_data *pdata;
+static int thread_num =3D 1;
+
+static void create_thread_posix(void *entry, pthread_t *thread, int *para,
+								 int policy, int prio)
+{
+	int					ret;
+	struct sched_param	param;
+	pthread_attr_t		attr;
+
+	memset(&param, 0, sizeof(param));
+	ret =3D pthread_attr_init(&attr);
+	do_err(ret);
+
+	ret =3D pthread_attr_setinheritsched(&attr, PTHREAD_EXPLICIT_SCHED);
+	do_err(ret);
+
+	param.sched_priority =3D prio;
+
+	ret =3D pthread_attr_setschedpolicy(&attr, policy);
+	do_err(ret);
+
+	ret =3D pthread_attr_setschedparam(&attr, &param);
+	do_err(ret);
+
+	ret =3D pthread_create(thread, &attr, entry, para);
+	do_err(ret);
+}
+
+static void *dead_loop_entry(void *arg)
+{
+	int index =3D *(int *)arg;
+	struct sched_param param;
+	int cur =3D gettid();
+
+	sched_getparam(cur, &param);
+	pdata[index].pid =3D cur;
+	printf("cur:%d prio:%d\n", cur, param.sched_priority);
+
+	while (1) {
+		asm volatile("" ::: "memory");
+		pdata[index].cnt++;
+	}
+	return NULL;
+}
+
+static void handle_signal(int signal)
+{
+	int cnt =3D 0;
+
+	if (signal =3D=3D SIGTERM) {
+		FILE *file =3D freopen(OUTPUT_FILE, "a", stdout);
+
+		if (file =3D=3D NULL) {
+			perror("freopen");
+			exit(0);
+		}
+
+		while (cnt < thread_num) {
+			printf("pid:%d cnt:%ld\n", pdata[cnt].pid, pdata[cnt].cnt);
+			cnt++;
+		}
+		fclose(file);
+		exit(0);
+	}
+}
+
+static int dead_loop_create(int policy, int prio)
+{
+	int cnt =3D 0;
+	int ret;
+	void *status;
+	struct sched_param param;
+
+	param.sched_priority =3D prio;
+	pdata =3D malloc(thread_num * sizeof(struct thread_data));
+	do_false(!pdata);
+
+	if (policy) {
+		ret =3D sched_setscheduler(0, policy, &param);
+		do_err(ret);
+	}
+
+	while (cnt < thread_num) {
+		pdata[cnt].index =3D cnt;
+		create_thread_posix(dead_loop_entry, &pdata[cnt].thread,
+								 &pdata[cnt].index, policy, prio);
+		cnt++;
+	}
+
+	signal(SIGTERM, handle_signal);
+
+	cnt =3D 0;
+	while (cnt < thread_num) {
+		pthread_join(pdata[cnt].thread, &status);
+		cnt++;
+	}
+
+	free(pdata);
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	int policy =3D 2;
+	int prio =3D 50;
+
+	if (argc =3D=3D 2)
+		thread_num =3D atoi(argv[1]);
+
+	if (argc =3D=3D 3) {
+		thread_num =3D atoi(argv[1]);
+		policy =3D atoi(argv[2]);
+		if (policy > 0)
+			prio =3D 50;
+	}
+
+	if (argc =3D=3D 4) {
+		thread_num =3D atoi(argv[1]);
+		policy =3D atoi(argv[2]);
+		prio =3D atoi(argv[3]);
+	}
+
+	dead_loop_create(policy, prio);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/sched/rt_group_sched_test.sh b/tools/t=
esting/selftests/sched/rt_group_sched_test.sh
new file mode 100755
index 000000000000..9031250a2684
--- /dev/null
+++ b/tools/testing/selftests/sched/rt_group_sched_test.sh
@@ -0,0 +1,119 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Test for rt group scheduling
+# Date: June 27, 2024
+# Author: Xavier <xavier_qy@163.com>
+
+# Record the list of child process PIDs
+PIDS=3D()
+
+# File for redirected output
+LOGFILE=3D"rt_group_sched_test.log"
+
+# Cleanup function: kill all recorded child processes and unmount the cgro=
up
+function cleanup() {
+	echo "Cleaning up..."
+	for pid in "${PIDS[@]}"; do
+		if kill -0 $pid 2>/dev/null; then
+			kill -TERM $pid
+		fi
+	done
+
+	# Sleep for a while to ensure the processes are properly killed
+	sleep 2
+
+	# Unmount the cgroup filesystem
+	umount /sys/fs/cgroup/cpu 2>/dev/null
+	umount /sys/fs/cgroup 2>/dev/null
+	echo "Cleanup completed."
+
+	# Ensure the LOGFILE exists and is correct
+	if [ ! -f "$LOGFILE" ]; then
+		echo "$LOGFILE not found!"
+		exit 1
+	fi
+
+	# Initialize the total count variable
+	total=3D0
+
+	# Read matching lines and calculate the total sum
+	while IFS=3D read -r line
+	do
+		# Use grep to match lines containing 'pid:' and 'cnt:', and extract the =
value of cnt
+		if echo "$line" | grep -q '^pid:[[:digit:]]\+ cnt:[[:digit:]]\+'; then
+			cnt=3D$(echo "$line" | sed -n \
+			  's/^pid:[[:digit:]]\+ cnt:\([[:digit:]]\+\)/\1/p')
+			total=3D$((total + cnt))
+		fi
+	done < "$LOGFILE"
+
+	# Print the total sum
+	echo "Total cnt: $total"
+	echo "Finished processing."
+}
+
+# Capture actions when interrupted or terminated by a signal
+trap cleanup EXIT
+
+# Start the cgroup filesystem and create the necessary directories
+function setup_cgroups() {
+	mount -t tmpfs -o mode=3D755 cgroup_root /sys/fs/cgroup
+	mkdir -p /sys/fs/cgroup/cpu
+	mount -t cgroup -o cpu none /sys/fs/cgroup/cpu
+}
+
+# Create cgroup subdirectories and configure their settings
+function create_child_cgroup() {
+	local base_dir=3D$1
+	local name=3D$2
+	local rt_period=3D$3
+	local rt_runtime=3D$4
+	mkdir -p "$base_dir/$name"
+	echo $rt_period > "$base_dir/$name/cpu.rt_period_us"
+	echo $rt_runtime > "$base_dir/$name/cpu.rt_runtime_us"
+}
+# Launch a process and add it to the specified cgroup
+function launch_process() {
+	local process_name=3D$1
+
+	# Three parameters representing the number of child threads, scheduling p=
olicy, and priority
+	local args=3D$2
+	local cgroup_path=3D$3
+
+	# Launch the process
+	exec -a $process_name ./deadloop $args &
+	local pid=3D$!
+	PIDS+=3D($pid)
+
+	# Short sleep to ensure the process starts
+	sleep 1
+
+	# Check if the process started successfully
+	if ! pgrep -x $process_name > /dev/null; then
+		echo "Error: No process found with name $process_name."
+		exit 1
+	fi
+
+	echo $pid > "$cgroup_path/cgroup.procs"
+	echo "Process $process_name with PID $pid added to cgroup $cgroup_path"
+}
+
+# Main function running all tasks
+function main() {
+	echo "The test needs 30 seconds..."
+	rm -f "$LOGFILE"
+	setup_cgroups
+	create_child_cgroup "/sys/fs/cgroup/cpu" "child1" 1000000 800000
+	create_child_cgroup "/sys/fs/cgroup/cpu/child1" "child2" 1000000 700000
+	create_child_cgroup "/sys/fs/cgroup/cpu/child1/child2" "child3" 1000000 6=
00000
+	launch_process "child1" "3 2 50" "/sys/fs/cgroup/cpu/child1"
+	launch_process "child2" "3 2 50" "/sys/fs/cgroup/cpu/child1/child2"
+	launch_process "child3" "1 2 50" "/sys/fs/cgroup/cpu/child1/child2/child3"
+	launch_process "tg_root" "1 2 50" "/sys/fs/cgroup/cpu"
+
+	# Run for 30 seconds
+	sleep 30
+}
+
+# Execute the main function
+main
--=20
2.45.2