[v3] Optimize the RT group scheduling

[PATCH-RT sched v3 1/2] RT SCHED: Optimize the enqueue and dequeue operations for rt_se

Posted by Xavier 1 year, 6 months ago

This patch optimizes the enqueue and dequeue of rt_se, the strategy employs
a bottom-up removal approach. Specifically, when removing an rt_se at a
certain level, if it is determined that the highest priority of the rq
associated with that rt_se has not changed, there is no need to continue
removing rt_se at higher levels. At this point, only the total number
of removed rt_se needs to be recorded, and the rt_nr_running count of
higher-level rq should be removed accordingly.

Signed-off-by: Xavier <xavier_qy@163.com>
---
 kernel/sched/debug.c |  48 ++++++++
 kernel/sched/rt.c    | 287 +++++++++++++++++++++++++++++++++++++------
 kernel/sched/sched.h |   1 +
 3 files changed, 298 insertions(+), 38 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index c1eb9a1afd13..bf9edba5e87b 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -712,6 +712,54 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 #endif
 }
 
+void print_rt_se(struct seq_file *m, struct sched_rt_entity *rt_se)
+{
+	struct task_struct *task;
+
+#ifdef CONFIG_RT_GROUP_SCHED
+	if (rt_se->my_q) {
+		SEQ_printf_task_group_path(m, rt_se->my_q->tg, "%s\n");
+		return;
+	}
+#endif
+	task = container_of(rt_se, struct task_struct, rt);
+	SEQ_printf(m, "	prio-%d, pid-%d, %s\n", task->prio, task->pid, task->comm);
+}
+
+/*shall be called in rq lock*/
+void print_rt_rq_task(struct seq_file *m, struct rt_rq *rt_rq)
+{
+	struct rt_prio_array *array = &rt_rq->active;
+	struct sched_rt_entity *rt_se;
+	struct list_head *queue, *head;
+	unsigned long bitmap[2];
+	int idx;
+	int count = 0;
+
+	if (!rt_rq->rt_nr_running)
+		return;
+
+	memcpy(bitmap, array->bitmap, sizeof(unsigned long) * 2);
+	idx = sched_find_first_bit(bitmap);
+	WARN_ON_ONCE(idx >= MAX_RT_PRIO);
+
+	while (1) {
+		clear_bit(idx, bitmap);
+		queue = array->queue + idx;
+		head = queue;
+		queue = queue->next;
+		do {
+			rt_se = list_entry(queue, struct sched_rt_entity, run_list);
+			print_rt_se(m, rt_se);
+			queue = queue->next;
+			count++;
+		} while (queue != head);
+		idx = sched_find_first_bit(bitmap);
+		if (idx >= MAX_RT_PRIO)
+			break;
+	}
+}
+
 void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 {
 #ifdef CONFIG_RT_GROUP_SCHED
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index aa4c1c874fa4..b18c424a50d2 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1113,7 +1113,7 @@ void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
 #endif /* CONFIG_SMP */
 
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-static void
+static int
 inc_rt_prio(struct rt_rq *rt_rq, int prio)
 {
 	int prev_prio = rt_rq->highest_prio.curr;
@@ -1122,9 +1122,11 @@ inc_rt_prio(struct rt_rq *rt_rq, int prio)
 		rt_rq->highest_prio.curr = prio;
 
 	inc_rt_prio_smp(rt_rq, prio, prev_prio);
+
+	return prev_prio > prio;
 }
 
-static void
+static int
 dec_rt_prio(struct rt_rq *rt_rq, int prio)
 {
 	int prev_prio = rt_rq->highest_prio.curr;
@@ -1149,12 +1151,22 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio)
 	}
 
 	dec_rt_prio_smp(rt_rq, prio, prev_prio);
+	if (rt_rq->highest_prio.curr > prio)
+		return prio;
+	else
+		return MAX_RT_PRIO;
 }
 
 #else
 
-static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
-static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
+static inline int inc_rt_prio(struct rt_rq *rt_rq, int prio)
+{
+	return 0;
+}
+static inline int dec_rt_prio(struct rt_rq *rt_rq, int prio)
+{
+	return 0;
+}
 
 #endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
 
@@ -1218,28 +1230,31 @@ unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
 }
 
 static inline
-void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+int inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
 	int prio = rt_se_prio(rt_se);
+	int prio_change;
 
 	WARN_ON(!rt_prio(prio));
 	rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
 	rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
 
-	inc_rt_prio(rt_rq, prio);
+	prio_change = inc_rt_prio(rt_rq, prio);
 	inc_rt_group(rt_se, rt_rq);
+	return prio_change;
 }
 
 static inline
-void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+int dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq, int prio)
 {
+	int prio_changed;
 	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
-	WARN_ON(!rt_rq->rt_nr_running);
 	rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
 	rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
 
-	dec_rt_prio(rt_rq, rt_se_prio(rt_se));
+	prio_changed = dec_rt_prio(rt_rq, prio);
 	dec_rt_group(rt_se, rt_rq);
+	return prio_changed;
 }
 
 /*
@@ -1255,12 +1270,13 @@ static inline bool move_entity(unsigned int flags)
 	return true;
 }
 
-static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
+static void __delist_rt_entity(struct sched_rt_entity *rt_se,
+						struct rt_prio_array *array, int last_prio)
 {
 	list_del_init(&rt_se->run_list);
 
-	if (list_empty(array->queue + rt_se_prio(rt_se)))
-		__clear_bit(rt_se_prio(rt_se), array->bitmap);
+	if (list_empty(array->queue + last_prio))
+		__clear_bit(last_prio, array->bitmap);
 
 	rt_se->on_list = 0;
 }
@@ -1371,7 +1387,12 @@ update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
 	}
 }
 
-static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
+/*
+ * Returns: -1 indicates that rt_se was not enqueued, 0 indicates that the highest
+ * priority of the rq did not change after enqueue, and 1 indicates that the highest
+ * priority of the rq changed after enqueue.
+ */
+static int __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
 	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 	struct rt_prio_array *array = &rt_rq->active;
@@ -1386,8 +1407,8 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flag
 	 */
 	if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
 		if (rt_se->on_list)
-			__delist_rt_entity(rt_se, array);
-		return;
+			__delist_rt_entity(rt_se, array, rt_se_prio(rt_se));
+		return -1;
 	}
 
 	if (move_entity(flags)) {
@@ -1402,73 +1423,263 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flag
 	}
 	rt_se->on_rq = 1;
 
-	inc_rt_tasks(rt_se, rt_rq);
+	return inc_rt_tasks(rt_se, rt_rq);
 }
 
-static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
+/**
+ * delete rt_se from rt_rq
+ *
+ * @rt_se		Nodes to be deleted
+ * @last_prio	The highest priority of this rt_se before the previous round
+ *				of deletion
+ * @flags		operation flags
+ *
+ * Returns: =0 indicates that the highest priority of the current rq did not
+ * change during this deletion. >0 indicates it changed, and it returns the
+ * previous highest priority to use in the next round of deletion.
+ */
+static int __dequeue_rt_entity(struct sched_rt_entity *rt_se, int last_prio,
+									unsigned int flags)
 {
 	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 	struct rt_prio_array *array = &rt_rq->active;
 
 	if (move_entity(flags)) {
 		WARN_ON_ONCE(!rt_se->on_list);
-		__delist_rt_entity(rt_se, array);
+		__delist_rt_entity(rt_se, array, last_prio);
 	}
 	rt_se->on_rq = 0;
 
-	dec_rt_tasks(rt_se, rt_rq);
+	return dec_rt_tasks(rt_se, rt_rq, last_prio);
+}
+
+static inline void dec_rq_nr_running(struct sched_rt_entity *rt_se,
+						unsigned int rt, unsigned int rr)
+{
+	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+
+	rt_rq->rt_nr_running -= rt;
+	rt_rq->rr_nr_running -= rr;
+}
+
+static inline void add_rq_nr_running(struct sched_rt_entity *rt_se,
+						unsigned int rt, unsigned int rr)
+{
+	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+
+	rt_rq->rt_nr_running += rt;
+	rt_rq->rr_nr_running += rr;
+}
+
+static inline bool on_top_rt_rq(struct sched_rt_entity *rt_se)
+{
+#ifdef CONFIG_RT_GROUP_SCHED
+	if (rt_se->parent)
+		return false;
+#endif
+	return true;
 }
 
 /*
- * Because the prio of an upper entry depends on the lower
- * entries, we must remove entries top - down.
+ * To optimize the enqueue and dequeue of rt_se, this strategy employs a
+ * bottom-up removal approach. Specifically, when removing an rt_se at a
+ * certain level, if it is determined that the highest priority of the rq
+ * associated with that rt_se has not changed, there is no need to continue
+ * removing rt_se at higher levels. At this point, only the total number
+ * of removed rt_se needs to be recorded, and the rt_nr_running count of
+ * higher-level rq should be removed accordingly.
+ *
+ * For enqueue operations, if an rt_se at a certain level is in the rq,
+ * it is still necessary to check the priority of the higher-level rq.
+ * If the priority of the higher-level rq is found to be lower than that
+ * of the rt_se to be added, it should be removed, as updating the highest
+ * priority of the rq during addition will cause the rq to be repositioned
+ * in the parent rq.
+ *
+ * Conversely, for dequeue operations, if an rt_se at a certain level is
+ * not in the rq, the operation can be exited immediately to reduce
+ * unnecessary checks and handling.
+ *
+ * The return value refers to the last rt_se that was removed for enqueue
+ * operations. And for dequeue operations, it refers to the last rt_se
+ * that was either removed or had its rt_nr_running updated.
  */
-static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
+static struct sched_rt_entity *dequeue_rt_stack(struct sched_rt_entity *rt_se,
+						unsigned int flags, int for_enqueue)
 {
-	struct sched_rt_entity *back = NULL;
-	unsigned int rt_nr_running;
+	struct sched_rt_entity *last = rt_se;
+	struct sched_rt_entity *origin = rt_se;
+	unsigned int del_rt_nr = 0;
+	unsigned int del_rr_nr = 0;
+	int prio_changed = rt_se_prio(rt_se);
+	int sub_on_rq = 1;
 
 	for_each_sched_rt_entity(rt_se) {
-		rt_se->back = back;
-		back = rt_se;
-	}
+		if (on_rt_rq(rt_se)) {
+			if (sub_on_rq) {
+				/*
+				 * The number of tasks removed from the sub-level rt_se also needs
+				 * to be subtracted from the rq of the current rt_se, as the current
+				 * rt_se's rq no longer includes the number of removed tasks.
+				 */
+				dec_rq_nr_running(rt_se, del_rt_nr, del_rr_nr);
+				if ((prio_changed != MAX_RT_PRIO) ||
+					(rt_se_prio(rt_se) > rt_se_prio(origin))) {
+					/*
+					 * If the removal of the lower-level rt_se causes the
+					 * highest priority of the current rq to change, or if the
+					 * priority of current rq is lower than the rt_se to be
+					 * added, then the current rt_se also needs to be removed
+					 * from its parent rq, and the number of deleted tasks
+					 * should be accumulated.
+					 */
+					if (prio_changed == MAX_RT_PRIO)
+						prio_changed = rt_se_prio(rt_se);
+					del_rt_nr += rt_se_nr_running(rt_se);
+					del_rr_nr += rt_se_rr_nr_running(rt_se);
+					prio_changed = __dequeue_rt_entity(rt_se,
+									prio_changed, flags);
+					last = rt_se;
+				} else if (!for_enqueue) {
+					/* For dequeue, last may only rt_nr_running was modified.*/
+					last = rt_se;
+				}
+			} else {
+				/*
+				 * Entering this branch must be for enqueue, as dequeue would break
+				 * if an rt_se is not online.
+				 * If the sub-level node is not online, and the current rt_se's
+				 * priority is lower than the one being added, current rt_se need
+				 * to be removed.
+				 */
+				prio_changed = rt_se_prio(rt_se);
+				if (prio_changed > rt_se_prio(origin)) {
+					del_rt_nr += rt_se_nr_running(rt_se);
+					del_rr_nr += rt_se_rr_nr_running(rt_se);
+					prio_changed = __dequeue_rt_entity(rt_se,
+									prio_changed, flags);
+					last = rt_se;
+				} else {
+					prio_changed = MAX_RT_PRIO;
+				}
+			}
 
-	rt_nr_running = rt_rq_of_se(back)->rt_nr_running;
+			/*
+			 * If the current rt_se is on the top rt_rq, then the already deleted
+			 * nodes, plus the count of the rt_rq where current rt_se located,
+			 * need to be removed from the top_rt_rq.
+			 */
+			if (on_top_rt_rq(rt_se)) {
+				dequeue_top_rt_rq(rt_rq_of_se(rt_se),
+						del_rt_nr + rt_rq_of_se(rt_se)->rt_nr_running);
+			}
+			sub_on_rq = 1;
+		} else if (for_enqueue) {
+			struct rt_rq *group_rq = group_rt_rq(rt_se);
 
-	for (rt_se = back; rt_se; rt_se = rt_se->back) {
-		if (on_rt_rq(rt_se))
-			__dequeue_rt_entity(rt_se, flags);
+			/*
+			 * In the case of an enqueue operation, if a certain level is found to be
+			 * not online, then the previous counts need to be reset to zero.
+			 */
+			prio_changed = MAX_RT_PRIO;
+			sub_on_rq = 0;
+			del_rt_nr = 0;
+			del_rr_nr = 0;
+
+			/*
+			 * If the current group is being throttled, then there is no need to check
+			 * higher levels since enqueueing will not affect higher-level nodes.
+			 */
+			if (group_rq && rt_rq_throttled(group_rq))
+				break;
+
+			if (on_top_rt_rq(rt_se))
+				dequeue_top_rt_rq(rt_rq_of_se(rt_se),
+						rt_rq_of_se(rt_se)->rt_nr_running);
+		} else {
+			last = rt_se;
+			break;
+		}
 	}
 
-	dequeue_top_rt_rq(rt_rq_of_se(back), rt_nr_running);
+	return last;
 }
 
 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
 	struct rq *rq = rq_of_rt_se(rt_se);
+	struct sched_rt_entity *last;
+	unsigned int add_rt_nr = 0;
+	unsigned int add_rr_nr = 0;
+	int enqueue = 1;
+	int prio_change = 1;
 
 	update_stats_enqueue_rt(rt_rq_of_se(rt_se), rt_se, flags);
 
-	dequeue_rt_stack(rt_se, flags);
-	for_each_sched_rt_entity(rt_se)
-		__enqueue_rt_entity(rt_se, flags);
+	last = dequeue_rt_stack(rt_se, flags, 1);
+
+	for_each_sched_rt_entity(rt_se) {
+		if (enqueue || !on_rt_rq(rt_se) || (prio_change == 1)) {
+			prio_change = __enqueue_rt_entity(rt_se, flags);
+			if (prio_change >= 0) {
+				add_rt_nr = rt_se_nr_running(rt_se);
+				add_rr_nr = rt_se_rr_nr_running(rt_se);
+			} else {
+				add_rt_nr = add_rr_nr = 0;
+			}
+		} else {
+			add_rq_nr_running(rt_se, add_rt_nr, add_rr_nr);
+		}
+
+		if (rt_se == last)
+			enqueue = 0;
+	}
+
 	enqueue_top_rt_rq(&rq->rt);
 }
 
 static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
 	struct rq *rq = rq_of_rt_se(rt_se);
+	struct sched_rt_entity *last;
+	unsigned int add_rt_nr = 0;
+	unsigned int add_rr_nr = 0;
+	int prio_change = 1;
 
 	update_stats_dequeue_rt(rt_rq_of_se(rt_se), rt_se, flags);
 
-	dequeue_rt_stack(rt_se, flags);
+	last = dequeue_rt_stack(rt_se, flags, 0);
 
 	for_each_sched_rt_entity(rt_se) {
 		struct rt_rq *rt_rq = group_rt_rq(rt_se);
+		if (rt_rq && rt_rq->rt_nr_running) {
+			if (on_rt_rq(rt_se)) {
+				add_rq_nr_running(rt_se, add_rt_nr, add_rr_nr);
+			} else {
+				prio_change = __enqueue_rt_entity(rt_se, flags);
+				if (prio_change == 0) {
+					/*
+					 * If enqueue is successful and the priority of the rq has
+					 * not changed, then the parent node only needs to add the
+					 * count of the current rt_se. Otherwise, the parent node
+					 * will also need to enqueue.
+					 */
+					add_rt_nr = rt_se_nr_running(rt_se);
+					add_rr_nr = rt_se_rr_nr_running(rt_se);
+				}
+			}
+		} else {
+			add_rt_nr = add_rr_nr = 0;
+		}
 
-		if (rt_rq && rt_rq->rt_nr_running)
-			__enqueue_rt_entity(rt_se, flags);
+		/*
+		 * last is the rt_se of the last deletion or modification of the
+		 * count, so the subsequent rt_se does not need to be updated.
+		 */
+		if (rt_se == last)
+			break;
 	}
+
 	enqueue_top_rt_rq(&rq->rt);
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a831af102070..b634153aacf0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2878,6 +2878,7 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
 extern void print_dl_stats(struct seq_file *m, int cpu);
 extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
 extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
+extern void print_rt_rq_task(struct seq_file *m, struct rt_rq *rt_rq);
 extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
 
 extern void resched_latency_warn(int cpu, u64 latency);
-- 
2.45.2

Re: [PATCH-RT sched v3 1/2] RT SCHED: Optimize the enqueue and dequeue operations for rt_se

Posted by kernel test robot 1 year, 6 months ago

Hi Xavier,

kernel test robot noticed the following build warnings:

[auto build test WARNING on tip/sched/core]
[also build test WARNING on shuah-kselftest/next shuah-kselftest/fixes peterz-queue/sched/core linus/master v6.10]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Xavier/RT-SCHED-Optimize-the-enqueue-and-dequeue-operations-for-rt_se/20240716-140932
base:   tip/sched/core
patch link:    https://lore.kernel.org/r/20240716060514.304324-2-xavier_qy%40163.com
patch subject: [PATCH-RT sched v3 1/2] RT SCHED: Optimize the enqueue and dequeue operations for rt_se
config: x86_64-randconfig-121-20240716 (https://download.01.org/0day-ci/archive/20240717/202407170411.vRtOCOzx-lkp@intel.com/config)
compiler: gcc-8 (Ubuntu 8.4.0-3ubuntu2) 8.4.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240717/202407170411.vRtOCOzx-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202407170411.vRtOCOzx-lkp@intel.com/

sparse warnings: (new ones prefixed by >>)
   kernel/sched/build_utility.c: note: in included file:
   kernel/sched/debug.c:469:17: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct sched_domain *[assigned] sd @@     got struct sched_domain [noderef] __rcu *parent @@
   kernel/sched/debug.c:469:17: sparse:     expected struct sched_domain *[assigned] sd
   kernel/sched/debug.c:469:17: sparse:     got struct sched_domain [noderef] __rcu *parent
>> kernel/sched/debug.c:715:6: sparse: sparse: symbol 'print_rt_se' was not declared. Should it be static?
   kernel/sched/debug.c:842:9: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct task_struct *tsk @@     got struct task_struct [noderef] __rcu *curr @@
   kernel/sched/debug.c:842:9: sparse:     expected struct task_struct *tsk
   kernel/sched/debug.c:842:9: sparse:     got struct task_struct [noderef] __rcu *curr
   kernel/sched/debug.c:842:9: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct task_struct *tsk @@     got struct task_struct [noderef] __rcu *curr @@
   kernel/sched/debug.c:842:9: sparse:     expected struct task_struct *tsk
   kernel/sched/debug.c:842:9: sparse:     got struct task_struct [noderef] __rcu *curr
   kernel/sched/build_utility.c: note: in included file:
   kernel/sched/stats.c:148:17: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct sched_domain *[assigned] sd @@     got struct sched_domain [noderef] __rcu *parent @@
   kernel/sched/stats.c:148:17: sparse:     expected struct sched_domain *[assigned] sd
   kernel/sched/stats.c:148:17: sparse:     got struct sched_domain [noderef] __rcu *parent
   kernel/sched/build_utility.c: note: in included file:
   kernel/sched/topology.c:107:56: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct sched_domain *sd @@     got struct sched_domain [noderef] __rcu *child @@
   kernel/sched/topology.c:107:56: sparse:     expected struct sched_domain *sd
   kernel/sched/topology.c:107:56: sparse:     got struct sched_domain [noderef] __rcu *child
   kernel/sched/topology.c:126:60: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct sched_domain *sd @@     got struct sched_domain [noderef] __rcu *parent @@
   kernel/sched/topology.c:126:60: sparse:     expected struct sched_domain *sd
   kernel/sched/topology.c:126:60: sparse:     got struct sched_domain [noderef] __rcu *parent
   kernel/sched/topology.c:149:20: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct sched_domain *sd @@     got struct sched_domain [noderef] __rcu *parent @@
   kernel/sched/topology.c:149:20: sparse:     expected struct sched_domain *sd
   kernel/sched/topology.c:149:20: sparse:     got struct sched_domain [noderef] __rcu *parent
   kernel/sched/topology.c:454:13: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct perf_domain *[assigned] tmp @@     got struct perf_domain [noderef] __rcu *pd @@
   kernel/sched/topology.c:454:13: sparse:     expected struct perf_domain *[assigned] tmp
   kernel/sched/topology.c:454:13: sparse:     got struct perf_domain [noderef] __rcu *pd
   kernel/sched/topology.c:463:13: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct perf_domain *[assigned] tmp @@     got struct perf_domain [noderef] __rcu *pd @@
   kernel/sched/topology.c:463:13: sparse:     expected struct perf_domain *[assigned] tmp
   kernel/sched/topology.c:463:13: sparse:     got struct perf_domain [noderef] __rcu *pd
   kernel/sched/topology.c:484:19: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct perf_domain *[assigned] pd @@     got struct perf_domain [noderef] __rcu *pd @@
   kernel/sched/topology.c:484:19: sparse:     expected struct perf_domain *[assigned] pd
   kernel/sched/topology.c:484:19: sparse:     got struct perf_domain [noderef] __rcu *pd
   kernel/sched/topology.c:646:49: sparse: sparse: incorrect type in initializer (different address spaces) @@     expected struct sched_domain *parent @@     got struct sched_domain [noderef] __rcu *parent @@
   kernel/sched/topology.c:646:49: sparse:     expected struct sched_domain *parent
   kernel/sched/topology.c:646:49: sparse:     got struct sched_domain [noderef] __rcu *parent
   kernel/sched/topology.c:731:50: sparse: sparse: incorrect type in initializer (different address spaces) @@     expected struct sched_domain *parent @@     got struct sched_domain [noderef] __rcu *parent @@
   kernel/sched/topology.c:731:50: sparse:     expected struct sched_domain *parent
   kernel/sched/topology.c:731:50: sparse:     got struct sched_domain [noderef] __rcu *parent
   kernel/sched/topology.c:739:55: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct sched_domain [noderef] __rcu *[noderef] __rcu child @@     got struct sched_domain *[assigned] tmp @@
   kernel/sched/topology.c:739:55: sparse:     expected struct sched_domain [noderef] __rcu *[noderef] __rcu child
   kernel/sched/topology.c:739:55: sparse:     got struct sched_domain *[assigned] tmp
   kernel/sched/topology.c:752:29: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct sched_domain *[assigned] tmp @@     got struct sched_domain [noderef] __rcu *parent @@
   kernel/sched/topology.c:752:29: sparse:     expected struct sched_domain *[assigned] tmp
   kernel/sched/topology.c:752:29: sparse:     got struct sched_domain [noderef] __rcu *parent
   kernel/sched/topology.c:757:20: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct sched_domain *sd @@     got struct sched_domain [noderef] __rcu *parent @@
   kernel/sched/topology.c:757:20: sparse:     expected struct sched_domain *sd
   kernel/sched/topology.c:757:20: sparse:     got struct sched_domain [noderef] __rcu *parent
   kernel/sched/topology.c:778:13: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct sched_domain *[assigned] tmp @@     got struct sched_domain [noderef] __rcu *sd @@
   kernel/sched/topology.c:778:13: sparse:     expected struct sched_domain *[assigned] tmp
   kernel/sched/topology.c:778:13: sparse:     got struct sched_domain [noderef] __rcu *sd
   kernel/sched/topology.c:940:70: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct sched_domain *sd @@     got struct sched_domain [noderef] __rcu *child @@
   kernel/sched/topology.c:940:70: sparse:     expected struct sched_domain *sd
   kernel/sched/topology.c:940:70: sparse:     got struct sched_domain [noderef] __rcu *child
   kernel/sched/topology.c:969:59: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct sched_domain *sd @@     got struct sched_domain [noderef] __rcu *child @@
   kernel/sched/topology.c:969:59: sparse:     expected struct sched_domain *sd
   kernel/sched/topology.c:969:59: sparse:     got struct sched_domain [noderef] __rcu *child
   kernel/sched/topology.c:1015:57: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct sched_domain *sd @@     got struct sched_domain [noderef] __rcu *child @@
   kernel/sched/topology.c:1015:57: sparse:     expected struct sched_domain *sd
   kernel/sched/topology.c:1015:57: sparse:     got struct sched_domain [noderef] __rcu *child
   kernel/sched/topology.c:1017:25: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct sched_domain *sibling @@     got struct sched_domain [noderef] __rcu *child @@
   kernel/sched/topology.c:1017:25: sparse:     expected struct sched_domain *sibling
   kernel/sched/topology.c:1017:25: sparse:     got struct sched_domain [noderef] __rcu *child
   kernel/sched/topology.c:1025:55: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct sched_domain *sd @@     got struct sched_domain [noderef] __rcu *child @@
   kernel/sched/topology.c:1025:55: sparse:     expected struct sched_domain *sd
   kernel/sched/topology.c:1025:55: sparse:     got struct sched_domain [noderef] __rcu *child
   kernel/sched/topology.c:1027:25: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct sched_domain *sibling @@     got struct sched_domain [noderef] __rcu *child @@
   kernel/sched/topology.c:1027:25: sparse:     expected struct sched_domain *sibling
   kernel/sched/topology.c:1027:25: sparse:     got struct sched_domain [noderef] __rcu *child
   kernel/sched/topology.c:1097:62: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct sched_domain *sd @@     got struct sched_domain [noderef] __rcu *child @@
   kernel/sched/topology.c:1097:62: sparse:     expected struct sched_domain *sd
   kernel/sched/topology.c:1097:62: sparse:     got struct sched_domain [noderef] __rcu *child
   kernel/sched/topology.c:1201:40: sparse: sparse: incorrect type in initializer (different address spaces) @@     expected struct sched_domain *child @@     got struct sched_domain [noderef] __rcu *child @@
   kernel/sched/topology.c:1201:40: sparse:     expected struct sched_domain *child
   kernel/sched/topology.c:1201:40: sparse:     got struct sched_domain [noderef] __rcu *child
   kernel/sched/topology.c:1629:43: sparse: sparse: incorrect type in initializer (different address spaces) @@     expected struct sched_domain [noderef] __rcu *child @@     got struct sched_domain *child @@
   kernel/sched/topology.c:1629:43: sparse:     expected struct sched_domain [noderef] __rcu *child
   kernel/sched/topology.c:1629:43: sparse:     got struct sched_domain *child
   kernel/sched/topology.c:2328:31: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct sched_domain [noderef] __rcu *parent @@     got struct sched_domain *sd @@
   kernel/sched/topology.c:2328:31: sparse:     expected struct sched_domain [noderef] __rcu *parent
   kernel/sched/topology.c:2328:31: sparse:     got struct sched_domain *sd
   kernel/sched/topology.c:2430:57: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct sched_domain *[assigned] sd @@     got struct sched_domain [noderef] __rcu *parent @@
   kernel/sched/topology.c:2430:57: sparse:     expected struct sched_domain *[assigned] sd
   kernel/sched/topology.c:2430:57: sparse:     got struct sched_domain [noderef] __rcu *parent
   kernel/sched/topology.c:2451:56: sparse: sparse: incorrect type in initializer (different address spaces) @@     expected struct sched_domain *child @@     got struct sched_domain [noderef] __rcu *child @@
   kernel/sched/topology.c:2451:56: sparse:     expected struct sched_domain *child
   kernel/sched/topology.c:2451:56: sparse:     got struct sched_domain [noderef] __rcu *child
   kernel/sched/topology.c:2450:57: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct sched_domain *[assigned] sd @@     got struct sched_domain [noderef] __rcu *parent @@
   kernel/sched/topology.c:2450:57: sparse:     expected struct sched_domain *[assigned] sd
   kernel/sched/topology.c:2450:57: sparse:     got struct sched_domain [noderef] __rcu *parent
   kernel/sched/topology.c:2505:57: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct sched_domain *[assigned] sd @@     got struct sched_domain [noderef] __rcu *parent @@
   kernel/sched/topology.c:2505:57: sparse:     expected struct sched_domain *[assigned] sd
   kernel/sched/topology.c:2505:57: sparse:     got struct sched_domain [noderef] __rcu *parent
   kernel/sched/build_utility.c: note: in included file:
   kernel/sched/core_sched.c:276:37: sparse: sparse: incompatible types in conditional expression (different address spaces):
   kernel/sched/core_sched.c:276:37: sparse:    struct task_struct *
   kernel/sched/core_sched.c:276:37: sparse:    struct task_struct [noderef] __rcu *
   kernel/sched/build_utility.c: note: in included file:
   kernel/sched/build_utility.c: note: in included file (through include/linux/mmzone.h, include/linux/topology.h, include/linux/sched/topology.h, ...):
   include/linux/page-flags.h:240:46: sparse: sparse: self-comparison always evaluates to false
   include/linux/page-flags.h:240:46: sparse: sparse: self-comparison always evaluates to false
   kernel/sched/build_utility.c: note: in included file:
   kernel/sched/sched.h:2175:25: sparse: sparse: incompatible types in comparison expression (different address spaces):

vim +/print_rt_se +715 kernel/sched/debug.c

   714	
 > 715	void print_rt_se(struct seq_file *m, struct sched_rt_entity *rt_se)
   716	{
   717		struct task_struct *task;
   718	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

[PATCH-RT sched v4 0/2] Optimize the RT group scheduling

Posted by Xavier 1 year, 6 months ago

Hi all,

Fix compilation warnings in debug.c

To Peter,

When is the new RT group implementation expected to be proposed?
Do you think my current patch is appropriate for optimization until then?


Xavier (2):
  RT SCHED: Optimize the enqueue and dequeue operations for rt_se
  RT test: Adding test cases for RT group scheduling

 MAINTAINERS                                   |   7 +
 kernel/sched/debug.c                          |  48 +++
 kernel/sched/rt.c                             | 287 +++++++++++++++---
 kernel/sched/sched.h                          |   1 +
 tools/testing/selftests/sched/Makefile        |   4 +-
 tools/testing/selftests/sched/deadloop.c      | 192 ++++++++++++
 .../selftests/sched/rt_group_sched_test.sh    | 119 ++++++++
 7 files changed, 618 insertions(+), 40 deletions(-)
 create mode 100644 tools/testing/selftests/sched/deadloop.c
 create mode 100755 tools/testing/selftests/sched/rt_group_sched_test.sh

-- 
2.45.2

[PATCH-RT sched v4 1/2] RT SCHED: Optimize the enqueue and dequeue operations for rt_se

Posted by Xavier 1 year, 6 months ago

This patch optimizes the enqueue and dequeue of rt_se, the strategy employs
a bottom-up removal approach. Specifically, when removing an rt_se at a
certain level, if it is determined that the highest priority of the rq
associated with that rt_se has not changed, there is no need to continue
removing rt_se at higher levels. At this point, only the total number
of removed rt_se needs to be recorded, and the rt_nr_running count of
higher-level rq should be removed accordingly.

Signed-off-by: Xavier <xavier_qy@163.com>
---
 kernel/sched/debug.c |  48 ++++++++
 kernel/sched/rt.c    | 287 +++++++++++++++++++++++++++++++++++++------
 kernel/sched/sched.h |   1 +
 3 files changed, 298 insertions(+), 38 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index c1eb9a1afd13..352ee55da25e 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -712,6 +712,54 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 #endif
 }
 
+static void print_rt_se(struct seq_file *m, struct sched_rt_entity *rt_se)
+{
+	struct task_struct *task;
+
+#ifdef CONFIG_RT_GROUP_SCHED
+	if (rt_se->my_q) {
+		SEQ_printf_task_group_path(m, rt_se->my_q->tg, "%s\n");
+		return;
+	}
+#endif
+	task = container_of(rt_se, struct task_struct, rt);
+	SEQ_printf(m, "	prio-%d, pid-%d, %s\n", task->prio, task->pid, task->comm);
+}
+
+/*shall be called in rq lock*/
+void print_rt_rq_task(struct seq_file *m, struct rt_rq *rt_rq)
+{
+	struct rt_prio_array *array = &rt_rq->active;
+	struct sched_rt_entity *rt_se;
+	struct list_head *queue, *head;
+	unsigned long bitmap[2];
+	int idx;
+	int count = 0;
+
+	if (!rt_rq->rt_nr_running)
+		return;
+
+	memcpy(bitmap, array->bitmap, sizeof(unsigned long) * 2);
+	idx = sched_find_first_bit(bitmap);
+	WARN_ON_ONCE(idx >= MAX_RT_PRIO);
+
+	while (1) {
+		clear_bit(idx, bitmap);
+		queue = array->queue + idx;
+		head = queue;
+		queue = queue->next;
+		do {
+			rt_se = list_entry(queue, struct sched_rt_entity, run_list);
+			print_rt_se(m, rt_se);
+			queue = queue->next;
+			count++;
+		} while (queue != head);
+		idx = sched_find_first_bit(bitmap);
+		if (idx >= MAX_RT_PRIO)
+			break;
+	}
+}
+
 void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 {
 #ifdef CONFIG_RT_GROUP_SCHED
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index aa4c1c874fa4..b18c424a50d2 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1113,7 +1113,7 @@ void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
 #endif /* CONFIG_SMP */
 
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-static void
+static int
 inc_rt_prio(struct rt_rq *rt_rq, int prio)
 {
 	int prev_prio = rt_rq->highest_prio.curr;
@@ -1122,9 +1122,11 @@ inc_rt_prio(struct rt_rq *rt_rq, int prio)
 		rt_rq->highest_prio.curr = prio;
 
 	inc_rt_prio_smp(rt_rq, prio, prev_prio);
+
+	return prev_prio > prio;
 }
 
-static void
+static int
 dec_rt_prio(struct rt_rq *rt_rq, int prio)
 {
 	int prev_prio = rt_rq->highest_prio.curr;
@@ -1149,12 +1151,22 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio)
 	}
 
 	dec_rt_prio_smp(rt_rq, prio, prev_prio);
+	if (rt_rq->highest_prio.curr > prio)
+		return prio;
+	else
+		return MAX_RT_PRIO;
 }
 
 #else
 
-static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
-static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
+static inline int inc_rt_prio(struct rt_rq *rt_rq, int prio)
+{
+	return 0;
+}
+static inline int dec_rt_prio(struct rt_rq *rt_rq, int prio)
+{
+	return 0;
+}
 
 #endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
 
@@ -1218,28 +1230,31 @@ unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
 }
 
 static inline
-void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+int inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
 	int prio = rt_se_prio(rt_se);
+	int prio_change;
 
 	WARN_ON(!rt_prio(prio));
 	rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
 	rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
 
-	inc_rt_prio(rt_rq, prio);
+	prio_change = inc_rt_prio(rt_rq, prio);
 	inc_rt_group(rt_se, rt_rq);
+	return prio_change;
 }
 
 static inline
-void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+int dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq, int prio)
 {
+	int prio_changed;
 	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
-	WARN_ON(!rt_rq->rt_nr_running);
 	rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
 	rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
 
-	dec_rt_prio(rt_rq, rt_se_prio(rt_se));
+	prio_changed = dec_rt_prio(rt_rq, prio);
 	dec_rt_group(rt_se, rt_rq);
+	return prio_changed;
 }
 
 /*
@@ -1255,12 +1270,13 @@ static inline bool move_entity(unsigned int flags)
 	return true;
 }
 
-static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
+static void __delist_rt_entity(struct sched_rt_entity *rt_se,
+						struct rt_prio_array *array, int last_prio)
 {
 	list_del_init(&rt_se->run_list);
 
-	if (list_empty(array->queue + rt_se_prio(rt_se)))
-		__clear_bit(rt_se_prio(rt_se), array->bitmap);
+	if (list_empty(array->queue + last_prio))
+		__clear_bit(last_prio, array->bitmap);
 
 	rt_se->on_list = 0;
 }
@@ -1371,7 +1387,12 @@ update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
 	}
 }
 
-static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
+/*
+ * Returns: -1 indicates that rt_se was not enqueued, 0 indicates that the highest
+ * priority of the rq did not change after enqueue, and 1 indicates that the highest
+ * priority of the rq changed after enqueue.
+ */
+static int __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
 	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 	struct rt_prio_array *array = &rt_rq->active;
@@ -1386,8 +1407,8 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flag
 	 */
 	if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
 		if (rt_se->on_list)
-			__delist_rt_entity(rt_se, array);
-		return;
+			__delist_rt_entity(rt_se, array, rt_se_prio(rt_se));
+		return -1;
 	}
 
 	if (move_entity(flags)) {
@@ -1402,73 +1423,263 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flag
 	}
 	rt_se->on_rq = 1;
 
-	inc_rt_tasks(rt_se, rt_rq);
+	return inc_rt_tasks(rt_se, rt_rq);
 }
 
-static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
+/**
+ * delete rt_se from rt_rq
+ *
+ * @rt_se		Nodes to be deleted
+ * @last_prio	The highest priority of this rt_se before the previous round
+ *				of deletion
+ * @flags		operation flags
+ *
+ * Returns: =0 indicates that the highest priority of the current rq did not
+ * change during this deletion. >0 indicates it changed, and it returns the
+ * previous highest priority to use in the next round of deletion.
+ */
+static int __dequeue_rt_entity(struct sched_rt_entity *rt_se, int last_prio,
+									unsigned int flags)
 {
 	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 	struct rt_prio_array *array = &rt_rq->active;
 
 	if (move_entity(flags)) {
 		WARN_ON_ONCE(!rt_se->on_list);
-		__delist_rt_entity(rt_se, array);
+		__delist_rt_entity(rt_se, array, last_prio);
 	}
 	rt_se->on_rq = 0;
 
-	dec_rt_tasks(rt_se, rt_rq);
+	return dec_rt_tasks(rt_se, rt_rq, last_prio);
+}
+
+static inline void dec_rq_nr_running(struct sched_rt_entity *rt_se,
+						unsigned int rt, unsigned int rr)
+{
+	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+
+	rt_rq->rt_nr_running -= rt;
+	rt_rq->rr_nr_running -= rr;
+}
+
+static inline void add_rq_nr_running(struct sched_rt_entity *rt_se,
+						unsigned int rt, unsigned int rr)
+{
+	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+
+	rt_rq->rt_nr_running += rt;
+	rt_rq->rr_nr_running += rr;
+}
+
+static inline bool on_top_rt_rq(struct sched_rt_entity *rt_se)
+{
+#ifdef CONFIG_RT_GROUP_SCHED
+	if (rt_se->parent)
+		return false;
+#endif
+	return true;
 }
 
 /*
- * Because the prio of an upper entry depends on the lower
- * entries, we must remove entries top - down.
+ * To optimize the enqueue and dequeue of rt_se, this strategy employs a
+ * bottom-up removal approach. Specifically, when removing an rt_se at a
+ * certain level, if it is determined that the highest priority of the rq
+ * associated with that rt_se has not changed, there is no need to continue
+ * removing rt_se at higher levels. At this point, only the total number
+ * of removed rt_se needs to be recorded, and the rt_nr_running count of
+ * higher-level rq should be removed accordingly.
+ *
+ * For enqueue operations, if an rt_se at a certain level is in the rq,
+ * it is still necessary to check the priority of the higher-level rq.
+ * If the priority of the higher-level rq is found to be lower than that
+ * of the rt_se to be added, it should be removed, as updating the highest
+ * priority of the rq during addition will cause the rq to be repositioned
+ * in the parent rq.
+ *
+ * Conversely, for dequeue operations, if an rt_se at a certain level is
+ * not in the rq, the operation can be exited immediately to reduce
+ * unnecessary checks and handling.
+ *
+ * The return value refers to the last rt_se that was removed for enqueue
+ * operations. And for dequeue operations, it refers to the last rt_se
+ * that was either removed or had its rt_nr_running updated.
  */
-static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
+static struct sched_rt_entity *dequeue_rt_stack(struct sched_rt_entity *rt_se,
+						unsigned int flags, int for_enqueue)
 {
-	struct sched_rt_entity *back = NULL;
-	unsigned int rt_nr_running;
+	struct sched_rt_entity *last = rt_se;
+	struct sched_rt_entity *origin = rt_se;
+	unsigned int del_rt_nr = 0;
+	unsigned int del_rr_nr = 0;
+	int prio_changed = rt_se_prio(rt_se);
+	int sub_on_rq = 1;
 
 	for_each_sched_rt_entity(rt_se) {
-		rt_se->back = back;
-		back = rt_se;
-	}
+		if (on_rt_rq(rt_se)) {
+			if (sub_on_rq) {
+				/*
+				 * The number of tasks removed from the sub-level rt_se also needs
+				 * to be subtracted from the rq of the current rt_se, as the current
+				 * rt_se's rq no longer includes the number of removed tasks.
+				 */
+				dec_rq_nr_running(rt_se, del_rt_nr, del_rr_nr);
+				if ((prio_changed != MAX_RT_PRIO) ||
+					(rt_se_prio(rt_se) > rt_se_prio(origin))) {
+					/*
+					 * If the removal of the lower-level rt_se causes the
+					 * highest priority of the current rq to change, or if the
+					 * priority of current rq is lower than the rt_se to be
+					 * added, then the current rt_se also needs to be removed
+					 * from its parent rq, and the number of deleted tasks
+					 * should be accumulated.
+					 */
+					if (prio_changed == MAX_RT_PRIO)
+						prio_changed = rt_se_prio(rt_se);
+					del_rt_nr += rt_se_nr_running(rt_se);
+					del_rr_nr += rt_se_rr_nr_running(rt_se);
+					prio_changed = __dequeue_rt_entity(rt_se,
+									prio_changed, flags);
+					last = rt_se;
+				} else if (!for_enqueue) {
+					/* For dequeue, last may only rt_nr_running was modified.*/
+					last = rt_se;
+				}
+			} else {
+				/*
+				 * Entering this branch must be for enqueue, as dequeue would break
+				 * if an rt_se is not online.
+				 * If the sub-level node is not online, and the current rt_se's
+				 * priority is lower than the one being added, current rt_se need
+				 * to be removed.
+				 */
+				prio_changed = rt_se_prio(rt_se);
+				if (prio_changed > rt_se_prio(origin)) {
+					del_rt_nr += rt_se_nr_running(rt_se);
+					del_rr_nr += rt_se_rr_nr_running(rt_se);
+					prio_changed = __dequeue_rt_entity(rt_se,
+									prio_changed, flags);
+					last = rt_se;
+				} else {
+					prio_changed = MAX_RT_PRIO;
+				}
+			}
 
-	rt_nr_running = rt_rq_of_se(back)->rt_nr_running;
+			/*
+			 * If the current rt_se is on the top rt_rq, then the already deleted
+			 * nodes, plus the count of the rt_rq where current rt_se located,
+			 * need to be removed from the top_rt_rq.
+			 */
+			if (on_top_rt_rq(rt_se)) {
+				dequeue_top_rt_rq(rt_rq_of_se(rt_se),
+						del_rt_nr + rt_rq_of_se(rt_se)->rt_nr_running);
+			}
+			sub_on_rq = 1;
+		} else if (for_enqueue) {
+			struct rt_rq *group_rq = group_rt_rq(rt_se);
 
-	for (rt_se = back; rt_se; rt_se = rt_se->back) {
-		if (on_rt_rq(rt_se))
-			__dequeue_rt_entity(rt_se, flags);
+			/*
+			 * In the case of an enqueue operation, if a certain level is found to be
+			 * not online, then the previous counts need to be reset to zero.
+			 */
+			prio_changed = MAX_RT_PRIO;
+			sub_on_rq = 0;
+			del_rt_nr = 0;
+			del_rr_nr = 0;
+
+			/*
+			 * If the current group is being throttled, then there is no need to check
+			 * higher levels since enqueueing will not affect higher-level nodes.
+			 */
+			if (group_rq && rt_rq_throttled(group_rq))
+				break;
+
+			if (on_top_rt_rq(rt_se))
+				dequeue_top_rt_rq(rt_rq_of_se(rt_se),
+						rt_rq_of_se(rt_se)->rt_nr_running);
+		} else {
+			last = rt_se;
+			break;
+		}
 	}
 
-	dequeue_top_rt_rq(rt_rq_of_se(back), rt_nr_running);
+	return last;
 }
 
 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
 	struct rq *rq = rq_of_rt_se(rt_se);
+	struct sched_rt_entity *last;
+	unsigned int add_rt_nr = 0;
+	unsigned int add_rr_nr = 0;
+	int enqueue = 1;
+	int prio_change = 1;
 
 	update_stats_enqueue_rt(rt_rq_of_se(rt_se), rt_se, flags);
 
-	dequeue_rt_stack(rt_se, flags);
-	for_each_sched_rt_entity(rt_se)
-		__enqueue_rt_entity(rt_se, flags);
+	last = dequeue_rt_stack(rt_se, flags, 1);
+
+	for_each_sched_rt_entity(rt_se) {
+		if (enqueue || !on_rt_rq(rt_se) || (prio_change == 1)) {
+			prio_change = __enqueue_rt_entity(rt_se, flags);
+			if (prio_change >= 0) {
+				add_rt_nr = rt_se_nr_running(rt_se);
+				add_rr_nr = rt_se_rr_nr_running(rt_se);
+			} else {
+				add_rt_nr = add_rr_nr = 0;
+			}
+		} else {
+			add_rq_nr_running(rt_se, add_rt_nr, add_rr_nr);
+		}
+
+		if (rt_se == last)
+			enqueue = 0;
+	}
+
 	enqueue_top_rt_rq(&rq->rt);
 }
 
 static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
 	struct rq *rq = rq_of_rt_se(rt_se);
+	struct sched_rt_entity *last;
+	unsigned int add_rt_nr = 0;
+	unsigned int add_rr_nr = 0;
+	int prio_change = 1;
 
 	update_stats_dequeue_rt(rt_rq_of_se(rt_se), rt_se, flags);
 
-	dequeue_rt_stack(rt_se, flags);
+	last = dequeue_rt_stack(rt_se, flags, 0);
 
 	for_each_sched_rt_entity(rt_se) {
 		struct rt_rq *rt_rq = group_rt_rq(rt_se);
+		if (rt_rq && rt_rq->rt_nr_running) {
+			if (on_rt_rq(rt_se)) {
+				add_rq_nr_running(rt_se, add_rt_nr, add_rr_nr);
+			} else {
+				prio_change = __enqueue_rt_entity(rt_se, flags);
+				if (prio_change == 0) {
+					/*
+					 * If enqueue is successful and the priority of the rq has
+					 * not changed, then the parent node only needs to add the
+					 * count of the current rt_se. Otherwise, the parent node
+					 * will also need to enqueue.
+					 */
+					add_rt_nr = rt_se_nr_running(rt_se);
+					add_rr_nr = rt_se_rr_nr_running(rt_se);
+				}
+			}
+		} else {
+			add_rt_nr = add_rr_nr = 0;
+		}
 
-		if (rt_rq && rt_rq->rt_nr_running)
-			__enqueue_rt_entity(rt_se, flags);
+		/*
+		 * last is the rt_se of the last deletion or modification of the
+		 * count, so the subsequent rt_se does not need to be updated.
+		 */
+		if (rt_se == last)
+			break;
 	}
+
 	enqueue_top_rt_rq(&rq->rt);
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ef20c61004eb..821d65106d13 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2879,6 +2879,7 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
 extern void print_dl_stats(struct seq_file *m, int cpu);
 extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
 extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
+extern void print_rt_rq_task(struct seq_file *m, struct rt_rq *rt_rq);
 extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
 
 extern void resched_latency_warn(int cpu, u64 latency);
-- 
2.45.2

Re:[PATCH-RT sched v4 1/2] RT SCHED: Optimize the enqueue and dequeue operations for rt_se

Posted by Xavier 1 year, 6 months ago

Hi all,

I would like to ask everyone for your opinions or thoughts on the RT group scheduling
optimization. At this stage, is it ready to be merged into the corresponding branch?
Thanks.

--
Best Regards,
Xavier





At 2024-07-17 11:00:32, "Xavier" <xavier_qy@163.com> wrote:
>This patch optimizes the enqueue and dequeue of rt_se, the strategy employs
>a bottom-up removal approach. Specifically, when removing an rt_se at a
>certain level, if it is determined that the highest priority of the rq
>associated with that rt_se has not changed, there is no need to continue
>removing rt_se at higher levels. At this point, only the total number
>of removed rt_se needs to be recorded, and the rt_nr_running count of
>higher-level rq should be removed accordingly.
>
>Signed-off-by: Xavier <xavier_qy@163.com>
>---
> kernel/sched/debug.c |  48 ++++++++
> kernel/sched/rt.c    | 287 +++++++++++++++++++++++++++++++++++++------
> kernel/sched/sched.h |   1 +
> 3 files changed, 298 insertions(+), 38 deletions(-)
>
>diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
>index c1eb9a1afd13..352ee55da25e 100644
>--- a/kernel/sched/debug.c
>+++ b/kernel/sched/debug.c
>@@ -712,6 +712,54 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
> #endif
> }
> 
>+static void print_rt_se(struct seq_file *m, struct sched_rt_entity *rt_se)
>+{
>+	struct task_struct *task;
>+
>+#ifdef CONFIG_RT_GROUP_SCHED
>+	if (rt_se->my_q) {
>+		SEQ_printf_task_group_path(m, rt_se->my_q->tg, "%s\n");
>+		return;
>+	}
>+#endif
>+	task = container_of(rt_se, struct task_struct, rt);
>+	SEQ_printf(m, "	prio-%d, pid-%d, %s\n", task->prio, task->pid, task->comm);
>+}
>+
>+/*shall be called in rq lock*/
>+void print_rt_rq_task(struct seq_file *m, struct rt_rq *rt_rq)
>+{
>+	struct rt_prio_array *array = &rt_rq->active;
>+	struct sched_rt_entity *rt_se;
>+	struct list_head *queue, *head;
>+	unsigned long bitmap[2];
>+	int idx;
>+	int count = 0;
>+
>+	if (!rt_rq->rt_nr_running)
>+		return;
>+
>+	memcpy(bitmap, array->bitmap, sizeof(unsigned long) * 2);
>+	idx = sched_find_first_bit(bitmap);
>+	WARN_ON_ONCE(idx >= MAX_RT_PRIO);
>+
>+	while (1) {
>+		clear_bit(idx, bitmap);
>+		queue = array->queue + idx;
>+		head = queue;
>+		queue = queue->next;
>+		do {
>+			rt_se = list_entry(queue, struct sched_rt_entity, run_list);
>+			print_rt_se(m, rt_se);
>+			queue = queue->next;
>+			count++;
>+		} while (queue != head);
>+		idx = sched_find_first_bit(bitmap);
>+		if (idx >= MAX_RT_PRIO)
>+			break;
>+	}
>+}
>+
> void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
> {
> #ifdef CONFIG_RT_GROUP_SCHED
>diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
>index aa4c1c874fa4..b18c424a50d2 100644
>--- a/kernel/sched/rt.c
>+++ b/kernel/sched/rt.c
>@@ -1113,7 +1113,7 @@ void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
> #endif /* CONFIG_SMP */
> 
> #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
>-static void
>+static int
> inc_rt_prio(struct rt_rq *rt_rq, int prio)
> {
> 	int prev_prio = rt_rq->highest_prio.curr;
>@@ -1122,9 +1122,11 @@ inc_rt_prio(struct rt_rq *rt_rq, int prio)
> 		rt_rq->highest_prio.curr = prio;
> 
> 	inc_rt_prio_smp(rt_rq, prio, prev_prio);
>+
>+	return prev_prio > prio;
> }
> 
>-static void
>+static int
> dec_rt_prio(struct rt_rq *rt_rq, int prio)
> {
> 	int prev_prio = rt_rq->highest_prio.curr;
>@@ -1149,12 +1151,22 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio)
> 	}
> 
> 	dec_rt_prio_smp(rt_rq, prio, prev_prio);
>+	if (rt_rq->highest_prio.curr > prio)
>+		return prio;
>+	else
>+		return MAX_RT_PRIO;
> }
> 
> #else
> 
>-static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
>-static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
>+static inline int inc_rt_prio(struct rt_rq *rt_rq, int prio)
>+{
>+	return 0;
>+}
>+static inline int dec_rt_prio(struct rt_rq *rt_rq, int prio)
>+{
>+	return 0;
>+}
> 
> #endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
> 
>@@ -1218,28 +1230,31 @@ unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
> }
> 
> static inline
>-void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
>+int inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
> {
> 	int prio = rt_se_prio(rt_se);
>+	int prio_change;
> 
> 	WARN_ON(!rt_prio(prio));
> 	rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
> 	rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
> 
>-	inc_rt_prio(rt_rq, prio);
>+	prio_change = inc_rt_prio(rt_rq, prio);
> 	inc_rt_group(rt_se, rt_rq);
>+	return prio_change;
> }
> 
> static inline
>-void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
>+int dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq, int prio)
> {
>+	int prio_changed;
> 	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
>-	WARN_ON(!rt_rq->rt_nr_running);
> 	rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
> 	rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
> 
>-	dec_rt_prio(rt_rq, rt_se_prio(rt_se));
>+	prio_changed = dec_rt_prio(rt_rq, prio);
> 	dec_rt_group(rt_se, rt_rq);
>+	return prio_changed;
> }
> 
> /*
>@@ -1255,12 +1270,13 @@ static inline bool move_entity(unsigned int flags)
> 	return true;
> }
> 
>-static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
>+static void __delist_rt_entity(struct sched_rt_entity *rt_se,
>+						struct rt_prio_array *array, int last_prio)
> {
> 	list_del_init(&rt_se->run_list);
> 
>-	if (list_empty(array->queue + rt_se_prio(rt_se)))
>-		__clear_bit(rt_se_prio(rt_se), array->bitmap);
>+	if (list_empty(array->queue + last_prio))
>+		__clear_bit(last_prio, array->bitmap);
> 
> 	rt_se->on_list = 0;
> }
>@@ -1371,7 +1387,12 @@ update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
> 	}
> }
> 
>-static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
>+/*
>+ * Returns: -1 indicates that rt_se was not enqueued, 0 indicates that the highest
>+ * priority of the rq did not change after enqueue, and 1 indicates that the highest
>+ * priority of the rq changed after enqueue.
>+ */
>+static int __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
> {
> 	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
> 	struct rt_prio_array *array = &rt_rq->active;
>@@ -1386,8 +1407,8 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flag
> 	 */
> 	if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
> 		if (rt_se->on_list)
>-			__delist_rt_entity(rt_se, array);
>-		return;
>+			__delist_rt_entity(rt_se, array, rt_se_prio(rt_se));
>+		return -1;
> 	}
> 
> 	if (move_entity(flags)) {
>@@ -1402,73 +1423,263 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flag
> 	}
> 	rt_se->on_rq = 1;
> 
>-	inc_rt_tasks(rt_se, rt_rq);
>+	return inc_rt_tasks(rt_se, rt_rq);
> }
> 
>-static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
>+/**
>+ * delete rt_se from rt_rq
>+ *
>+ * @rt_se		Nodes to be deleted
>+ * @last_prio	The highest priority of this rt_se before the previous round
>+ *				of deletion
>+ * @flags		operation flags
>+ *
>+ * Returns: =0 indicates that the highest priority of the current rq did not
>+ * change during this deletion. >0 indicates it changed, and it returns the
>+ * previous highest priority to use in the next round of deletion.
>+ */
>+static int __dequeue_rt_entity(struct sched_rt_entity *rt_se, int last_prio,
>+									unsigned int flags)
> {
> 	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
> 	struct rt_prio_array *array = &rt_rq->active;
> 
> 	if (move_entity(flags)) {
> 		WARN_ON_ONCE(!rt_se->on_list);
>-		__delist_rt_entity(rt_se, array);
>+		__delist_rt_entity(rt_se, array, last_prio);
> 	}
> 	rt_se->on_rq = 0;
> 
>-	dec_rt_tasks(rt_se, rt_rq);
>+	return dec_rt_tasks(rt_se, rt_rq, last_prio);
>+}
>+
>+static inline void dec_rq_nr_running(struct sched_rt_entity *rt_se,
>+						unsigned int rt, unsigned int rr)
>+{
>+	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
>+
>+	rt_rq->rt_nr_running -= rt;
>+	rt_rq->rr_nr_running -= rr;
>+}
>+
>+static inline void add_rq_nr_running(struct sched_rt_entity *rt_se,
>+						unsigned int rt, unsigned int rr)
>+{
>+	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
>+
>+	rt_rq->rt_nr_running += rt;
>+	rt_rq->rr_nr_running += rr;
>+}
>+
>+static inline bool on_top_rt_rq(struct sched_rt_entity *rt_se)
>+{
>+#ifdef CONFIG_RT_GROUP_SCHED
>+	if (rt_se->parent)
>+		return false;
>+#endif
>+	return true;
> }
> 
> /*
>- * Because the prio of an upper entry depends on the lower
>- * entries, we must remove entries top - down.
>+ * To optimize the enqueue and dequeue of rt_se, this strategy employs a
>+ * bottom-up removal approach. Specifically, when removing an rt_se at a
>+ * certain level, if it is determined that the highest priority of the rq
>+ * associated with that rt_se has not changed, there is no need to continue
>+ * removing rt_se at higher levels. At this point, only the total number
>+ * of removed rt_se needs to be recorded, and the rt_nr_running count of
>+ * higher-level rq should be removed accordingly.
>+ *
>+ * For enqueue operations, if an rt_se at a certain level is in the rq,
>+ * it is still necessary to check the priority of the higher-level rq.
>+ * If the priority of the higher-level rq is found to be lower than that
>+ * of the rt_se to be added, it should be removed, as updating the highest
>+ * priority of the rq during addition will cause the rq to be repositioned
>+ * in the parent rq.
>+ *
>+ * Conversely, for dequeue operations, if an rt_se at a certain level is
>+ * not in the rq, the operation can be exited immediately to reduce
>+ * unnecessary checks and handling.
>+ *
>+ * The return value refers to the last rt_se that was removed for enqueue
>+ * operations. And for dequeue operations, it refers to the last rt_se
>+ * that was either removed or had its rt_nr_running updated.
>  */
>-static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
>+static struct sched_rt_entity *dequeue_rt_stack(struct sched_rt_entity *rt_se,
>+						unsigned int flags, int for_enqueue)
> {
>-	struct sched_rt_entity *back = NULL;
>-	unsigned int rt_nr_running;
>+	struct sched_rt_entity *last = rt_se;
>+	struct sched_rt_entity *origin = rt_se;
>+	unsigned int del_rt_nr = 0;
>+	unsigned int del_rr_nr = 0;
>+	int prio_changed = rt_se_prio(rt_se);
>+	int sub_on_rq = 1;
> 
> 	for_each_sched_rt_entity(rt_se) {
>-		rt_se->back = back;
>-		back = rt_se;
>-	}
>+		if (on_rt_rq(rt_se)) {
>+			if (sub_on_rq) {
>+				/*
>+				 * The number of tasks removed from the sub-level rt_se also needs
>+				 * to be subtracted from the rq of the current rt_se, as the current
>+				 * rt_se's rq no longer includes the number of removed tasks.
>+				 */
>+				dec_rq_nr_running(rt_se, del_rt_nr, del_rr_nr);
>+				if ((prio_changed != MAX_RT_PRIO) ||
>+					(rt_se_prio(rt_se) > rt_se_prio(origin))) {
>+					/*
>+					 * If the removal of the lower-level rt_se causes the
>+					 * highest priority of the current rq to change, or if the
>+					 * priority of current rq is lower than the rt_se to be
>+					 * added, then the current rt_se also needs to be removed
>+					 * from its parent rq, and the number of deleted tasks
>+					 * should be accumulated.
>+					 */
>+					if (prio_changed == MAX_RT_PRIO)
>+						prio_changed = rt_se_prio(rt_se);
>+					del_rt_nr += rt_se_nr_running(rt_se);
>+					del_rr_nr += rt_se_rr_nr_running(rt_se);
>+					prio_changed = __dequeue_rt_entity(rt_se,
>+									prio_changed, flags);
>+					last = rt_se;
>+				} else if (!for_enqueue) {
>+					/* For dequeue, last may only rt_nr_running was modified.*/
>+					last = rt_se;
>+				}
>+			} else {
>+				/*
>+				 * Entering this branch must be for enqueue, as dequeue would break
>+				 * if an rt_se is not online.
>+				 * If the sub-level node is not online, and the current rt_se's
>+				 * priority is lower than the one being added, current rt_se need
>+				 * to be removed.
>+				 */
>+				prio_changed = rt_se_prio(rt_se);
>+				if (prio_changed > rt_se_prio(origin)) {
>+					del_rt_nr += rt_se_nr_running(rt_se);
>+					del_rr_nr += rt_se_rr_nr_running(rt_se);
>+					prio_changed = __dequeue_rt_entity(rt_se,
>+									prio_changed, flags);
>+					last = rt_se;
>+				} else {
>+					prio_changed = MAX_RT_PRIO;
>+				}
>+			}
> 
>-	rt_nr_running = rt_rq_of_se(back)->rt_nr_running;
>+			/*
>+			 * If the current rt_se is on the top rt_rq, then the already deleted
>+			 * nodes, plus the count of the rt_rq where current rt_se located,
>+			 * need to be removed from the top_rt_rq.
>+			 */
>+			if (on_top_rt_rq(rt_se)) {
>+				dequeue_top_rt_rq(rt_rq_of_se(rt_se),
>+						del_rt_nr + rt_rq_of_se(rt_se)->rt_nr_running);
>+			}
>+			sub_on_rq = 1;
>+		} else if (for_enqueue) {
>+			struct rt_rq *group_rq = group_rt_rq(rt_se);
> 
>-	for (rt_se = back; rt_se; rt_se = rt_se->back) {
>-		if (on_rt_rq(rt_se))
>-			__dequeue_rt_entity(rt_se, flags);
>+			/*
>+			 * In the case of an enqueue operation, if a certain level is found to be
>+			 * not online, then the previous counts need to be reset to zero.
>+			 */
>+			prio_changed = MAX_RT_PRIO;
>+			sub_on_rq = 0;
>+			del_rt_nr = 0;
>+			del_rr_nr = 0;
>+
>+			/*
>+			 * If the current group is being throttled, then there is no need to check
>+			 * higher levels since enqueueing will not affect higher-level nodes.
>+			 */
>+			if (group_rq && rt_rq_throttled(group_rq))
>+				break;
>+
>+			if (on_top_rt_rq(rt_se))
>+				dequeue_top_rt_rq(rt_rq_of_se(rt_se),
>+						rt_rq_of_se(rt_se)->rt_nr_running);
>+		} else {
>+			last = rt_se;
>+			break;
>+		}
> 	}
> 
>-	dequeue_top_rt_rq(rt_rq_of_se(back), rt_nr_running);
>+	return last;
> }
> 
> static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
> {
> 	struct rq *rq = rq_of_rt_se(rt_se);
>+	struct sched_rt_entity *last;
>+	unsigned int add_rt_nr = 0;
>+	unsigned int add_rr_nr = 0;
>+	int enqueue = 1;
>+	int prio_change = 1;
> 
> 	update_stats_enqueue_rt(rt_rq_of_se(rt_se), rt_se, flags);
> 
>-	dequeue_rt_stack(rt_se, flags);
>-	for_each_sched_rt_entity(rt_se)
>-		__enqueue_rt_entity(rt_se, flags);
>+	last = dequeue_rt_stack(rt_se, flags, 1);
>+
>+	for_each_sched_rt_entity(rt_se) {
>+		if (enqueue || !on_rt_rq(rt_se) || (prio_change == 1)) {
>+			prio_change = __enqueue_rt_entity(rt_se, flags);
>+			if (prio_change >= 0) {
>+				add_rt_nr = rt_se_nr_running(rt_se);
>+				add_rr_nr = rt_se_rr_nr_running(rt_se);
>+			} else {
>+				add_rt_nr = add_rr_nr = 0;
>+			}
>+		} else {
>+			add_rq_nr_running(rt_se, add_rt_nr, add_rr_nr);
>+		}
>+
>+		if (rt_se == last)
>+			enqueue = 0;
>+	}
>+
> 	enqueue_top_rt_rq(&rq->rt);
> }
> 
> static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
> {
> 	struct rq *rq = rq_of_rt_se(rt_se);
>+	struct sched_rt_entity *last;
>+	unsigned int add_rt_nr = 0;
>+	unsigned int add_rr_nr = 0;
>+	int prio_change = 1;
> 
> 	update_stats_dequeue_rt(rt_rq_of_se(rt_se), rt_se, flags);
> 
>-	dequeue_rt_stack(rt_se, flags);
>+	last = dequeue_rt_stack(rt_se, flags, 0);
> 
> 	for_each_sched_rt_entity(rt_se) {
> 		struct rt_rq *rt_rq = group_rt_rq(rt_se);
>+		if (rt_rq && rt_rq->rt_nr_running) {
>+			if (on_rt_rq(rt_se)) {
>+				add_rq_nr_running(rt_se, add_rt_nr, add_rr_nr);
>+			} else {
>+				prio_change = __enqueue_rt_entity(rt_se, flags);
>+				if (prio_change == 0) {
>+					/*
>+					 * If enqueue is successful and the priority of the rq has
>+					 * not changed, then the parent node only needs to add the
>+					 * count of the current rt_se. Otherwise, the parent node
>+					 * will also need to enqueue.
>+					 */
>+					add_rt_nr = rt_se_nr_running(rt_se);
>+					add_rr_nr = rt_se_rr_nr_running(rt_se);
>+				}
>+			}
>+		} else {
>+			add_rt_nr = add_rr_nr = 0;
>+		}
> 
>-		if (rt_rq && rt_rq->rt_nr_running)
>-			__enqueue_rt_entity(rt_se, flags);
>+		/*
>+		 * last is the rt_se of the last deletion or modification of the
>+		 * count, so the subsequent rt_se does not need to be updated.
>+		 */
>+		if (rt_se == last)
>+			break;
> 	}
>+
> 	enqueue_top_rt_rq(&rq->rt);
> }
> 
>diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>index ef20c61004eb..821d65106d13 100644
>--- a/kernel/sched/sched.h
>+++ b/kernel/sched/sched.h
>@@ -2879,6 +2879,7 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
> extern void print_dl_stats(struct seq_file *m, int cpu);
> extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
> extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
>+extern void print_rt_rq_task(struct seq_file *m, struct rt_rq *rt_rq);
> extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
> 
> extern void resched_latency_warn(int cpu, u64 latency);
>-- 
>2.45.2

Re:Re:[PATCH-RT sched v4 1/2] RT SCHED: Optimize the enqueue and dequeue operations for rt_se

Posted by Xavier 1 year, 6 months ago


Just a reminder, does anyone have any comments or feedback on the patch?

--
Best Regards,
Xavier




At 2024-07-25 14:21:03, "Xavier" <xavier_qy@163.com> wrote:
>
>Hi all,
>
>I would like to ask everyone for your opinions or thoughts on the RT group scheduling
>optimization. At this stage, is it ready to be merged into the corresponding branch?
>Thanks.
>
>--
>Best Regards,
>Xavier
>
>
>
>
>
>At 2024-07-17 11:00:32, "Xavier" <xavier_qy@163.com> wrote:
>>This patch optimizes the enqueue and dequeue of rt_se, the strategy employs
>>a bottom-up removal approach. Specifically, when removing an rt_se at a
>>certain level, if it is determined that the highest priority of the rq
>>associated with that rt_se has not changed, there is no need to continue
>>removing rt_se at higher levels. At this point, only the total number
>>of removed rt_se needs to be recorded, and the rt_nr_running count of
>>higher-level rq should be removed accordingly.
>>
>>Signed-off-by: Xavier <xavier_qy@163.com>
>>---
>> kernel/sched/debug.c |  48 ++++++++
>> kernel/sched/rt.c    | 287 +++++++++++++++++++++++++++++++++++++------
>> kernel/sched/sched.h |   1 +
>> 3 files changed, 298 insertions(+), 38 deletions(-)
>>
>>diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
>>index c1eb9a1afd13..352ee55da25e 100644
>>--- a/kernel/sched/debug.c
>>+++ b/kernel/sched/debug.c
>>@@ -712,6 +712,54 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
>> #endif
>> }
>> 
>>+static void print_rt_se(struct seq_file *m, struct sched_rt_entity *rt_se)
>>+{
>>+	struct task_struct *task;
>>+
>>+#ifdef CONFIG_RT_GROUP_SCHED
>>+	if (rt_se->my_q) {
>>+		SEQ_printf_task_group_path(m, rt_se->my_q->tg, "%s\n");
>>+		return;
>>+	}
>>+#endif
>>+	task = container_of(rt_se, struct task_struct, rt);
>>+	SEQ_printf(m, "	prio-%d, pid-%d, %s\n", task->prio, task->pid, task->comm);
>>+}
>>+
>>+/*shall be called in rq lock*/
>>+void print_rt_rq_task(struct seq_file *m, struct rt_rq *rt_rq)
>>+{
>>+	struct rt_prio_array *array = &rt_rq->active;
>>+	struct sched_rt_entity *rt_se;
>>+	struct list_head *queue, *head;
>>+	unsigned long bitmap[2];
>>+	int idx;
>>+	int count = 0;
>>+
>>+	if (!rt_rq->rt_nr_running)
>>+		return;
>>+
>>+	memcpy(bitmap, array->bitmap, sizeof(unsigned long) * 2);
>>+	idx = sched_find_first_bit(bitmap);
>>+	WARN_ON_ONCE(idx >= MAX_RT_PRIO);
>>+
>>+	while (1) {
>>+		clear_bit(idx, bitmap);
>>+		queue = array->queue + idx;
>>+		head = queue;
>>+		queue = queue->next;
>>+		do {
>>+			rt_se = list_entry(queue, struct sched_rt_entity, run_list);
>>+			print_rt_se(m, rt_se);
>>+			queue = queue->next;
>>+			count++;
>>+		} while (queue != head);
>>+		idx = sched_find_first_bit(bitmap);
>>+		if (idx >= MAX_RT_PRIO)
>>+			break;
>>+	}
>>+}
>>+
>> void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
>> {
>> #ifdef CONFIG_RT_GROUP_SCHED
>>diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
>>index aa4c1c874fa4..b18c424a50d2 100644
>>--- a/kernel/sched/rt.c
>>+++ b/kernel/sched/rt.c
>>@@ -1113,7 +1113,7 @@ void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
>> #endif /* CONFIG_SMP */
>> 
>> #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
>>-static void
>>+static int
>> inc_rt_prio(struct rt_rq *rt_rq, int prio)
>> {
>> 	int prev_prio = rt_rq->highest_prio.curr;
>>@@ -1122,9 +1122,11 @@ inc_rt_prio(struct rt_rq *rt_rq, int prio)
>> 		rt_rq->highest_prio.curr = prio;
>> 
>> 	inc_rt_prio_smp(rt_rq, prio, prev_prio);
>>+
>>+	return prev_prio > prio;
>> }
>> 
>>-static void
>>+static int
>> dec_rt_prio(struct rt_rq *rt_rq, int prio)
>> {
>> 	int prev_prio = rt_rq->highest_prio.curr;
>>@@ -1149,12 +1151,22 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio)
>> 	}
>> 
>> 	dec_rt_prio_smp(rt_rq, prio, prev_prio);
>>+	if (rt_rq->highest_prio.curr > prio)
>>+		return prio;
>>+	else
>>+		return MAX_RT_PRIO;
>> }
>> 
>> #else
>> 
>>-static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
>>-static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
>>+static inline int inc_rt_prio(struct rt_rq *rt_rq, int prio)
>>+{
>>+	return 0;
>>+}
>>+static inline int dec_rt_prio(struct rt_rq *rt_rq, int prio)
>>+{
>>+	return 0;
>>+}
>> 
>> #endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
>> 
>>@@ -1218,28 +1230,31 @@ unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
>> }
>> 
>> static inline
>>-void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
>>+int inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
>> {
>> 	int prio = rt_se_prio(rt_se);
>>+	int prio_change;
>> 
>> 	WARN_ON(!rt_prio(prio));
>> 	rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
>> 	rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
>> 
>>-	inc_rt_prio(rt_rq, prio);
>>+	prio_change = inc_rt_prio(rt_rq, prio);
>> 	inc_rt_group(rt_se, rt_rq);
>>+	return prio_change;
>> }
>> 
>> static inline
>>-void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
>>+int dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq, int prio)
>> {
>>+	int prio_changed;
>> 	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
>>-	WARN_ON(!rt_rq->rt_nr_running);
>> 	rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
>> 	rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
>> 
>>-	dec_rt_prio(rt_rq, rt_se_prio(rt_se));
>>+	prio_changed = dec_rt_prio(rt_rq, prio);
>> 	dec_rt_group(rt_se, rt_rq);
>>+	return prio_changed;
>> }
>> 
>> /*
>>@@ -1255,12 +1270,13 @@ static inline bool move_entity(unsigned int flags)
>> 	return true;
>> }
>> 
>>-static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
>>+static void __delist_rt_entity(struct sched_rt_entity *rt_se,
>>+						struct rt_prio_array *array, int last_prio)
>> {
>> 	list_del_init(&rt_se->run_list);
>> 
>>-	if (list_empty(array->queue + rt_se_prio(rt_se)))
>>-		__clear_bit(rt_se_prio(rt_se), array->bitmap);
>>+	if (list_empty(array->queue + last_prio))
>>+		__clear_bit(last_prio, array->bitmap);
>> 
>> 	rt_se->on_list = 0;
>> }
>>@@ -1371,7 +1387,12 @@ update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
>> 	}
>> }
>> 
>>-static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
>>+/*
>>+ * Returns: -1 indicates that rt_se was not enqueued, 0 indicates that the highest
>>+ * priority of the rq did not change after enqueue, and 1 indicates that the highest
>>+ * priority of the rq changed after enqueue.
>>+ */
>>+static int __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
>> {
>> 	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
>> 	struct rt_prio_array *array = &rt_rq->active;
>>@@ -1386,8 +1407,8 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flag
>> 	 */
>> 	if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
>> 		if (rt_se->on_list)
>>-			__delist_rt_entity(rt_se, array);
>>-		return;
>>+			__delist_rt_entity(rt_se, array, rt_se_prio(rt_se));
>>+		return -1;
>> 	}
>> 
>> 	if (move_entity(flags)) {
>>@@ -1402,73 +1423,263 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flag
>> 	}
>> 	rt_se->on_rq = 1;
>> 
>>-	inc_rt_tasks(rt_se, rt_rq);
>>+	return inc_rt_tasks(rt_se, rt_rq);
>> }
>> 
>>-static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
>>+/**
>>+ * delete rt_se from rt_rq
>>+ *
>>+ * @rt_se		Nodes to be deleted
>>+ * @last_prio	The highest priority of this rt_se before the previous round
>>+ *				of deletion
>>+ * @flags		operation flags
>>+ *
>>+ * Returns: =0 indicates that the highest priority of the current rq did not
>>+ * change during this deletion. >0 indicates it changed, and it returns the
>>+ * previous highest priority to use in the next round of deletion.
>>+ */
>>+static int __dequeue_rt_entity(struct sched_rt_entity *rt_se, int last_prio,
>>+									unsigned int flags)
>> {
>> 	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
>> 	struct rt_prio_array *array = &rt_rq->active;
>> 
>> 	if (move_entity(flags)) {
>> 		WARN_ON_ONCE(!rt_se->on_list);
>>-		__delist_rt_entity(rt_se, array);
>>+		__delist_rt_entity(rt_se, array, last_prio);
>> 	}
>> 	rt_se->on_rq = 0;
>> 
>>-	dec_rt_tasks(rt_se, rt_rq);
>>+	return dec_rt_tasks(rt_se, rt_rq, last_prio);
>>+}
>>+
>>+static inline void dec_rq_nr_running(struct sched_rt_entity *rt_se,
>>+						unsigned int rt, unsigned int rr)
>>+{
>>+	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
>>+
>>+	rt_rq->rt_nr_running -= rt;
>>+	rt_rq->rr_nr_running -= rr;
>>+}
>>+
>>+static inline void add_rq_nr_running(struct sched_rt_entity *rt_se,
>>+						unsigned int rt, unsigned int rr)
>>+{
>>+	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
>>+
>>+	rt_rq->rt_nr_running += rt;
>>+	rt_rq->rr_nr_running += rr;
>>+}
>>+
>>+static inline bool on_top_rt_rq(struct sched_rt_entity *rt_se)
>>+{
>>+#ifdef CONFIG_RT_GROUP_SCHED
>>+	if (rt_se->parent)
>>+		return false;
>>+#endif
>>+	return true;
>> }
>> 
>> /*
>>- * Because the prio of an upper entry depends on the lower
>>- * entries, we must remove entries top - down.
>>+ * To optimize the enqueue and dequeue of rt_se, this strategy employs a
>>+ * bottom-up removal approach. Specifically, when removing an rt_se at a
>>+ * certain level, if it is determined that the highest priority of the rq
>>+ * associated with that rt_se has not changed, there is no need to continue
>>+ * removing rt_se at higher levels. At this point, only the total number
>>+ * of removed rt_se needs to be recorded, and the rt_nr_running count of
>>+ * higher-level rq should be removed accordingly.
>>+ *
>>+ * For enqueue operations, if an rt_se at a certain level is in the rq,
>>+ * it is still necessary to check the priority of the higher-level rq.
>>+ * If the priority of the higher-level rq is found to be lower than that
>>+ * of the rt_se to be added, it should be removed, as updating the highest
>>+ * priority of the rq during addition will cause the rq to be repositioned
>>+ * in the parent rq.
>>+ *
>>+ * Conversely, for dequeue operations, if an rt_se at a certain level is
>>+ * not in the rq, the operation can be exited immediately to reduce
>>+ * unnecessary checks and handling.
>>+ *
>>+ * The return value refers to the last rt_se that was removed for enqueue
>>+ * operations. And for dequeue operations, it refers to the last rt_se
>>+ * that was either removed or had its rt_nr_running updated.
>>  */
>>-static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
>>+static struct sched_rt_entity *dequeue_rt_stack(struct sched_rt_entity *rt_se,
>>+						unsigned int flags, int for_enqueue)
>> {
>>-	struct sched_rt_entity *back = NULL;
>>-	unsigned int rt_nr_running;
>>+	struct sched_rt_entity *last = rt_se;
>>+	struct sched_rt_entity *origin = rt_se;
>>+	unsigned int del_rt_nr = 0;
>>+	unsigned int del_rr_nr = 0;
>>+	int prio_changed = rt_se_prio(rt_se);
>>+	int sub_on_rq = 1;
>> 
>> 	for_each_sched_rt_entity(rt_se) {
>>-		rt_se->back = back;
>>-		back = rt_se;
>>-	}
>>+		if (on_rt_rq(rt_se)) {
>>+			if (sub_on_rq) {
>>+				/*
>>+				 * The number of tasks removed from the sub-level rt_se also needs
>>+				 * to be subtracted from the rq of the current rt_se, as the current
>>+				 * rt_se's rq no longer includes the number of removed tasks.
>>+				 */
>>+				dec_rq_nr_running(rt_se, del_rt_nr, del_rr_nr);
>>+				if ((prio_changed != MAX_RT_PRIO) ||
>>+					(rt_se_prio(rt_se) > rt_se_prio(origin))) {
>>+					/*
>>+					 * If the removal of the lower-level rt_se causes the
>>+					 * highest priority of the current rq to change, or if the
>>+					 * priority of current rq is lower than the rt_se to be
>>+					 * added, then the current rt_se also needs to be removed
>>+					 * from its parent rq, and the number of deleted tasks
>>+					 * should be accumulated.
>>+					 */
>>+					if (prio_changed == MAX_RT_PRIO)
>>+						prio_changed = rt_se_prio(rt_se);
>>+					del_rt_nr += rt_se_nr_running(rt_se);
>>+					del_rr_nr += rt_se_rr_nr_running(rt_se);
>>+					prio_changed = __dequeue_rt_entity(rt_se,
>>+									prio_changed, flags);
>>+					last = rt_se;
>>+				} else if (!for_enqueue) {
>>+					/* For dequeue, last may only rt_nr_running was modified.*/
>>+					last = rt_se;
>>+				}
>>+			} else {
>>+				/*
>>+				 * Entering this branch must be for enqueue, as dequeue would break
>>+				 * if an rt_se is not online.
>>+				 * If the sub-level node is not online, and the current rt_se's
>>+				 * priority is lower than the one being added, current rt_se need
>>+				 * to be removed.
>>+				 */
>>+				prio_changed = rt_se_prio(rt_se);
>>+				if (prio_changed > rt_se_prio(origin)) {
>>+					del_rt_nr += rt_se_nr_running(rt_se);
>>+					del_rr_nr += rt_se_rr_nr_running(rt_se);
>>+					prio_changed = __dequeue_rt_entity(rt_se,
>>+									prio_changed, flags);
>>+					last = rt_se;
>>+				} else {
>>+					prio_changed = MAX_RT_PRIO;
>>+				}
>>+			}
>> 
>>-	rt_nr_running = rt_rq_of_se(back)->rt_nr_running;
>>+			/*
>>+			 * If the current rt_se is on the top rt_rq, then the already deleted
>>+			 * nodes, plus the count of the rt_rq where current rt_se located,
>>+			 * need to be removed from the top_rt_rq.
>>+			 */
>>+			if (on_top_rt_rq(rt_se)) {
>>+				dequeue_top_rt_rq(rt_rq_of_se(rt_se),
>>+						del_rt_nr + rt_rq_of_se(rt_se)->rt_nr_running);
>>+			}
>>+			sub_on_rq = 1;
>>+		} else if (for_enqueue) {
>>+			struct rt_rq *group_rq = group_rt_rq(rt_se);
>> 
>>-	for (rt_se = back; rt_se; rt_se = rt_se->back) {
>>-		if (on_rt_rq(rt_se))
>>-			__dequeue_rt_entity(rt_se, flags);
>>+			/*
>>+			 * In the case of an enqueue operation, if a certain level is found to be
>>+			 * not online, then the previous counts need to be reset to zero.
>>+			 */
>>+			prio_changed = MAX_RT_PRIO;
>>+			sub_on_rq = 0;
>>+			del_rt_nr = 0;
>>+			del_rr_nr = 0;
>>+
>>+			/*
>>+			 * If the current group is being throttled, then there is no need to check
>>+			 * higher levels since enqueueing will not affect higher-level nodes.
>>+			 */
>>+			if (group_rq && rt_rq_throttled(group_rq))
>>+				break;
>>+
>>+			if (on_top_rt_rq(rt_se))
>>+				dequeue_top_rt_rq(rt_rq_of_se(rt_se),
>>+						rt_rq_of_se(rt_se)->rt_nr_running);
>>+		} else {
>>+			last = rt_se;
>>+			break;
>>+		}
>> 	}
>> 
>>-	dequeue_top_rt_rq(rt_rq_of_se(back), rt_nr_running);
>>+	return last;
>> }
>> 
>> static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
>> {
>> 	struct rq *rq = rq_of_rt_se(rt_se);
>>+	struct sched_rt_entity *last;
>>+	unsigned int add_rt_nr = 0;
>>+	unsigned int add_rr_nr = 0;
>>+	int enqueue = 1;
>>+	int prio_change = 1;
>> 
>> 	update_stats_enqueue_rt(rt_rq_of_se(rt_se), rt_se, flags);
>> 
>>-	dequeue_rt_stack(rt_se, flags);
>>-	for_each_sched_rt_entity(rt_se)
>>-		__enqueue_rt_entity(rt_se, flags);
>>+	last = dequeue_rt_stack(rt_se, flags, 1);
>>+
>>+	for_each_sched_rt_entity(rt_se) {
>>+		if (enqueue || !on_rt_rq(rt_se) || (prio_change == 1)) {
>>+			prio_change = __enqueue_rt_entity(rt_se, flags);
>>+			if (prio_change >= 0) {
>>+				add_rt_nr = rt_se_nr_running(rt_se);
>>+				add_rr_nr = rt_se_rr_nr_running(rt_se);
>>+			} else {
>>+				add_rt_nr = add_rr_nr = 0;
>>+			}
>>+		} else {
>>+			add_rq_nr_running(rt_se, add_rt_nr, add_rr_nr);
>>+		}
>>+
>>+		if (rt_se == last)
>>+			enqueue = 0;
>>+	}
>>+
>> 	enqueue_top_rt_rq(&rq->rt);
>> }
>> 
>> static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
>> {
>> 	struct rq *rq = rq_of_rt_se(rt_se);
>>+	struct sched_rt_entity *last;
>>+	unsigned int add_rt_nr = 0;
>>+	unsigned int add_rr_nr = 0;
>>+	int prio_change = 1;
>> 
>> 	update_stats_dequeue_rt(rt_rq_of_se(rt_se), rt_se, flags);
>> 
>>-	dequeue_rt_stack(rt_se, flags);
>>+	last = dequeue_rt_stack(rt_se, flags, 0);
>> 
>> 	for_each_sched_rt_entity(rt_se) {
>> 		struct rt_rq *rt_rq = group_rt_rq(rt_se);
>>+		if (rt_rq && rt_rq->rt_nr_running) {
>>+			if (on_rt_rq(rt_se)) {
>>+				add_rq_nr_running(rt_se, add_rt_nr, add_rr_nr);
>>+			} else {
>>+				prio_change = __enqueue_rt_entity(rt_se, flags);
>>+				if (prio_change == 0) {
>>+					/*
>>+					 * If enqueue is successful and the priority of the rq has
>>+					 * not changed, then the parent node only needs to add the
>>+					 * count of the current rt_se. Otherwise, the parent node
>>+					 * will also need to enqueue.
>>+					 */
>>+					add_rt_nr = rt_se_nr_running(rt_se);
>>+					add_rr_nr = rt_se_rr_nr_running(rt_se);
>>+				}
>>+			}
>>+		} else {
>>+			add_rt_nr = add_rr_nr = 0;
>>+		}
>> 
>>-		if (rt_rq && rt_rq->rt_nr_running)
>>-			__enqueue_rt_entity(rt_se, flags);
>>+		/*
>>+		 * last is the rt_se of the last deletion or modification of the
>>+		 * count, so the subsequent rt_se does not need to be updated.
>>+		 */
>>+		if (rt_se == last)
>>+			break;
>> 	}
>>+
>> 	enqueue_top_rt_rq(&rq->rt);
>> }
>> 
>>diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>>index ef20c61004eb..821d65106d13 100644
>>--- a/kernel/sched/sched.h
>>+++ b/kernel/sched/sched.h
>>@@ -2879,6 +2879,7 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
>> extern void print_dl_stats(struct seq_file *m, int cpu);
>> extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
>> extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
>>+extern void print_rt_rq_task(struct seq_file *m, struct rt_rq *rt_rq);
>> extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
>> 
>> extern void resched_latency_warn(int cpu, u64 latency);
>>-- 
>>2.45.2

[PATCH-RT sched v4 2/2] RT test: Adding test cases for RT group scheduling

Posted by Xavier 1 year, 6 months ago

Adding test cases for RT group scheduling, create some RT infinite loop
processes/threads, then set them to the same or different priorities.
Place them in different RT task groups, run for a period of time,
and finally count the number of infinite loop executions for all tasks.

Signed-off-by: Xavier <xavier_qy@163.com>
---
 MAINTAINERS                                   |   7 +
 tools/testing/selftests/sched/Makefile        |   4 +-
 tools/testing/selftests/sched/deadloop.c      | 192 ++++++++++++++++++
 .../selftests/sched/rt_group_sched_test.sh    | 119 +++++++++++
 4 files changed, 320 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/sched/deadloop.c
 create mode 100755 tools/testing/selftests/sched/rt_group_sched_test.sh

diff --git a/MAINTAINERS b/MAINTAINERS
index 958e935449e5..f5cc821b8510 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -19463,6 +19463,13 @@ L:	linux-remoteproc@vger.kernel.org
 S:	Maintained
 F:	drivers/tty/rpmsg_tty.c
 
+RT GROUP SCHED TEST
+M:	Xavier <xavier_qy@163.com>
+L:	linux-kernel@vger.kernel.org
+S:	Maintained
+F:	tools/testing/selftests/sched/deadloop.c
+F:	tools/testing/selftests/sched/rt_group_sched_test.sh
+
 RTL2830 MEDIA DRIVER
 L:	linux-media@vger.kernel.org
 S:	Orphan
diff --git a/tools/testing/selftests/sched/Makefile b/tools/testing/selftests/sched/Makefile
index 099ee9213557..96decb58bf35 100644
--- a/tools/testing/selftests/sched/Makefile
+++ b/tools/testing/selftests/sched/Makefile
@@ -8,7 +8,7 @@ CFLAGS += -O2 -Wall -g -I./ $(KHDR_INCLUDES) -Wl,-rpath=./ \
 	  $(CLANG_FLAGS)
 LDLIBS += -lpthread
 
-TEST_GEN_FILES := cs_prctl_test
-TEST_PROGS := cs_prctl_test
+TEST_GEN_FILES := cs_prctl_test deadloop
+TEST_PROGS := cs_prctl_test deadloop
 
 include ../lib.mk
diff --git a/tools/testing/selftests/sched/deadloop.c b/tools/testing/selftests/sched/deadloop.c
new file mode 100644
index 000000000000..d850a3e2a0ab
--- /dev/null
+++ b/tools/testing/selftests/sched/deadloop.c
@@ -0,0 +1,192 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <signal.h>
+
+/*
+ * Create multiple infinite loop threads based on the passed parameters
+ * Usage: deadloop num policy prio
+ *	num: the number of child threads
+ *	policy: the scheduling policy of the child threads, 0-fair, 1-fifo, 2-rr
+ *	prio: the priority
+ * If this process is killed, it will print the loop count of all child threads
+ * to the OUTPUT_FILE
+ *
+ * Date: June 27, 2024
+ * Author: Xavier <xavier_qy@163.com>
+ */
+
+#define OUTPUT_FILE "rt_group_sched_test.log"
+
+#if __GLIBC_PREREQ(2, 30) == 0
+#include <sys/syscall.h>
+static pid_t gettid(void)
+{
+	return syscall(SYS_gettid);
+}
+#endif
+
+#define do_err(x) \
+do { \
+	if ((x) < 0) {  \
+		printf("test BUG_ON func %s, line %d %ld\n", \
+			__func__, __LINE__, (long)(x) \
+		); \
+		while (1) \
+			sleep(1); \
+	} \
+} while (0)
+
+#define do_false(x) \
+do { \
+	if ((x) == 1) { \
+		printf("test BUG_ON func %s, line %d %d\n", \
+			__func__, __LINE__, (x) \
+		); \
+		while (1) \
+			sleep(1); \
+	} \
+} while (0)
+
+
+struct thread_data {
+	pthread_t thread;
+	int index;
+	int pid;
+	unsigned long cnt;
+};
+
+static struct thread_data *pdata;
+static int thread_num = 1;
+
+static void create_thread_posix(void *entry, pthread_t *thread, int *para,
+								 int policy, int prio)
+{
+	int					ret;
+	struct sched_param	param;
+	pthread_attr_t		attr;
+
+	memset(&param, 0, sizeof(param));
+	ret = pthread_attr_init(&attr);
+	do_err(ret);
+
+	ret = pthread_attr_setinheritsched(&attr, PTHREAD_EXPLICIT_SCHED);
+	do_err(ret);
+
+	param.sched_priority = prio;
+
+	ret = pthread_attr_setschedpolicy(&attr, policy);
+	do_err(ret);
+
+	ret = pthread_attr_setschedparam(&attr, &param);
+	do_err(ret);
+
+	ret = pthread_create(thread, &attr, entry, para);
+	do_err(ret);
+}
+
+static void *dead_loop_entry(void *arg)
+{
+	int index = *(int *)arg;
+	struct sched_param param;
+	int cur = gettid();
+
+	sched_getparam(cur, &param);
+	pdata[index].pid = cur;
+	printf("cur:%d prio:%d\n", cur, param.sched_priority);
+
+	while (1) {
+		asm volatile("" ::: "memory");
+		pdata[index].cnt++;
+	}
+	return NULL;
+}
+
+static void handle_signal(int signal)
+{
+	int cnt = 0;
+
+	if (signal == SIGTERM) {
+		FILE *file = freopen(OUTPUT_FILE, "a", stdout);
+
+		if (file == NULL) {
+			perror("freopen");
+			exit(0);
+		}
+
+		while (cnt < thread_num) {
+			printf("pid:%d cnt:%ld\n", pdata[cnt].pid, pdata[cnt].cnt);
+			cnt++;
+		}
+		fclose(file);
+		exit(0);
+	}
+}
+
+static int dead_loop_create(int policy, int prio)
+{
+	int cnt = 0;
+	int ret;
+	void *status;
+	struct sched_param param;
+
+	param.sched_priority = prio;
+	pdata = malloc(thread_num * sizeof(struct thread_data));
+	do_false(!pdata);
+
+	if (policy) {
+		ret = sched_setscheduler(0, policy, &param);
+		do_err(ret);
+	}
+
+	while (cnt < thread_num) {
+		pdata[cnt].index = cnt;
+		create_thread_posix(dead_loop_entry, &pdata[cnt].thread,
+								 &pdata[cnt].index, policy, prio);
+		cnt++;
+	}
+
+	signal(SIGTERM, handle_signal);
+
+	cnt = 0;
+	while (cnt < thread_num) {
+		pthread_join(pdata[cnt].thread, &status);
+		cnt++;
+	}
+
+	free(pdata);
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	int policy = 2;
+	int prio = 50;
+
+	if (argc == 2)
+		thread_num = atoi(argv[1]);
+
+	if (argc == 3) {
+		thread_num = atoi(argv[1]);
+		policy = atoi(argv[2]);
+		if (policy > 0)
+			prio = 50;
+	}
+
+	if (argc == 4) {
+		thread_num = atoi(argv[1]);
+		policy = atoi(argv[2]);
+		prio = atoi(argv[3]);
+	}
+
+	dead_loop_create(policy, prio);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/sched/rt_group_sched_test.sh b/tools/testing/selftests/sched/rt_group_sched_test.sh
new file mode 100755
index 000000000000..9031250a2684
--- /dev/null
+++ b/tools/testing/selftests/sched/rt_group_sched_test.sh
@@ -0,0 +1,119 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Test for rt group scheduling
+# Date: June 27, 2024
+# Author: Xavier <xavier_qy@163.com>
+
+# Record the list of child process PIDs
+PIDS=()
+
+# File for redirected output
+LOGFILE="rt_group_sched_test.log"
+
+# Cleanup function: kill all recorded child processes and unmount the cgroup
+function cleanup() {
+	echo "Cleaning up..."
+	for pid in "${PIDS[@]}"; do
+		if kill -0 $pid 2>/dev/null; then
+			kill -TERM $pid
+		fi
+	done
+
+	# Sleep for a while to ensure the processes are properly killed
+	sleep 2
+
+	# Unmount the cgroup filesystem
+	umount /sys/fs/cgroup/cpu 2>/dev/null
+	umount /sys/fs/cgroup 2>/dev/null
+	echo "Cleanup completed."
+
+	# Ensure the LOGFILE exists and is correct
+	if [ ! -f "$LOGFILE" ]; then
+		echo "$LOGFILE not found!"
+		exit 1
+	fi
+
+	# Initialize the total count variable
+	total=0
+
+	# Read matching lines and calculate the total sum
+	while IFS= read -r line
+	do
+		# Use grep to match lines containing 'pid:' and 'cnt:', and extract the value of cnt
+		if echo "$line" | grep -q '^pid:[[:digit:]]\+ cnt:[[:digit:]]\+'; then
+			cnt=$(echo "$line" | sed -n \
+			  's/^pid:[[:digit:]]\+ cnt:\([[:digit:]]\+\)/\1/p')
+			total=$((total + cnt))
+		fi
+	done < "$LOGFILE"
+
+	# Print the total sum
+	echo "Total cnt: $total"
+	echo "Finished processing."
+}
+
+# Capture actions when interrupted or terminated by a signal
+trap cleanup EXIT
+
+# Start the cgroup filesystem and create the necessary directories
+function setup_cgroups() {
+	mount -t tmpfs -o mode=755 cgroup_root /sys/fs/cgroup
+	mkdir -p /sys/fs/cgroup/cpu
+	mount -t cgroup -o cpu none /sys/fs/cgroup/cpu
+}
+
+# Create cgroup subdirectories and configure their settings
+function create_child_cgroup() {
+	local base_dir=$1
+	local name=$2
+	local rt_period=$3
+	local rt_runtime=$4
+	mkdir -p "$base_dir/$name"
+	echo $rt_period > "$base_dir/$name/cpu.rt_period_us"
+	echo $rt_runtime > "$base_dir/$name/cpu.rt_runtime_us"
+}
+# Launch a process and add it to the specified cgroup
+function launch_process() {
+	local process_name=$1
+
+	# Three parameters representing the number of child threads, scheduling policy, and priority
+	local args=$2
+	local cgroup_path=$3
+
+	# Launch the process
+	exec -a $process_name ./deadloop $args &
+	local pid=$!
+	PIDS+=($pid)
+
+	# Short sleep to ensure the process starts
+	sleep 1
+
+	# Check if the process started successfully
+	if ! pgrep -x $process_name > /dev/null; then
+		echo "Error: No process found with name $process_name."
+		exit 1
+	fi
+
+	echo $pid > "$cgroup_path/cgroup.procs"
+	echo "Process $process_name with PID $pid added to cgroup $cgroup_path"
+}
+
+# Main function running all tasks
+function main() {
+	echo "The test needs 30 seconds..."
+	rm -f "$LOGFILE"
+	setup_cgroups
+	create_child_cgroup "/sys/fs/cgroup/cpu" "child1" 1000000 800000
+	create_child_cgroup "/sys/fs/cgroup/cpu/child1" "child2" 1000000 700000
+	create_child_cgroup "/sys/fs/cgroup/cpu/child1/child2" "child3" 1000000 600000
+	launch_process "child1" "3 2 50" "/sys/fs/cgroup/cpu/child1"
+	launch_process "child2" "3 2 50" "/sys/fs/cgroup/cpu/child1/child2"
+	launch_process "child3" "1 2 50" "/sys/fs/cgroup/cpu/child1/child2/child3"
+	launch_process "tg_root" "1 2 50" "/sys/fs/cgroup/cpu"
+
+	# Run for 30 seconds
+	sleep 30
+}
+
+# Execute the main function
+main
-- 
2.45.2

[PATCH-RT sched v3 1/2] RT SCHED: Optimize the enqueue and dequeue operations for rt_se
[PATCH-RT sched v3 2/2] RT test: Adding test cases for RT group scheduling