This patch optimizes the enqueue and dequeue of rt_se, the strategy employs
a bottom-up removal approach. Specifically, when removing an rt_se at a
certain level, if it is determined that the highest priority of the rq
associated with that rt_se has not changed, there is no need to continue
removing rt_se at higher levels. At this point, only the total number
of removed rt_se needs to be recorded, and the rt_nr_running count of
higher-level rq should be removed accordingly.
Signed-off-by: Xavier <xavier_qy@163.com>
---
kernel/sched/debug.c | 48 ++++++++
kernel/sched/rt.c | 287 +++++++++++++++++++++++++++++++++++++------
kernel/sched/sched.h | 1 +
3 files changed, 298 insertions(+), 38 deletions(-)
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index c1eb9a1afd13..352ee55da25e 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -712,6 +712,54 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
#endif
}
+static void print_rt_se(struct seq_file *m, struct sched_rt_entity *rt_se)
+{
+ struct task_struct *task;
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ if (rt_se->my_q) {
+ SEQ_printf_task_group_path(m, rt_se->my_q->tg, "%s\n");
+ return;
+ }
+#endif
+ task = container_of(rt_se, struct task_struct, rt);
+ SEQ_printf(m, " prio-%d, pid-%d, %s\n", task->prio, task->pid, task->comm);
+}
+
+/*shall be called in rq lock*/
+void print_rt_rq_task(struct seq_file *m, struct rt_rq *rt_rq)
+{
+ struct rt_prio_array *array = &rt_rq->active;
+ struct sched_rt_entity *rt_se;
+ struct list_head *queue, *head;
+ unsigned long bitmap[2];
+ int idx;
+ int count = 0;
+
+ if (!rt_rq->rt_nr_running)
+ return;
+
+ memcpy(bitmap, array->bitmap, sizeof(unsigned long) * 2);
+ idx = sched_find_first_bit(bitmap);
+ WARN_ON_ONCE(idx >= MAX_RT_PRIO);
+
+ while (1) {
+ clear_bit(idx, bitmap);
+ queue = array->queue + idx;
+ head = queue;
+ queue = queue->next;
+ do {
+ rt_se = list_entry(queue, struct sched_rt_entity, run_list);
+ print_rt_se(m, rt_se);
+ queue = queue->next;
+ count++;
+ } while (queue != head);
+ idx = sched_find_first_bit(bitmap);
+ if (idx >= MAX_RT_PRIO)
+ break;
+ }
+}
+
void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
{
#ifdef CONFIG_RT_GROUP_SCHED
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index aa4c1c874fa4..b18c424a50d2 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1113,7 +1113,7 @@ void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
#endif /* CONFIG_SMP */
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-static void
+static int
inc_rt_prio(struct rt_rq *rt_rq, int prio)
{
int prev_prio = rt_rq->highest_prio.curr;
@@ -1122,9 +1122,11 @@ inc_rt_prio(struct rt_rq *rt_rq, int prio)
rt_rq->highest_prio.curr = prio;
inc_rt_prio_smp(rt_rq, prio, prev_prio);
+
+ return prev_prio > prio;
}
-static void
+static int
dec_rt_prio(struct rt_rq *rt_rq, int prio)
{
int prev_prio = rt_rq->highest_prio.curr;
@@ -1149,12 +1151,22 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio)
}
dec_rt_prio_smp(rt_rq, prio, prev_prio);
+ if (rt_rq->highest_prio.curr > prio)
+ return prio;
+ else
+ return MAX_RT_PRIO;
}
#else
-static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
-static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
+static inline int inc_rt_prio(struct rt_rq *rt_rq, int prio)
+{
+ return 0;
+}
+static inline int dec_rt_prio(struct rt_rq *rt_rq, int prio)
+{
+ return 0;
+}
#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
@@ -1218,28 +1230,31 @@ unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
}
static inline
-void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+int inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
int prio = rt_se_prio(rt_se);
+ int prio_change;
WARN_ON(!rt_prio(prio));
rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
- inc_rt_prio(rt_rq, prio);
+ prio_change = inc_rt_prio(rt_rq, prio);
inc_rt_group(rt_se, rt_rq);
+ return prio_change;
}
static inline
-void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+int dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq, int prio)
{
+ int prio_changed;
WARN_ON(!rt_prio(rt_se_prio(rt_se)));
- WARN_ON(!rt_rq->rt_nr_running);
rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
- dec_rt_prio(rt_rq, rt_se_prio(rt_se));
+ prio_changed = dec_rt_prio(rt_rq, prio);
dec_rt_group(rt_se, rt_rq);
+ return prio_changed;
}
/*
@@ -1255,12 +1270,13 @@ static inline bool move_entity(unsigned int flags)
return true;
}
-static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
+static void __delist_rt_entity(struct sched_rt_entity *rt_se,
+ struct rt_prio_array *array, int last_prio)
{
list_del_init(&rt_se->run_list);
- if (list_empty(array->queue + rt_se_prio(rt_se)))
- __clear_bit(rt_se_prio(rt_se), array->bitmap);
+ if (list_empty(array->queue + last_prio))
+ __clear_bit(last_prio, array->bitmap);
rt_se->on_list = 0;
}
@@ -1371,7 +1387,12 @@ update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
}
}
-static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
+/*
+ * Returns: -1 indicates that rt_se was not enqueued, 0 indicates that the highest
+ * priority of the rq did not change after enqueue, and 1 indicates that the highest
+ * priority of the rq changed after enqueue.
+ */
+static int __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
struct rt_prio_array *array = &rt_rq->active;
@@ -1386,8 +1407,8 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flag
*/
if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
if (rt_se->on_list)
- __delist_rt_entity(rt_se, array);
- return;
+ __delist_rt_entity(rt_se, array, rt_se_prio(rt_se));
+ return -1;
}
if (move_entity(flags)) {
@@ -1402,73 +1423,263 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flag
}
rt_se->on_rq = 1;
- inc_rt_tasks(rt_se, rt_rq);
+ return inc_rt_tasks(rt_se, rt_rq);
}
-static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
+/**
+ * delete rt_se from rt_rq
+ *
+ * @rt_se Nodes to be deleted
+ * @last_prio The highest priority of this rt_se before the previous round
+ * of deletion
+ * @flags operation flags
+ *
+ * Returns: =0 indicates that the highest priority of the current rq did not
+ * change during this deletion. >0 indicates it changed, and it returns the
+ * previous highest priority to use in the next round of deletion.
+ */
+static int __dequeue_rt_entity(struct sched_rt_entity *rt_se, int last_prio,
+ unsigned int flags)
{
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
struct rt_prio_array *array = &rt_rq->active;
if (move_entity(flags)) {
WARN_ON_ONCE(!rt_se->on_list);
- __delist_rt_entity(rt_se, array);
+ __delist_rt_entity(rt_se, array, last_prio);
}
rt_se->on_rq = 0;
- dec_rt_tasks(rt_se, rt_rq);
+ return dec_rt_tasks(rt_se, rt_rq, last_prio);
+}
+
+static inline void dec_rq_nr_running(struct sched_rt_entity *rt_se,
+ unsigned int rt, unsigned int rr)
+{
+ struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+
+ rt_rq->rt_nr_running -= rt;
+ rt_rq->rr_nr_running -= rr;
+}
+
+static inline void add_rq_nr_running(struct sched_rt_entity *rt_se,
+ unsigned int rt, unsigned int rr)
+{
+ struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+
+ rt_rq->rt_nr_running += rt;
+ rt_rq->rr_nr_running += rr;
+}
+
+static inline bool on_top_rt_rq(struct sched_rt_entity *rt_se)
+{
+#ifdef CONFIG_RT_GROUP_SCHED
+ if (rt_se->parent)
+ return false;
+#endif
+ return true;
}
/*
- * Because the prio of an upper entry depends on the lower
- * entries, we must remove entries top - down.
+ * To optimize the enqueue and dequeue of rt_se, this strategy employs a
+ * bottom-up removal approach. Specifically, when removing an rt_se at a
+ * certain level, if it is determined that the highest priority of the rq
+ * associated with that rt_se has not changed, there is no need to continue
+ * removing rt_se at higher levels. At this point, only the total number
+ * of removed rt_se needs to be recorded, and the rt_nr_running count of
+ * higher-level rq should be removed accordingly.
+ *
+ * For enqueue operations, if an rt_se at a certain level is in the rq,
+ * it is still necessary to check the priority of the higher-level rq.
+ * If the priority of the higher-level rq is found to be lower than that
+ * of the rt_se to be added, it should be removed, as updating the highest
+ * priority of the rq during addition will cause the rq to be repositioned
+ * in the parent rq.
+ *
+ * Conversely, for dequeue operations, if an rt_se at a certain level is
+ * not in the rq, the operation can be exited immediately to reduce
+ * unnecessary checks and handling.
+ *
+ * The return value refers to the last rt_se that was removed for enqueue
+ * operations. And for dequeue operations, it refers to the last rt_se
+ * that was either removed or had its rt_nr_running updated.
*/
-static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
+static struct sched_rt_entity *dequeue_rt_stack(struct sched_rt_entity *rt_se,
+ unsigned int flags, int for_enqueue)
{
- struct sched_rt_entity *back = NULL;
- unsigned int rt_nr_running;
+ struct sched_rt_entity *last = rt_se;
+ struct sched_rt_entity *origin = rt_se;
+ unsigned int del_rt_nr = 0;
+ unsigned int del_rr_nr = 0;
+ int prio_changed = rt_se_prio(rt_se);
+ int sub_on_rq = 1;
for_each_sched_rt_entity(rt_se) {
- rt_se->back = back;
- back = rt_se;
- }
+ if (on_rt_rq(rt_se)) {
+ if (sub_on_rq) {
+ /*
+ * The number of tasks removed from the sub-level rt_se also needs
+ * to be subtracted from the rq of the current rt_se, as the current
+ * rt_se's rq no longer includes the number of removed tasks.
+ */
+ dec_rq_nr_running(rt_se, del_rt_nr, del_rr_nr);
+ if ((prio_changed != MAX_RT_PRIO) ||
+ (rt_se_prio(rt_se) > rt_se_prio(origin))) {
+ /*
+ * If the removal of the lower-level rt_se causes the
+ * highest priority of the current rq to change, or if the
+ * priority of current rq is lower than the rt_se to be
+ * added, then the current rt_se also needs to be removed
+ * from its parent rq, and the number of deleted tasks
+ * should be accumulated.
+ */
+ if (prio_changed == MAX_RT_PRIO)
+ prio_changed = rt_se_prio(rt_se);
+ del_rt_nr += rt_se_nr_running(rt_se);
+ del_rr_nr += rt_se_rr_nr_running(rt_se);
+ prio_changed = __dequeue_rt_entity(rt_se,
+ prio_changed, flags);
+ last = rt_se;
+ } else if (!for_enqueue) {
+ /* For dequeue, last may only rt_nr_running was modified.*/
+ last = rt_se;
+ }
+ } else {
+ /*
+ * Entering this branch must be for enqueue, as dequeue would break
+ * if an rt_se is not online.
+ * If the sub-level node is not online, and the current rt_se's
+ * priority is lower than the one being added, current rt_se need
+ * to be removed.
+ */
+ prio_changed = rt_se_prio(rt_se);
+ if (prio_changed > rt_se_prio(origin)) {
+ del_rt_nr += rt_se_nr_running(rt_se);
+ del_rr_nr += rt_se_rr_nr_running(rt_se);
+ prio_changed = __dequeue_rt_entity(rt_se,
+ prio_changed, flags);
+ last = rt_se;
+ } else {
+ prio_changed = MAX_RT_PRIO;
+ }
+ }
- rt_nr_running = rt_rq_of_se(back)->rt_nr_running;
+ /*
+ * If the current rt_se is on the top rt_rq, then the already deleted
+ * nodes, plus the count of the rt_rq where current rt_se located,
+ * need to be removed from the top_rt_rq.
+ */
+ if (on_top_rt_rq(rt_se)) {
+ dequeue_top_rt_rq(rt_rq_of_se(rt_se),
+ del_rt_nr + rt_rq_of_se(rt_se)->rt_nr_running);
+ }
+ sub_on_rq = 1;
+ } else if (for_enqueue) {
+ struct rt_rq *group_rq = group_rt_rq(rt_se);
- for (rt_se = back; rt_se; rt_se = rt_se->back) {
- if (on_rt_rq(rt_se))
- __dequeue_rt_entity(rt_se, flags);
+ /*
+ * In the case of an enqueue operation, if a certain level is found to be
+ * not online, then the previous counts need to be reset to zero.
+ */
+ prio_changed = MAX_RT_PRIO;
+ sub_on_rq = 0;
+ del_rt_nr = 0;
+ del_rr_nr = 0;
+
+ /*
+ * If the current group is being throttled, then there is no need to check
+ * higher levels since enqueueing will not affect higher-level nodes.
+ */
+ if (group_rq && rt_rq_throttled(group_rq))
+ break;
+
+ if (on_top_rt_rq(rt_se))
+ dequeue_top_rt_rq(rt_rq_of_se(rt_se),
+ rt_rq_of_se(rt_se)->rt_nr_running);
+ } else {
+ last = rt_se;
+ break;
+ }
}
- dequeue_top_rt_rq(rt_rq_of_se(back), rt_nr_running);
+ return last;
}
static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct rq *rq = rq_of_rt_se(rt_se);
+ struct sched_rt_entity *last;
+ unsigned int add_rt_nr = 0;
+ unsigned int add_rr_nr = 0;
+ int enqueue = 1;
+ int prio_change = 1;
update_stats_enqueue_rt(rt_rq_of_se(rt_se), rt_se, flags);
- dequeue_rt_stack(rt_se, flags);
- for_each_sched_rt_entity(rt_se)
- __enqueue_rt_entity(rt_se, flags);
+ last = dequeue_rt_stack(rt_se, flags, 1);
+
+ for_each_sched_rt_entity(rt_se) {
+ if (enqueue || !on_rt_rq(rt_se) || (prio_change == 1)) {
+ prio_change = __enqueue_rt_entity(rt_se, flags);
+ if (prio_change >= 0) {
+ add_rt_nr = rt_se_nr_running(rt_se);
+ add_rr_nr = rt_se_rr_nr_running(rt_se);
+ } else {
+ add_rt_nr = add_rr_nr = 0;
+ }
+ } else {
+ add_rq_nr_running(rt_se, add_rt_nr, add_rr_nr);
+ }
+
+ if (rt_se == last)
+ enqueue = 0;
+ }
+
enqueue_top_rt_rq(&rq->rt);
}
static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct rq *rq = rq_of_rt_se(rt_se);
+ struct sched_rt_entity *last;
+ unsigned int add_rt_nr = 0;
+ unsigned int add_rr_nr = 0;
+ int prio_change = 1;
update_stats_dequeue_rt(rt_rq_of_se(rt_se), rt_se, flags);
- dequeue_rt_stack(rt_se, flags);
+ last = dequeue_rt_stack(rt_se, flags, 0);
for_each_sched_rt_entity(rt_se) {
struct rt_rq *rt_rq = group_rt_rq(rt_se);
+ if (rt_rq && rt_rq->rt_nr_running) {
+ if (on_rt_rq(rt_se)) {
+ add_rq_nr_running(rt_se, add_rt_nr, add_rr_nr);
+ } else {
+ prio_change = __enqueue_rt_entity(rt_se, flags);
+ if (prio_change == 0) {
+ /*
+ * If enqueue is successful and the priority of the rq has
+ * not changed, then the parent node only needs to add the
+ * count of the current rt_se. Otherwise, the parent node
+ * will also need to enqueue.
+ */
+ add_rt_nr = rt_se_nr_running(rt_se);
+ add_rr_nr = rt_se_rr_nr_running(rt_se);
+ }
+ }
+ } else {
+ add_rt_nr = add_rr_nr = 0;
+ }
- if (rt_rq && rt_rq->rt_nr_running)
- __enqueue_rt_entity(rt_se, flags);
+ /*
+ * last is the rt_se of the last deletion or modification of the
+ * count, so the subsequent rt_se does not need to be updated.
+ */
+ if (rt_se == last)
+ break;
}
+
enqueue_top_rt_rq(&rq->rt);
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ef20c61004eb..821d65106d13 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2879,6 +2879,7 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
extern void print_dl_stats(struct seq_file *m, int cpu);
extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
+extern void print_rt_rq_task(struct seq_file *m, struct rt_rq *rt_rq);
extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
extern void resched_latency_warn(int cpu, u64 latency);
--
2.45.2
Hi all,
I would like to ask everyone for your opinions or thoughts on the RT group scheduling
optimization. At this stage, is it ready to be merged into the corresponding branch?
Thanks.
--
Best Regards,
Xavier
At 2024-07-17 11:00:32, "Xavier" <xavier_qy@163.com> wrote:
>This patch optimizes the enqueue and dequeue of rt_se, the strategy employs
>a bottom-up removal approach. Specifically, when removing an rt_se at a
>certain level, if it is determined that the highest priority of the rq
>associated with that rt_se has not changed, there is no need to continue
>removing rt_se at higher levels. At this point, only the total number
>of removed rt_se needs to be recorded, and the rt_nr_running count of
>higher-level rq should be removed accordingly.
>
>Signed-off-by: Xavier <xavier_qy@163.com>
>---
> kernel/sched/debug.c | 48 ++++++++
> kernel/sched/rt.c | 287 +++++++++++++++++++++++++++++++++++++------
> kernel/sched/sched.h | 1 +
> 3 files changed, 298 insertions(+), 38 deletions(-)
>
>diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
>index c1eb9a1afd13..352ee55da25e 100644
>--- a/kernel/sched/debug.c
>+++ b/kernel/sched/debug.c
>@@ -712,6 +712,54 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
> #endif
> }
>
>+static void print_rt_se(struct seq_file *m, struct sched_rt_entity *rt_se)
>+{
>+ struct task_struct *task;
>+
>+#ifdef CONFIG_RT_GROUP_SCHED
>+ if (rt_se->my_q) {
>+ SEQ_printf_task_group_path(m, rt_se->my_q->tg, "%s\n");
>+ return;
>+ }
>+#endif
>+ task = container_of(rt_se, struct task_struct, rt);
>+ SEQ_printf(m, " prio-%d, pid-%d, %s\n", task->prio, task->pid, task->comm);
>+}
>+
>+/*shall be called in rq lock*/
>+void print_rt_rq_task(struct seq_file *m, struct rt_rq *rt_rq)
>+{
>+ struct rt_prio_array *array = &rt_rq->active;
>+ struct sched_rt_entity *rt_se;
>+ struct list_head *queue, *head;
>+ unsigned long bitmap[2];
>+ int idx;
>+ int count = 0;
>+
>+ if (!rt_rq->rt_nr_running)
>+ return;
>+
>+ memcpy(bitmap, array->bitmap, sizeof(unsigned long) * 2);
>+ idx = sched_find_first_bit(bitmap);
>+ WARN_ON_ONCE(idx >= MAX_RT_PRIO);
>+
>+ while (1) {
>+ clear_bit(idx, bitmap);
>+ queue = array->queue + idx;
>+ head = queue;
>+ queue = queue->next;
>+ do {
>+ rt_se = list_entry(queue, struct sched_rt_entity, run_list);
>+ print_rt_se(m, rt_se);
>+ queue = queue->next;
>+ count++;
>+ } while (queue != head);
>+ idx = sched_find_first_bit(bitmap);
>+ if (idx >= MAX_RT_PRIO)
>+ break;
>+ }
>+}
>+
> void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
> {
> #ifdef CONFIG_RT_GROUP_SCHED
>diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
>index aa4c1c874fa4..b18c424a50d2 100644
>--- a/kernel/sched/rt.c
>+++ b/kernel/sched/rt.c
>@@ -1113,7 +1113,7 @@ void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
> #endif /* CONFIG_SMP */
>
> #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
>-static void
>+static int
> inc_rt_prio(struct rt_rq *rt_rq, int prio)
> {
> int prev_prio = rt_rq->highest_prio.curr;
>@@ -1122,9 +1122,11 @@ inc_rt_prio(struct rt_rq *rt_rq, int prio)
> rt_rq->highest_prio.curr = prio;
>
> inc_rt_prio_smp(rt_rq, prio, prev_prio);
>+
>+ return prev_prio > prio;
> }
>
>-static void
>+static int
> dec_rt_prio(struct rt_rq *rt_rq, int prio)
> {
> int prev_prio = rt_rq->highest_prio.curr;
>@@ -1149,12 +1151,22 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio)
> }
>
> dec_rt_prio_smp(rt_rq, prio, prev_prio);
>+ if (rt_rq->highest_prio.curr > prio)
>+ return prio;
>+ else
>+ return MAX_RT_PRIO;
> }
>
> #else
>
>-static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
>-static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
>+static inline int inc_rt_prio(struct rt_rq *rt_rq, int prio)
>+{
>+ return 0;
>+}
>+static inline int dec_rt_prio(struct rt_rq *rt_rq, int prio)
>+{
>+ return 0;
>+}
>
> #endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
>
>@@ -1218,28 +1230,31 @@ unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
> }
>
> static inline
>-void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
>+int inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
> {
> int prio = rt_se_prio(rt_se);
>+ int prio_change;
>
> WARN_ON(!rt_prio(prio));
> rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
> rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
>
>- inc_rt_prio(rt_rq, prio);
>+ prio_change = inc_rt_prio(rt_rq, prio);
> inc_rt_group(rt_se, rt_rq);
>+ return prio_change;
> }
>
> static inline
>-void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
>+int dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq, int prio)
> {
>+ int prio_changed;
> WARN_ON(!rt_prio(rt_se_prio(rt_se)));
>- WARN_ON(!rt_rq->rt_nr_running);
> rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
> rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
>
>- dec_rt_prio(rt_rq, rt_se_prio(rt_se));
>+ prio_changed = dec_rt_prio(rt_rq, prio);
> dec_rt_group(rt_se, rt_rq);
>+ return prio_changed;
> }
>
> /*
>@@ -1255,12 +1270,13 @@ static inline bool move_entity(unsigned int flags)
> return true;
> }
>
>-static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
>+static void __delist_rt_entity(struct sched_rt_entity *rt_se,
>+ struct rt_prio_array *array, int last_prio)
> {
> list_del_init(&rt_se->run_list);
>
>- if (list_empty(array->queue + rt_se_prio(rt_se)))
>- __clear_bit(rt_se_prio(rt_se), array->bitmap);
>+ if (list_empty(array->queue + last_prio))
>+ __clear_bit(last_prio, array->bitmap);
>
> rt_se->on_list = 0;
> }
>@@ -1371,7 +1387,12 @@ update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
> }
> }
>
>-static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
>+/*
>+ * Returns: -1 indicates that rt_se was not enqueued, 0 indicates that the highest
>+ * priority of the rq did not change after enqueue, and 1 indicates that the highest
>+ * priority of the rq changed after enqueue.
>+ */
>+static int __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
> {
> struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
> struct rt_prio_array *array = &rt_rq->active;
>@@ -1386,8 +1407,8 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flag
> */
> if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
> if (rt_se->on_list)
>- __delist_rt_entity(rt_se, array);
>- return;
>+ __delist_rt_entity(rt_se, array, rt_se_prio(rt_se));
>+ return -1;
> }
>
> if (move_entity(flags)) {
>@@ -1402,73 +1423,263 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flag
> }
> rt_se->on_rq = 1;
>
>- inc_rt_tasks(rt_se, rt_rq);
>+ return inc_rt_tasks(rt_se, rt_rq);
> }
>
>-static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
>+/**
>+ * delete rt_se from rt_rq
>+ *
>+ * @rt_se Nodes to be deleted
>+ * @last_prio The highest priority of this rt_se before the previous round
>+ * of deletion
>+ * @flags operation flags
>+ *
>+ * Returns: =0 indicates that the highest priority of the current rq did not
>+ * change during this deletion. >0 indicates it changed, and it returns the
>+ * previous highest priority to use in the next round of deletion.
>+ */
>+static int __dequeue_rt_entity(struct sched_rt_entity *rt_se, int last_prio,
>+ unsigned int flags)
> {
> struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
> struct rt_prio_array *array = &rt_rq->active;
>
> if (move_entity(flags)) {
> WARN_ON_ONCE(!rt_se->on_list);
>- __delist_rt_entity(rt_se, array);
>+ __delist_rt_entity(rt_se, array, last_prio);
> }
> rt_se->on_rq = 0;
>
>- dec_rt_tasks(rt_se, rt_rq);
>+ return dec_rt_tasks(rt_se, rt_rq, last_prio);
>+}
>+
>+static inline void dec_rq_nr_running(struct sched_rt_entity *rt_se,
>+ unsigned int rt, unsigned int rr)
>+{
>+ struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
>+
>+ rt_rq->rt_nr_running -= rt;
>+ rt_rq->rr_nr_running -= rr;
>+}
>+
>+static inline void add_rq_nr_running(struct sched_rt_entity *rt_se,
>+ unsigned int rt, unsigned int rr)
>+{
>+ struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
>+
>+ rt_rq->rt_nr_running += rt;
>+ rt_rq->rr_nr_running += rr;
>+}
>+
>+static inline bool on_top_rt_rq(struct sched_rt_entity *rt_se)
>+{
>+#ifdef CONFIG_RT_GROUP_SCHED
>+ if (rt_se->parent)
>+ return false;
>+#endif
>+ return true;
> }
>
> /*
>- * Because the prio of an upper entry depends on the lower
>- * entries, we must remove entries top - down.
>+ * To optimize the enqueue and dequeue of rt_se, this strategy employs a
>+ * bottom-up removal approach. Specifically, when removing an rt_se at a
>+ * certain level, if it is determined that the highest priority of the rq
>+ * associated with that rt_se has not changed, there is no need to continue
>+ * removing rt_se at higher levels. At this point, only the total number
>+ * of removed rt_se needs to be recorded, and the rt_nr_running count of
>+ * higher-level rq should be removed accordingly.
>+ *
>+ * For enqueue operations, if an rt_se at a certain level is in the rq,
>+ * it is still necessary to check the priority of the higher-level rq.
>+ * If the priority of the higher-level rq is found to be lower than that
>+ * of the rt_se to be added, it should be removed, as updating the highest
>+ * priority of the rq during addition will cause the rq to be repositioned
>+ * in the parent rq.
>+ *
>+ * Conversely, for dequeue operations, if an rt_se at a certain level is
>+ * not in the rq, the operation can be exited immediately to reduce
>+ * unnecessary checks and handling.
>+ *
>+ * The return value refers to the last rt_se that was removed for enqueue
>+ * operations. And for dequeue operations, it refers to the last rt_se
>+ * that was either removed or had its rt_nr_running updated.
> */
>-static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
>+static struct sched_rt_entity *dequeue_rt_stack(struct sched_rt_entity *rt_se,
>+ unsigned int flags, int for_enqueue)
> {
>- struct sched_rt_entity *back = NULL;
>- unsigned int rt_nr_running;
>+ struct sched_rt_entity *last = rt_se;
>+ struct sched_rt_entity *origin = rt_se;
>+ unsigned int del_rt_nr = 0;
>+ unsigned int del_rr_nr = 0;
>+ int prio_changed = rt_se_prio(rt_se);
>+ int sub_on_rq = 1;
>
> for_each_sched_rt_entity(rt_se) {
>- rt_se->back = back;
>- back = rt_se;
>- }
>+ if (on_rt_rq(rt_se)) {
>+ if (sub_on_rq) {
>+ /*
>+ * The number of tasks removed from the sub-level rt_se also needs
>+ * to be subtracted from the rq of the current rt_se, as the current
>+ * rt_se's rq no longer includes the number of removed tasks.
>+ */
>+ dec_rq_nr_running(rt_se, del_rt_nr, del_rr_nr);
>+ if ((prio_changed != MAX_RT_PRIO) ||
>+ (rt_se_prio(rt_se) > rt_se_prio(origin))) {
>+ /*
>+ * If the removal of the lower-level rt_se causes the
>+ * highest priority of the current rq to change, or if the
>+ * priority of current rq is lower than the rt_se to be
>+ * added, then the current rt_se also needs to be removed
>+ * from its parent rq, and the number of deleted tasks
>+ * should be accumulated.
>+ */
>+ if (prio_changed == MAX_RT_PRIO)
>+ prio_changed = rt_se_prio(rt_se);
>+ del_rt_nr += rt_se_nr_running(rt_se);
>+ del_rr_nr += rt_se_rr_nr_running(rt_se);
>+ prio_changed = __dequeue_rt_entity(rt_se,
>+ prio_changed, flags);
>+ last = rt_se;
>+ } else if (!for_enqueue) {
>+ /* For dequeue, last may only rt_nr_running was modified.*/
>+ last = rt_se;
>+ }
>+ } else {
>+ /*
>+ * Entering this branch must be for enqueue, as dequeue would break
>+ * if an rt_se is not online.
>+ * If the sub-level node is not online, and the current rt_se's
>+ * priority is lower than the one being added, current rt_se need
>+ * to be removed.
>+ */
>+ prio_changed = rt_se_prio(rt_se);
>+ if (prio_changed > rt_se_prio(origin)) {
>+ del_rt_nr += rt_se_nr_running(rt_se);
>+ del_rr_nr += rt_se_rr_nr_running(rt_se);
>+ prio_changed = __dequeue_rt_entity(rt_se,
>+ prio_changed, flags);
>+ last = rt_se;
>+ } else {
>+ prio_changed = MAX_RT_PRIO;
>+ }
>+ }
>
>- rt_nr_running = rt_rq_of_se(back)->rt_nr_running;
>+ /*
>+ * If the current rt_se is on the top rt_rq, then the already deleted
>+ * nodes, plus the count of the rt_rq where current rt_se located,
>+ * need to be removed from the top_rt_rq.
>+ */
>+ if (on_top_rt_rq(rt_se)) {
>+ dequeue_top_rt_rq(rt_rq_of_se(rt_se),
>+ del_rt_nr + rt_rq_of_se(rt_se)->rt_nr_running);
>+ }
>+ sub_on_rq = 1;
>+ } else if (for_enqueue) {
>+ struct rt_rq *group_rq = group_rt_rq(rt_se);
>
>- for (rt_se = back; rt_se; rt_se = rt_se->back) {
>- if (on_rt_rq(rt_se))
>- __dequeue_rt_entity(rt_se, flags);
>+ /*
>+ * In the case of an enqueue operation, if a certain level is found to be
>+ * not online, then the previous counts need to be reset to zero.
>+ */
>+ prio_changed = MAX_RT_PRIO;
>+ sub_on_rq = 0;
>+ del_rt_nr = 0;
>+ del_rr_nr = 0;
>+
>+ /*
>+ * If the current group is being throttled, then there is no need to check
>+ * higher levels since enqueueing will not affect higher-level nodes.
>+ */
>+ if (group_rq && rt_rq_throttled(group_rq))
>+ break;
>+
>+ if (on_top_rt_rq(rt_se))
>+ dequeue_top_rt_rq(rt_rq_of_se(rt_se),
>+ rt_rq_of_se(rt_se)->rt_nr_running);
>+ } else {
>+ last = rt_se;
>+ break;
>+ }
> }
>
>- dequeue_top_rt_rq(rt_rq_of_se(back), rt_nr_running);
>+ return last;
> }
>
> static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
> {
> struct rq *rq = rq_of_rt_se(rt_se);
>+ struct sched_rt_entity *last;
>+ unsigned int add_rt_nr = 0;
>+ unsigned int add_rr_nr = 0;
>+ int enqueue = 1;
>+ int prio_change = 1;
>
> update_stats_enqueue_rt(rt_rq_of_se(rt_se), rt_se, flags);
>
>- dequeue_rt_stack(rt_se, flags);
>- for_each_sched_rt_entity(rt_se)
>- __enqueue_rt_entity(rt_se, flags);
>+ last = dequeue_rt_stack(rt_se, flags, 1);
>+
>+ for_each_sched_rt_entity(rt_se) {
>+ if (enqueue || !on_rt_rq(rt_se) || (prio_change == 1)) {
>+ prio_change = __enqueue_rt_entity(rt_se, flags);
>+ if (prio_change >= 0) {
>+ add_rt_nr = rt_se_nr_running(rt_se);
>+ add_rr_nr = rt_se_rr_nr_running(rt_se);
>+ } else {
>+ add_rt_nr = add_rr_nr = 0;
>+ }
>+ } else {
>+ add_rq_nr_running(rt_se, add_rt_nr, add_rr_nr);
>+ }
>+
>+ if (rt_se == last)
>+ enqueue = 0;
>+ }
>+
> enqueue_top_rt_rq(&rq->rt);
> }
>
> static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
> {
> struct rq *rq = rq_of_rt_se(rt_se);
>+ struct sched_rt_entity *last;
>+ unsigned int add_rt_nr = 0;
>+ unsigned int add_rr_nr = 0;
>+ int prio_change = 1;
>
> update_stats_dequeue_rt(rt_rq_of_se(rt_se), rt_se, flags);
>
>- dequeue_rt_stack(rt_se, flags);
>+ last = dequeue_rt_stack(rt_se, flags, 0);
>
> for_each_sched_rt_entity(rt_se) {
> struct rt_rq *rt_rq = group_rt_rq(rt_se);
>+ if (rt_rq && rt_rq->rt_nr_running) {
>+ if (on_rt_rq(rt_se)) {
>+ add_rq_nr_running(rt_se, add_rt_nr, add_rr_nr);
>+ } else {
>+ prio_change = __enqueue_rt_entity(rt_se, flags);
>+ if (prio_change == 0) {
>+ /*
>+ * If enqueue is successful and the priority of the rq has
>+ * not changed, then the parent node only needs to add the
>+ * count of the current rt_se. Otherwise, the parent node
>+ * will also need to enqueue.
>+ */
>+ add_rt_nr = rt_se_nr_running(rt_se);
>+ add_rr_nr = rt_se_rr_nr_running(rt_se);
>+ }
>+ }
>+ } else {
>+ add_rt_nr = add_rr_nr = 0;
>+ }
>
>- if (rt_rq && rt_rq->rt_nr_running)
>- __enqueue_rt_entity(rt_se, flags);
>+ /*
>+ * last is the rt_se of the last deletion or modification of the
>+ * count, so the subsequent rt_se does not need to be updated.
>+ */
>+ if (rt_se == last)
>+ break;
> }
>+
> enqueue_top_rt_rq(&rq->rt);
> }
>
>diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>index ef20c61004eb..821d65106d13 100644
>--- a/kernel/sched/sched.h
>+++ b/kernel/sched/sched.h
>@@ -2879,6 +2879,7 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
> extern void print_dl_stats(struct seq_file *m, int cpu);
> extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
> extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
>+extern void print_rt_rq_task(struct seq_file *m, struct rt_rq *rt_rq);
> extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
>
> extern void resched_latency_warn(int cpu, u64 latency);
>--
>2.45.2
Just a reminder, does anyone have any comments or feedback on the patch?
--
Best Regards,
Xavier
At 2024-07-25 14:21:03, "Xavier" <xavier_qy@163.com> wrote:
>
>Hi all,
>
>I would like to ask everyone for your opinions or thoughts on the RT group scheduling
>optimization. At this stage, is it ready to be merged into the corresponding branch?
>Thanks.
>
>--
>Best Regards,
>Xavier
>
>
>
>
>
>At 2024-07-17 11:00:32, "Xavier" <xavier_qy@163.com> wrote:
>>This patch optimizes the enqueue and dequeue of rt_se, the strategy employs
>>a bottom-up removal approach. Specifically, when removing an rt_se at a
>>certain level, if it is determined that the highest priority of the rq
>>associated with that rt_se has not changed, there is no need to continue
>>removing rt_se at higher levels. At this point, only the total number
>>of removed rt_se needs to be recorded, and the rt_nr_running count of
>>higher-level rq should be removed accordingly.
>>
>>Signed-off-by: Xavier <xavier_qy@163.com>
>>---
>> kernel/sched/debug.c | 48 ++++++++
>> kernel/sched/rt.c | 287 +++++++++++++++++++++++++++++++++++++------
>> kernel/sched/sched.h | 1 +
>> 3 files changed, 298 insertions(+), 38 deletions(-)
>>
>>diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
>>index c1eb9a1afd13..352ee55da25e 100644
>>--- a/kernel/sched/debug.c
>>+++ b/kernel/sched/debug.c
>>@@ -712,6 +712,54 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
>> #endif
>> }
>>
>>+static void print_rt_se(struct seq_file *m, struct sched_rt_entity *rt_se)
>>+{
>>+ struct task_struct *task;
>>+
>>+#ifdef CONFIG_RT_GROUP_SCHED
>>+ if (rt_se->my_q) {
>>+ SEQ_printf_task_group_path(m, rt_se->my_q->tg, "%s\n");
>>+ return;
>>+ }
>>+#endif
>>+ task = container_of(rt_se, struct task_struct, rt);
>>+ SEQ_printf(m, " prio-%d, pid-%d, %s\n", task->prio, task->pid, task->comm);
>>+}
>>+
>>+/*shall be called in rq lock*/
>>+void print_rt_rq_task(struct seq_file *m, struct rt_rq *rt_rq)
>>+{
>>+ struct rt_prio_array *array = &rt_rq->active;
>>+ struct sched_rt_entity *rt_se;
>>+ struct list_head *queue, *head;
>>+ unsigned long bitmap[2];
>>+ int idx;
>>+ int count = 0;
>>+
>>+ if (!rt_rq->rt_nr_running)
>>+ return;
>>+
>>+ memcpy(bitmap, array->bitmap, sizeof(unsigned long) * 2);
>>+ idx = sched_find_first_bit(bitmap);
>>+ WARN_ON_ONCE(idx >= MAX_RT_PRIO);
>>+
>>+ while (1) {
>>+ clear_bit(idx, bitmap);
>>+ queue = array->queue + idx;
>>+ head = queue;
>>+ queue = queue->next;
>>+ do {
>>+ rt_se = list_entry(queue, struct sched_rt_entity, run_list);
>>+ print_rt_se(m, rt_se);
>>+ queue = queue->next;
>>+ count++;
>>+ } while (queue != head);
>>+ idx = sched_find_first_bit(bitmap);
>>+ if (idx >= MAX_RT_PRIO)
>>+ break;
>>+ }
>>+}
>>+
>> void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
>> {
>> #ifdef CONFIG_RT_GROUP_SCHED
>>diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
>>index aa4c1c874fa4..b18c424a50d2 100644
>>--- a/kernel/sched/rt.c
>>+++ b/kernel/sched/rt.c
>>@@ -1113,7 +1113,7 @@ void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
>> #endif /* CONFIG_SMP */
>>
>> #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
>>-static void
>>+static int
>> inc_rt_prio(struct rt_rq *rt_rq, int prio)
>> {
>> int prev_prio = rt_rq->highest_prio.curr;
>>@@ -1122,9 +1122,11 @@ inc_rt_prio(struct rt_rq *rt_rq, int prio)
>> rt_rq->highest_prio.curr = prio;
>>
>> inc_rt_prio_smp(rt_rq, prio, prev_prio);
>>+
>>+ return prev_prio > prio;
>> }
>>
>>-static void
>>+static int
>> dec_rt_prio(struct rt_rq *rt_rq, int prio)
>> {
>> int prev_prio = rt_rq->highest_prio.curr;
>>@@ -1149,12 +1151,22 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio)
>> }
>>
>> dec_rt_prio_smp(rt_rq, prio, prev_prio);
>>+ if (rt_rq->highest_prio.curr > prio)
>>+ return prio;
>>+ else
>>+ return MAX_RT_PRIO;
>> }
>>
>> #else
>>
>>-static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
>>-static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
>>+static inline int inc_rt_prio(struct rt_rq *rt_rq, int prio)
>>+{
>>+ return 0;
>>+}
>>+static inline int dec_rt_prio(struct rt_rq *rt_rq, int prio)
>>+{
>>+ return 0;
>>+}
>>
>> #endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
>>
>>@@ -1218,28 +1230,31 @@ unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
>> }
>>
>> static inline
>>-void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
>>+int inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
>> {
>> int prio = rt_se_prio(rt_se);
>>+ int prio_change;
>>
>> WARN_ON(!rt_prio(prio));
>> rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
>> rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
>>
>>- inc_rt_prio(rt_rq, prio);
>>+ prio_change = inc_rt_prio(rt_rq, prio);
>> inc_rt_group(rt_se, rt_rq);
>>+ return prio_change;
>> }
>>
>> static inline
>>-void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
>>+int dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq, int prio)
>> {
>>+ int prio_changed;
>> WARN_ON(!rt_prio(rt_se_prio(rt_se)));
>>- WARN_ON(!rt_rq->rt_nr_running);
>> rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
>> rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
>>
>>- dec_rt_prio(rt_rq, rt_se_prio(rt_se));
>>+ prio_changed = dec_rt_prio(rt_rq, prio);
>> dec_rt_group(rt_se, rt_rq);
>>+ return prio_changed;
>> }
>>
>> /*
>>@@ -1255,12 +1270,13 @@ static inline bool move_entity(unsigned int flags)
>> return true;
>> }
>>
>>-static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
>>+static void __delist_rt_entity(struct sched_rt_entity *rt_se,
>>+ struct rt_prio_array *array, int last_prio)
>> {
>> list_del_init(&rt_se->run_list);
>>
>>- if (list_empty(array->queue + rt_se_prio(rt_se)))
>>- __clear_bit(rt_se_prio(rt_se), array->bitmap);
>>+ if (list_empty(array->queue + last_prio))
>>+ __clear_bit(last_prio, array->bitmap);
>>
>> rt_se->on_list = 0;
>> }
>>@@ -1371,7 +1387,12 @@ update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
>> }
>> }
>>
>>-static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
>>+/*
>>+ * Returns: -1 indicates that rt_se was not enqueued, 0 indicates that the highest
>>+ * priority of the rq did not change after enqueue, and 1 indicates that the highest
>>+ * priority of the rq changed after enqueue.
>>+ */
>>+static int __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
>> {
>> struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
>> struct rt_prio_array *array = &rt_rq->active;
>>@@ -1386,8 +1407,8 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flag
>> */
>> if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
>> if (rt_se->on_list)
>>- __delist_rt_entity(rt_se, array);
>>- return;
>>+ __delist_rt_entity(rt_se, array, rt_se_prio(rt_se));
>>+ return -1;
>> }
>>
>> if (move_entity(flags)) {
>>@@ -1402,73 +1423,263 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flag
>> }
>> rt_se->on_rq = 1;
>>
>>- inc_rt_tasks(rt_se, rt_rq);
>>+ return inc_rt_tasks(rt_se, rt_rq);
>> }
>>
>>-static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
>>+/**
>>+ * delete rt_se from rt_rq
>>+ *
>>+ * @rt_se Nodes to be deleted
>>+ * @last_prio The highest priority of this rt_se before the previous round
>>+ * of deletion
>>+ * @flags operation flags
>>+ *
>>+ * Returns: =0 indicates that the highest priority of the current rq did not
>>+ * change during this deletion. >0 indicates it changed, and it returns the
>>+ * previous highest priority to use in the next round of deletion.
>>+ */
>>+static int __dequeue_rt_entity(struct sched_rt_entity *rt_se, int last_prio,
>>+ unsigned int flags)
>> {
>> struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
>> struct rt_prio_array *array = &rt_rq->active;
>>
>> if (move_entity(flags)) {
>> WARN_ON_ONCE(!rt_se->on_list);
>>- __delist_rt_entity(rt_se, array);
>>+ __delist_rt_entity(rt_se, array, last_prio);
>> }
>> rt_se->on_rq = 0;
>>
>>- dec_rt_tasks(rt_se, rt_rq);
>>+ return dec_rt_tasks(rt_se, rt_rq, last_prio);
>>+}
>>+
>>+static inline void dec_rq_nr_running(struct sched_rt_entity *rt_se,
>>+ unsigned int rt, unsigned int rr)
>>+{
>>+ struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
>>+
>>+ rt_rq->rt_nr_running -= rt;
>>+ rt_rq->rr_nr_running -= rr;
>>+}
>>+
>>+static inline void add_rq_nr_running(struct sched_rt_entity *rt_se,
>>+ unsigned int rt, unsigned int rr)
>>+{
>>+ struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
>>+
>>+ rt_rq->rt_nr_running += rt;
>>+ rt_rq->rr_nr_running += rr;
>>+}
>>+
>>+static inline bool on_top_rt_rq(struct sched_rt_entity *rt_se)
>>+{
>>+#ifdef CONFIG_RT_GROUP_SCHED
>>+ if (rt_se->parent)
>>+ return false;
>>+#endif
>>+ return true;
>> }
>>
>> /*
>>- * Because the prio of an upper entry depends on the lower
>>- * entries, we must remove entries top - down.
>>+ * To optimize the enqueue and dequeue of rt_se, this strategy employs a
>>+ * bottom-up removal approach. Specifically, when removing an rt_se at a
>>+ * certain level, if it is determined that the highest priority of the rq
>>+ * associated with that rt_se has not changed, there is no need to continue
>>+ * removing rt_se at higher levels. At this point, only the total number
>>+ * of removed rt_se needs to be recorded, and the rt_nr_running count of
>>+ * higher-level rq should be removed accordingly.
>>+ *
>>+ * For enqueue operations, if an rt_se at a certain level is in the rq,
>>+ * it is still necessary to check the priority of the higher-level rq.
>>+ * If the priority of the higher-level rq is found to be lower than that
>>+ * of the rt_se to be added, it should be removed, as updating the highest
>>+ * priority of the rq during addition will cause the rq to be repositioned
>>+ * in the parent rq.
>>+ *
>>+ * Conversely, for dequeue operations, if an rt_se at a certain level is
>>+ * not in the rq, the operation can be exited immediately to reduce
>>+ * unnecessary checks and handling.
>>+ *
>>+ * The return value refers to the last rt_se that was removed for enqueue
>>+ * operations. And for dequeue operations, it refers to the last rt_se
>>+ * that was either removed or had its rt_nr_running updated.
>> */
>>-static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
>>+static struct sched_rt_entity *dequeue_rt_stack(struct sched_rt_entity *rt_se,
>>+ unsigned int flags, int for_enqueue)
>> {
>>- struct sched_rt_entity *back = NULL;
>>- unsigned int rt_nr_running;
>>+ struct sched_rt_entity *last = rt_se;
>>+ struct sched_rt_entity *origin = rt_se;
>>+ unsigned int del_rt_nr = 0;
>>+ unsigned int del_rr_nr = 0;
>>+ int prio_changed = rt_se_prio(rt_se);
>>+ int sub_on_rq = 1;
>>
>> for_each_sched_rt_entity(rt_se) {
>>- rt_se->back = back;
>>- back = rt_se;
>>- }
>>+ if (on_rt_rq(rt_se)) {
>>+ if (sub_on_rq) {
>>+ /*
>>+ * The number of tasks removed from the sub-level rt_se also needs
>>+ * to be subtracted from the rq of the current rt_se, as the current
>>+ * rt_se's rq no longer includes the number of removed tasks.
>>+ */
>>+ dec_rq_nr_running(rt_se, del_rt_nr, del_rr_nr);
>>+ if ((prio_changed != MAX_RT_PRIO) ||
>>+ (rt_se_prio(rt_se) > rt_se_prio(origin))) {
>>+ /*
>>+ * If the removal of the lower-level rt_se causes the
>>+ * highest priority of the current rq to change, or if the
>>+ * priority of current rq is lower than the rt_se to be
>>+ * added, then the current rt_se also needs to be removed
>>+ * from its parent rq, and the number of deleted tasks
>>+ * should be accumulated.
>>+ */
>>+ if (prio_changed == MAX_RT_PRIO)
>>+ prio_changed = rt_se_prio(rt_se);
>>+ del_rt_nr += rt_se_nr_running(rt_se);
>>+ del_rr_nr += rt_se_rr_nr_running(rt_se);
>>+ prio_changed = __dequeue_rt_entity(rt_se,
>>+ prio_changed, flags);
>>+ last = rt_se;
>>+ } else if (!for_enqueue) {
>>+ /* For dequeue, last may only rt_nr_running was modified.*/
>>+ last = rt_se;
>>+ }
>>+ } else {
>>+ /*
>>+ * Entering this branch must be for enqueue, as dequeue would break
>>+ * if an rt_se is not online.
>>+ * If the sub-level node is not online, and the current rt_se's
>>+ * priority is lower than the one being added, current rt_se need
>>+ * to be removed.
>>+ */
>>+ prio_changed = rt_se_prio(rt_se);
>>+ if (prio_changed > rt_se_prio(origin)) {
>>+ del_rt_nr += rt_se_nr_running(rt_se);
>>+ del_rr_nr += rt_se_rr_nr_running(rt_se);
>>+ prio_changed = __dequeue_rt_entity(rt_se,
>>+ prio_changed, flags);
>>+ last = rt_se;
>>+ } else {
>>+ prio_changed = MAX_RT_PRIO;
>>+ }
>>+ }
>>
>>- rt_nr_running = rt_rq_of_se(back)->rt_nr_running;
>>+ /*
>>+ * If the current rt_se is on the top rt_rq, then the already deleted
>>+ * nodes, plus the count of the rt_rq where current rt_se located,
>>+ * need to be removed from the top_rt_rq.
>>+ */
>>+ if (on_top_rt_rq(rt_se)) {
>>+ dequeue_top_rt_rq(rt_rq_of_se(rt_se),
>>+ del_rt_nr + rt_rq_of_se(rt_se)->rt_nr_running);
>>+ }
>>+ sub_on_rq = 1;
>>+ } else if (for_enqueue) {
>>+ struct rt_rq *group_rq = group_rt_rq(rt_se);
>>
>>- for (rt_se = back; rt_se; rt_se = rt_se->back) {
>>- if (on_rt_rq(rt_se))
>>- __dequeue_rt_entity(rt_se, flags);
>>+ /*
>>+ * In the case of an enqueue operation, if a certain level is found to be
>>+ * not online, then the previous counts need to be reset to zero.
>>+ */
>>+ prio_changed = MAX_RT_PRIO;
>>+ sub_on_rq = 0;
>>+ del_rt_nr = 0;
>>+ del_rr_nr = 0;
>>+
>>+ /*
>>+ * If the current group is being throttled, then there is no need to check
>>+ * higher levels since enqueueing will not affect higher-level nodes.
>>+ */
>>+ if (group_rq && rt_rq_throttled(group_rq))
>>+ break;
>>+
>>+ if (on_top_rt_rq(rt_se))
>>+ dequeue_top_rt_rq(rt_rq_of_se(rt_se),
>>+ rt_rq_of_se(rt_se)->rt_nr_running);
>>+ } else {
>>+ last = rt_se;
>>+ break;
>>+ }
>> }
>>
>>- dequeue_top_rt_rq(rt_rq_of_se(back), rt_nr_running);
>>+ return last;
>> }
>>
>> static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
>> {
>> struct rq *rq = rq_of_rt_se(rt_se);
>>+ struct sched_rt_entity *last;
>>+ unsigned int add_rt_nr = 0;
>>+ unsigned int add_rr_nr = 0;
>>+ int enqueue = 1;
>>+ int prio_change = 1;
>>
>> update_stats_enqueue_rt(rt_rq_of_se(rt_se), rt_se, flags);
>>
>>- dequeue_rt_stack(rt_se, flags);
>>- for_each_sched_rt_entity(rt_se)
>>- __enqueue_rt_entity(rt_se, flags);
>>+ last = dequeue_rt_stack(rt_se, flags, 1);
>>+
>>+ for_each_sched_rt_entity(rt_se) {
>>+ if (enqueue || !on_rt_rq(rt_se) || (prio_change == 1)) {
>>+ prio_change = __enqueue_rt_entity(rt_se, flags);
>>+ if (prio_change >= 0) {
>>+ add_rt_nr = rt_se_nr_running(rt_se);
>>+ add_rr_nr = rt_se_rr_nr_running(rt_se);
>>+ } else {
>>+ add_rt_nr = add_rr_nr = 0;
>>+ }
>>+ } else {
>>+ add_rq_nr_running(rt_se, add_rt_nr, add_rr_nr);
>>+ }
>>+
>>+ if (rt_se == last)
>>+ enqueue = 0;
>>+ }
>>+
>> enqueue_top_rt_rq(&rq->rt);
>> }
>>
>> static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
>> {
>> struct rq *rq = rq_of_rt_se(rt_se);
>>+ struct sched_rt_entity *last;
>>+ unsigned int add_rt_nr = 0;
>>+ unsigned int add_rr_nr = 0;
>>+ int prio_change = 1;
>>
>> update_stats_dequeue_rt(rt_rq_of_se(rt_se), rt_se, flags);
>>
>>- dequeue_rt_stack(rt_se, flags);
>>+ last = dequeue_rt_stack(rt_se, flags, 0);
>>
>> for_each_sched_rt_entity(rt_se) {
>> struct rt_rq *rt_rq = group_rt_rq(rt_se);
>>+ if (rt_rq && rt_rq->rt_nr_running) {
>>+ if (on_rt_rq(rt_se)) {
>>+ add_rq_nr_running(rt_se, add_rt_nr, add_rr_nr);
>>+ } else {
>>+ prio_change = __enqueue_rt_entity(rt_se, flags);
>>+ if (prio_change == 0) {
>>+ /*
>>+ * If enqueue is successful and the priority of the rq has
>>+ * not changed, then the parent node only needs to add the
>>+ * count of the current rt_se. Otherwise, the parent node
>>+ * will also need to enqueue.
>>+ */
>>+ add_rt_nr = rt_se_nr_running(rt_se);
>>+ add_rr_nr = rt_se_rr_nr_running(rt_se);
>>+ }
>>+ }
>>+ } else {
>>+ add_rt_nr = add_rr_nr = 0;
>>+ }
>>
>>- if (rt_rq && rt_rq->rt_nr_running)
>>- __enqueue_rt_entity(rt_se, flags);
>>+ /*
>>+ * last is the rt_se of the last deletion or modification of the
>>+ * count, so the subsequent rt_se does not need to be updated.
>>+ */
>>+ if (rt_se == last)
>>+ break;
>> }
>>+
>> enqueue_top_rt_rq(&rq->rt);
>> }
>>
>>diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>>index ef20c61004eb..821d65106d13 100644
>>--- a/kernel/sched/sched.h
>>+++ b/kernel/sched/sched.h
>>@@ -2879,6 +2879,7 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
>> extern void print_dl_stats(struct seq_file *m, int cpu);
>> extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
>> extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
>>+extern void print_rt_rq_task(struct seq_file *m, struct rt_rq *rt_rq);
>> extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
>>
>> extern void resched_latency_warn(int cpu, u64 latency);
>>--
>>2.45.2
© 2016 - 2025 Red Hat, Inc.