kernel/sched/debug.c | 1 +- kernel/sched/fair.c | 49 ++++++++++++++++++++++++++++++++++++++----- kernel/sched/pelt.c | 2 +- kernel/sched/sched.h | 7 ++++-- 4 files changed, 51 insertions(+), 8 deletions(-)
The following commit has been merged into the sched/core branch of tip:
Commit-ID: 2e05f6c71d36f8ae1410a1cf3f12848cc17916e9
Gitweb: https://git.kernel.org/tip/2e05f6c71d36f8ae1410a1cf3f12848cc17916e9
Author: Peter Zijlstra <peterz@infradead.org>
AuthorDate: Fri, 06 Sep 2024 12:45:25 +02:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 10 Sep 2024 09:51:15 +02:00
sched/eevdf: More PELT vs DELAYED_DEQUEUE
Vincent and Dietmar noted that while commit fc1892becd56 fixes the
entity runnable stats, it does not adjust the cfs_rq runnable stats,
which are based off of h_nr_running.
Track h_nr_delayed such that we can discount those and adjust the
signal.
Fixes: fc1892becd56 ("sched/eevdf: Fixup PELT vs DELAYED_DEQUEUE")
Reported-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Reported-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20240906104525.GG4928@noisy.programming.kicks-ass.net
---
kernel/sched/debug.c | 1 +-
kernel/sched/fair.c | 49 ++++++++++++++++++++++++++++++++++++++-----
kernel/sched/pelt.c | 2 +-
kernel/sched/sched.h | 7 ++++--
4 files changed, 51 insertions(+), 8 deletions(-)
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index de1dc52..35974ac 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -844,6 +844,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread));
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running);
+ SEQ_printf(m, " .%-30s: %d\n", "h_nr_delayed", cfs_rq->h_nr_delayed);
SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running",
cfs_rq->idle_nr_running);
SEQ_printf(m, " .%-30s: %d\n", "idle_h_nr_running",
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 922d690..0bc5e62 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5456,9 +5456,31 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
-static inline void finish_delayed_dequeue_entity(struct sched_entity *se)
+static void set_delayed(struct sched_entity *se)
+{
+ se->sched_delayed = 1;
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ cfs_rq->h_nr_delayed++;
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+ }
+}
+
+static void clear_delayed(struct sched_entity *se)
{
se->sched_delayed = 0;
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ cfs_rq->h_nr_delayed--;
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+ }
+}
+
+static inline void finish_delayed_dequeue_entity(struct sched_entity *se)
+{
+ clear_delayed(se);
if (sched_feat(DELAY_ZERO) && se->vlag > 0)
se->vlag = 0;
}
@@ -5488,7 +5510,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (cfs_rq->next == se)
cfs_rq->next = NULL;
update_load_avg(cfs_rq, se, 0);
- se->sched_delayed = 1;
+ set_delayed(se);
return false;
}
}
@@ -5907,7 +5929,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
- long task_delta, idle_task_delta, dequeue = 1;
+ long task_delta, idle_task_delta, delayed_delta, dequeue = 1;
long rq_h_nr_running = rq->cfs.h_nr_running;
raw_spin_lock(&cfs_b->lock);
@@ -5940,6 +5962,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
task_delta = cfs_rq->h_nr_running;
idle_task_delta = cfs_rq->idle_h_nr_running;
+ delayed_delta = cfs_rq->h_nr_delayed;
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
int flags;
@@ -5963,6 +5986,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
qcfs_rq->h_nr_running -= task_delta;
qcfs_rq->idle_h_nr_running -= idle_task_delta;
+ qcfs_rq->h_nr_delayed -= delayed_delta;
if (qcfs_rq->load.weight) {
/* Avoid re-evaluating load for this entity: */
@@ -5985,6 +6009,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
qcfs_rq->h_nr_running -= task_delta;
qcfs_rq->idle_h_nr_running -= idle_task_delta;
+ qcfs_rq->h_nr_delayed -= delayed_delta;
}
/* At this point se is NULL and we are at root level*/
@@ -6010,7 +6035,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
- long task_delta, idle_task_delta;
+ long task_delta, idle_task_delta, delayed_delta;
long rq_h_nr_running = rq->cfs.h_nr_running;
se = cfs_rq->tg->se[cpu_of(rq)];
@@ -6046,6 +6071,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
task_delta = cfs_rq->h_nr_running;
idle_task_delta = cfs_rq->idle_h_nr_running;
+ delayed_delta = cfs_rq->h_nr_delayed;
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
@@ -6060,6 +6086,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
qcfs_rq->h_nr_running += task_delta;
qcfs_rq->idle_h_nr_running += idle_task_delta;
+ qcfs_rq->h_nr_delayed += delayed_delta;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(qcfs_rq))
@@ -6077,6 +6104,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
qcfs_rq->h_nr_running += task_delta;
qcfs_rq->idle_h_nr_running += idle_task_delta;
+ qcfs_rq->h_nr_delayed += delayed_delta;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(qcfs_rq))
@@ -6930,7 +6958,7 @@ requeue_delayed_entity(struct sched_entity *se)
}
update_load_avg(cfs_rq, se, 0);
- se->sched_delayed = 0;
+ clear_delayed(se);
}
/*
@@ -6944,6 +6972,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
int idle_h_nr_running = task_has_idle_policy(p);
+ int h_nr_delayed = 0;
int task_new = !(flags & ENQUEUE_WAKEUP);
int rq_h_nr_running = rq->cfs.h_nr_running;
u64 slice = 0;
@@ -6970,6 +6999,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (p->in_iowait)
cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
+ if (task_new)
+ h_nr_delayed = !!se->sched_delayed;
+
for_each_sched_entity(se) {
if (se->on_rq) {
if (se->sched_delayed)
@@ -6992,6 +7024,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq->h_nr_running++;
cfs_rq->idle_h_nr_running += idle_h_nr_running;
+ cfs_rq->h_nr_delayed += h_nr_delayed;
if (cfs_rq_is_idle(cfs_rq))
idle_h_nr_running = 1;
@@ -7015,6 +7048,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq->h_nr_running++;
cfs_rq->idle_h_nr_running += idle_h_nr_running;
+ cfs_rq->h_nr_delayed += h_nr_delayed;
if (cfs_rq_is_idle(cfs_rq))
idle_h_nr_running = 1;
@@ -7077,6 +7111,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
struct task_struct *p = NULL;
int idle_h_nr_running = 0;
int h_nr_running = 0;
+ int h_nr_delayed = 0;
struct cfs_rq *cfs_rq;
u64 slice = 0;
@@ -7084,6 +7119,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
p = task_of(se);
h_nr_running = 1;
idle_h_nr_running = task_has_idle_policy(p);
+ if (!task_sleep && !task_delayed)
+ h_nr_delayed = !!se->sched_delayed;
} else {
cfs_rq = group_cfs_rq(se);
slice = cfs_rq_min_slice(cfs_rq);
@@ -7101,6 +7138,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
cfs_rq->h_nr_running -= h_nr_running;
cfs_rq->idle_h_nr_running -= idle_h_nr_running;
+ cfs_rq->h_nr_delayed -= h_nr_delayed;
if (cfs_rq_is_idle(cfs_rq))
idle_h_nr_running = h_nr_running;
@@ -7139,6 +7177,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
cfs_rq->h_nr_running -= h_nr_running;
cfs_rq->idle_h_nr_running -= idle_h_nr_running;
+ cfs_rq->h_nr_delayed -= h_nr_delayed;
if (cfs_rq_is_idle(cfs_rq))
idle_h_nr_running = h_nr_running;
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index fa52906..21e3ff5 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -321,7 +321,7 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
{
if (___update_load_sum(now, &cfs_rq->avg,
scale_load_down(cfs_rq->load.weight),
- cfs_rq->h_nr_running,
+ cfs_rq->h_nr_running - cfs_rq->h_nr_delayed,
cfs_rq->curr != NULL)) {
___update_load_avg(&cfs_rq->avg, 1);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3744f16..d91360b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -603,6 +603,7 @@ struct cfs_rq {
unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */
unsigned int idle_nr_running; /* SCHED_IDLE */
unsigned int idle_h_nr_running; /* SCHED_IDLE */
+ unsigned int h_nr_delayed;
s64 avg_vruntime;
u64 avg_load;
@@ -813,8 +814,10 @@ struct dl_rq {
static inline void se_update_runnable(struct sched_entity *se)
{
- if (!entity_is_task(se))
- se->runnable_weight = se->my_q->h_nr_running;
+ if (!entity_is_task(se)) {
+ struct cfs_rq *cfs_rq = se->my_q;
+ se->runnable_weight = cfs_rq->h_nr_running - cfs_rq->h_nr_delayed;
+ }
}
static inline long se_runnable(struct sched_entity *se)
Hello Peter, On 9/10/2024 1:39 PM, tip-bot2 for Peter Zijlstra wrote: > The following commit has been merged into the sched/core branch of tip: > > Commit-ID: 2e05f6c71d36f8ae1410a1cf3f12848cc17916e9 > Gitweb: https://git.kernel.org/tip/2e05f6c71d36f8ae1410a1cf3f12848cc17916e9 > Author: Peter Zijlstra <peterz@infradead.org> > AuthorDate: Fri, 06 Sep 2024 12:45:25 +02:00 > Committer: Peter Zijlstra <peterz@infradead.org> > CommitterDate: Tue, 10 Sep 2024 09:51:15 +02:00 > > sched/eevdf: More PELT vs DELAYED_DEQUEUE > > Vincent and Dietmar noted that while commit fc1892becd56 fixes the > entity runnable stats, it does not adjust the cfs_rq runnable stats, > which are based off of h_nr_running. > > Track h_nr_delayed such that we can discount those and adjust the > signal. > > Fixes: fc1892becd56 ("sched/eevdf: Fixup PELT vs DELAYED_DEQUEUE") > Reported-by: Dietmar Eggemann <dietmar.eggemann@arm.com> > Reported-by: Vincent Guittot <vincent.guittot@linaro.org> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> > Link: https://lkml.kernel.org/r/20240906104525.GG4928@noisy.programming.kicks-ass.net I've been testing this fix for a while now to see if it helps the regressions reported around EEVDF complete. The issue with negative "h_nr_delayed" reported by Luis previously seem to have been fixed as a result of commit 75b6499024a6 ("sched/fair: Properly deactivate sched_delayed task upon class change") I've been running stress-ng for a while and haven't seen any cases of negative "h_nr_delayed". I'd also added the following WARN_ON() to see if there are any delayed tasks on the cfs_rq before switching to idle in some of my previous experiments and I did not see any splat during my benchmark runs. diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 621696269584..c19a31fa46c9 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -457,6 +457,9 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct t static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first) { + /* All delayed tasks must be picked off before switching to idle */ + SCHED_WARN_ON(rq->cfs.h_nr_delayed); + update_idle_core(rq); scx_update_idle(rq, true); schedstat_inc(rq->sched_goidle); -- If you are including this back, feel free to add: Tested-by: K Prateek Nayak <kprateek.nayak@amd.com> > [..snip..] -- Thanks and Regards, Prateek
Hi, On 11/27/24 04:17, K Prateek Nayak wrote: > Hello Peter, > > On 9/10/2024 1:39 PM, tip-bot2 for Peter Zijlstra wrote: >> The following commit has been merged into the sched/core branch of tip: >> >> Commit-ID: 2e05f6c71d36f8ae1410a1cf3f12848cc17916e9 >> Gitweb: https://git.kernel.org/tip/2e05f6c71d36f8ae1410a1cf3f12848cc17916e9 >> Author: Peter Zijlstra <peterz@infradead.org> >> AuthorDate: Fri, 06 Sep 2024 12:45:25 +02:00 >> Committer: Peter Zijlstra <peterz@infradead.org> >> CommitterDate: Tue, 10 Sep 2024 09:51:15 +02:00 >> >> sched/eevdf: More PELT vs DELAYED_DEQUEUE >> >> Vincent and Dietmar noted that while commit fc1892becd56 fixes the >> entity runnable stats, it does not adjust the cfs_rq runnable stats, >> which are based off of h_nr_running. >> >> Track h_nr_delayed such that we can discount those and adjust the >> signal. >> >> Fixes: fc1892becd56 ("sched/eevdf: Fixup PELT vs DELAYED_DEQUEUE") >> Reported-by: Dietmar Eggemann <dietmar.eggemann@arm.com> >> Reported-by: Vincent Guittot <vincent.guittot@linaro.org> >> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> >> Link: https://lkml.kernel.org/r/20240906104525.GG4928@noisy.programming.kicks-ass.net > > I've been testing this fix for a while now to see if it helps the > regressions reported around EEVDF complete. The issue with negative > "h_nr_delayed" reported by Luis previously seem to have been fixed as a > result of commit 75b6499024a6 ("sched/fair: Properly deactivate > sched_delayed task upon class change") I recall having 75b6499024a6 in my testing tree and somehow still noticing unbalanced accounting for h_nr_delayed, where it would be decremented twice eventually, leading to negative numbers. I might have to give it another go if we're considering including the change as-is, just to make sure. Since our setups are slightly different, we could be exercising some slightly different paths. Did this patch help with the regressions you noticed though? > > I've been running stress-ng for a while and haven't seen any cases of > negative "h_nr_delayed". I'd also added the following WARN_ON() to see > if there are any delayed tasks on the cfs_rq before switching to idle in > some of my previous experiments and I did not see any splat during my > benchmark runs. > > diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c > index 621696269584..c19a31fa46c9 100644 > --- a/kernel/sched/idle.c > +++ b/kernel/sched/idle.c > @@ -457,6 +457,9 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct t > > static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first) > { > + /* All delayed tasks must be picked off before switching to idle */ > + SCHED_WARN_ON(rq->cfs.h_nr_delayed); > + > update_idle_core(rq); > scx_update_idle(rq, true); > schedstat_inc(rq->sched_goidle); > -- > > If you are including this back, feel free to add: > > Tested-by: K Prateek Nayak <kprateek.nayak@amd.com> > >> [..snip..] >
(+ Saravana, Samuel) Hello Luis, On 11/27/2024 3:04 PM, Luis Machado wrote: > Hi, > > On 11/27/24 04:17, K Prateek Nayak wrote: >> Hello Peter, >> >> On 9/10/2024 1:39 PM, tip-bot2 for Peter Zijlstra wrote: >>> The following commit has been merged into the sched/core branch of tip: >>> >>> Commit-ID: 2e05f6c71d36f8ae1410a1cf3f12848cc17916e9 >>> Gitweb: https://git.kernel.org/tip/2e05f6c71d36f8ae1410a1cf3f12848cc17916e9 >>> Author: Peter Zijlstra <peterz@infradead.org> >>> AuthorDate: Fri, 06 Sep 2024 12:45:25 +02:00 >>> Committer: Peter Zijlstra <peterz@infradead.org> >>> CommitterDate: Tue, 10 Sep 2024 09:51:15 +02:00 >>> >>> sched/eevdf: More PELT vs DELAYED_DEQUEUE >>> >>> Vincent and Dietmar noted that while commit fc1892becd56 fixes the >>> entity runnable stats, it does not adjust the cfs_rq runnable stats, >>> which are based off of h_nr_running. >>> >>> Track h_nr_delayed such that we can discount those and adjust the >>> signal. >>> >>> Fixes: fc1892becd56 ("sched/eevdf: Fixup PELT vs DELAYED_DEQUEUE") >>> Reported-by: Dietmar Eggemann <dietmar.eggemann@arm.com> >>> Reported-by: Vincent Guittot <vincent.guittot@linaro.org> >>> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> >>> Link: https://lkml.kernel.org/r/20240906104525.GG4928@noisy.programming.kicks-ass.net >> >> I've been testing this fix for a while now to see if it helps the >> regressions reported around EEVDF complete. The issue with negative >> "h_nr_delayed" reported by Luis previously seem to have been fixed as a >> result of commit 75b6499024a6 ("sched/fair: Properly deactivate >> sched_delayed task upon class change") > > I recall having 75b6499024a6 in my testing tree and somehow still noticing > unbalanced accounting for h_nr_delayed, where it would be decremented > twice eventually, leading to negative numbers. > > I might have to give it another go if we're considering including the change > as-is, just to make sure. Since our setups are slightly different, we could > be exercising some slightly different paths. That would be great! Thank you :) Now that I see, you did have Valentine's patches in your tree during testing https://lore.kernel.org/lkml/6df12fde-1e0d-445f-8f8a-736d11f9ee41@arm.com/ Perhaps it could be the fixup commit 98442f0ccd82 ("sched: Fix delayed_dequeue vs switched_from_fair()") or the fact that my benchmark didn't stress this path enough to break you as you mentioned. I would have still expected it to hit that SCHED_WARN_ON() I had added in set_next_task_idle() if something went sideways. > > Did this patch help with the regressions you noticed though? I believe it was Saravana who was seeing anomalies in PELT ramp-up with DELAY_DEQUEUE. My test setup is currently not equipped to catch it but Saravana was interested in these fixes being backported to v6.12 LTS in https://lore.kernel.org/lkml/CAGETcx_1pZCtWiBbDmUcxEw3abF5dr=XdFCkH9zXWK75g7457w@mail.gmail.com/ I believe tracking h_nr_delayed and disregarding delayed tasks in certain places is a necessary fix. None of the benchmarks in my test setup seem to mind running without it but I'm also doing most of my testing with performance governor and the PELT anomalies seem to affect more from a PM perspective and not load balancing perspective. > >> >> I've been running stress-ng for a while and haven't seen any cases of >> negative "h_nr_delayed". I'd also added the following WARN_ON() to see >> if there are any delayed tasks on the cfs_rq before switching to idle in >> some of my previous experiments and I did not see any splat during my >> benchmark runs. >> >> diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c >> index 621696269584..c19a31fa46c9 100644 >> --- a/kernel/sched/idle.c >> +++ b/kernel/sched/idle.c >> @@ -457,6 +457,9 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct t >> >> static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first) >> { >> + /* All delayed tasks must be picked off before switching to idle */ >> + SCHED_WARN_ON(rq->cfs.h_nr_delayed); >> + >> update_idle_core(rq); >> scx_update_idle(rq, true); >> schedstat_inc(rq->sched_goidle); >> -- >> >> If you are including this back, feel free to add: >> >> Tested-by: K Prateek Nayak <kprateek.nayak@amd.com> >> >>> [..snip..] >> > -- Thanks and Regards, Prateek
© 2016 - 2024 Red Hat, Inc.