It turns out that zero_vruntime tracking is broken when there is but a single
task running. Current update paths are through __{en,de}queue_entity(), and
when there is but a single task, pick_next_task() will always return that one
task, and put_prev_set_next_task() will end up in neither function.
This can cause entity_key() to grow indefinitely large and cause overflows,
leading to much pain and suffering.
Furtermore, doing update_zero_vruntime() from __{de,en}queue_entity(), which
are called from {set_next,put_prev}_entity() has problems because:
- set_next_entity() calls __dequeue_entity() before it does cfs_rq->curr = se.
This means the avg_vruntime() will see the removal but not current, missing
the entity for accounting.
- put_prev_entity() calls __enqueue_entity() before it does cfs_rq->curr =
NULL. This means the avg_vruntime() will see the addition *and* current,
leading to double accounting.
Both cases are incorrect/inconsistent.
Noting that avg_vruntime is already called on each {en,de}queue, remove the
explicit avg_vruntime() calls (which removes an extra 64bit division for each
{en,de}queue) and have avg_vruntime() update zero_vruntime itself.
Additionally, have the tick call avg_vruntime() -- discarding the result, but
for the side-effect of updating zero_vruntime.
While there, optimize avg_vruntime() by noting that the average of one value is
rather trivial to compute.
Test case:
# taskset -c -p 1 $$
# taskset -c 2 bash -c 'while :; do :; done&'
# cat /sys/kernel/debug/sched/debug | awk '/^cpu#/ {P=0} /^cpu#2,/ {P=1} {if (P) print $0}' | grep -e zero_vruntime -e "^>"
PRE:
.zero_vruntime : 31316.407903
>R bash 487 50787.345112 E 50789.145972 2.800000 50780.298364 16 120 0.000000 0.000000 0.000000 /
.zero_vruntime : 382548.253179
>R bash 487 427275.204288 E 427276.003584 2.800000 427268.157540 23 120 0.000000 0.000000 0.000000 /
POST:
.zero_vruntime : 17259.709467
>R bash 526 17259.709467 E 17262.509467 2.800000 16915.031624 9 120 0.000000 0.000000 0.000000 /
.zero_vruntime : 18702.723356
>R bash 526 18702.723356 E 18705.523356 2.800000 18358.045513 9 120 0.000000 0.000000 0.000000 /
Fixes: 79f3f9bedd14 ("sched/eevdf: Fix min_vruntime vs avg_vruntime")
Reported-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Tested-by: Shubhang Kaushik <shubhang@os.amperecomputing.com>
---
kernel/sched/fair.c | 84 +++++++++++++++++++++++++++++++++++-----------------
1 file changed, 57 insertions(+), 27 deletions(-)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -589,6 +589,21 @@ static inline bool entity_before(const s
return vruntime_cmp(a->deadline, "<", b->deadline);
}
+/*
+ * Per avg_vruntime() below, cfs_rq::zero_vruntime is only slightly stale
+ * and this value should be no more than two lag bounds. Which puts it in the
+ * general order of:
+ *
+ * (slice + TICK_NSEC) << NICE_0_LOAD_SHIFT
+ *
+ * which is around 44 bits in size (on 64bit); that is 20 for
+ * NICE_0_LOAD_SHIFT, another 20 for NSEC_PER_MSEC and then a handful for
+ * however many msec the actual slice+tick ends up begin.
+ *
+ * (disregarding the actual divide-by-weight part makes for the worst case
+ * weight of 2, which nicely cancels vs the fuzz in zero_vruntime not actually
+ * being the zero-lag point).
+ */
static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
return vruntime_op(se->vruntime, "-", cfs_rq->zero_vruntime);
@@ -676,39 +691,61 @@ sum_w_vruntime_sub(struct cfs_rq *cfs_rq
}
static inline
-void sum_w_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
+void update_zero_vruntime(struct cfs_rq *cfs_rq, s64 delta)
{
/*
- * v' = v + d ==> sum_w_vruntime' = sum_runtime - d*sum_weight
+ * v' = v + d ==> sum_w_vruntime' = sum_w_vruntime - d*sum_weight
*/
cfs_rq->sum_w_vruntime -= cfs_rq->sum_weight * delta;
+ cfs_rq->zero_vruntime += delta;
}
/*
- * Specifically: avg_runtime() + 0 must result in entity_eligible() := true
+ * Specifically: avg_vruntime() + 0 must result in entity_eligible() := true
* For this to be so, the result of this function must have a left bias.
+ *
+ * Called in:
+ * - place_entity() -- before enqueue
+ * - update_entity_lag() -- before dequeue
+ * - entity_tick()
+ *
+ * This means it is one entry 'behind' but that puts it close enough to where
+ * the bound on entity_key() is at most two lag bounds.
*/
u64 avg_vruntime(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
- s64 avg = cfs_rq->sum_w_vruntime;
- long load = cfs_rq->sum_weight;
+ long weight = cfs_rq->sum_weight;
+ s64 delta = 0;
- if (curr && curr->on_rq) {
- unsigned long weight = scale_load_down(curr->load.weight);
+ if (curr && !curr->on_rq)
+ curr = NULL;
- avg += entity_key(cfs_rq, curr) * weight;
- load += weight;
- }
+ if (weight) {
+ s64 runtime = cfs_rq->sum_w_vruntime;
+
+ if (curr) {
+ unsigned long w = scale_load_down(curr->load.weight);
+
+ runtime += entity_key(cfs_rq, curr) * w;
+ weight += w;
+ }
- if (load) {
/* sign flips effective floor / ceiling */
- if (avg < 0)
- avg -= (load - 1);
- avg = div_s64(avg, load);
+ if (runtime < 0)
+ runtime -= (weight - 1);
+
+ delta = div_s64(runtime, weight);
+ } else if (curr) {
+ /*
+ * When there is but one element, it is the average.
+ */
+ delta = curr->vruntime - cfs_rq->zero_vruntime;
}
- return cfs_rq->zero_vruntime + avg;
+ update_zero_vruntime(cfs_rq, delta);
+
+ return cfs_rq->zero_vruntime;
}
/*
@@ -777,16 +814,6 @@ int entity_eligible(struct cfs_rq *cfs_r
return vruntime_eligible(cfs_rq, se->vruntime);
}
-static void update_zero_vruntime(struct cfs_rq *cfs_rq)
-{
- u64 vruntime = avg_vruntime(cfs_rq);
- s64 delta = vruntime_op(vruntime, "-", cfs_rq->zero_vruntime);
-
- sum_w_vruntime_update(cfs_rq, delta);
-
- cfs_rq->zero_vruntime = vruntime;
-}
-
static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq)
{
struct sched_entity *root = __pick_root_entity(cfs_rq);
@@ -856,7 +883,6 @@ RB_DECLARE_CALLBACKS(static, min_vruntim
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
sum_w_vruntime_add(cfs_rq, se);
- update_zero_vruntime(cfs_rq);
se->min_vruntime = se->vruntime;
se->min_slice = se->slice;
rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
@@ -868,7 +894,6 @@ static void __dequeue_entity(struct cfs_
rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
&min_vruntime_cb);
sum_w_vruntime_sub(cfs_rq, se);
- update_zero_vruntime(cfs_rq);
}
struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq)
@@ -5524,6 +5549,11 @@ entity_tick(struct cfs_rq *cfs_rq, struc
update_load_avg(cfs_rq, curr, UPDATE_TG);
update_cfs_group(curr);
+ /*
+ * Pulls along cfs_rq::zero_vruntime.
+ */
+ avg_vruntime(cfs_rq);
+
#ifdef CONFIG_SCHED_HRTICK
/*
* queued ticks are scheduled to match the slice, so don't bother
On Wed, Feb 18, 2026 at 11:58 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> It turns out that zero_vruntime tracking is broken when there is but a single
> task running. Current update paths are through __{en,de}queue_entity(), and
> when there is but a single task, pick_next_task() will always return that one
> task, and put_prev_set_next_task() will end up in neither function.
>
> This can cause entity_key() to grow indefinitely large and cause overflows,
> leading to much pain and suffering.
>
> Furtermore, doing update_zero_vruntime() from __{de,en}queue_entity(), which
> are called from {set_next,put_prev}_entity() has problems because:
>
> - set_next_entity() calls __dequeue_entity() before it does cfs_rq->curr = se.
> This means the avg_vruntime() will see the removal but not current, missing
> the entity for accounting.
>
> - put_prev_entity() calls __enqueue_entity() before it does cfs_rq->curr =
> NULL. This means the avg_vruntime() will see the addition *and* current,
> leading to double accounting.
>
> Both cases are incorrect/inconsistent.
>
> Noting that avg_vruntime is already called on each {en,de}queue, remove the
> explicit avg_vruntime() calls (which removes an extra 64bit division for each
> {en,de}queue) and have avg_vruntime() update zero_vruntime itself.
>
> Additionally, have the tick call avg_vruntime() -- discarding the result, but
> for the side-effect of updating zero_vruntime.
Hey all,
So in stress testing with my full proxy-exec series, I was
occasionally tripping over the situation where __pick_eevdf() returns
null which quickly crashes.
Initially I was thinking the bug was in my out of tree patches, but I
later found I could trip it with upstream as well, and I believe I
have bisected it down to this patch. Though reproduction often takes
3-4 hours, and I usually quit testing after 5 hours, so it's possible
I have some false negatives on the problem and it could have arisen
earlier.
From a little bit of debugging (done with the full proxy exec series,
I need to re-debug with vanilla), usual symptom is that we run into a
situation where !entity_eligible(cfs_rq, curr), so curr gets set to
null (though in one case, I saw cfs_rq->curr start as null), and then
we never set best, and thus the `if (!best || ...) best = curr;`
assignment doesn't save us and we return null, and crash.
I still need to dig more into the eligibility values and also to dump
the rq to see why nothing is being found. I am running with
CONFIG_SCHED_PROXY_EXEC enabled, so there may yet be some collision
here between this change and the already upstream portions of Proxy
Exec (I'll have to do more testing to see if it reproduces without
that option enabled).
The backtrace is usually due to stress-ng stress-ng-yield test:
[ 3775.898617] BUG: kernel NULL pointer dereference, address: 0000000000000059
[ 3775.903089] #PF: supervisor read access in kernel mode
[ 3775.906068] #PF: error_code(0x0000) - not-present page
[ 3775.909102] PGD 0 P4D 0
[ 3775.910656] Oops: Oops: 0000 [#1] SMP NOPTI
[ 3775.913371] CPU: 36 UID: 0 PID: 269131 Comm: stress-ng-yield
Tainted: G W 7.0.0-rc5-00001-g42a93b71138f #5
PREEMPT(full)
[ 3775.920304] Tainted: [W]=WARN
[ 3775.922100] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS 1.17.0-debian-1.17.0-1 04/01/2014
[ 3775.927852] RIP: 0010:pick_task_fair+0x6f/0xb0
[ 3775.930466] Code: 85 ff 74 52 48 8b 47 48 48 85 c0 74 d6 80 78 58
00 74 d0 48 89 3c 24 e8 8f 9b ff ff 48 8b 3c 24 be 01 00 00 00 e8 51
74 ff ff <80> 78 59 00 74
c3 ba 21 00 00 00 48 89 c6 48 89 df e8 5b f1 ff ff
[ 3775.941027] RSP: 0018:ffffc9003827fde0 EFLAGS: 00010086
[ 3775.943949] RAX: 0000000000000000 RBX: ffff8881b972bc40 RCX: 0000000000000803
[ 3775.948179] RDX: 00000041acc1002a RSI: 000000b0cef5382a RDI: 000040138cc6cd49
[ 3775.952149] RBP: ffffc9003827fef8 R08: 0000000000000400 R09: 0000000000000002
[ 3775.956548] R10: 0000000000000024 R11: ffff8881b04a4d40 R12: ffff8881b04a4280
[ 3775.960480] R13: ffff8881b04a4280 R14: ffffffff82ce70a8 R15: ffff8881b972bc40
[ 3775.964713] FS: 00007f6ecb7a6b00(0000) GS:ffff888235beb000(0000)
knlGS:0000000000000000
[ 3775.969468] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 3775.972960] CR2: 0000000000000059 CR3: 000000019c32a003 CR4: 0000000000370ef0
[ 3775.977008] Call Trace:
[ 3775.978581] <TASK>
[ 3775.979841] pick_next_task_fair+0x3c/0x8e0
[ 3775.982408] ? lock_is_held_type+0xcd/0x130
[ 3775.984833] __schedule+0x20f/0x14d0
[ 3775.987287] ? do_sched_yield+0xa2/0xe0
[ 3775.989365] schedule+0x3d/0x130
[ 3775.991376] __do_sys_sched_yield+0xe/0x20
[ 3775.993889] do_syscall_64+0xf3/0x680
[ 3775.996229] entry_SYSCALL_64_after_hwframe+0x77/0x7f
[ 3776.000459] RIP: 0033:0x7f6ecc0e18c7
[ 3776.002757] Code: 73 01 c3 48 8b 0d 49 d5 0e 00 f7 d8 64 89 01 48
83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 b8 18 00 00
00 0f 05 <48> 3d 01 f0 ff
ff 73 01 c3 48 8b 0d 19 d5 0e 00 f7 d8 64 89 01 48
I'll continue digging next week on this, but wanted to share in case
anyone else sees something obvious first.
thanks
-john
On Fri, Mar 27, 2026 at 10:44:28PM -0700, John Stultz wrote:
> On Wed, Feb 18, 2026 at 11:58 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > It turns out that zero_vruntime tracking is broken when there is but a single
> > task running. Current update paths are through __{en,de}queue_entity(), and
> > when there is but a single task, pick_next_task() will always return that one
> > task, and put_prev_set_next_task() will end up in neither function.
> >
> > This can cause entity_key() to grow indefinitely large and cause overflows,
> > leading to much pain and suffering.
> >
> > Furtermore, doing update_zero_vruntime() from __{de,en}queue_entity(), which
> > are called from {set_next,put_prev}_entity() has problems because:
> >
> > - set_next_entity() calls __dequeue_entity() before it does cfs_rq->curr = se.
> > This means the avg_vruntime() will see the removal but not current, missing
> > the entity for accounting.
> >
> > - put_prev_entity() calls __enqueue_entity() before it does cfs_rq->curr =
> > NULL. This means the avg_vruntime() will see the addition *and* current,
> > leading to double accounting.
> >
> > Both cases are incorrect/inconsistent.
> >
> > Noting that avg_vruntime is already called on each {en,de}queue, remove the
> > explicit avg_vruntime() calls (which removes an extra 64bit division for each
> > {en,de}queue) and have avg_vruntime() update zero_vruntime itself.
> >
> > Additionally, have the tick call avg_vruntime() -- discarding the result, but
> > for the side-effect of updating zero_vruntime.
>
> Hey all,
>
> So in stress testing with my full proxy-exec series, I was
> occasionally tripping over the situation where __pick_eevdf() returns
> null which quickly crashes.
> The backtrace is usually due to stress-ng stress-ng-yield test:
Suppose we have 2 runnable tasks, both doing yield. Then one will be
eligible and one will not be, because the average position must be in
between these two entities.
Therefore, the runnable task will be eligible, and be promoted a full
slice (all the tasks do is yield after all). This causes it to jump over
the other task and now the other task is eligible and it is no longer.
So we schedule.
Since we are runnable, there is no dequeue or enqueue. All we have is
the __enqueue_entity() and __dequeue_entity() from put_prev_task() /
set_next_task(). But per the fingered commit, those two no longer move
zero_vruntime head.
All that moves zero_vruntime is tick and full dequeue or enqueue.
This means, that if the two tasks playing leapfrog can reach the
critical speed to reach the overflow point inside one tick's worth of
time, we're up a creek.
If this is indeed the case, then the below should cure things.
This also means that running a HZ=100 config will increase the chances
of hitting this vs HZ=1000.
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9298f49f842c..c7daaf941b26 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9307,6 +9307,7 @@ static void yield_task_fair(struct rq *rq)
if (entity_eligible(cfs_rq, se)) {
se->vruntime = se->deadline;
se->deadline += calc_delta_fair(se->slice, se);
+ avg_vruntime(cfs_rq);
}
}
On Mon, Mar 30, 2026 at 3:10 AM Peter Zijlstra <peterz@infradead.org> wrote:
> This means, that if the two tasks playing leapfrog can reach the
> critical speed to reach the overflow point inside one tick's worth of
> time, we're up a creek.
>
> If this is indeed the case, then the below should cure things.
>
> This also means that running a HZ=100 config will increase the chances
> of hitting this vs HZ=1000.
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 9298f49f842c..c7daaf941b26 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -9307,6 +9307,7 @@ static void yield_task_fair(struct rq *rq)
> if (entity_eligible(cfs_rq, se)) {
> se->vruntime = se->deadline;
> se->deadline += calc_delta_fair(se->slice, se);
> + avg_vruntime(cfs_rq);
> }
> }
I just tested with this and similar to Prateek, I also still tripped the issue.
I'll give your new patch a spin here in a second.
thanks
-john
On Mon, Mar 30, 2026 at 12:40:45PM -0700, John Stultz wrote:
> On Mon, Mar 30, 2026 at 3:10 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > This means, that if the two tasks playing leapfrog can reach the
> > critical speed to reach the overflow point inside one tick's worth of
> > time, we're up a creek.
> >
> > If this is indeed the case, then the below should cure things.
> >
> > This also means that running a HZ=100 config will increase the chances
> > of hitting this vs HZ=1000.
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 9298f49f842c..c7daaf941b26 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -9307,6 +9307,7 @@ static void yield_task_fair(struct rq *rq)
> > if (entity_eligible(cfs_rq, se)) {
> > se->vruntime = se->deadline;
> > se->deadline += calc_delta_fair(se->slice, se);
> > + avg_vruntime(cfs_rq);
> > }
> > }
>
> I just tested with this and similar to Prateek, I also still tripped the issue.
>
> I'll give your new patch a spin here in a second.
Stick both on please :-) AFAICT they're both real, just not convinced
they're what you're hitting.
On Mon, Mar 30, 2026 at 12:43 PM Peter Zijlstra <peterz@infradead.org> wrote:
> On Mon, Mar 30, 2026 at 12:40:45PM -0700, John Stultz wrote:
> > On Mon, Mar 30, 2026 at 3:10 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > This means, that if the two tasks playing leapfrog can reach the
> > > critical speed to reach the overflow point inside one tick's worth of
> > > time, we're up a creek.
> > >
> > > If this is indeed the case, then the below should cure things.
> > >
> > > This also means that running a HZ=100 config will increase the chances
> > > of hitting this vs HZ=1000.
> > >
> > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > index 9298f49f842c..c7daaf941b26 100644
> > > --- a/kernel/sched/fair.c
> > > +++ b/kernel/sched/fair.c
> > > @@ -9307,6 +9307,7 @@ static void yield_task_fair(struct rq *rq)
> > > if (entity_eligible(cfs_rq, se)) {
> > > se->vruntime = se->deadline;
> > > se->deadline += calc_delta_fair(se->slice, se);
> > > + avg_vruntime(cfs_rq);
> > > }
> > > }
> >
> > I just tested with this and similar to Prateek, I also still tripped the issue.
> >
> > I'll give your new patch a spin here in a second.
>
> Stick both on please :-) AFAICT they're both real, just not convinced
> they're what you're hitting.
Sadly I'm still hitting it with both. This time the stack trace was
different, and it came up through do_nanosleep() from stress-ng-exit
instead of yield.
I'll re-add my debug trace_printks (I dropped them while testing your
patches in case they changed the timing of things) and work to
understand more here.
thanks
-john
[ 6777.071789] BUG: kernel NULL pointer dereference, address: 0000000000000051
[ 6777.076712] #PF: supervisor read access in kernel mode
[ 6777.079767] #PF: error_code(0x0000) - not-present page
[ 6777.082787] PGD 0 P4D 0
[ 6777.084361] Oops: Oops: 0000 [#1] SMP NOPTI
[ 6777.086812] CPU: 37 UID: 0 PID: 531349 Comm: stress-ng-exit-
Tainted: G W 7.0.0-rc1-00001-gb3d99f43c72b-dirty #18
PREEMPT(full)
[ 6777.094026] Tainted: [W]=WARN
[ 6777.095771] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS 1.17.0-debian-1.17.0-1 04/01/2014
[ 6777.100689] RIP: 0010:pick_task_fair+0x6f/0xb0
[ 6777.103239] Code: 85 ff 74 52 48 8b 47 48 48 85 c0 74 d6 80 78 50
00 74 d0 48 89 3c 24 e8 8f e0 ff ff 48 8b 3c 24 be 01 00 00 00 e8 31
77 ff ff <80> 78 51 00 74 c3 ba 2
1 00 00 00 48 89 c6 48 89 df e8 db f1 ff ff
[ 6777.113447] RSP: 0018:ffffc9000f7dbcf0 EFLAGS: 00010082
[ 6777.116283] RAX: 0000000000000000 RBX: ffff8881b976bbc0 RCX: 0000000000000800
[ 6777.119791] RDX: 000000000a071800 RSI: 000000000b719000 RDI: 00004fc5ab7864c9
[ 6777.123608] RBP: ffffc9000f7dbdf0 R08: 0000000000000400 R09: 0000000000000002
[ 6777.127785] R10: 0000000000000025 R11: 0000000000000000 R12: ffff88810adc4200
[ 6777.131937] R13: ffff88810adc4200 R14: ffffffff82ce5b28 R15: ffff8881b976bbc0
[ 6777.135994] FS: 00007fc1c37866c0(0000) GS:ffff888235c2b000(0000)
knlGS:0000000000000000
[ 6777.140449] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 6777.143756] CR2: 0000000000000051 CR3: 000000014160f005 CR4: 0000000000370ef0
[ 6777.147819] Call Trace:
[ 6777.149379] <TASK>
[ 6777.150653] pick_next_task_fair+0x3c/0x8c0
[ 6777.153115] __schedule+0x1e8/0x1200
[ 6777.155241] ? do_nanosleep+0x1a/0x170
[ 6777.157336] schedule+0x3d/0x130
[ 6777.159150] do_nanosleep+0x88/0x170
[ 6777.161161] ? find_held_lock+0x2b/0x80
[ 6777.163201] hrtimer_nanosleep+0xba/0x1f0
[ 6777.165481] ? __pfx_hrtimer_wakeup+0x10/0x10
[ 6777.167990] common_nsleep+0x34/0x60
[ 6777.169957] __x64_sys_clock_nanosleep+0xde/0x150
[ 6777.172443] do_syscall_64+0xf3/0x680
[ 6777.174409] entry_SYSCALL_64_after_hwframe+0x77/0x7f
[ 6777.177176] RIP: 0033:0x7fc1cc92d9ee
[ 6777.179009] Code: 08 0f 85 f5 4b ff ff 49 89 fb 48 89 f0 48 89 d7
48 89 ce 4c 89 c2 4d 89 ca 4c 8b 44 24 08 4c 8b 4c 24 10 4c 89 5c 24
08 0f 05 <c3> 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 80 00 00 00 00 48 83
ec 08
Hello Peter, On 3/30/2026 3:40 PM, Peter Zijlstra wrote: > This means, that if the two tasks playing leapfrog can reach the > critical speed to reach the overflow point inside one tick's worth of > time, we're up a creek. > > If this is indeed the case, then the below should cure things. I have been running with this for four hours now and haven't seen any splats or crashes on my setup. I could reliably trigger the warning from __sum_w_vruntime_add() within an hour previously so it is safe to say I was hitting exactly this. Feel free to include: Tested-by: K Prateek Nayak <kprateek.nayak@amd.com> -- Thanks and Regards, Prateek
On Mon, Mar 30, 2026 at 08:07:06PM +0530, K Prateek Nayak wrote: > Hello Peter, > > On 3/30/2026 3:40 PM, Peter Zijlstra wrote: > > This means, that if the two tasks playing leapfrog can reach the > > critical speed to reach the overflow point inside one tick's worth of > > time, we're up a creek. > > > > If this is indeed the case, then the below should cure things. > > I have been running with this for four hours now and haven't seen > any splats or crashes on my setup. I could reliably trigger the > warning from __sum_w_vruntime_add() within an hour previously so > it is safe to say I was hitting exactly this. > > Feel free to include: > > Tested-by: K Prateek Nayak <kprateek.nayak@amd.com> Ha!, excellent. Thanks!
Hello Peter, On 3/30/2026 8:10 PM, Peter Zijlstra wrote: > On Mon, Mar 30, 2026 at 08:07:06PM +0530, K Prateek Nayak wrote: >> Hello Peter, >> >> On 3/30/2026 3:40 PM, Peter Zijlstra wrote: >>> This means, that if the two tasks playing leapfrog can reach the >>> critical speed to reach the overflow point inside one tick's worth of >>> time, we're up a creek. >>> >>> If this is indeed the case, then the below should cure things. >> >> I have been running with this for four hours now and haven't seen >> any splats or crashes on my setup. I could reliably trigger the >> warning from __sum_w_vruntime_add() within an hour previously so >> it is safe to say I was hitting exactly this. >> >> Feel free to include: >> >> Tested-by: K Prateek Nayak <kprateek.nayak@amd.com> > > Ha!, excellent. Thanks! Turns out I spoke too soon and it did eventually run into that problem again and then eventually crashed in pick_task_fair() later so there is definitely something amiss still :-( I'll throw in some debug traces and get back tomorrow. -- Thanks and Regards, Prateek
On Mon, Mar 30, 2026 at 09:20:01PM +0530, K Prateek Nayak wrote:
> Hello Peter,
>
> On 3/30/2026 8:10 PM, Peter Zijlstra wrote:
> > On Mon, Mar 30, 2026 at 08:07:06PM +0530, K Prateek Nayak wrote:
> >> Hello Peter,
> >>
> >> On 3/30/2026 3:40 PM, Peter Zijlstra wrote:
> >>> This means, that if the two tasks playing leapfrog can reach the
> >>> critical speed to reach the overflow point inside one tick's worth of
> >>> time, we're up a creek.
> >>>
> >>> If this is indeed the case, then the below should cure things.
> >>
> >> I have been running with this for four hours now and haven't seen
> >> any splats or crashes on my setup. I could reliably trigger the
> >> warning from __sum_w_vruntime_add() within an hour previously so
> >> it is safe to say I was hitting exactly this.
> >>
> >> Feel free to include:
> >>
> >> Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
> >
> > Ha!, excellent. Thanks!
>
> Turns out I spoke too soon and it did eventually run into that
> problem again and then eventually crashed in pick_task_fair()
> later so there is definitely something amiss still :-(
>
> I'll throw in some debug traces and get back tomorrow.
Are there cgroups involved?
I'm thinking that if you have two groups, and the tick always hits the
one group, the other group can go a while without ever getting updated.
But if there's no cgroups, this can't be it.
Anyway, something like the below would rule this out I suppose.
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bf948db905ed..19b75af31a5a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1304,6 +1304,8 @@ static void update_curr(struct cfs_rq *cfs_rq)
curr->vruntime += calc_delta_fair(delta_exec, curr);
resched = update_deadline(cfs_rq, curr);
+ if (resched)
+ avg_vruntime(cfs_rq);
if (entity_is_task(curr)) {
/*
@@ -5593,11 +5595,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
update_load_avg(cfs_rq, curr, UPDATE_TG);
update_cfs_group(curr);
- /*
- * Pulls along cfs_rq::zero_vruntime.
- */
- avg_vruntime(cfs_rq);
-
#ifdef CONFIG_SCHED_HRTICK
/*
* queued ticks are scheduled to match the slice, so don't bother
Hello Peter,
On 3/31/2026 12:41 AM, Peter Zijlstra wrote:
>> Turns out I spoke too soon and it did eventually run into that
>> problem again and then eventually crashed in pick_task_fair()
>> later so there is definitely something amiss still :-(
>>
>> I'll throw in some debug traces and get back tomorrow.
>
> Are there cgroups involved?
Indeed there are.
>
> I'm thinking that if you have two groups, and the tick always hits the
> one group, the other group can go a while without ever getting updated.
Ack! That could be but I only have once cgroup on top of root cgroup as
far as cpu controllers are concerned so the sched_yield() catching up
the avg_vruntime() should have worked. Either ways, I have more data:
When I hit the overflow warning, I have:
se: entity_key(-83106064385) weight(90891264) overflow(-7553615238018032640)
cfs_rq: zero_vruntime(138430453113448575) sum_w_vruntime(0) sum_weight(0)
cfs_rq->curr: entity_key(0) vruntime(138430453113448575) deadline(138430500540426854)
Post avg_vruntime():
se: entity_key(-83106064385) weight(90891264) overflow(-7553615238018032640)
cfs_rq: zero_vruntime(138430453113448575) sum_w_vruntime(0) sum_weight(0)
cfs_rq->curr: entity_key(0) vruntime(138430453113448575) deadline(138430500540426854)
so running avg_vruntime() doesn't make a difference and it seems to be a
genuine case of place_entity() putting the newly woken entity pretty
far back in the timeline. (I forgot to print weights!)
Now, the funny part is, if I leave the system undisturbed, I get a few
of the above warning and nothing interesting but as soon as I do a:
grep bits /sys/kernel/debug/sched/debug
Boom! Pick fails very consistently (Because of copy-pasta this too
doesn't contain weights):
NULL Pick!
cfs_rq: zero_vruntime(89029406877992895) sum_w_vruntime(-135049248768) sum_weight(1048576)
cfs_rq->curr: entity_key(149162) vruntime(89029406878142057) deadline(89029406976268435)
queued se: entity_key(-123294) vruntime(89029406877869601) deadline(89029406880669601)
after avg_vruntime()!
cfs_rq: zero_vruntime(89029406877868114) sum_w_vruntime(-4206886912) sum_weight(1048576)
cfs_rq->curr: entity_key(273943) vruntime(89029406878142057) deadline(89029406976268435)
queued se: entity_key(1487) vruntime(89029406877869601) deadline(89029406880669601)
NULL Pick!
The above doesn't recover after a avg_vruntime(). Btw I'm running:
nice -n 19 stress-ng --yield 32 -t 1000000s&
while true; do perf bench sched messaging -p -t -l 100000 -g 16; done
Nice 19 is to get a large deadline and keep catching up to that deadline
at every yield to see if that makes any difference.
>
> But if there's no cgroups, this can't be it.
>
> Anyway, something like the below would rule this out I suppose.
I'll add that in and see if it makes a difference. I'll add in
weights and look at place_entity() to see if we have anything
interesting going on there.
Thank you for taking a look.
--
Thanks and Regards,
Prateek
On Tue, Mar 31, 2026 at 06:08:27AM +0530, K Prateek Nayak wrote: > The above doesn't recover after a avg_vruntime(). Btw I'm running: > > nice -n 19 stress-ng --yield 32 -t 1000000s& > while true; do perf bench sched messaging -p -t -l 100000 -g 16; done And you're running that on a 16 cpu machine / vm ?
On Tue, Mar 31, 2026 at 09:08:23AM +0200, Peter Zijlstra wrote: > On Tue, Mar 31, 2026 at 06:08:27AM +0530, K Prateek Nayak wrote: > > > The above doesn't recover after a avg_vruntime(). Btw I'm running: > > > > nice -n 19 stress-ng --yield 32 -t 1000000s& > > while true; do perf bench sched messaging -p -t -l 100000 -g 16; done > > And you're running that on a 16 cpu machine / vm ? W00t, it went b00m. Ok, let me go add some tracing.
On 3/31/2026 12:44 PM, Peter Zijlstra wrote: > On Tue, Mar 31, 2026 at 09:08:23AM +0200, Peter Zijlstra wrote: >> On Tue, Mar 31, 2026 at 06:08:27AM +0530, K Prateek Nayak wrote: >> >>> The above doesn't recover after a avg_vruntime(). Btw I'm running: >>> >>> nice -n 19 stress-ng --yield 32 -t 1000000s& >>> while true; do perf bench sched messaging -p -t -l 100000 -g 16; done >> >> And you're running that on a 16 cpu machine / vm ? > > W00t, it went b00m. Ok, let me go add some tracing. I could only repro it on baremetal after few hours but good to know it exploded effortlessly on your end! Was this a 16vCPU VM with the same recipe? -- Thanks and Regards, Prateek
On Tue, Mar 31, 2026 at 02:19:54PM +0530, K Prateek Nayak wrote: > On 3/31/2026 12:44 PM, Peter Zijlstra wrote: > > On Tue, Mar 31, 2026 at 09:08:23AM +0200, Peter Zijlstra wrote: > >> On Tue, Mar 31, 2026 at 06:08:27AM +0530, K Prateek Nayak wrote: > >> > >>> The above doesn't recover after a avg_vruntime(). Btw I'm running: > >>> > >>> nice -n 19 stress-ng --yield 32 -t 1000000s& > >>> while true; do perf bench sched messaging -p -t -l 100000 -g 16; done > >> > >> And you're running that on a 16 cpu machine / vm ? > > > > W00t, it went b00m. Ok, let me go add some tracing. > > I could only repro it on baremetal after few hours but good to know it > exploded effortlessly on your end! Was this a 16vCPU VM with the same > recipe? Yep. It almost insta triggers. Trying to make sense of the traces now.
On Tue, Mar 31, 2026 at 11:29:09AM +0200, Peter Zijlstra wrote:
> On Tue, Mar 31, 2026 at 02:19:54PM +0530, K Prateek Nayak wrote:
> > On 3/31/2026 12:44 PM, Peter Zijlstra wrote:
> > > On Tue, Mar 31, 2026 at 09:08:23AM +0200, Peter Zijlstra wrote:
> > >> On Tue, Mar 31, 2026 at 06:08:27AM +0530, K Prateek Nayak wrote:
> > >>
> > >>> The above doesn't recover after a avg_vruntime(). Btw I'm running:
> > >>>
> > >>> nice -n 19 stress-ng --yield 32 -t 1000000s&
> > >>> while true; do perf bench sched messaging -p -t -l 100000 -g 16; done
> > >>
> > >> And you're running that on a 16 cpu machine / vm ?
> > >
> > > W00t, it went b00m. Ok, let me go add some tracing.
> >
> > I could only repro it on baremetal after few hours but good to know it
> > exploded effortlessly on your end! Was this a 16vCPU VM with the same
> > recipe?
>
> Yep. It almost insta triggers. Trying to make sense of the traces now.
So the thing I'm seeing is that avg_vruntime() is behind of where it
should be, not much, but every time it goes *boom* it is just far enough
behind that no entity is eligible.
sched-messaging-2192 [039] d..2. 77.136100: pick_task_fair: cfs_rq(39:ff4a5bc7bebeb680): sum_w_vruntime(194325882) sum_weight(5120) zero_vruntime(105210161141318) avg_vruntime(105210161179272)
sched-messaging-2192 [039] d..2. 77.136100: pick_task_fair: T se(ff4a5bc79040c940): vruntime(105210161556539) deadline(105210164099443) weight(1048576) -- sched-messaging:2340
sched-messaging-2192 [039] d..2. 77.136101: pick_task_fair: T se(ff4a5bc794ce98c0): vruntime(105210161435669) deadline(105210164235669) weight(1048576) -- sched-messaging:2212
sched-messaging-2192 [039] d..2. 77.136101: pick_task_fair: T se(ff4a5bc7952d3100): vruntime(105210161580240) deadline(105210164380240) weight(1048576) -- sched-messaging:2381
sched-messaging-2192 [039] d..2. 77.136102: pick_task_fair: T se(ff4a5bc794c318c0): vruntime(105210161818264) deadline(105210164518004) weight(1048576) -- sched-messaging:2306
sched-messaging-2192 [039] d..2. 77.136103: pick_task_fair: T se(ff4a5bc796b4b100): vruntime(105210161831546) deadline(105210164631546) weight(1048576) -- sched-messaging:2551
sched-messaging-2192 [039] d..2. 77.136104: pick_task_fair: min_lag(-652274) max_lag(0) limit(38000000)
sched-messaging-2192 [039] d..2. 77.136104: pick_task_fair: picked NULL!!
If we compute the avg_vruntime() manually, then we get a
sum_w_vruntime contribution for each task:
(105210161556539-105210161141318)*1024
425186304
(105210161435669-105210161141318)*1024
301415424
(105210161580240-105210161141318)*1024
449456128
(105210161818264-105210161141318)*1024
693192704
(105210161831546-105210161141318)*1024
706793472
Which combined is:
425186304+301415424+449456128+693192704+706793472
2576044032
NOTE: this is different (more) from sum_w_vruntime(194325882).
So divided, and added to zero gives:
2576044032/5120
503133.60000000000000000000
105210161141318+503133.60000000000000000000
105210161644451.60000000000000000000
Which is where avg_vruntime() *should* be, except it ends up being at:
avg_vruntime(105210161179272), which then results in no eligible entities.
Note that with the computed avg, the first 3 entities would be eligible.
This suggests I go build a parallel infrastructure to double check when
and where this goes sizeways.
... various attempts later ....
sched-messaging-1021 [009] d..2. 34.483159: update_curr: T<=> se(ff37d0bcd52718c0): vruntime(56921690782736, E) deadline(56921693563331) weight(1048576) -- sched-messaging:1021
sched-messaging-1021 [009] d..2. 34.483160: __avg_vruntime: cfs_rq(9:ff37d0bcfe46b680): delta(-48327) sum_w_vruntime(811471242) zero_vruntime(56921691575188)
sched-messaging-1021 [009] d..2. 34.483160: pick_task_fair: cfs_rq(9:ff37d0bcfe46b680): sum_w_vruntime(811471242) sum_weight(6159) zero_vruntime(56921691575188) avg_vruntime(56921691706941)
sched-messaging-1021 [009] d..2. 34.483160: pick_task_fair: T< se(ff37d0bcd5c6c940): vruntime(56921691276707, E) deadline(56921694076707) weight(1048576) -- sched-messaging:1276
sched-messaging-1021 [009] d..2. 34.483161: pick_task_fair: T se(ff37d0bcd56f98c0): vruntime(56921691917863) deadline(56921694079320) weight(1048576) -- sched-messaging:1201
sched-messaging-1021 [009] d..2. 34.483162: pick_task_fair: T se(ff37d0bcd5344940): vruntime(56921691340323, E) deadline(56921694140323) weight(1048576) -- sched-messaging:1036
sched-messaging-1021 [009] d..2. 34.483163: pick_task_fair: T se(ff37d0bcd56dc940): vruntime(56921691637185, E) deadline(56921694403038) weight(1048576) -- sched-messaging:1179
sched-messaging-1021 [009] d..2. 34.483164: pick_task_fair: T se(ff37d0bcd43eb100): vruntime(56921691629067, E) deadline(56921694429067) weight(1048576) -- sched-messaging:786
sched-messaging-1021 [009] d..2. 34.483164: pick_task_fair: T se(ff37d0bcd5d80080): vruntime(56921691810771) deadline(56921694610771) weight(1048576) -- sched-messaging:1291
sched-messaging-1021 [009] d..2. 34.483165: pick_task_fair: T se(ff37d0bcd027b100): vruntime(56921734696810) deadline(56921917287562) weight(15360) -- stress-ng-yield:693
sched-messaging-1021 [009] d..2. 34.483165: pick_task_fair: min_lag(-42989869) max_lag(430234) limit(38000000)
sched-messaging-1021 [009] d..2. 34.483166: pick_task_fair: swv(811471242)
sched-messaging-1021 [009] d..2. 34.483167: __dequeue_entity: cfs_rq(9:ff37d0bcfe46b680): sum_w_vruntime(1117115786) zero_vruntime(56921691575188)
set_next_task(1276):
swv -= key * weight
811471242 - (56921691276707-56921691575188)*1024
1117115786
OK
sched-messaging-1276 [009] d.h2. 34.483168: update_curr: T<=> se(ff37d0bcd5c6c940): vruntime(56921691285759, E) deadline(56921694076707) weight(1048576) -- sched-messaging:1276
sched-messaging-1276 [009] d.h2. 34.483169: __avg_vruntime: cfs_rq(9:ff37d0bcfe46b680): delta(22156) sum_w_vruntime(319064896) zero_vruntime(56921691597344)
swv -= sw * delta
1117115786 - 5135 * 22156
1003344726
WTF!?!
zv += delta
56921691575188 + 22156
56921691597344
OK
sched-messaging-1276 [009] d.h2. 34.483169: place_entity: T< se(ff37d0bcd52718c0): vruntime(56921690673139, E) deadline(56921693473139) weight(1048576) -- sched-messaging:1021
sched-messaging-1276 [009] d.h2. 34.483170: __enqueue_entity: cfs_rq(9:ff37d0bcfe46b680): sum_w_vruntime(-627321024) zero_vruntime(56921691597344)
swv += key * weight
Should be:
1003344726 + (56921690673139 - 56921691597344) * 1024
56958806 [*]
But is:
319064896 + (56921690673139 - 56921691597344) * 1024
-627321024
Consistent, but wrong
sched-messaging-1276 [009] d..2. 34.483173: update_curr: T<=> se(ff37d0bcd5c6c940): vruntime(56921691289762, E) deadline(56921694076707) weight(1048576) -- sched-messaging:1276
sched-messaging-1276 [009] d..2. 34.483173: __avg_vruntime: cfs_rq(9:ff37d0bcfe46b680): delta(571) sum_w_vruntime(180635073) zero_vruntime(56921691466161)
This would be dequeue(1276) update_entity_lag(), but the numbers make no sense...
swv -= sw * delta
-627321024 - 6159 * 571
-630837813 != 180635073
zv += delta
56921691597344 + 571
56921691597915 != 56921691466161
Also, the actual delta would be (zero_vruntime - prev zero_vruntime):
56921691466161-56921691597344
-131183
At which point we can construct the swv value from where we left of [*]
56958806 - -131183 * 6159
864914903
But the actual state makes no frigging sense....
sched-messaging-1276 [009] d..2. 34.483174: pick_task_fair: cfs_rq(9:ff37d0bcfe46b680): sum_w_vruntime(180635073) sum_weight(6159) zero_vruntime(56921691466161) avg_vruntime(56921691495489)
sched-messaging-1276 [009] d..2. 34.483174: pick_task_fair: T< se(ff37d0bcd52718c0): vruntime(56921690673139, E) deadline(56921693473139) weight(1048576) -- sched-messaging:1021
sched-messaging-1276 [009] d..2. 34.483175: pick_task_fair: T se(ff37d0bcd56f98c0): vruntime(56921691917863) deadline(56921694079320) weight(1048576) -- sched-messaging:1201
sched-messaging-1276 [009] d..2. 34.483175: pick_task_fair: T se(ff37d0bcd5344940): vruntime(56921691340323, E) deadline(56921694140323) weight(1048576) -- sched-messaging:1036
sched-messaging-1276 [009] d..2. 34.483176: pick_task_fair: T se(ff37d0bcd56dc940): vruntime(56921691637185) deadline(56921694403038) weight(1048576) -- sched-messaging:1179
sched-messaging-1276 [009] d..2. 34.483177: pick_task_fair: T se(ff37d0bcd43eb100): vruntime(56921691629067) deadline(56921694429067) weight(1048576) -- sched-messaging:786
sched-messaging-1276 [009] d..2. 34.483177: pick_task_fair: T se(ff37d0bcd5d80080): vruntime(56921691810771) deadline(56921694610771) weight(1048576) -- sched-messaging:1291
sched-messaging-1276 [009] d..2. 34.483178: pick_task_fair: T se(ff37d0bcd027b100): vruntime(56921734696810) deadline(56921917287562) weight(15360) -- stress-ng-yield:693
sched-messaging-1276 [009] d..2. 34.483178: pick_task_fair: min_lag(-43201321) max_lag(822350) limit(38000000)
sched-messaging-1276 [009] d..2. 34.483178: pick_task_fair: swv(864914903)
sched-messaging-1276 [009] d..2. 34.483179: pick_task_fair: FAIL
Generated with the below patch on top of -rc6.
---
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bf948db905ed..5462aeac1c45 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -678,6 +678,11 @@ sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
cfs_rq->sum_w_vruntime += key * weight;
cfs_rq->sum_weight += weight;
+
+ trace_printk("cfs_rq(%d:%px): sum_w_vruntime(%Ld) zero_vruntime(%Ld)\n",
+ rq_of(cfs_rq)->cpu, cfs_rq,
+ cfs_rq->sum_w_vruntime,
+ cfs_rq->zero_vruntime);
}
static void
@@ -688,6 +693,11 @@ sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
cfs_rq->sum_w_vruntime -= key * weight;
cfs_rq->sum_weight -= weight;
+
+ trace_printk("cfs_rq(%d:%px): sum_w_vruntime(%Ld) zero_vruntime(%Ld)\n",
+ rq_of(cfs_rq)->cpu, cfs_rq,
+ cfs_rq->sum_w_vruntime,
+ cfs_rq->zero_vruntime);
}
static inline
@@ -698,6 +708,12 @@ void update_zero_vruntime(struct cfs_rq *cfs_rq, s64 delta)
*/
cfs_rq->sum_w_vruntime -= cfs_rq->sum_weight * delta;
cfs_rq->zero_vruntime += delta;
+
+ trace_printk("cfs_rq(%d:%px): delta(%Ld) sum_w_vruntime(%Ld) zero_vruntime(%Ld)\n",
+ rq_of(cfs_rq)->cpu, cfs_rq,
+ delta,
+ cfs_rq->sum_w_vruntime,
+ cfs_rq->zero_vruntime);
}
/*
@@ -712,7 +728,7 @@ void update_zero_vruntime(struct cfs_rq *cfs_rq, s64 delta)
* This means it is one entry 'behind' but that puts it close enough to where
* the bound on entity_key() is at most two lag bounds.
*/
-u64 avg_vruntime(struct cfs_rq *cfs_rq)
+static u64 __avg_vruntime(struct cfs_rq *cfs_rq, bool update)
{
struct sched_entity *curr = cfs_rq->curr;
long weight = cfs_rq->sum_weight;
@@ -743,9 +759,17 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
delta = curr->vruntime - cfs_rq->zero_vruntime;
}
- update_zero_vruntime(cfs_rq, delta);
+ if (update) {
+ update_zero_vruntime(cfs_rq, delta);
+ return cfs_rq->zero_vruntime;
+ }
- return cfs_rq->zero_vruntime;
+ return cfs_rq->zero_vruntime + delta;
+}
+
+u64 avg_vruntime(struct cfs_rq *cfs_rq)
+{
+ return __avg_vruntime(cfs_rq, true);
}
static inline u64 cfs_rq_max_slice(struct cfs_rq *cfs_rq);
@@ -1078,11 +1102,6 @@ static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect)
return best;
}
-static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
-{
- return __pick_eevdf(cfs_rq, true);
-}
-
struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
@@ -1279,6 +1298,8 @@ s64 update_curr_common(struct rq *rq)
return update_se(rq, &rq->donor->se);
}
+static void print_se(struct cfs_rq *cfs_rq, struct sched_entity *se, bool pick);
+
/*
* Update the current task's runtime statistics.
*/
@@ -1304,6 +1325,10 @@ static void update_curr(struct cfs_rq *cfs_rq)
curr->vruntime += calc_delta_fair(delta_exec, curr);
resched = update_deadline(cfs_rq, curr);
+ if (resched)
+ avg_vruntime(cfs_rq);
+
+ print_se(cfs_rq, curr, true);
if (entity_is_task(curr)) {
/*
@@ -3849,6 +3874,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
bool rel_vprot = false;
u64 vprot;
+ print_se(cfs_rq, se, true);
+
if (se->on_rq) {
/* commit outstanding execution time */
update_curr(cfs_rq);
@@ -3896,6 +3923,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
__enqueue_entity(cfs_rq, se);
cfs_rq->nr_queued++;
}
+
+ print_se(cfs_rq, se, true);
}
static void reweight_task_fair(struct rq *rq, struct task_struct *p,
@@ -5251,6 +5280,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (se->rel_deadline) {
se->deadline += se->vruntime;
se->rel_deadline = 0;
+ print_se(cfs_rq, se, true);
return;
}
@@ -5266,6 +5296,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* EEVDF: vd_i = ve_i + r_i/w_i
*/
se->deadline = se->vruntime + vslice;
+ print_se(cfs_rq, se, true);
}
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
@@ -5529,31 +5560,6 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, bool first)
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}
-static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags);
-
-/*
- * Pick the next process, keeping these things in mind, in this order:
- * 1) keep things fair between processes/task groups
- * 2) pick the "next" process, since someone really wants that to run
- * 3) pick the "last" process, for cache locality
- * 4) do not run the "skip" process, if something else is available
- */
-static struct sched_entity *
-pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
-{
- struct sched_entity *se;
-
- se = pick_eevdf(cfs_rq);
- if (se->sched_delayed) {
- dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
- /*
- * Must not reference @se again, see __block_task().
- */
- return NULL;
- }
- return se;
-}
-
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
@@ -8942,6 +8948,123 @@ static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_f
resched_curr_lazy(rq);
}
+static __always_inline
+void print_se(struct cfs_rq *cfs_rq, struct sched_entity *se, bool pick)
+{
+ bool curr = (se == cfs_rq->curr);
+ bool el = entity_eligible(cfs_rq, se);
+ bool prot = protect_slice(se);
+ bool task = false;
+ char *comm = NULL;
+ int pid = -1;
+
+ if (entity_is_task(se)) {
+ struct task_struct *p = task_of(se);
+ task = true;
+ comm = p->comm;
+ pid = p->pid;
+ }
+
+ trace_printk("%c%c%c%c se(%px): vruntime(%Ld%s) deadline(%Ld) weight(%ld) -- %s:%d\n",
+ task ? 'T' : '@',
+ pick ? '<' : ' ',
+ curr && prot ? '=' : ' ',
+ curr ? '>' : ' ',
+ se, se->vruntime, el ? ", E" : "",
+ se->deadline, se->load.weight,
+ comm, pid);
+}
+
+static struct sched_entity *pick_debug(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *pick = __pick_eevdf(cfs_rq, true);
+ struct sched_entity *curr = cfs_rq->curr;
+ s64 min_lag = 0, max_lag = 0;
+ u64 runtime, weight, z_vruntime, avg;
+ u64 swv = 0;
+
+ s64 limit = 10*(sysctl_sched_base_slice + TICK_NSEC);
+
+ if (curr && !curr->on_rq)
+ curr = NULL;
+
+ runtime = cfs_rq->sum_w_vruntime;
+ weight = cfs_rq->sum_weight;
+ z_vruntime = cfs_rq->zero_vruntime;
+ barrier();
+ avg = __avg_vruntime(cfs_rq, false);
+
+ trace_printk("cfs_rq(%d:%px): sum_w_vruntime(%Ld) sum_weight(%Ld) zero_vruntime(%Ld) avg_vruntime(%Ld)\n",
+ rq_of(cfs_rq)->cpu, cfs_rq,
+ runtime, weight,
+ z_vruntime, avg);
+
+ for (struct rb_node *node = cfs_rq->tasks_timeline.rb_leftmost;
+ node; node = rb_next(node)) {
+ struct sched_entity *se = __node_2_se(node);
+ if (se == curr)
+ curr = NULL;
+ print_se(cfs_rq, se, pick == se);
+
+ swv += (se->vruntime - z_vruntime) * scale_load_down(se->load.weight);
+
+ s64 vlag = avg - se->vruntime;
+ min_lag = min(min_lag, vlag);
+ max_lag = max(max_lag, vlag);
+ }
+
+ if (curr) {
+ print_se(cfs_rq, curr, pick == curr);
+
+ s64 vlag = avg - curr->vruntime;
+ min_lag = min(min_lag, vlag);
+ max_lag = max(max_lag, vlag);
+ }
+
+ trace_printk(" min_lag(%Ld) max_lag(%Ld) limit(%Ld)\n", min_lag, max_lag, limit);
+ trace_printk(" swv(%Ld)\n", swv);
+
+ if (swv != runtime) {
+ trace_printk("FAIL\n");
+ tracing_off();
+ printk("FAIL FAIL FAIL!!!\n");
+ }
+
+// WARN_ON_ONCE(min_lag < -limit || max_lag > limit);
+
+ if (!pick) {
+ trace_printk("picked NULL!!\n");
+ tracing_off();
+ printk("FAIL FAIL FAIL!!!\n");
+ return __pick_first_entity(cfs_rq);
+ }
+
+ return pick;
+}
+
+/*
+ * Pick the next process, keeping these things in mind, in this order:
+ * 1) keep things fair between processes/task groups
+ * 2) pick the "next" process, since someone really wants that to run
+ * 3) pick the "last" process, for cache locality
+ * 4) do not run the "skip" process, if something else is available
+ */
+static struct sched_entity *
+pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *se;
+
+ se = pick_debug(cfs_rq);
+ if (se->sched_delayed) {
+ dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+ /*
+ * Must not reference @se again, see __block_task().
+ */
+ return NULL;
+ }
+ return se;
+}
+
static struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf)
{
struct sched_entity *se;
@@ -9129,6 +9252,7 @@ static void yield_task_fair(struct rq *rq)
if (entity_eligible(cfs_rq, se)) {
se->vruntime = se->deadline;
se->deadline += calc_delta_fair(se->slice, se);
+ avg_vruntime(cfs_rq);
}
}
On Tue, Mar 31, 2026 at 02:20:35PM +0200, Peter Zijlstra wrote:
> WTF!?!
I'm thinking this might help... I'll try once I'm back home again.
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index b24f40f05019..15bf45b6f912 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -902,6 +902,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
{
s64 left_vruntime = -1, zero_vruntime, right_vruntime = -1, left_deadline = -1, spread;
+ u64 avruntime;
struct sched_entity *last, *first, *root;
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
@@ -925,6 +926,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
if (last)
right_vruntime = last->vruntime;
zero_vruntime = cfs_rq->zero_vruntime;
+ avruntime = avg_vruntime(cfs_rq);
raw_spin_rq_unlock_irqrestore(rq, flags);
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline",
@@ -934,7 +936,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "zero_vruntime",
SPLIT_NS(zero_vruntime));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime",
- SPLIT_NS(avg_vruntime(cfs_rq)));
+ SPLIT_NS(avruntime));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime",
SPLIT_NS(right_vruntime));
spread = right_vruntime - left_vruntime;
On Tue, Mar 31, 2026 at 9:14 AM Peter Zijlstra <peterz@infradead.org> wrote:
> On Tue, Mar 31, 2026 at 02:20:35PM +0200, Peter Zijlstra wrote:
>
> I'm thinking this might help... I'll try once I'm back home again.
>
>
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index b24f40f05019..15bf45b6f912 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -902,6 +902,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
> void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
> {
> s64 left_vruntime = -1, zero_vruntime, right_vruntime = -1, left_deadline = -1, spread;
> + u64 avruntime;
> struct sched_entity *last, *first, *root;
> struct rq *rq = cpu_rq(cpu);
> unsigned long flags;
> @@ -925,6 +926,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
> if (last)
> right_vruntime = last->vruntime;
> zero_vruntime = cfs_rq->zero_vruntime;
> + avruntime = avg_vruntime(cfs_rq);
> raw_spin_rq_unlock_irqrestore(rq, flags);
>
> SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline",
> @@ -934,7 +936,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
> SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "zero_vruntime",
> SPLIT_NS(zero_vruntime));
> SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime",
> - SPLIT_NS(avg_vruntime(cfs_rq)));
> + SPLIT_NS(avruntime));
> SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime",
> SPLIT_NS(right_vruntime));
> spread = right_vruntime - left_vruntime;
>
This on top of your two previous changes has run for 5 hours for me
now, which is usually where I'd call things "good" when bisecting.
I'm going to leave it overnight, but tentatively:
Tested-by: John Stultz <jstultz@google.com>
thanks
-john
On 3/31/2026 9:44 PM, Peter Zijlstra wrote: > @@ -925,6 +926,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) > if (last) > right_vruntime = last->vruntime; > zero_vruntime = cfs_rq->zero_vruntime; > + avruntime = avg_vruntime(cfs_rq); > raw_spin_rq_unlock_irqrestore(rq, flags); Ah! Didn't notice we dropped the lock as soon as we are done reading the necessary values. Makes sense why reading the debugfs under heavy load crashed. Ran 100 loops of reading the debug file while running stress-ng + sched-messaging and I haven't see any crashes yet so feel free to include: Tested-by: K Prateek Nayak <kprateek.nayak@amd.com> > > SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline", -- Thanks and Regards, Prateek
On 3/31/2026 6:08 AM, K Prateek Nayak wrote:
>> I'm thinking that if you have two groups, and the tick always hits the
>> one group, the other group can go a while without ever getting updated.
>
> Ack! That could be but I only have once cgroup on top of root cgroup as
> far as cpu controllers are concerned so the sched_yield() catching up
> the avg_vruntime() should have worked. Either ways, I have more data:
>
> When I hit the overflow warning, I have:
>
> se: entity_key(-83106064385) weight(90891264) overflow(-7553615238018032640)
> cfs_rq: zero_vruntime(138430453113448575) sum_w_vruntime(0) sum_weight(0)
> cfs_rq->curr: entity_key(0) vruntime(138430453113448575) deadline(138430500540426854)
> Post avg_vruntime():
> se: entity_key(-83106064385) weight(90891264) overflow(-7553615238018032640)
> cfs_rq: zero_vruntime(138430453113448575) sum_w_vruntime(0) sum_weight(0)
> cfs_rq->curr: entity_key(0) vruntime(138430453113448575) deadline(138430500540426854)
>
> so running avg_vruntime() doesn't make a difference and it seems to be a
> genuine case of place_entity() putting the newly woken entity pretty
> far back in the timeline. (I forgot to print weights!)
>
> Now, the funny part is, if I leave the system undisturbed, I get a few
> of the above warning and nothing interesting but as soon as I do a:
>
> grep bits /sys/kernel/debug/sched/debug
>
> Boom! Pick fails very consistently (Because of copy-pasta this too
> doesn't contain weights):
>
> NULL Pick!
> cfs_rq: zero_vruntime(89029406877992895) sum_w_vruntime(-135049248768) sum_weight(1048576)
> cfs_rq->curr: entity_key(149162) vruntime(89029406878142057) deadline(89029406976268435)
> queued se: entity_key(-123294) vruntime(89029406877869601) deadline(89029406880669601)
>
> after avg_vruntime()!
> cfs_rq: zero_vruntime(89029406877868114) sum_w_vruntime(-4206886912) sum_weight(1048576)
> cfs_rq->curr: entity_key(273943) vruntime(89029406878142057) deadline(89029406976268435)
> queued se: entity_key(1487) vruntime(89029406877869601) deadline(89029406880669601)
>
> NULL Pick!
>
> The above doesn't recover after a avg_vruntime(). Btw I'm running:
>
> nice -n 19 stress-ng --yield 32 -t 1000000s&
> while true; do perf bench sched messaging -p -t -l 100000 -g 16; done
>
> Nice 19 is to get a large deadline and keep catching up to that deadline
> at every yield to see if that makes any difference.
>
>>
>> But if there's no cgroups, this can't be it.
>>
>> Anyway, something like the below would rule this out I suppose.
>
> I'll add that in and see if it makes a difference. I'll add in
> weights and look at place_entity() to see if we have anything
> interesting going on there.
Still trips the issue :-( This time I have logs with weights.
For the warning:
se: entity_key(-72358759771) weight(90891264) warning_mul(-6576779137058540544) vlag(39009) delayed?(0)
cfs_rq: zero_vruntime(18695504496613622) sum_w_vruntime(0) sum_weight(0)
cfs_rq->curr: entity_key(0) vruntime(18695504496613622) deadline(18695540588878716) weight(49)
Post avg_vruntime():
se: entity_key(-72358759771) weight(90891264) overflow?(-6576779137058540544)
cfs_rq: zero_vruntime(18695504496613622) sum_w_vruntime(0) sum_weight(0)
cfs_rq->curr: entity_key(0) vruntime(18695504496613622) deadline(18695540588878716) weight(49)
And the NULL pick while reading debugfs (probably something in the initial
task wakeup path that trips it?):
NULL Pick!
cfs_rq: zero_vruntime(21126236598445952) sum_w_vruntime(-1074569456640) sum_weight(15360)
cfs_rq->curr: entity_key(69958950) vruntime(21126236668404902) deadline(21126236859551568) weight(15360)
queued se: entity_key(32498584) vruntime(21126236630944536) deadline(21126236822091202) weight(15360)
After avg_vruntime():
cfs_rq: zero_vruntime(21126236598445952) sum_w_vruntime(-1074569456640) sum_weight(15360)
cfs_rq->curr: entity_key(69958950) vruntime(21126236668404902) deadline(21126236859551568) weight(15360)
queued se: entity_key(32498584) vruntime(21126236630944536) deadline(21126236822091202) weight(15360)
NULL Pick!
Updated zero_vruntime is behind that of either of the queued entities.
Now that I have a reliable trigger for the crash, I'll just start
tracing everything before I run grep (although I suspect something may
have gone bad a long time ago but we can be hopeful)
--
Thanks and Regards,
Prateek
On Fri, Mar 27, 2026 at 10:44:28PM -0700, John Stultz wrote: > The backtrace is usually due to stress-ng stress-ng-yield test: What actual stress-ng parameters are that, and what kind of VM are you running?
On Mon, Mar 30, 2026 at 2:43 AM Peter Zijlstra <peterz@infradead.org> wrote: > > On Fri, Mar 27, 2026 at 10:44:28PM -0700, John Stultz wrote: > > > The backtrace is usually due to stress-ng stress-ng-yield test: > > What actual stress-ng parameters are that, and what kind of VM are you > running? Just running in a loop: stress-ng --class scheduler --all 1 --timeout 300 I usually also run with locktorture using the following boot args: torture.random_shuffle=1 locktorture.writer_fifo=1 locktorture.torture_type=mutex_lock locktorture.nested_locks=8 locktorture.rt_boost=1 locktorture.rt_boost_factor=50 locktorture.stutter=0 As well as cyclictest -t -p99 and my prio-inversion-demo in a loop in the background, but I suspect they aren't necessary here. This all on a nested VM w/ 64 vCPUs (host has 96 vCPUs). Over the weekend I did run with CONFIG_SCHED_PROXY_EXEC disabled, and still tripped the problem. I've added some trace_printks back in and am working to get more details. thanks -john
On Fri, 27 Mar 2026 22:44:28 -0700 John Stultz <jstultz@google.com> wrote: > I'll continue digging next week on this, but wanted to share in case > anyone else sees something obvious first. FYI, you can add a trace_printk() around all the areas that assign cfs_rq->curr, and if you use the persistent ring buffer you can retrieve the output of all the events that lead up to the crash. Add to the kernel command line: reserve_mem=100M:12M:trace trace_instance=boot_mapped^traceprintk^traceoff@trace And before starting your testing: echo 1 > /sys/kernel/tracing/instances/boot_mapped/tracing_on If all goes well, after the crash you should see the output in: cat /sys/kernel/tracing/instances/boot_mapped/trace -- Steve
On Sat, Mar 28, 2026 at 10:03 AM Steven Rostedt <rostedt@goodmis.org> wrote: > > On Fri, 27 Mar 2026 22:44:28 -0700 > John Stultz <jstultz@google.com> wrote: > > > I'll continue digging next week on this, but wanted to share in case > > anyone else sees something obvious first. > > FYI, you can add a trace_printk() around all the areas that assign > cfs_rq->curr, and if you use the persistent ring buffer you can retrieve > the output of all the events that lead up to the crash. > > Add to the kernel command line: > > reserve_mem=100M:12M:trace trace_instance=boot_mapped^traceprintk^traceoff@trace > > And before starting your testing: > > echo 1 > /sys/kernel/tracing/instances/boot_mapped/tracing_on > > If all goes well, after the crash you should see the output in: > > cat /sys/kernel/tracing/instances/boot_mapped/trace Nice. I've actually been using ftrace_dump_on_oops as I log the VM serial console to a file and don't have to remember to go fetch it out the next time. But this is helpful when its on real hardware (dumping the trace over serial console can take awhile and I've had cases where the hardware watchdogs reboot the device midway when the trace is too large). thanks -john
On Mon, 30 Mar 2026 10:58:54 -0700 John Stultz <jstultz@google.com> wrote: > Nice. I've actually been using ftrace_dump_on_oops as I log the VM > serial console to a file and don't have to remember to go fetch it out > the next time. But this is helpful when its on real hardware (dumping > the trace over serial console can take awhile and I've had cases where > the hardware watchdogs reboot the device midway when the trace is too > large). > I use it on VMs all the time. It works nicely with qemu. In the near future we will have a backup buffer implementation too, where you don't need to remember to start it! reserve_mem=100M:12M:trace trace_instance=boot_mapped^traceprintk^@trace trace_instance=backup=boot_mapped Where on early boot up, a "backup" instance is created and copies the persistent ring buffer to a read-only temporary buffer that doesn't get overwritten. Then you can keep the persistent ring buffer always on, and if there's a crash, just look at the "backup" instance to see what happened. -- Steve
On 19.02.26 08:58, Peter Zijlstra wrote:
> It turns out that zero_vruntime tracking is broken when there is but a single
> task running. Current update paths are through __{en,de}queue_entity(), and
> when there is but a single task, pick_next_task() will always return that one
> task, and put_prev_set_next_task() will end up in neither function.
Tried hard but I don't get the last clause.
[...]
> While there, optimize avg_vruntime() by noting that the average of one value is
> rather trivial to compute.
>
nitpick:
> Test case:
> # taskset -c -p 1 $$
> # taskset -c 2 bash -c 'while :; do :; done&'
> # cat /sys/kernel/debug/sched/debug | awk '/^cpu#/ {P=0} /^cpu#2,/ {P=1} {if (P) print $0}' | grep -e zero_vruntime -e "^>"
^
|
Works
only w/o this comma for me.
[...]
On Mon, Feb 23, 2026 at 02:09:52PM +0100, Dietmar Eggemann wrote:
> On 19.02.26 08:58, Peter Zijlstra wrote:
> > It turns out that zero_vruntime tracking is broken when there is but a single
> > task running. Current update paths are through __{en,de}queue_entity(), and
> > when there is but a single task, pick_next_task() will always return that one
> > task, and put_prev_set_next_task() will end up in neither function.
>
> Tried hard but I don't get the last clause.
When prev==next, then put_prev_set_next_task() bails out and we'll never
hit __enqueue_entity()/__dequeue_entity().
> [...]
>
> > While there, optimize avg_vruntime() by noting that the average of one value is
> > rather trivial to compute.
> >
>
> nitpick:
>
> > Test case:
> > # taskset -c -p 1 $$
> > # taskset -c 2 bash -c 'while :; do :; done&'
> > # cat /sys/kernel/debug/sched/debug | awk '/^cpu#/ {P=0} /^cpu#2,/ {P=1} {if (P) print $0}' | grep -e zero_vruntime -e "^>"
> ^
> |
> Works
> only w/o this comma for me.
Hmm, weird, for me:
cat /sys/kernel/debug/sched/debug | grep ^cpu#2
cpu#2, 2500.000 MHz
cpu#20, 2500.000 MHz
cpu#21, 2500.000 MHz
cpu#22, 2500.000 MHz
cpu#23, 2500.000 MHz
cpu#24, 2500.000 MHz
cpu#25, 2500.000 MHz
cpu#26, 2500.000 MHz
cpu#27, 2500.000 MHz
cpu#28, 2500.000 MHz
cpu#29, 2500.000 MHz
And that ',' was added because otherwise it would match the full 20
range of CPUs too, which was not intended ;-)
On 23.02.26 15:15, Peter Zijlstra wrote:
> On Mon, Feb 23, 2026 at 02:09:52PM +0100, Dietmar Eggemann wrote:
>> On 19.02.26 08:58, Peter Zijlstra wrote:
>>> It turns out that zero_vruntime tracking is broken when there is but a single
>>> task running. Current update paths are through __{en,de}queue_entity(), and
>>> when there is but a single task, pick_next_task() will always return that one
>>> task, and put_prev_set_next_task() will end up in neither function.
>>
>> Tried hard but I don't get the last clause.
>
> When prev==next, then put_prev_set_next_task() bails out and we'll never
> hit __enqueue_entity()/__dequeue_entity().
Ah, I see. But IMHO put_prev_set_next_task() is never called for the
testcase below (CPU hog is prev and next in put_prev_set_next_task())
But (prev != p) in pick_next_task_fair() is avoided ?
pick_next_task_fair()
if (prev != p) {
while (!is_same_group())
put_prev_entity()
set_next_entity()
put_prev_entity()
set_next_entity()
}
[...]
>>> Test case:
>>> # taskset -c -p 1 $$
>>> # taskset -c 2 bash -c 'while :; do :; done&'
>>> # cat /sys/kernel/debug/sched/debug | awk '/^cpu#/ {P=0} /^cpu#2,/ {P=1} {if (P) print $0}' | grep -e zero_vruntime -e "^>"
>> ^
>> |
>> Works
>> only w/o this comma for me.
>
> Hmm, weird, for me:
>
> cat /sys/kernel/debug/sched/debug | grep ^cpu#2
> cpu#2, 2500.000 MHz
> cpu#20, 2500.000 MHz
> cpu#21, 2500.000 MHz
> cpu#22, 2500.000 MHz
> cpu#23, 2500.000 MHz
> cpu#24, 2500.000 MHz
> cpu#25, 2500.000 MHz
> cpu#26, 2500.000 MHz
> cpu#27, 2500.000 MHz
> cpu#28, 2500.000 MHz
> cpu#29, 2500.000 MHz
>
> And that ',' was added because otherwise it would match the full 20
> range of CPUs too, which was not intended ;-)
Ah, the ', X Mhz' thing is X86 specific.
awk '/^cpu#/ {P=0} /^cpu#2(,|$)/ {P=1} {if (P) print $0}'
works on Arm64 as well.
On Tue, Feb 24, 2026 at 09:53:06AM +0100, Dietmar Eggemann wrote:
> On 23.02.26 15:15, Peter Zijlstra wrote:
> > On Mon, Feb 23, 2026 at 02:09:52PM +0100, Dietmar Eggemann wrote:
> >> On 19.02.26 08:58, Peter Zijlstra wrote:
> >>> It turns out that zero_vruntime tracking is broken when there is but a single
> >>> task running. Current update paths are through __{en,de}queue_entity(), and
> >>> when there is but a single task, pick_next_task() will always return that one
> >>> task, and put_prev_set_next_task() will end up in neither function.
> >>
> >> Tried hard but I don't get the last clause.
> >
> > When prev==next, then put_prev_set_next_task() bails out and we'll never
> > hit __enqueue_entity()/__dequeue_entity().
>
> Ah, I see. But IMHO put_prev_set_next_task() is never called for the
> testcase below (CPU hog is prev and next in put_prev_set_next_task())
>
> But (prev != p) in pick_next_task_fair() is avoided ?
>
> pick_next_task_fair()
>
> if (prev != p) {
>
> while (!is_same_group())
> put_prev_entity()
> set_next_entity()
>
> put_prev_entity()
> set_next_entity()
> }
>
Ah yes, pick_next_task_fair() open codes that. Also, I might have a
patch to 'fix' all that, but I've not goten around to posting that.
There's still a few wobblies in that part of the pile :/
Look at the top 3 patches in queue/sched/flat if you're up for it :-)
On Thu, 19 Feb 2026 at 09:10, Peter Zijlstra <peterz@infradead.org> wrote:
>
> It turns out that zero_vruntime tracking is broken when there is but a single
> task running. Current update paths are through __{en,de}queue_entity(), and
> when there is but a single task, pick_next_task() will always return that one
> task, and put_prev_set_next_task() will end up in neither function.
>
> This can cause entity_key() to grow indefinitely large and cause overflows,
> leading to much pain and suffering.
>
> Furtermore, doing update_zero_vruntime() from __{de,en}queue_entity(), which
> are called from {set_next,put_prev}_entity() has problems because:
>
> - set_next_entity() calls __dequeue_entity() before it does cfs_rq->curr = se.
> This means the avg_vruntime() will see the removal but not current, missing
> the entity for accounting.
>
> - put_prev_entity() calls __enqueue_entity() before it does cfs_rq->curr =
> NULL. This means the avg_vruntime() will see the addition *and* current,
> leading to double accounting.
>
> Both cases are incorrect/inconsistent.
>
> Noting that avg_vruntime is already called on each {en,de}queue, remove the
> explicit avg_vruntime() calls (which removes an extra 64bit division for each
> {en,de}queue) and have avg_vruntime() update zero_vruntime itself.
>
> Additionally, have the tick call avg_vruntime() -- discarding the result, but
> for the side-effect of updating zero_vruntime.
>
> While there, optimize avg_vruntime() by noting that the average of one value is
> rather trivial to compute.
>
> Test case:
> # taskset -c -p 1 $$
> # taskset -c 2 bash -c 'while :; do :; done&'
> # cat /sys/kernel/debug/sched/debug | awk '/^cpu#/ {P=0} /^cpu#2,/ {P=1} {if (P) print $0}' | grep -e zero_vruntime -e "^>"
>
> PRE:
> .zero_vruntime : 31316.407903
> >R bash 487 50787.345112 E 50789.145972 2.800000 50780.298364 16 120 0.000000 0.000000 0.000000 /
> .zero_vruntime : 382548.253179
> >R bash 487 427275.204288 E 427276.003584 2.800000 427268.157540 23 120 0.000000 0.000000 0.000000 /
>
> POST:
> .zero_vruntime : 17259.709467
> >R bash 526 17259.709467 E 17262.509467 2.800000 16915.031624 9 120 0.000000 0.000000 0.000000 /
> .zero_vruntime : 18702.723356
> >R bash 526 18702.723356 E 18705.523356 2.800000 18358.045513 9 120 0.000000 0.000000 0.000000 /
>
> Fixes: 79f3f9bedd14 ("sched/eevdf: Fix min_vruntime vs avg_vruntime")
> Reported-by: K Prateek Nayak <kprateek.nayak@amd.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
> Tested-by: Shubhang Kaushik <shubhang@os.amperecomputing.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
> ---
> kernel/sched/fair.c | 84 +++++++++++++++++++++++++++++++++++-----------------
> 1 file changed, 57 insertions(+), 27 deletions(-)
>
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -589,6 +589,21 @@ static inline bool entity_before(const s
> return vruntime_cmp(a->deadline, "<", b->deadline);
> }
>
> +/*
> + * Per avg_vruntime() below, cfs_rq::zero_vruntime is only slightly stale
> + * and this value should be no more than two lag bounds. Which puts it in the
> + * general order of:
> + *
> + * (slice + TICK_NSEC) << NICE_0_LOAD_SHIFT
> + *
> + * which is around 44 bits in size (on 64bit); that is 20 for
> + * NICE_0_LOAD_SHIFT, another 20 for NSEC_PER_MSEC and then a handful for
> + * however many msec the actual slice+tick ends up begin.
> + *
> + * (disregarding the actual divide-by-weight part makes for the worst case
> + * weight of 2, which nicely cancels vs the fuzz in zero_vruntime not actually
> + * being the zero-lag point).
> + */
> static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
> {
> return vruntime_op(se->vruntime, "-", cfs_rq->zero_vruntime);
> @@ -676,39 +691,61 @@ sum_w_vruntime_sub(struct cfs_rq *cfs_rq
> }
>
> static inline
> -void sum_w_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
> +void update_zero_vruntime(struct cfs_rq *cfs_rq, s64 delta)
> {
> /*
> - * v' = v + d ==> sum_w_vruntime' = sum_runtime - d*sum_weight
> + * v' = v + d ==> sum_w_vruntime' = sum_w_vruntime - d*sum_weight
> */
> cfs_rq->sum_w_vruntime -= cfs_rq->sum_weight * delta;
> + cfs_rq->zero_vruntime += delta;
> }
>
> /*
> - * Specifically: avg_runtime() + 0 must result in entity_eligible() := true
> + * Specifically: avg_vruntime() + 0 must result in entity_eligible() := true
> * For this to be so, the result of this function must have a left bias.
> + *
> + * Called in:
> + * - place_entity() -- before enqueue
> + * - update_entity_lag() -- before dequeue
> + * - entity_tick()
> + *
> + * This means it is one entry 'behind' but that puts it close enough to where
> + * the bound on entity_key() is at most two lag bounds.
> */
> u64 avg_vruntime(struct cfs_rq *cfs_rq)
> {
> struct sched_entity *curr = cfs_rq->curr;
> - s64 avg = cfs_rq->sum_w_vruntime;
> - long load = cfs_rq->sum_weight;
> + long weight = cfs_rq->sum_weight;
> + s64 delta = 0;
>
> - if (curr && curr->on_rq) {
> - unsigned long weight = scale_load_down(curr->load.weight);
> + if (curr && !curr->on_rq)
> + curr = NULL;
>
> - avg += entity_key(cfs_rq, curr) * weight;
> - load += weight;
> - }
> + if (weight) {
> + s64 runtime = cfs_rq->sum_w_vruntime;
> +
> + if (curr) {
> + unsigned long w = scale_load_down(curr->load.weight);
> +
> + runtime += entity_key(cfs_rq, curr) * w;
> + weight += w;
> + }
>
> - if (load) {
> /* sign flips effective floor / ceiling */
> - if (avg < 0)
> - avg -= (load - 1);
> - avg = div_s64(avg, load);
> + if (runtime < 0)
> + runtime -= (weight - 1);
> +
> + delta = div_s64(runtime, weight);
> + } else if (curr) {
> + /*
> + * When there is but one element, it is the average.
> + */
> + delta = curr->vruntime - cfs_rq->zero_vruntime;
> }
>
> - return cfs_rq->zero_vruntime + avg;
> + update_zero_vruntime(cfs_rq, delta);
> +
> + return cfs_rq->zero_vruntime;
> }
>
> /*
> @@ -777,16 +814,6 @@ int entity_eligible(struct cfs_rq *cfs_r
> return vruntime_eligible(cfs_rq, se->vruntime);
> }
>
> -static void update_zero_vruntime(struct cfs_rq *cfs_rq)
> -{
> - u64 vruntime = avg_vruntime(cfs_rq);
> - s64 delta = vruntime_op(vruntime, "-", cfs_rq->zero_vruntime);
> -
> - sum_w_vruntime_update(cfs_rq, delta);
> -
> - cfs_rq->zero_vruntime = vruntime;
> -}
> -
> static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq)
> {
> struct sched_entity *root = __pick_root_entity(cfs_rq);
> @@ -856,7 +883,6 @@ RB_DECLARE_CALLBACKS(static, min_vruntim
> static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
> {
> sum_w_vruntime_add(cfs_rq, se);
> - update_zero_vruntime(cfs_rq);
> se->min_vruntime = se->vruntime;
> se->min_slice = se->slice;
> rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
> @@ -868,7 +894,6 @@ static void __dequeue_entity(struct cfs_
> rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
> &min_vruntime_cb);
> sum_w_vruntime_sub(cfs_rq, se);
> - update_zero_vruntime(cfs_rq);
> }
>
> struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq)
> @@ -5524,6 +5549,11 @@ entity_tick(struct cfs_rq *cfs_rq, struc
> update_load_avg(cfs_rq, curr, UPDATE_TG);
> update_cfs_group(curr);
>
> + /*
> + * Pulls along cfs_rq::zero_vruntime.
> + */
> + avg_vruntime(cfs_rq);
> +
> #ifdef CONFIG_SCHED_HRTICK
> /*
> * queued ticks are scheduled to match the slice, so don't bother
>
>
© 2016 - 2026 Red Hat, Inc.