[v2] sched: Various reweight_entity() fixes

[PATCH v2 5/7] sched/fair: Increase weight bits for avg_vruntime

Posted by Peter Zijlstra 1 month, 1 week ago

Due to the zero_vruntime patch, the deltas are now a lot smaller and
measurement with kernel-build and hackbench runs show about 45 bits
used.

This ensures avg_vruntime() tracks the full weight range, reducing
numerical artifacts in reweight and the like.

Also, lets keep the paranoid debug code around fow now.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Tested-by: Shubhang Kaushik <shubhang@os.amperecomputing.com>
---
 kernel/sched/debug.c    |   14 ++++++-
 kernel/sched/fair.c     |   91 ++++++++++++++++++++++++++++++++++++++----------
 kernel/sched/features.h |    2 +
 kernel/sched/sched.h    |    3 +
 4 files changed, 90 insertions(+), 20 deletions(-)

--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -8,6 +8,7 @@
  */
 #include <linux/debugfs.h>
 #include <linux/nmi.h>
+#include <linux/log2.h>
 #include "sched.h"
 
 /*
@@ -901,10 +902,13 @@ static void print_rq(struct seq_file *m,
 
 void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 {
-	s64 left_vruntime = -1, zero_vruntime, right_vruntime = -1, left_deadline = -1, spread;
+	s64 left_vruntime = -1, right_vruntime = -1, left_deadline = -1, spread;
+	s64 zero_vruntime = -1, sum_w_vruntime = -1;
 	struct sched_entity *last, *first, *root;
 	struct rq *rq = cpu_rq(cpu);
+	unsigned int sum_shift;
 	unsigned long flags;
+	u64 sum_weight;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	SEQ_printf(m, "\n");
@@ -925,6 +929,9 @@ void print_cfs_rq(struct seq_file *m, in
 	if (last)
 		right_vruntime = last->vruntime;
 	zero_vruntime = cfs_rq->zero_vruntime;
+	sum_w_vruntime = cfs_rq->sum_w_vruntime;
+	sum_weight = cfs_rq->sum_weight;
+	sum_shift = cfs_rq->sum_shift;
 	raw_spin_rq_unlock_irqrestore(rq, flags);
 
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "left_deadline",
@@ -933,6 +940,11 @@ void print_cfs_rq(struct seq_file *m, in
 			SPLIT_NS(left_vruntime));
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "zero_vruntime",
 			SPLIT_NS(zero_vruntime));
+	SEQ_printf(m, "  .%-30s: %Ld (%d bits)\n", "sum_w_vruntime",
+		   sum_w_vruntime, ilog2(abs(sum_w_vruntime)));
+	SEQ_printf(m, "  .%-30s: %Lu\n", "sum_weight",
+		   sum_weight);
+	SEQ_printf(m, "  .%-30s: %u\n", "sum_shift", sum_shift);
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "avg_vruntime",
 			SPLIT_NS(avg_vruntime(cfs_rq)));
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "right_vruntime",
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -665,15 +665,20 @@ static inline s64 entity_key(struct cfs_
  * Since zero_vruntime closely tracks the per-task service, these
  * deltas: (v_i - v0), will be in the order of the maximal (virtual) lag
  * induced in the system due to quantisation.
- *
- * Also, we use scale_load_down() to reduce the size.
- *
- * As measured, the max (key * weight) value was ~44 bits for a kernel build.
  */
-static void
-sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static inline unsigned long avg_vruntime_weight(struct cfs_rq *cfs_rq, unsigned long w)
+{
+#ifdef CONFIG_64BIT
+	if (cfs_rq->sum_shift)
+		w = max(2UL, w >> cfs_rq->sum_shift);
+#endif
+	return w;
+}
+
+static inline void
+__sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	unsigned long weight = scale_load_down(se->load.weight);
+	unsigned long weight = avg_vruntime_weight(cfs_rq, se->load.weight);
 	s64 key = entity_key(cfs_rq, se);
 
 	cfs_rq->sum_w_vruntime += key * weight;
@@ -681,9 +686,59 @@ sum_w_vruntime_add(struct cfs_rq *cfs_rq
 }
 
 static void
+sum_w_vruntime_add_paranoid(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	unsigned long weight;
+	s64 key, tmp;
+
+again:
+	weight = avg_vruntime_weight(cfs_rq, se->load.weight);
+	key = entity_key(cfs_rq, se);
+
+	if (check_mul_overflow(key, weight, &key))
+		goto overflow;
+
+	if (check_add_overflow(cfs_rq->sum_w_vruntime, key, &tmp))
+		goto overflow;
+
+	cfs_rq->sum_w_vruntime = tmp;
+	cfs_rq->sum_weight += weight;
+	return;
+
+overflow:
+	/*
+	 * There's gotta be a limit -- if we're still failing at this point
+	 * there's really nothing much to be done about things.
+	 */
+	BUG_ON(cfs_rq->sum_shift >= 10);
+	cfs_rq->sum_shift++;
+
+	/*
+	 * Note: \Sum (k_i * (w_i >> 1)) != (\Sum (k_i * w_i)) >> 1
+	 */
+	cfs_rq->sum_w_vruntime = 0;
+	cfs_rq->sum_weight = 0;
+
+	for (struct rb_node *node = cfs_rq->tasks_timeline.rb_leftmost;
+	     node; node = rb_next(node))
+		__sum_w_vruntime_add(cfs_rq, __node_2_se(node));
+
+	goto again;
+}
+
+static void
+sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	if (sched_feat(PARANOID_AVG))
+		return sum_w_vruntime_add_paranoid(cfs_rq, se);
+
+	__sum_w_vruntime_add(cfs_rq, se);
+}
+
+static void
 sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	unsigned long weight = scale_load_down(se->load.weight);
+	unsigned long weight = avg_vruntime_weight(cfs_rq, se->load.weight);
 	s64 key = entity_key(cfs_rq, se);
 
 	cfs_rq->sum_w_vruntime -= key * weight;
@@ -725,7 +780,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
 		s64 runtime = cfs_rq->sum_w_vruntime;
 
 		if (curr) {
-			unsigned long w = scale_load_down(curr->load.weight);
+			unsigned long w = avg_vruntime_weight(cfs_rq, curr->load.weight);
 
 			runtime += entity_key(cfs_rq, curr) * w;
 			weight += w;
@@ -735,7 +790,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
 		if (runtime < 0)
 			runtime -= (weight - 1);
 
-		delta = div_s64(runtime, weight);
+		delta = div64_long(runtime, weight);
 	} else if (curr) {
 		/*
 		 * When there is but one element, it is the average.
@@ -801,7 +856,7 @@ static int vruntime_eligible(struct cfs_
 	long load = cfs_rq->sum_weight;
 
 	if (curr && curr->on_rq) {
-		unsigned long weight = scale_load_down(curr->load.weight);
+		unsigned long weight = avg_vruntime_weight(cfs_rq, curr->load.weight);
 
 		avg += entity_key(cfs_rq, curr) * weight;
 		load += weight;
@@ -3871,12 +3926,12 @@ static void reweight_entity(struct cfs_r
 	 * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
 	 * we need to scale se->vlag when w_i changes.
 	 */
-	se->vlag = div_s64(se->vlag * se->load.weight, weight);
+	se->vlag = div64_long(se->vlag * se->load.weight, weight);
 	if (se->rel_deadline)
-		se->deadline = div_s64(se->deadline * se->load.weight, weight);
+		se->deadline = div64_long(se->deadline * se->load.weight, weight);
 
 	if (rel_vprot)
-		vprot = div_s64(vprot * se->load.weight, weight);
+		vprot = div64_long(vprot * se->load.weight, weight);
 
 	update_load_set(&se->load, weight);
 
@@ -5180,7 +5235,7 @@ place_entity(struct cfs_rq *cfs_rq, stru
 	 */
 	if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) {
 		struct sched_entity *curr = cfs_rq->curr;
-		unsigned long load;
+		long load;
 
 		lag = se->vlag;
 
@@ -5238,12 +5293,12 @@ place_entity(struct cfs_rq *cfs_rq, stru
 		 */
 		load = cfs_rq->sum_weight;
 		if (curr && curr->on_rq)
-			load += scale_load_down(curr->load.weight);
+			load += avg_vruntime_weight(cfs_rq, curr->load.weight);
 
-		lag *= load + scale_load_down(se->load.weight);
+		lag *= load + avg_vruntime_weight(cfs_rq, se->load.weight);
 		if (WARN_ON_ONCE(!load))
 			load = 1;
-		lag = div_s64(lag, load);
+		lag = div64_long(lag, load);
 	}
 
 	se->vruntime = vruntime - lag;
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -58,6 +58,8 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
 SCHED_FEAT(DELAY_DEQUEUE, true)
 SCHED_FEAT(DELAY_ZERO, true)
 
+SCHED_FEAT(PARANOID_AVG, false)
+
 /*
  * Allow wakeup-time preemption of the current task:
  */
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -684,8 +684,9 @@ struct cfs_rq {
 
 	s64			sum_w_vruntime;
 	u64			sum_weight;
-
 	u64			zero_vruntime;
+	unsigned int		sum_shift;
+
 #ifdef CONFIG_SCHED_CORE
 	unsigned int		forceidle_seq;
 	u64			zero_vruntime_fi;

Re: [PATCH v2 5/7] sched/fair: Increase weight bits for avg_vruntime

Posted by Vincent Guittot 1 month, 1 week ago

On Thu, 19 Feb 2026 at 09:10, Peter Zijlstra <peterz@infradead.org> wrote:
>
> Due to the zero_vruntime patch, the deltas are now a lot smaller and
> measurement with kernel-build and hackbench runs show about 45 bits
> used.
>
> This ensures avg_vruntime() tracks the full weight range, reducing
> numerical artifacts in reweight and the like.

Instead of paranoid, would it be better to add WARN_ONCE ?

I'm afraid that we will not notice any potential overflow without a
long study of the regression with SCHED_FEAT(PARANOID_AVG, false)

Couldn't we add a cheaper WARN_ONCE (key > 2^50)  in __sum_w_vruntime_add ?

We should always have
key < 110ms (max slice+max tick) * nice_0 (2^20) / weight (2)
key < 2^46

We can use 50 bits to get margin

Weight is always less than 27bits and key*weight gives us 110ms (max
slice+max tick) * nice_0 (2^20) so we should never add more than 2^47
to ->sum_weight

so a WARN_ONCE (cfs_rq->sum_weight > 2^63) should be enough



>
> Also, lets keep the paranoid debug code around fow now.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
> Tested-by: Shubhang Kaushik <shubhang@os.amperecomputing.com>
> ---
>  kernel/sched/debug.c    |   14 ++++++-
>  kernel/sched/fair.c     |   91 ++++++++++++++++++++++++++++++++++++++----------
>  kernel/sched/features.h |    2 +
>  kernel/sched/sched.h    |    3 +
>  4 files changed, 90 insertions(+), 20 deletions(-)
>
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -8,6 +8,7 @@
>   */
>  #include <linux/debugfs.h>
>  #include <linux/nmi.h>
> +#include <linux/log2.h>
>  #include "sched.h"
>
>  /*
> @@ -901,10 +902,13 @@ static void print_rq(struct seq_file *m,
>
>  void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
>  {
> -       s64 left_vruntime = -1, zero_vruntime, right_vruntime = -1, left_deadline = -1, spread;
> +       s64 left_vruntime = -1, right_vruntime = -1, left_deadline = -1, spread;
> +       s64 zero_vruntime = -1, sum_w_vruntime = -1;
>         struct sched_entity *last, *first, *root;
>         struct rq *rq = cpu_rq(cpu);
> +       unsigned int sum_shift;
>         unsigned long flags;
> +       u64 sum_weight;
>
>  #ifdef CONFIG_FAIR_GROUP_SCHED
>         SEQ_printf(m, "\n");
> @@ -925,6 +929,9 @@ void print_cfs_rq(struct seq_file *m, in
>         if (last)
>                 right_vruntime = last->vruntime;
>         zero_vruntime = cfs_rq->zero_vruntime;
> +       sum_w_vruntime = cfs_rq->sum_w_vruntime;
> +       sum_weight = cfs_rq->sum_weight;
> +       sum_shift = cfs_rq->sum_shift;
>         raw_spin_rq_unlock_irqrestore(rq, flags);
>
>         SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "left_deadline",
> @@ -933,6 +940,11 @@ void print_cfs_rq(struct seq_file *m, in
>                         SPLIT_NS(left_vruntime));
>         SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "zero_vruntime",
>                         SPLIT_NS(zero_vruntime));
> +       SEQ_printf(m, "  .%-30s: %Ld (%d bits)\n", "sum_w_vruntime",
> +                  sum_w_vruntime, ilog2(abs(sum_w_vruntime)));
> +       SEQ_printf(m, "  .%-30s: %Lu\n", "sum_weight",
> +                  sum_weight);
> +       SEQ_printf(m, "  .%-30s: %u\n", "sum_shift", sum_shift);
>         SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "avg_vruntime",
>                         SPLIT_NS(avg_vruntime(cfs_rq)));
>         SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "right_vruntime",
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -665,15 +665,20 @@ static inline s64 entity_key(struct cfs_
>   * Since zero_vruntime closely tracks the per-task service, these
>   * deltas: (v_i - v0), will be in the order of the maximal (virtual) lag
>   * induced in the system due to quantisation.
> - *
> - * Also, we use scale_load_down() to reduce the size.
> - *
> - * As measured, the max (key * weight) value was ~44 bits for a kernel build.
>   */
> -static void
> -sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
> +static inline unsigned long avg_vruntime_weight(struct cfs_rq *cfs_rq, unsigned long w)
> +{
> +#ifdef CONFIG_64BIT
> +       if (cfs_rq->sum_shift)
> +               w = max(2UL, w >> cfs_rq->sum_shift);
> +#endif
> +       return w;
> +}
> +
> +static inline void
> +__sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  {
> -       unsigned long weight = scale_load_down(se->load.weight);
> +       unsigned long weight = avg_vruntime_weight(cfs_rq, se->load.weight);
>         s64 key = entity_key(cfs_rq, se);
>
>         cfs_rq->sum_w_vruntime += key * weight;
> @@ -681,9 +686,59 @@ sum_w_vruntime_add(struct cfs_rq *cfs_rq
>  }
>
>  static void
> +sum_w_vruntime_add_paranoid(struct cfs_rq *cfs_rq, struct sched_entity *se)
> +{
> +       unsigned long weight;
> +       s64 key, tmp;
> +
> +again:
> +       weight = avg_vruntime_weight(cfs_rq, se->load.weight);
> +       key = entity_key(cfs_rq, se);
> +
> +       if (check_mul_overflow(key, weight, &key))
> +               goto overflow;
> +
> +       if (check_add_overflow(cfs_rq->sum_w_vruntime, key, &tmp))
> +               goto overflow;
> +
> +       cfs_rq->sum_w_vruntime = tmp;
> +       cfs_rq->sum_weight += weight;
> +       return;
> +
> +overflow:
> +       /*
> +        * There's gotta be a limit -- if we're still failing at this point
> +        * there's really nothing much to be done about things.
> +        */
> +       BUG_ON(cfs_rq->sum_shift >= 10);
> +       cfs_rq->sum_shift++;
> +
> +       /*
> +        * Note: \Sum (k_i * (w_i >> 1)) != (\Sum (k_i * w_i)) >> 1
> +        */
> +       cfs_rq->sum_w_vruntime = 0;
> +       cfs_rq->sum_weight = 0;
> +
> +       for (struct rb_node *node = cfs_rq->tasks_timeline.rb_leftmost;
> +            node; node = rb_next(node))
> +               __sum_w_vruntime_add(cfs_rq, __node_2_se(node));
> +
> +       goto again;
> +}
> +
> +static void
> +sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
> +{
> +       if (sched_feat(PARANOID_AVG))
> +               return sum_w_vruntime_add_paranoid(cfs_rq, se);
> +
> +       __sum_w_vruntime_add(cfs_rq, se);
> +}
> +
> +static void
>  sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  {
> -       unsigned long weight = scale_load_down(se->load.weight);
> +       unsigned long weight = avg_vruntime_weight(cfs_rq, se->load.weight);
>         s64 key = entity_key(cfs_rq, se);
>
>         cfs_rq->sum_w_vruntime -= key * weight;
> @@ -725,7 +780,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
>                 s64 runtime = cfs_rq->sum_w_vruntime;
>
>                 if (curr) {
> -                       unsigned long w = scale_load_down(curr->load.weight);
> +                       unsigned long w = avg_vruntime_weight(cfs_rq, curr->load.weight);
>
>                         runtime += entity_key(cfs_rq, curr) * w;
>                         weight += w;
> @@ -735,7 +790,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
>                 if (runtime < 0)
>                         runtime -= (weight - 1);
>
> -               delta = div_s64(runtime, weight);
> +               delta = div64_long(runtime, weight);
>         } else if (curr) {
>                 /*
>                  * When there is but one element, it is the average.
> @@ -801,7 +856,7 @@ static int vruntime_eligible(struct cfs_
>         long load = cfs_rq->sum_weight;
>
>         if (curr && curr->on_rq) {
> -               unsigned long weight = scale_load_down(curr->load.weight);
> +               unsigned long weight = avg_vruntime_weight(cfs_rq, curr->load.weight);
>
>                 avg += entity_key(cfs_rq, curr) * weight;
>                 load += weight;
> @@ -3871,12 +3926,12 @@ static void reweight_entity(struct cfs_r
>          * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
>          * we need to scale se->vlag when w_i changes.
>          */
> -       se->vlag = div_s64(se->vlag * se->load.weight, weight);
> +       se->vlag = div64_long(se->vlag * se->load.weight, weight);
>         if (se->rel_deadline)
> -               se->deadline = div_s64(se->deadline * se->load.weight, weight);
> +               se->deadline = div64_long(se->deadline * se->load.weight, weight);
>
>         if (rel_vprot)
> -               vprot = div_s64(vprot * se->load.weight, weight);
> +               vprot = div64_long(vprot * se->load.weight, weight);
>
>         update_load_set(&se->load, weight);
>
> @@ -5180,7 +5235,7 @@ place_entity(struct cfs_rq *cfs_rq, stru
>          */
>         if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) {
>                 struct sched_entity *curr = cfs_rq->curr;
> -               unsigned long load;
> +               long load;
>
>                 lag = se->vlag;
>
> @@ -5238,12 +5293,12 @@ place_entity(struct cfs_rq *cfs_rq, stru
>                  */
>                 load = cfs_rq->sum_weight;
>                 if (curr && curr->on_rq)
> -                       load += scale_load_down(curr->load.weight);
> +                       load += avg_vruntime_weight(cfs_rq, curr->load.weight);
>
> -               lag *= load + scale_load_down(se->load.weight);
> +               lag *= load + avg_vruntime_weight(cfs_rq, se->load.weight);
>                 if (WARN_ON_ONCE(!load))
>                         load = 1;
> -               lag = div_s64(lag, load);
> +               lag = div64_long(lag, load);
>         }
>
>         se->vruntime = vruntime - lag;
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -58,6 +58,8 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
>  SCHED_FEAT(DELAY_DEQUEUE, true)
>  SCHED_FEAT(DELAY_ZERO, true)
>
> +SCHED_FEAT(PARANOID_AVG, false)
> +
>  /*
>   * Allow wakeup-time preemption of the current task:
>   */
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -684,8 +684,9 @@ struct cfs_rq {
>
>         s64                     sum_w_vruntime;
>         u64                     sum_weight;
> -
>         u64                     zero_vruntime;
> +       unsigned int            sum_shift;
> +
>  #ifdef CONFIG_SCHED_CORE
>         unsigned int            forceidle_seq;
>         u64                     zero_vruntime_fi;
>
>

Re: [PATCH v2 5/7] sched/fair: Increase weight bits for avg_vruntime

Posted by Peter Zijlstra 1 month, 1 week ago

On Mon, Feb 23, 2026 at 11:56:33AM +0100, Vincent Guittot wrote:
> On Thu, 19 Feb 2026 at 09:10, Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > Due to the zero_vruntime patch, the deltas are now a lot smaller and
> > measurement with kernel-build and hackbench runs show about 45 bits
> > used.
> >
> > This ensures avg_vruntime() tracks the full weight range, reducing
> > numerical artifacts in reweight and the like.
> 
> Instead of paranoid, would it be better to add WARN_ONCE ?
> 
> I'm afraid that we will not notice any potential overflow without a
> long study of the regression with SCHED_FEAT(PARANOID_AVG, false)
> 
> Couldn't we add a cheaper WARN_ONCE (key > 2^50)  in __sum_w_vruntime_add ?
> 
> We should always have
> key < 110ms (max slice+max tick) * nice_0 (2^20) / weight (2)
> key < 2^46
> 
> We can use 50 bits to get margin
> 
> Weight is always less than 27bits and key*weight gives us 110ms (max
> slice+max tick) * nice_0 (2^20) so we should never add more than 2^47
> to ->sum_weight
> 
> so a WARN_ONCE (cfs_rq->sum_weight > 2^63) should be enough

Ha, I was >< close to pushing out these patches when I saw this.

The thing is signed, so bit 63 is the sign bit, but I suppose we can
test bit 62 like so:

Let me go build and boot that.

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -679,9 +679,13 @@ static inline void
 __sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	unsigned long weight = avg_vruntime_weight(cfs_rq, se->load.weight);
-	s64 key = entity_key(cfs_rq, se);
+	s64 w_vruntime, key = entity_key(cfs_rq, se);
 
-	cfs_rq->sum_w_vruntime += key * weight;
+	w_vruntime = key * weight;
+
+	WARN_ON_ONCE((w_vruntime >> 63) != (w_vruntime >> 62));
+
+	cfs_rq->sum_w_vruntime += w_vruntime;
 	cfs_rq->sum_weight += weight;
 }

Re: [PATCH v2 5/7] sched/fair: Increase weight bits for avg_vruntime

Posted by K Prateek Nayak 3 days, 11 hours ago

On 2/23/2026 5:21 PM, Peter Zijlstra wrote:
>> We should always have
>> key < 110ms (max slice+max tick) * nice_0 (2^20) / weight (2)
>> key < 2^46
>>
>> We can use 50 bits to get margin
>>
>> Weight is always less than 27bits and key*weight gives us 110ms (max
>> slice+max tick) * nice_0 (2^20) so we should never add more than 2^47
>> to ->sum_weight
>>
>> so a WARN_ONCE (cfs_rq->sum_weight > 2^63) should be enough
> 
> Ha, I was >< close to pushing out these patches when I saw this.
> 
> The thing is signed, so bit 63 is the sign bit, but I suppose we can
> test bit 62 like so:
> 
> Let me go build and boot that.
> 
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -679,9 +679,13 @@ static inline void
>  __sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  {
>  	unsigned long weight = avg_vruntime_weight(cfs_rq, se->load.weight);
> -	s64 key = entity_key(cfs_rq, se);
> +	s64 w_vruntime, key = entity_key(cfs_rq, se);
>  
> -	cfs_rq->sum_w_vruntime += key * weight;
> +	w_vruntime = key * weight;
> +
> +	WARN_ON_ONCE((w_vruntime >> 63) != (w_vruntime >> 62));

I was trying to reproduce the crash that John mentioned on the Patch 1
and although I couldn't reproduce that crash (yet), I tripped this when
running stress-ng yield test (32 copies x 256 children + sched messaging
16 groups) on my dual socket system (2 x 64C/128T):

    ------------[ cut here ]------------
    (w_vruntime >> 63) != (w_vruntime >> 62)
    WARNING: kernel/sched/fair.c:692 at __enqueue_entity+0x382/0x3a0, CPU#5: stress-ng/5062
    Modules linked in: ...
    CPU: 5 UID: 1000 PID: 5062 Comm: stress-ng Not tainted 7.0.0-rc5-topo-test+ #40 PREEMPT(full)
    Hardware name: Dell Inc. PowerEdge R6525/024PW1, BIOS 2.7.3 03/30/2022
    RIP: 0010:__enqueue_entity+0x382/0x3a0
    Code: 4c 89 4b 48 4c 89 4b 50 e9 61 fe ff ff 83 f9 3f 0f 87 b8 27 e5 ff 49 d3 ec b8 02 00 00 00 49 39 c4 4c 0f 42 e0 e9 16 ff ff ff <0f> 0b e9 d8 fc ff ff 0f 0b e9 e1 fe ff ff 0f 0b 66 66 2e 0f 1f 84
    RSP: 0018:ffffcf6b8ea88c18 EFLAGS: 00010002
    RAX: bf38ba3b09dc2400 RBX: ffff8d546f832680 RCX: ffffffffffffffff
    RDX: fffffffffffffffe RSI: ffff8d1587818080 RDI: ffff8d546f832680
    RBP: ffff8d1587818080 R08: 000000000000002d R09: 00000000ffffffff
    R10: 0000000000000001 R11: ffffcf6b8ea88ff8 R12: 00000000056ae400
    R13: ffff8d1587818080 R14: 0000000000000001 R15: ffff8d546f832680
    FS:  00007f8c742c9740(0000) GS:ffff8d54b0c82000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    CR2: 00007f2e09381358 CR3: 00000040db039002 CR4: 0000000000f70ef0
    PKRU: 55555554
    Call Trace:
     <IRQ>
     enqueue_task_fair+0x1a3/0xe50
     ? srso_alias_return_thunk+0x5/0xfbef5
     ? place_entity+0x21/0x160
     enqueue_task+0x88/0x1b0
     ttwu_do_activate+0x74/0x1c0
     try_to_wake_up+0x277/0x840
	 ...
     asm_sysvec_call_function_single+0x1a/0x20
    RIP: 0010:do_sched_yield+0x73/0xa0
    Code: 89 df 48 8b 80 e8 02 00 00 48 8b 40 18 e8 75 a9 fd 00 65 ff 05 9e 94 fc 02 66 90 48 8d 7b 48 e8 d3 96 fd 00 fb 0f 1f 44 00 00 <65> ff 0d 86 94 fc 02 5b e9 10 12 fd 00 83 83 70 0d 00 00 01 eb bb
    RSP: 0018:ffffcf6b9b7bbd78 EFLAGS: 00000282
    RAX: ffffffffbbbb4560 RBX: ffff8d546f832580 RCX: 0000000000000000
    RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffff8d546f8325c8
    RBP: ffffcf6b9b7bbf38 R08: 0000000000000000 R09: 0000000000000000
    R10: 0000000000000000 R11: 0000000000000000 R12: ffff8d15d1070000
    R13: 0000000000000018 R14: 0000000000000000 R15: 0000000000000018
     ? __pfx_yield_task_fair+0x10/0x10
     ? do_sched_yield+0x6d/0xa0
     __do_sys_sched_yield+0xe/0x20
	 ...

Since this wasn't suppose to trip, I'm assuming we are somehow in the
wrap around territory again :-(

I don't see anything particularly interesting in the sched/debug
entry after the fact:

cfs_rq[5]:/user.slice
  .left_deadline                 : 26249498461.397509
  .left_vruntime                 : 26249498270.250843
  .zero_vruntime                 : 26249456859.395628
  .sum_w_vruntime                : 2547538312135680 (51 bits)
  .sum_weight                    : 61440
  .sum_shift                     : 0
  .avg_vruntime                  : 26249498338.158417
  .right_vruntime                : 26249498381.633124
  .spread                        : 111.382281
  .nr_queued                     : 5
  .h_nr_runnable                 : 5
  .h_nr_queued                   : 5
  .h_nr_idle                     : 0
  ...

I still haven't figured out how this happens I'll start running with
some debug prints next.

On a tangential note, now that we only yield on eligibility, would
something like below make sense?

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 226509231e67..55ab1f58d703 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9265,9 +9265,10 @@ static void yield_task_fair(struct rq *rq)
 	struct sched_entity *se = &curr->se;
 
 	/*
-	 * Are we the only task in the tree?
+	 * Single task is always eligible on the cfs_rq.
+	 * Don't pull the vruntime needlessly.
 	 */
-	if (unlikely(rq->nr_running == 1))
+	if (unlikely(cfs_rq->nr_queued == 1))
 		return;
 
 	clear_buddies(cfs_rq, se);
-- 
Thanks and Regards,
Prateek

Re: [PATCH v2 5/7] sched/fair: Increase weight bits for avg_vruntime

Posted by K Prateek Nayak 13 hours ago

On 3/30/2026 1:25 PM, K Prateek Nayak wrote:
>     ------------[ cut here ]------------
>     (w_vruntime >> 63) != (w_vruntime >> 62)
>     WARNING: kernel/sched/fair.c:692 at __enqueue_entity+0x382/0x3a0, CPU#5: stress-ng/5062

Back to this: I still see this with latest set of changes on
queue:sched/urgent but it doesn't go kaboom. Nonetheless, it suggests we
are closing in on the s64 limitations of "sum_w_vruntime" which isn't
very comforting.

Here is one scenario where it was triggered when running:

    stress-ng --yield=32 -t 10000000s&
    while true; do perf bench sched messaging -p -t -l 100000 -g 16; done

on a 256CPUs machine after about an hour into the run:

    __enqeue_entity: entity_key(-141245081754) weight(90891264) overflow_mul(5608800059305154560) vlag(57498) delayed?(0)
    cfs_rq: zero_vruntime(3809707759657809) sum_w_vruntime(0) sum_weight(0) nr_queued(1)
    cfs_rq->curr: entity_key(0) vruntime(3809707759657809) deadline(3809723966988476) weight(37)

The above comes from __enqueue_entity() after a place_entity(). Breaking
this down:

    vlag_initial = 57498
    vlag = (57498 * (37 + 90891264)) / 37 = 141,245,081,754

    vruntime = 3809707759657809 - 141245081754 = 3,809,566,514,576,055
    entity_key(se, cfs_rq) = -141,245,081,754

Now, multiplying the entity_key with its own weight results to
5,608,800,059,305,154,560 (same as what overflow_mul() suggests) but
in Python, without overflow, this would be: -1,2837,944,014,404,397,056

Now, the fact that it doesn't crash suggests to me the later
avg_vruntime() calculation would restore normality and the
sum_w_vruntime turns to -57498 (vlag_initial) * 90891264 (weight) =
-5,226,065,897,472 (assuming curr's vruntime is still the same) which
only requires 43 bits.

I also added the following at the bottom of dequeue_entity():

    WARN_ON_ONCE(!cfs_rq->nr_queued && cfs_rq->sum_w_vruntime)

which was never triggered when the cfs_rq goes idle so it isn't like we
didn't account sum_w_vruntime properly. There was just a momentary
overflow so we are fine but will it always be that way?

One way to avoid the warning entirely would be to pull the zero_vruntime
close to avg_vruntime is we are enqueuing a very heavy entity.

The correct way to do this would be to compute the actual avg_vruntime()
and move the zero_vruntime to that point (but that requires at least one
multiply + divide + update_zero_vruntime()).

One seemingly cheap way by which I've been able to avoid the warning is
with:

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 226509231e67..bc708bb8b5d0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5329,6 +5329,7 @@ static void
 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
 	u64 vslice, vruntime = avg_vruntime(cfs_rq);
+	bool update_zero = false;
 	s64 lag = 0;

 	if (!se->custom_slice)
@@ -5406,6 +5407,17 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 			load += avg_vruntime_weight(cfs_rq, curr->load.weight);

 		lag *= load + avg_vruntime_weight(cfs_rq, se->load.weight);
+		/*
+		 * If the entity_key() * sum_weight of all the enqueued entities
+		 * is more than the sum_w_vruntime, move the zero_vruntime
+		 * point to the vruntime of the entity which prevents using
+		 * more bits than necessary for sum_w_vruntime until the
+		 * next avg_vruntime().
+		 *
+		 * XXX: Cheap enough check?
+		 */
+		if (abs(lag) > abs(cfs_rq->sum_w_vruntime))
+			update_zero = true;
 		if (WARN_ON_ONCE(!load))
 			load = 1;
 		lag = div64_long(lag, load);
@@ -5413,6 +5425,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)

 	se->vruntime = vruntime - lag;

+	if (update_zero)
+		update_zero_vruntime(cfs_rq, -lag);
+
 	if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) {
 		se->deadline += se->vruntime;
 		se->rel_deadline = 0;
---

But I'm sure it'll make people nervous since we basically move the
zero_vruntime to se->vruntime. It isn't too bad if:

    abs(sum_w_vuntime - (lag * load)) < abs(lag * se->load.weight)

but we already know that the latter overflows so is there any other
cheaper indicator that we can use to detect the necessity to adjust the
avg_vruntime beforehand at place_entity()?

-- 
Thanks and Regards,
Prateek

Re: [PATCH v2 5/7] sched/fair: Increase weight bits for avg_vruntime

Posted by Peter Zijlstra 8 hours ago

On Thu, Apr 02, 2026 at 10:58:18AM +0530, K Prateek Nayak wrote:
> On 3/30/2026 1:25 PM, K Prateek Nayak wrote:
> >     ------------[ cut here ]------------
> >     (w_vruntime >> 63) != (w_vruntime >> 62)
> >     WARNING: kernel/sched/fair.c:692 at __enqueue_entity+0x382/0x3a0, CPU#5: stress-ng/5062
> 
> Back to this: I still see this with latest set of changes on
> queue:sched/urgent but it doesn't go kaboom. Nonetheless, it suggests we
> are closing in on the s64 limitations of "sum_w_vruntime" which isn't
> very comforting.

Yeah, we are pushing 64bit pretty hard :/ And if all we would care about
was x86_64 I'd have long since used the fact that imul has a 128bit
result and idiv actually divides 128bit. But even among 64bit
architectures that is somewhat rare :/

> Here is one scenario where it was triggered when running:
> 
>     stress-ng --yield=32 -t 10000000s&
>     while true; do perf bench sched messaging -p -t -l 100000 -g 16; done
> 
> on a 256CPUs machine after about an hour into the run:
> 
>     __enqeue_entity: entity_key(-141245081754) weight(90891264) overflow_mul(5608800059305154560) vlag(57498) delayed?(0)
>     cfs_rq: zero_vruntime(3809707759657809) sum_w_vruntime(0) sum_weight(0) nr_queued(1)
>     cfs_rq->curr: entity_key(0) vruntime(3809707759657809) deadline(3809723966988476) weight(37)
> 
> The above comes from __enqueue_entity() after a place_entity(). Breaking
> this down:
> 
>     vlag_initial = 57498
>     vlag = (57498 * (37 + 90891264)) / 37 = 141,245,081,754
> 
>     vruntime = 3809707759657809 - 141245081754 = 3,809,566,514,576,055
>     entity_key(se, cfs_rq) = -141,245,081,754
> 
> Now, multiplying the entity_key with its own weight results to
> 5,608,800,059,305,154,560 (same as what overflow_mul() suggests) but
> in Python, without overflow, this would be: -1,2837,944,014,404,397,056

Oh gawd, this is a 'fun' case.

> One way to avoid the warning entirely would be to pull the zero_vruntime
> close to avg_vruntime is we are enqueuing a very heavy entity.
> 
> The correct way to do this would be to compute the actual avg_vruntime()
> and move the zero_vruntime to that point (but that requires at least one
> multiply + divide + update_zero_vruntime()).
> 
> One seemingly cheap way by which I've been able to avoid the warning is
> with:
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 226509231e67..bc708bb8b5d0 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5329,6 +5329,7 @@ static void
>  place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>  {
>  	u64 vslice, vruntime = avg_vruntime(cfs_rq);
> +	bool update_zero = false;
>  	s64 lag = 0;
>  
>  	if (!se->custom_slice)
> @@ -5406,6 +5407,17 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>  			load += avg_vruntime_weight(cfs_rq, curr->load.weight);
>  
>  		lag *= load + avg_vruntime_weight(cfs_rq, se->load.weight);
> +		/*
> +		 * If the entity_key() * sum_weight of all the enqueued entities
> +		 * is more than the sum_w_vruntime, move the zero_vruntime
> +		 * point to the vruntime of the entity which prevents using
> +		 * more bits than necessary for sum_w_vruntime until the
> +		 * next avg_vruntime().
> +		 *
> +		 * XXX: Cheap enough check?
> +		 */
> +		if (abs(lag) > abs(cfs_rq->sum_w_vruntime))
> +			update_zero = true;
>  		if (WARN_ON_ONCE(!load))
>  			load = 1;
>  		lag = div64_long(lag, load);
> @@ -5413,6 +5425,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>  
>  	se->vruntime = vruntime - lag;
>  
> +	if (update_zero)
> +		update_zero_vruntime(cfs_rq, -lag);
> +
>  	if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) {
>  		se->deadline += se->vruntime;
>  		se->rel_deadline = 0;
> ---
> 
> But I'm sure it'll make people nervous since we basically move the
> zero_vruntime to se->vruntime. It isn't too bad if:
> 
>     abs(sum_w_vuntime - (lag * load)) < abs(lag * se->load.weight)
> 
> but we already know that the latter overflows so is there any other
> cheaper indicator that we can use to detect the necessity to adjust the
> avg_vruntime beforehand at place_entity()?

So in general I think it would be fine to move zero_vruntime to the
heaviest entity in the tree. And if there are multiple equal heaviest
weights, any one of them should be fine.

Per necessity heavy entities are more tightly clustered -- the lag is
inversely proportional to weight, and the spread is proportional to the
lag bound.

I suspect something simple like comparing the entity weight against the
sum_weight might be enough. If the pre-existing tree is, in aggregate,
heavier than the new element, the avg will not move very drastically.
However, if the new element is (significantly) heavier than the tree,
the avg will move significantly (as demonstrated here).

That is, something like the below... But with a comment ofc :-)

Does that make sense?


diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9298f49f842c..7fbd9538fe30 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5329,6 +5329,7 @@ static void
 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
 	u64 vslice, vruntime = avg_vruntime(cfs_rq);
+	bool update_zero = false;
 	s64 lag = 0;
 
 	if (!se->custom_slice)
@@ -5345,7 +5346,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 */
 	if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) {
 		struct sched_entity *curr = cfs_rq->curr;
-		long load;
+		long load, weight;
 
 		lag = se->vlag;
 
@@ -5405,14 +5406,21 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 		if (curr && curr->on_rq)
 			load += avg_vruntime_weight(cfs_rq, curr->load.weight);
 
-		lag *= load + avg_vruntime_weight(cfs_rq, se->load.weight);
+		weight = avg_vruntime_weight(cfs_rq, se->load.weight);
+		lag *= load + weight;
 		if (WARN_ON_ONCE(!load))
 			load = 1;
 		lag = div64_long(lag, load);
+
+		if (weight > load)
+			update_zero = true;
 	}
 
 	se->vruntime = vruntime - lag;
 
+	if (update_zero)
+		update_zero_vruntime(cfs_rq, -lag);
+
 	if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) {
 		se->deadline += se->vruntime;
 		se->rel_deadline = 0;

Re: [PATCH v2 5/7] sched/fair: Increase weight bits for avg_vruntime

Posted by K Prateek Nayak 8 hours ago

Hello Peter,

On 4/2/2026 3:52 PM, Peter Zijlstra wrote:
> On Thu, Apr 02, 2026 at 10:58:18AM +0530, K Prateek Nayak wrote:
>> On 3/30/2026 1:25 PM, K Prateek Nayak wrote:
>>>     ------------[ cut here ]------------
>>>     (w_vruntime >> 63) != (w_vruntime >> 62)
>>>     WARNING: kernel/sched/fair.c:692 at __enqueue_entity+0x382/0x3a0, CPU#5: stress-ng/5062
>>
>> Back to this: I still see this with latest set of changes on
>> queue:sched/urgent but it doesn't go kaboom. Nonetheless, it suggests we
>> are closing in on the s64 limitations of "sum_w_vruntime" which isn't
>> very comforting.
> 
> Yeah, we are pushing 64bit pretty hard :/ And if all we would care about
> was x86_64 I'd have long since used the fact that imul has a 128bit
> result and idiv actually divides 128bit. But even among 64bit
> architectures that is somewhat rare :/

Guess we have to make do with what is more abundant. We haven't crashed
and burnt yet so it should be a fun debug for future us when we get
there :-)

> 
>> Here is one scenario where it was triggered when running:
>>
>>     stress-ng --yield=32 -t 10000000s&
>>     while true; do perf bench sched messaging -p -t -l 100000 -g 16; done
>>
>> on a 256CPUs machine after about an hour into the run:
>>
>>     __enqeue_entity: entity_key(-141245081754) weight(90891264) overflow_mul(5608800059305154560) vlag(57498) delayed?(0)
>>     cfs_rq: zero_vruntime(3809707759657809) sum_w_vruntime(0) sum_weight(0) nr_queued(1)
>>     cfs_rq->curr: entity_key(0) vruntime(3809707759657809) deadline(3809723966988476) weight(37)
>>
>> The above comes from __enqueue_entity() after a place_entity(). Breaking
>> this down:
>>
>>     vlag_initial = 57498
>>     vlag = (57498 * (37 + 90891264)) / 37 = 141,245,081,754
>>
>>     vruntime = 3809707759657809 - 141245081754 = 3,809,566,514,576,055
>>     entity_key(se, cfs_rq) = -141,245,081,754
>>
>> Now, multiplying the entity_key with its own weight results to
>> 5,608,800,059,305,154,560 (same as what overflow_mul() suggests) but
>> in Python, without overflow, this would be: -1,2837,944,014,404,397,056
> 
> Oh gawd, this is a 'fun' case.
> 
>> One way to avoid the warning entirely would be to pull the zero_vruntime
>> close to avg_vruntime is we are enqueuing a very heavy entity.
>>
>> The correct way to do this would be to compute the actual avg_vruntime()
>> and move the zero_vruntime to that point (but that requires at least one
>> multiply + divide + update_zero_vruntime()).
>>
>> One seemingly cheap way by which I've been able to avoid the warning is
>> with:
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 226509231e67..bc708bb8b5d0 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -5329,6 +5329,7 @@ static void
>>  place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>>  {
>>  	u64 vslice, vruntime = avg_vruntime(cfs_rq);
>> +	bool update_zero = false;
>>  	s64 lag = 0;
>>  
>>  	if (!se->custom_slice)
>> @@ -5406,6 +5407,17 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>>  			load += avg_vruntime_weight(cfs_rq, curr->load.weight);
>>  
>>  		lag *= load + avg_vruntime_weight(cfs_rq, se->load.weight);
>> +		/*
>> +		 * If the entity_key() * sum_weight of all the enqueued entities
>> +		 * is more than the sum_w_vruntime, move the zero_vruntime
>> +		 * point to the vruntime of the entity which prevents using
>> +		 * more bits than necessary for sum_w_vruntime until the
>> +		 * next avg_vruntime().
>> +		 *
>> +		 * XXX: Cheap enough check?
>> +		 */
>> +		if (abs(lag) > abs(cfs_rq->sum_w_vruntime))
>> +			update_zero = true;
>>  		if (WARN_ON_ONCE(!load))
>>  			load = 1;
>>  		lag = div64_long(lag, load);
>> @@ -5413,6 +5425,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>>  
>>  	se->vruntime = vruntime - lag;
>>  
>> +	if (update_zero)
>> +		update_zero_vruntime(cfs_rq, -lag);
>> +
>>  	if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) {
>>  		se->deadline += se->vruntime;
>>  		se->rel_deadline = 0;
>> ---
>>
>> But I'm sure it'll make people nervous since we basically move the
>> zero_vruntime to se->vruntime. It isn't too bad if:
>>
>>     abs(sum_w_vuntime - (lag * load)) < abs(lag * se->load.weight)
>>
>> but we already know that the latter overflows so is there any other
>> cheaper indicator that we can use to detect the necessity to adjust the
>> avg_vruntime beforehand at place_entity()?
> 
> So in general I think it would be fine to move zero_vruntime to the
> heaviest entity in the tree. And if there are multiple equal heaviest
> weights, any one of them should be fine.
> 
> Per necessity heavy entities are more tightly clustered -- the lag is
> inversely proportional to weight, and the spread is proportional to the
> lag bound.
> 
> I suspect something simple like comparing the entity weight against the
> sum_weight might be enough. If the pre-existing tree is, in aggregate,
> heavier than the new element, the avg will not move very drastically.
> However, if the new element is (significantly) heavier than the tree,
> the avg will move significantly (as demonstrated here).
> 
> That is, something like the below... But with a comment ofc :-)
> 
> Does that make sense?

Let me go queue an overnight test to see if I trip that warning or
not. I initially did think this might work but then convinced myself
that testing the spread with "sum_w_vruntime" might prove to be
better but we'll know for sure tomorrow ;-)

-- 
Thanks and Regards,
Prateek

Re: [PATCH v2 5/7] sched/fair: Increase weight bits for avg_vruntime

Posted by Peter Zijlstra 3 days, 9 hours ago

On Mon, Mar 30, 2026 at 01:25:59PM +0530, K Prateek Nayak wrote:

> On a tangential note, now that we only yield on eligibility, would
> something like below make sense?
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 226509231e67..55ab1f58d703 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -9265,9 +9265,10 @@ static void yield_task_fair(struct rq *rq)
>  	struct sched_entity *se = &curr->se;
>  
>  	/*
> -	 * Are we the only task in the tree?
> +	 * Single task is always eligible on the cfs_rq.
> +	 * Don't pull the vruntime needlessly.
>  	 */
> -	if (unlikely(rq->nr_running == 1))
> +	if (unlikely(cfs_rq->nr_queued == 1))
>  		return;
>  
>  	clear_buddies(cfs_rq, se);

Right, with the addition of sched_ext this could actually make a
difference.

Re: [PATCH v2 5/7] sched/fair: Increase weight bits for avg_vruntime

Posted by Vincent Guittot 1 month, 1 week ago

On Mon, 23 Feb 2026 at 12:51, Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Feb 23, 2026 at 11:56:33AM +0100, Vincent Guittot wrote:
> > On Thu, 19 Feb 2026 at 09:10, Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > Due to the zero_vruntime patch, the deltas are now a lot smaller and
> > > measurement with kernel-build and hackbench runs show about 45 bits
> > > used.
> > >
> > > This ensures avg_vruntime() tracks the full weight range, reducing
> > > numerical artifacts in reweight and the like.
> >
> > Instead of paranoid, would it be better to add WARN_ONCE ?
> >
> > I'm afraid that we will not notice any potential overflow without a
> > long study of the regression with SCHED_FEAT(PARANOID_AVG, false)
> >
> > Couldn't we add a cheaper WARN_ONCE (key > 2^50)  in __sum_w_vruntime_add ?
> >
> > We should always have
> > key < 110ms (max slice+max tick) * nice_0 (2^20) / weight (2)
> > key < 2^46
> >
> > We can use 50 bits to get margin
> >
> > Weight is always less than 27bits and key*weight gives us 110ms (max
> > slice+max tick) * nice_0 (2^20) so we should never add more than 2^47
> > to ->sum_weight
> >
> > so a WARN_ONCE (cfs_rq->sum_weight > 2^63) should be enough
>
> Ha, I was >< close to pushing out these patches when I saw this.
>
> The thing is signed, so bit 63 is the sign bit, but I suppose we can
> test bit 62 like so:

Ah yes, I forgot that it's a signed value

>
> Let me go build and boot that.
>
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -679,9 +679,13 @@ static inline void
>  __sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  {
>         unsigned long weight = avg_vruntime_weight(cfs_rq, se->load.weight);
> -       s64 key = entity_key(cfs_rq, se);
> +       s64 w_vruntime, key = entity_key(cfs_rq, se);
>
> -       cfs_rq->sum_w_vruntime += key * weight;
> +       w_vruntime = key * weight;
> +
> +       WARN_ON_ONCE((w_vruntime >> 63) != (w_vruntime >> 62));

yes looks good

> +
> +       cfs_rq->sum_w_vruntime += w_vruntime;
>         cfs_rq->sum_weight += weight;
>  }
>

Re: [PATCH v2 5/7] sched/fair: Increase weight bits for avg_vruntime

Posted by Peter Zijlstra 1 month, 1 week ago

On Mon, Feb 23, 2026 at 12:51:00PM +0100, Peter Zijlstra wrote:

> Let me go build and boot that.

Seems to not explode; had it run a few things and such. Must be good.

> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -679,9 +679,13 @@ static inline void
>  __sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  {
>  	unsigned long weight = avg_vruntime_weight(cfs_rq, se->load.weight);
> -	s64 key = entity_key(cfs_rq, se);
> +	s64 w_vruntime, key = entity_key(cfs_rq, se);
>  
> -	cfs_rq->sum_w_vruntime += key * weight;
> +	w_vruntime = key * weight;
> +
> +	WARN_ON_ONCE((w_vruntime >> 63) != (w_vruntime >> 62));
> +
> +	cfs_rq->sum_w_vruntime += w_vruntime;
>  	cfs_rq->sum_weight += weight;
>  }
>

[PATCH v2 1/7] sched/fair: Fix zero_vruntime tracking
[PATCH v2 2/7] sched/fair: Only set slice protection at pick time
[PATCH v2 3/7] sched/eevdf: Update se->vprot in reweight_entity()
[PATCH v2 4/7] sched/fair: Fix lag clamp
[PATCH v2 5/7] sched/fair: Increase weight bits for avg_vruntime
[PATCH v2 6/7] sched/fair: Revert 6d71a9c61604 ("sched/fair: Fix EEVDF entity placement bug causing scheduling lag")
[PATCH v2 7/7] sched/fair: Use full weight to __calc_delta()