[PATCH 03/15] sched/fair: Add lag based placement

Peter Zijlstra posted 15 patches 2 years, 8 months ago
[PATCH 03/15] sched/fair: Add lag based placement
Posted by Peter Zijlstra 2 years, 8 months ago
With the introduction of avg_vruntime, it is possible to approximate
lag (the entire purpose of introducing it in fact). Use this to do lag
based placement over sleep+wake.

Specifically, the FAIR_SLEEPERS thing places things too far to the
left and messes up the deadline aspect of EEVDF.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/sched.h   |    3 
 kernel/sched/core.c     |    1 
 kernel/sched/fair.c     |  162 +++++++++++++++++++++++++++++++++++++-----------
 kernel/sched/features.h |    8 ++
 4 files changed, 138 insertions(+), 36 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -555,8 +555,9 @@ struct sched_entity {
 
 	u64				exec_start;
 	u64				sum_exec_runtime;
-	u64				vruntime;
 	u64				prev_sum_exec_runtime;
+	u64				vruntime;
+	s64				vlag;
 
 	u64				nr_migrations;
 
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4463,6 +4463,7 @@ static void __sched_fork(unsigned long c
 	p->se.prev_sum_exec_runtime	= 0;
 	p->se.nr_migrations		= 0;
 	p->se.vruntime			= 0;
+	p->se.vlag			= 0;
 	INIT_LIST_HEAD(&p->se.group_node);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -715,6 +715,15 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
 	return cfs_rq->min_vruntime + avg;
 }
 
+/*
+ * lag_i = S - s_i = w_i * (V - v_i)
+ */
+void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	SCHED_WARN_ON(!se->on_rq);
+	se->vlag = avg_vruntime(cfs_rq) - se->vruntime;
+}
+
 static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
 {
 	u64 min_vruntime = cfs_rq->min_vruntime;
@@ -3492,6 +3501,8 @@ dequeue_load_avg(struct cfs_rq *cfs_rq,
 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 			    unsigned long weight)
 {
+	unsigned long old_weight = se->load.weight;
+
 	if (se->on_rq) {
 		/* commit outstanding execution time */
 		if (cfs_rq->curr == se)
@@ -3504,6 +3515,14 @@ static void reweight_entity(struct cfs_r
 
 	update_load_set(&se->load, weight);
 
+	if (!se->on_rq) {
+		/*
+		 * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
+		 * we need to scale se->vlag when w_i changes.
+		 */
+		se->vlag = div_s64(se->vlag * old_weight, weight);
+	}
+
 #ifdef CONFIG_SMP
 	do {
 		u32 divider = get_pelt_divider(&se->avg);
@@ -4853,49 +4872,119 @@ static void
 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 {
 	u64 vruntime = avg_vruntime(cfs_rq);
+	s64 lag = 0;
 
-	/* sleeps up to a single latency don't count. */
-	if (!initial) {
-		unsigned long thresh;
+	/*
+	 * Due to how V is constructed as the weighted average of entities,
+	 * adding tasks with positive lag, or removing tasks with negative lag
+	 * will move 'time' backwards, this can screw around with the lag of
+	 * other tasks.
+	 *
+	 * EEVDF: placement strategy #1 / #2
+	 */
+	if (sched_feat(PLACE_LAG) && cfs_rq->nr_running > 1) {
+		struct sched_entity *curr = cfs_rq->curr;
+		unsigned long load;
 
-		if (se_is_idle(se))
-			thresh = sysctl_sched_min_granularity;
-		else
-			thresh = sysctl_sched_latency;
+		lag = se->vlag;
 
 		/*
-		 * Halve their sleep time's effect, to allow
-		 * for a gentler effect of sleepers:
+		 * If we want to place a task and preserve lag, we have to
+		 * consider the effect of the new entity on the weighted
+		 * average and compensate for this, otherwise lag can quickly
+		 * evaporate.
+		 *
+		 * Lag is defined as:
+		 *
+		 *   lag_i = S - s_i = w_i * (V - v_i)
+		 *
+		 * To avoid the 'w_i' term all over the place, we only track
+		 * the virtual lag:
+		 *
+		 *   vl_i = V - v_i <=> v_i = V - vl_i
+		 *
+		 * And we take V to be the weighted average of all v:
+		 *
+		 *   V = (\Sum w_j*v_j) / W
+		 *
+		 * Where W is: \Sum w_j
+		 *
+		 * Then, the weighted average after adding an entity with lag
+		 * vl_i is given by:
+		 *
+		 *   V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i)
+		 *      = (W*V + w_i*(V - vl_i)) / (W + w_i)
+		 *      = (W*V + w_i*V - w_i*vl_i) / (W + w_i)
+		 *      = (V*(W + w_i) - w_i*l) / (W + w_i)
+		 *      = V - w_i*vl_i / (W + w_i)
+		 *
+		 * And the actual lag after adding an entity with vl_i is:
+		 *
+		 *   vl'_i = V' - v_i
+		 *         = V - w_i*vl_i / (W + w_i) - (V - vl_i)
+		 *         = vl_i - w_i*vl_i / (W + w_i)
+		 *
+		 * Which is strictly less than vl_i. So in order to preserve lag
+		 * we should inflate the lag before placement such that the
+		 * effective lag after placement comes out right.
+		 *
+		 * As such, invert the above relation for vl'_i to get the vl_i
+		 * we need to use such that the lag after placement is the lag
+		 * we computed before dequeue.
+		 *
+		 *   vl'_i = vl_i - w_i*vl_i / (W + w_i)
+		 *         = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i)
+		 *
+		 *   (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i
+		 *                   = W*vl_i
+		 *
+		 *   vl_i = (W + w_i)*vl'_i / W
 		 */
-		if (sched_feat(GENTLE_FAIR_SLEEPERS))
-			thresh >>= 1;
+		load = cfs_rq->avg_load;
+		if (curr && curr->on_rq)
+			load += curr->load.weight;
 
-		vruntime -= thresh;
+		lag *= load + se->load.weight;
+		if (WARN_ON_ONCE(!load))
+			load = 1;
+		lag = div_s64(lag, load);
+
+		vruntime -= lag;
 	}
 
-	/*
-	 * Pull vruntime of the entity being placed to the base level of
-	 * cfs_rq, to prevent boosting it if placed backwards.
-	 * However, min_vruntime can advance much faster than real time, with
-	 * the extreme being when an entity with the minimal weight always runs
-	 * on the cfs_rq. If the waking entity slept for a long time, its
-	 * vruntime difference from min_vruntime may overflow s64 and their
-	 * comparison may get inversed, so ignore the entity's original
-	 * vruntime in that case.
-	 * The maximal vruntime speedup is given by the ratio of normal to
-	 * minimal weight: scale_load_down(NICE_0_LOAD) / MIN_SHARES.
-	 * When placing a migrated waking entity, its exec_start has been set
-	 * from a different rq. In order to take into account a possible
-	 * divergence between new and prev rq's clocks task because of irq and
-	 * stolen time, we take an additional margin.
-	 * So, cutting off on the sleep time of
-	 *     2^63 / scale_load_down(NICE_0_LOAD) ~ 104 days
-	 * should be safe.
-	 */
-	if (entity_is_long_sleeper(se))
-		se->vruntime = vruntime;
-	else
-		se->vruntime = max_vruntime(se->vruntime, vruntime);
+	if (sched_feat(FAIR_SLEEPERS)) {
+
+		/* sleeps up to a single latency don't count. */
+		if (!initial) {
+			unsigned long thresh;
+
+			if (se_is_idle(se))
+				thresh = sysctl_sched_min_granularity;
+			else
+				thresh = sysctl_sched_latency;
+
+			/*
+			 * Halve their sleep time's effect, to allow
+			 * for a gentler effect of sleepers:
+			 */
+			if (sched_feat(GENTLE_FAIR_SLEEPERS))
+				thresh >>= 1;
+
+			vruntime -= thresh;
+		}
+
+		/*
+		 * Pull vruntime of the entity being placed to the base level of
+		 * cfs_rq, to prevent boosting it if placed backwards.  If the entity
+		 * slept for a long time, don't even try to compare its vruntime with
+		 * the base as it may be too far off and the comparison may get
+		 * inversed due to s64 overflow.
+		 */
+		if (!entity_is_long_sleeper(se))
+			vruntime = max_vruntime(se->vruntime, vruntime);
+	}
+
+	se->vruntime = vruntime;
 }
 
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
@@ -5066,6 +5155,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
 
 	clear_buddies(cfs_rq, se);
 
+	if (flags & DEQUEUE_SLEEP)
+		update_entity_lag(cfs_rq, se);
+
 	if (se != cfs_rq->curr)
 		__dequeue_entity(cfs_rq, se);
 	se->on_rq = 0;
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -1,12 +1,20 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+
 /*
  * Only give sleepers 50% of their service deficit. This allows
  * them to run sooner, but does not allow tons of sleepers to
  * rip the spread apart.
  */
+SCHED_FEAT(FAIR_SLEEPERS, false)
 SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
 
 /*
+ * Using the avg_vruntime, do the right thing and preserve lag across
+ * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
+ */
+SCHED_FEAT(PLACE_LAG, true)
+
+/*
  * Prefer to schedule the task we woke last (assuming it failed
  * wakeup-preemption), since its likely going to consume data we
  * touched, increases cache locality.
Re: [PATCH 03/15] sched/fair: Add lag based placement
Posted by Breno Leitao 1 year ago
Hello Peter,

On Wed, May 31, 2023 at 01:58:42PM +0200, Peter Zijlstra wrote:
>
>  place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
>  {
<snip>
> -		vruntime -= thresh;
> +		lag *= load + se->load.weight;
> +		if (WARN_ON_ONCE(!load))

I have 6.13 running on some hosts, and in some cases, where the system
is getting some OOMs, I see the following stack:

          WARNING: CPU: 29 PID: 593474 at kernel/sched/fair.c:5250 place_entity+0x199/0x1b0

           Call Trace:
            <TASK>
            ? __warn+0xd1/0x1b0
            ? place_entity+0x199/0x1b0
            ? report_bug+0x140/0x1c0
            ? handle_bug+0x5e/0x90
            ? exc_invalid_op+0x16/0x40
            ? asm_exc_invalid_op+0x16/0x20
            ? place_entity+0x199/0x1b0
            reweight_entity+0x188/0x200
            enqueue_task_fair.llvm.15448040313737105663+0x28c/0x560
            enqueue_task+0x30/0x120
            ttwu_do_activate+0x99/0x230
            try_to_wake_up+0x25a/0x4a0
            ? hrtimer_dummy_timeout+0x10/0x10
            hrtimer_wakeup+0x25/0x30
            __hrtimer_run_queues+0xf1/0x250
            hrtimer_interrupt+0xfb/0x220
            __sysvec_apic_timer_interrupt+0x47/0x140
            sysvec_apic_timer_interrupt+0x35/0x80
            asm_sysvec_apic_timer_interrupt+0x16/0x20

I am sorry for not decoding the stack, but I am having a hard time
decoding the stack properly. The values I got was misleading, and I am
working to understand what is happening.

Anyway, I don't have a reproducer and this problem doesn't happen
frequent enough. I have 1K hosts with 6.13 and I saw it 5 times in the
last week.

Also, this is happening in 6.13.1.

Thanks
--breno
Re: [PATCH 03/15] sched/fair: Add lag based placement
Posted by Peter Zijlstra 1 year ago
On Fri, Feb 07, 2025 at 02:07:18AM -0800, Breno Leitao wrote:
> Hello Peter,
> 
> On Wed, May 31, 2023 at 01:58:42PM +0200, Peter Zijlstra wrote:
> >
> >  place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
> >  {
> <snip>
> > -		vruntime -= thresh;
> > +		lag *= load + se->load.weight;
> > +		if (WARN_ON_ONCE(!load))
> 
> I have 6.13 running on some hosts, and in some cases, where the system
> is getting some OOMs, I see the following stack:
> 
>           WARNING: CPU: 29 PID: 593474 at kernel/sched/fair.c:5250 place_entity+0x199/0x1b0
> 
>            Call Trace:
>             <TASK>
>             ? place_entity+0x199/0x1b0
>             reweight_entity+0x188/0x200
>             enqueue_task_fair.llvm.15448040313737105663+0x28c/0x560
>             enqueue_task+0x30/0x120
>             ttwu_do_activate+0x99/0x230
>             try_to_wake_up+0x25a/0x4a0
>             ? hrtimer_dummy_timeout+0x10/0x10
>             hrtimer_wakeup+0x25/0x30
>             __hrtimer_run_queues+0xf1/0x250
>             hrtimer_interrupt+0xfb/0x220
>             __sysvec_apic_timer_interrupt+0x47/0x140
>             sysvec_apic_timer_interrupt+0x35/0x80
>             asm_sysvec_apic_timer_interrupt+0x16/0x20
> 
> I am sorry for not decoding the stack, but I am having a hard time
> decoding the stack properly. The values I got was misleading, and I am
> working to understand what is happening.
> 
> Anyway, I don't have a reproducer and this problem doesn't happen
> frequent enough. I have 1K hosts with 6.13 and I saw it 5 times in the
> last week.

Weird. Would you mind trying with the below patch on top?

---
Subject: sched/fair: Adhere to place_entity() constraints
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 28 Jan 2025 15:39:49 +0100

Mike reports that commit 6d71a9c61604 ("sched/fair: Fix EEVDF entity
placement bug causing scheduling lag") relies on commit 4423af84b297
("sched/fair: optimize the PLACE_LAG when se->vlag is zero") to not
trip a WARN in place_entity().

What happens is that the lag of the very last entity is 0 per
definition -- the average of one element matches the value of that
element. Therefore place_entity() will match the condition skipping
the lag adjustment:

  if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) {

Without the 'se->vlag' condition -- it will attempt to adjust the zero
lag even though we're inserting into an empty tree.

Notably, we should have failed the 'cfs_rq->nr_queued' condition, but
don't because they didn't get updated.

Additionally, move update_load_add() after placement() as is
consistent with other place_entity() users -- this change is
non-functional, place_entity() does not use cfs_rq->load.

Fixes: 6d71a9c61604 ("sched/fair: Fix EEVDF entity placement bug causing scheduling lag")
Reported-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20250128143949.GD7145@noisy.programming.kicks-ass.net
---
 kernel/sched/fair.c |    4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3781,6 +3781,7 @@ static void reweight_entity(struct cfs_r
 		update_entity_lag(cfs_rq, se);
 		se->deadline -= se->vruntime;
 		se->rel_deadline = 1;
+		cfs_rq->nr_queued--;
 		if (!curr)
 			__dequeue_entity(cfs_rq, se);
 		update_load_sub(&cfs_rq->load, se->load.weight);
@@ -3807,10 +3808,11 @@ static void reweight_entity(struct cfs_r
 
 	enqueue_load_avg(cfs_rq, se);
 	if (se->on_rq) {
-		update_load_add(&cfs_rq->load, se->load.weight);
 		place_entity(cfs_rq, se, 0);
+		update_load_add(&cfs_rq->load, se->load.weight);
 		if (!curr)
 			__enqueue_entity(cfs_rq, se);
+		cfs_rq->nr_queued++;
 
 		/*
 		 * The entity's vruntime has been adjusted, so let's check
Re: [PATCH 03/15] sched/fair: Add lag based placement
Posted by Breno Leitao 11 months, 4 weeks ago
On Fri, Feb 07, 2025 at 12:11:41PM +0100, Peter Zijlstra wrote:
> On Fri, Feb 07, 2025 at 02:07:18AM -0800, Breno Leitao wrote:
> > Hello Peter,
> > 
> > On Wed, May 31, 2023 at 01:58:42PM +0200, Peter Zijlstra wrote:
> > >
> > >  place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
> > >  {
> > <snip>
> > > -		vruntime -= thresh;
> > > +		lag *= load + se->load.weight;
> > > +		if (WARN_ON_ONCE(!load))
> > 
> > I have 6.13 running on some hosts, and in some cases, where the system
> > is getting some OOMs, I see the following stack:
> > 
> >           WARNING: CPU: 29 PID: 593474 at kernel/sched/fair.c:5250 place_entity+0x199/0x1b0
> > 
> >            Call Trace:
> >             <TASK>
> >             ? place_entity+0x199/0x1b0
> >             reweight_entity+0x188/0x200
> >             enqueue_task_fair.llvm.15448040313737105663+0x28c/0x560
> >             enqueue_task+0x30/0x120
> >             ttwu_do_activate+0x99/0x230
> >             try_to_wake_up+0x25a/0x4a0
> >             ? hrtimer_dummy_timeout+0x10/0x10
> >             hrtimer_wakeup+0x25/0x30
> >             __hrtimer_run_queues+0xf1/0x250
> >             hrtimer_interrupt+0xfb/0x220
> >             __sysvec_apic_timer_interrupt+0x47/0x140
> >             sysvec_apic_timer_interrupt+0x35/0x80
> >             asm_sysvec_apic_timer_interrupt+0x16/0x20
> > 
> > I am sorry for not decoding the stack, but I am having a hard time
> > decoding the stack properly. The values I got was misleading, and I am
> > working to understand what is happening.
> > 
> > Anyway, I don't have a reproducer and this problem doesn't happen
> > frequent enough. I have 1K hosts with 6.13 and I saw it 5 times in the
> > last week.
> 
> Weird. Would you mind trying with the below patch on top?

Tested for a 3 days in ~1k hosts and the warning is gone.

Tested-by: Breno Leitao <leitao@debian.org>

Thanks!
--breno
Re: [PATCH 03/15] sched/fair: Add lag based placement
Posted by Breno Leitao 1 year ago
On Fri, Feb 07, 2025 at 12:11:41PM +0100, Peter Zijlstra wrote:
> On Fri, Feb 07, 2025 at 02:07:18AM -0800, Breno Leitao wrote:
> > Hello Peter,
> > 
> > On Wed, May 31, 2023 at 01:58:42PM +0200, Peter Zijlstra wrote:
> > >
> > >  place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
> > >  {
> > <snip>
> > > -		vruntime -= thresh;
> > > +		lag *= load + se->load.weight;
> > > +		if (WARN_ON_ONCE(!load))
> > 
> > I have 6.13 running on some hosts, and in some cases, where the system
> > is getting some OOMs, I see the following stack:
> > 
> >           WARNING: CPU: 29 PID: 593474 at kernel/sched/fair.c:5250 place_entity+0x199/0x1b0
> > 
> >            Call Trace:
> >             <TASK>
> >             ? place_entity+0x199/0x1b0
> >             reweight_entity+0x188/0x200
> >             enqueue_task_fair.llvm.15448040313737105663+0x28c/0x560
> >             enqueue_task+0x30/0x120
> >             ttwu_do_activate+0x99/0x230
> >             try_to_wake_up+0x25a/0x4a0
> >             ? hrtimer_dummy_timeout+0x10/0x10
> >             hrtimer_wakeup+0x25/0x30
> >             __hrtimer_run_queues+0xf1/0x250
> >             hrtimer_interrupt+0xfb/0x220
> >             __sysvec_apic_timer_interrupt+0x47/0x140
> >             sysvec_apic_timer_interrupt+0x35/0x80
> >             asm_sysvec_apic_timer_interrupt+0x16/0x20
> > 
> > I am sorry for not decoding the stack, but I am having a hard time
> > decoding the stack properly. The values I got was misleading, and I am
> > working to understand what is happening.
> > 
> > Anyway, I don't have a reproducer and this problem doesn't happen
> > frequent enough. I have 1K hosts with 6.13 and I saw it 5 times in the
> > last week.
> 
> Weird. Would you mind trying with the below patch on top?

I tried to get this patch on top of latest 6.13 stable (6.13.1), and it
seems it misses some dependencies in stable. Field cfs_rq->nr_queued
don't exist in 6.13, it was added/renamed later by 736c55a02c477a
("sched/fair: Rename cfs_rq.nr_running into nr_queued").

Is it safe to just s/nr_queued/nr_running in our patch?

Thanks
Re: [PATCH 03/15] sched/fair: Add lag based placement
Posted by Vincent Guittot 1 year ago
On Fri, 7 Feb 2025 at 14:38, Breno Leitao <leitao@debian.org> wrote:
>
> On Fri, Feb 07, 2025 at 12:11:41PM +0100, Peter Zijlstra wrote:
> > On Fri, Feb 07, 2025 at 02:07:18AM -0800, Breno Leitao wrote:
> > > Hello Peter,
> > >
> > > On Wed, May 31, 2023 at 01:58:42PM +0200, Peter Zijlstra wrote:
> > > >
> > > >  place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
> > > >  {
> > > <snip>
> > > > -         vruntime -= thresh;
> > > > +         lag *= load + se->load.weight;
> > > > +         if (WARN_ON_ONCE(!load))
> > >
> > > I have 6.13 running on some hosts, and in some cases, where the system
> > > is getting some OOMs, I see the following stack:
> > >
> > >           WARNING: CPU: 29 PID: 593474 at kernel/sched/fair.c:5250 place_entity+0x199/0x1b0
> > >
> > >            Call Trace:
> > >             <TASK>
> > >             ? place_entity+0x199/0x1b0
> > >             reweight_entity+0x188/0x200
> > >             enqueue_task_fair.llvm.15448040313737105663+0x28c/0x560
> > >             enqueue_task+0x30/0x120
> > >             ttwu_do_activate+0x99/0x230
> > >             try_to_wake_up+0x25a/0x4a0
> > >             ? hrtimer_dummy_timeout+0x10/0x10
> > >             hrtimer_wakeup+0x25/0x30
> > >             __hrtimer_run_queues+0xf1/0x250
> > >             hrtimer_interrupt+0xfb/0x220
> > >             __sysvec_apic_timer_interrupt+0x47/0x140
> > >             sysvec_apic_timer_interrupt+0x35/0x80
> > >             asm_sysvec_apic_timer_interrupt+0x16/0x20
> > >
> > > I am sorry for not decoding the stack, but I am having a hard time
> > > decoding the stack properly. The values I got was misleading, and I am
> > > working to understand what is happening.
> > >
> > > Anyway, I don't have a reproducer and this problem doesn't happen
> > > frequent enough. I have 1K hosts with 6.13 and I saw it 5 times in the
> > > last week.
> >
> > Weird. Would you mind trying with the below patch on top?
>
> I tried to get this patch on top of latest 6.13 stable (6.13.1), and it
> seems it misses some dependencies in stable. Field cfs_rq->nr_queued
> don't exist in 6.13, it was added/renamed later by 736c55a02c477a
> ("sched/fair: Rename cfs_rq.nr_running into nr_queued").
>
> Is it safe to just s/nr_queued/nr_running in our patch?

Yes, I was about to mention that nr_queued appeared in v6.14-rc1 and
it' still nr_running in v6.13

>
> Thanks
Re: [PATCH 03/15] sched/fair: Add lag based placement
Posted by Breno Leitao 1 year ago
Hello Peter,

On Fri, Feb 07, 2025 at 12:11:41PM +0100, Peter Zijlstra wrote:
> On Fri, Feb 07, 2025 at 02:07:18AM -0800, Breno Leitao wrote:
> > Hello Peter,
> > 
> > On Wed, May 31, 2023 at 01:58:42PM +0200, Peter Zijlstra wrote:
> > >
> > >  place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
> > >  {
> > <snip>
> > > -		vruntime -= thresh;
> > > +		lag *= load + se->load.weight;
> > > +		if (WARN_ON_ONCE(!load))
> > 
> > I have 6.13 running on some hosts, and in some cases, where the system
> > is getting some OOMs, I see the following stack:
> > 
> >           WARNING: CPU: 29 PID: 593474 at kernel/sched/fair.c:5250 place_entity+0x199/0x1b0
> > 
> >            Call Trace:
> >             <TASK>
> >             ? place_entity+0x199/0x1b0
> >             reweight_entity+0x188/0x200
> >             enqueue_task_fair.llvm.15448040313737105663+0x28c/0x560
> >             enqueue_task+0x30/0x120
> >             ttwu_do_activate+0x99/0x230
> >             try_to_wake_up+0x25a/0x4a0
> >             ? hrtimer_dummy_timeout+0x10/0x10
> >             hrtimer_wakeup+0x25/0x30
> >             __hrtimer_run_queues+0xf1/0x250
> >             hrtimer_interrupt+0xfb/0x220
> >             __sysvec_apic_timer_interrupt+0x47/0x140
> >             sysvec_apic_timer_interrupt+0x35/0x80
> >             asm_sysvec_apic_timer_interrupt+0x16/0x20
> > 
> > I am sorry for not decoding the stack, but I am having a hard time
> > decoding the stack properly. The values I got was misleading, and I am
> > working to understand what is happening.
> > 
> > Anyway, I don't have a reproducer and this problem doesn't happen
> > frequent enough. I have 1K hosts with 6.13 and I saw it 5 times in the
> > last week.
> 
> Weird. Would you mind trying with the below patch on top?

Thanks for the quick answer. I will pick it on top of our tree, and
start rolling it to those 1k hosts I have been playing with.

I haven't seen this patch in stable. Is it queue up for the next
submission for stable?
Re: [PATCH 03/15] sched/fair: Add lag based placement
Posted by Peter Zijlstra 1 year ago
On Fri, Feb 07, 2025 at 04:25:02AM -0800, Breno Leitao wrote:

> I haven't seen this patch in stable. Is it queue up for the next
> submission for stable?

It's not yet queued, up. I meant to do that this week, but 'stuff' kept
getting in between. I'll try and get it pushed into tip/sched/urgent
this seekend somewhere -- but it is unlikely to make -rc2 at this pace.
Re: [PATCH 03/15] sched/fair: Add lag based placement
Posted by Breno Leitao 1 year ago
On Fri, Feb 07, 2025 at 01:30:20PM +0100, Peter Zijlstra wrote:
> On Fri, Feb 07, 2025 at 04:25:02AM -0800, Breno Leitao wrote:
> 
> > I haven't seen this patch in stable. Is it queue up for the next
> > submission for stable?
> 
> It's not yet queued, up. I meant to do that this week, but 'stuff' kept
> getting in between. I'll try and get it pushed into tip/sched/urgent
> this seekend somewhere -- but it is unlikely to make -rc2 at this pace.

That is fine.  I just want to assure that this patch will eventually get
to stable, so, I don't need to carry forward this patch forever in my
production tree.

I will let in ~week if we are still seeing this warning in our machines.

Thanks for the quick response,
--breno
Re: [PATCH 03/15] sched/fair: Add lag based placement
Posted by Benjamin Segall 2 years, 3 months ago
Peter Zijlstra <peterz@infradead.org> writes:

> @@ -4853,49 +4872,119 @@ static void
>  place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
>  {
>  	u64 vruntime = avg_vruntime(cfs_rq);
> +	s64 lag = 0;
>  
> -	/* sleeps up to a single latency don't count. */
> -	if (!initial) {
> -		unsigned long thresh;
> +	/*
> +	 * Due to how V is constructed as the weighted average of entities,
> +	 * adding tasks with positive lag, or removing tasks with negative lag
> +	 * will move 'time' backwards, this can screw around with the lag of
> +	 * other tasks.
> +	 *
> +	 * EEVDF: placement strategy #1 / #2
> +	 */

So the big problem with EEVDF #1 compared to #2/#3 and CFS (hacky though
it is) is that it creates a significant perverse incentive to yield or
spin until you see yourself be preempted, rather than just sleep (if you
have any competition on the cpu). If you go to sleep immediately after
doing work and happen to do so near the end of a slice (arguably what
you _want_ to have happen overall), then you have to pay that negative
lag in wakeup latency later, because it is maintained through any amount
of sleep. (#1 or similar is good for reweight/migrate of course)

#2 in theory could be abused by micro-sleeping right before you are
preempted, but that isn't something tasks can really predict, unlike
seeing more "don't go to sleep, just spin, the latency numbers are so
much better" nonsense.
Re: [PATCH 03/15] sched/fair: Add lag based placement
Posted by Peter Zijlstra 2 years, 3 months ago
On Thu, Oct 12, 2023 at 12:15:12PM -0700, Benjamin Segall wrote:
> Peter Zijlstra <peterz@infradead.org> writes:
> 
> > @@ -4853,49 +4872,119 @@ static void
> >  place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
> >  {
> >  	u64 vruntime = avg_vruntime(cfs_rq);
> > +	s64 lag = 0;
> >  
> > -	/* sleeps up to a single latency don't count. */
> > -	if (!initial) {
> > -		unsigned long thresh;
> > +	/*
> > +	 * Due to how V is constructed as the weighted average of entities,
> > +	 * adding tasks with positive lag, or removing tasks with negative lag
> > +	 * will move 'time' backwards, this can screw around with the lag of
> > +	 * other tasks.
> > +	 *
> > +	 * EEVDF: placement strategy #1 / #2
> > +	 */
> 
> So the big problem with EEVDF #1 compared to #2/#3 and CFS (hacky though
> it is) is that it creates a significant perverse incentive to yield or
> spin until you see yourself be preempted, rather than just sleep (if you
> have any competition on the cpu). If you go to sleep immediately after
> doing work and happen to do so near the end of a slice (arguably what
> you _want_ to have happen overall), then you have to pay that negative
> lag in wakeup latency later, because it is maintained through any amount
> of sleep. (#1 or similar is good for reweight/migrate of course)
> 
> #2 in theory could be abused by micro-sleeping right before you are
> preempted, but that isn't something tasks can really predict, unlike
> seeing more "don't go to sleep, just spin, the latency numbers are so
> much better" nonsense.

For giggles (cyclictest vs hackbench):

$ echo PLACE_LAG > /debug/sched/features
$ ./doit-latency-slice.sh
# Running 'sched/messaging' benchmark:
slice 30000000
# /dev/cpu_dma_latency set to 0us
# Min Latencies: 00051
# Avg Latencies: 00819
# Max Latencies: 172558
slice 3000000
# /dev/cpu_dma_latency set to 0us
# Min Latencies: 00033
# Avg Latencies: 00407
# Max Latencies: 12024
slice 300000
# /dev/cpu_dma_latency set to 0us
# Min Latencies: 00055
# Avg Latencies: 00395
# Max Latencies: 11780


$ echo NO_PLACE_LAG > /debug/sched/features
$ ./doit-latency-slice.sh
# Running 'sched/messaging' benchmark:
slice 30000000
# /dev/cpu_dma_latency set to 0us
# Min Latencies: 00069
# Avg Latencies: 69071
# Max Latencies: 1492250
slice 3000000
# /dev/cpu_dma_latency set to 0us
# Min Latencies: 00062
# Avg Latencies: 10215
# Max Latencies: 21209
slice 300000
# /dev/cpu_dma_latency set to 0us
# Min Latencies: 00055
# Avg Latencies: 00060
# Max Latencies: 03088


IOW, insanely worse latencies in most cases. This is because when
everybody starts at 0-lag, everybody is always eligible, and 'fairness'
goes out the window fast.

Placement strategy #1 only really works when you have well behaving
tasks (eg. conforming to the periodic task model -- not waking up before
its time and all that).
Re: [PATCH 03/15] sched/fair: Add lag based placement
Posted by Peter Zijlstra 2 years, 3 months ago
On Thu, Oct 12, 2023 at 12:15:12PM -0700, Benjamin Segall wrote:
> Peter Zijlstra <peterz@infradead.org> writes:
> 
> > @@ -4853,49 +4872,119 @@ static void
> >  place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
> >  {
> >  	u64 vruntime = avg_vruntime(cfs_rq);
> > +	s64 lag = 0;
> >  
> > -	/* sleeps up to a single latency don't count. */
> > -	if (!initial) {
> > -		unsigned long thresh;
> > +	/*
> > +	 * Due to how V is constructed as the weighted average of entities,
> > +	 * adding tasks with positive lag, or removing tasks with negative lag
> > +	 * will move 'time' backwards, this can screw around with the lag of
> > +	 * other tasks.
> > +	 *
> > +	 * EEVDF: placement strategy #1 / #2
> > +	 */
> 
> So the big problem with EEVDF #1 compared to #2/#3 and CFS (hacky though
> it is) is that it creates a significant perverse incentive to yield or
> spin until you see yourself be preempted, rather than just sleep (if you
> have any competition on the cpu). If you go to sleep immediately after
> doing work and happen to do so near the end of a slice (arguably what
> you _want_ to have happen overall), then you have to pay that negative
> lag in wakeup latency later, because it is maintained through any amount
> of sleep. (#1 or similar is good for reweight/migrate of course)
> 
> #2 in theory could be abused by micro-sleeping right before you are
> preempted, but that isn't something tasks can really predict, unlike
> seeing more "don't go to sleep, just spin, the latency numbers are so
> much better" nonsense.

Right, so I do have this:

  https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/commit/?h=sched/eevdf&id=344944e06f11da25b49328825ed15fedd63036d3

That allows tasks to sleep away the lag -- with all the gnarly bits that
sleep time has. And it reliably fixes the above. However, it also
depresses a bunch of other stuff. Never a free lunch etc.

It is so far the least horrible of the things I've tried.
Re: [PATCH 03/15] sched/fair: Add lag based placement
Posted by Peter Zijlstra 2 years, 3 months ago
On Fri, Oct 13, 2023 at 12:34:28AM +0200, Peter Zijlstra wrote:

> Right, so I do have this:
> 
>   https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/commit/?h=sched/eevdf&id=344944e06f11da25b49328825ed15fedd63036d3
> 
> That allows tasks to sleep away the lag -- with all the gnarly bits that
> sleep time has. And it reliably fixes the above. However, it also
> depresses a bunch of other stuff. Never a free lunch etc.
> 
> It is so far the least horrible of the things I've tried. 

So the below is one I conceptually like more -- except I hate the code,
nor does it work as well as the one linked above.

(Mike, this isn't the same one you saw before -- it's been 'improved')

---

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 29daece54a74..7f17295931de 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -895,6 +895,7 @@ struct task_struct {
 	unsigned			sched_reset_on_fork:1;
 	unsigned			sched_contributes_to_load:1;
 	unsigned			sched_migrated:1;
+	unsigned			sched_delayed:1;
 
 	/* Force alignment to the next boundary: */
 	unsigned			:0;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7771a4d68280..38b2e0488a38 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3833,12 +3833,21 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags)
 
 	rq = __task_rq_lock(p, &rf);
 	if (task_on_rq_queued(p)) {
+		update_rq_clock(rq);
+		if (unlikely(p->sched_delayed)) {
+			p->sched_delayed = 0;
+			/* mustn't run a delayed task */
+			WARN_ON_ONCE(task_on_cpu(rq, p));
+			dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
+			if (p->se.vlag > 0)
+				p->se.vlag = 0;
+			enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
+		}
 		if (!task_on_cpu(rq, p)) {
 			/*
 			 * When on_rq && !on_cpu the task is preempted, see if
 			 * it should preempt the task that is current now.
 			 */
-			update_rq_clock(rq);
 			wakeup_preempt(rq, p, wake_flags);
 		}
 		ttwu_do_wakeup(p);
@@ -6520,6 +6529,16 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 # define SM_MASK_PREEMPT	SM_PREEMPT
 #endif
 
+static void __deschedule_task(struct rq *rq, struct task_struct *p)
+{
+	deactivate_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
+
+	if (p->in_iowait) {
+		atomic_inc(&rq->nr_iowait);
+		delayacct_blkio_start();
+	}
+}
+
 /*
  * __schedule() is the main scheduler function.
  *
@@ -6604,6 +6623,8 @@ static void __sched notrace __schedule(unsigned int sched_mode)
 
 	switch_count = &prev->nivcsw;
 
+	WARN_ON_ONCE(prev->sched_delayed);
+
 	/*
 	 * We must load prev->state once (task_struct::state is volatile), such
 	 * that we form a control dependency vs deactivate_task() below.
@@ -6632,17 +6653,39 @@ static void __sched notrace __schedule(unsigned int sched_mode)
 			 *
 			 * After this, schedule() must not care about p->state any more.
 			 */
-			deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
-
-			if (prev->in_iowait) {
-				atomic_inc(&rq->nr_iowait);
-				delayacct_blkio_start();
-			}
+			if (sched_feat(DELAY_DEQUEUE) &&
+			    prev->sched_class->eligible_task &&
+			    !prev->sched_class->eligible_task(rq, prev))
+				prev->sched_delayed = 1;
+			else
+				__deschedule_task(rq, prev);
 		}
 		switch_count = &prev->nvcsw;
 	}
 
-	next = pick_next_task(rq, prev, &rf);
+	for (struct task_struct *tmp = prev;;) {
+
+		next = pick_next_task(rq, tmp, &rf);
+		if (unlikely(tmp != prev))
+			finish_task(tmp);
+
+		if (likely(!next->sched_delayed))
+			break;
+
+		next->sched_delayed = 0;
+
+		/* ttwu_runnable() */
+		if (WARN_ON_ONCE(!next->__state))
+			break;
+
+		prepare_task(next);
+		smp_wmb();
+		__deschedule_task(rq, next);
+		if (next->se.vlag > 0)
+			next->se.vlag = 0;
+		tmp = next;
+	}
+
 	clear_tsk_need_resched(prev);
 	clear_preempt_need_resched();
 #ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b2210e7cc057..3084e21abfe7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8410,6 +8410,16 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq)
 	return pick_next_task_fair(rq, NULL, NULL);
 }
 
+static bool eligible_task_fair(struct rq *rq, struct task_struct *p)
+{
+	struct sched_entity *se = &p->se;
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+	update_curr(cfs_rq);
+
+	return entity_eligible(cfs_rq, se);
+}
+
 /*
  * Account for a descheduled task:
  */
@@ -13006,6 +13016,7 @@ DEFINE_SCHED_CLASS(fair) = {
 
 	.wakeup_preempt		= check_preempt_wakeup_fair,
 
+	.eligible_task		= eligible_task_fair,
 	.pick_next_task		= __pick_next_task_fair,
 	.put_prev_task		= put_prev_task_fair,
 	.set_next_task          = set_next_task_fair,
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index a133b46efedd..0546905f1f8f 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -11,6 +11,7 @@ SCHED_FEAT(PREEMPT_SHORT, true)
 SCHED_FEAT(PLACE_SLEEPER, false)
 SCHED_FEAT(GENTLE_SLEEPER, true)
 SCHED_FEAT(EVDF, false)
+SCHED_FEAT(DELAY_DEQUEUE, true)
 
 /*
  * Prefer to schedule the task we woke last (assuming it failed
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 245df0c6d344..35d297e1d91b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2222,6 +2222,7 @@ struct sched_class {
 
 	void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags);
 
+	bool (*eligible_task)(struct rq *rq, struct task_struct *p);
 	struct task_struct *(*pick_next_task)(struct rq *rq);
 
 	void (*put_prev_task)(struct rq *rq, struct task_struct *p);
@@ -2275,7 +2276,7 @@ struct sched_class {
 
 static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
-	WARN_ON_ONCE(rq->curr != prev);
+//	WARN_ON_ONCE(rq->curr != prev);
 	prev->sched_class->put_prev_task(rq, prev);
 }
Re: [PATCH 03/15] sched/fair: Add lag based placement
Posted by Mike Galbraith 2 years, 3 months ago
On Fri, 2023-10-13 at 18:35 +0200, Peter Zijlstra wrote:
> On Fri, Oct 13, 2023 at 12:34:28AM +0200, Peter Zijlstra wrote:
>
> > Right, so I do have this:
> >
> >   https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/commit/?h=sched/eevdf&id=344944e06f11da25b49328825ed15fedd63036d3
> >
> > That allows tasks to sleep away the lag -- with all the gnarly bits that
> > sleep time has. And it reliably fixes the above. However, it also
> > depresses a bunch of other stuff. Never a free lunch etc.
> >
> > It is so far the least horrible of the things I've tried.
>
> So the below is one I conceptually like more -- except I hate the code,
> nor does it work as well as the one linked above.
>
> (Mike, this isn't the same one you saw before -- it's been 'improved')

Still improves high frequency switchers vs hogs nicely.

tbench vs massive_intr

6.4.16-stable                  avg
2353.57  2311.77  2399.27  2354.87  1.00

6.4.16-eevdf                   avg
2037.93  2014.57  2026.84  2026.44  .86   1.00  DELAY_DEQUEUE    v1
1893.53  1903.45  1851.57  1882.85  .79    .92  NO_DELAY_DEQUEUE
2193.33  2165.35  2201.82  2186.83  .92   1.07  DELAY_DEQUEUE    v2

It's barely visible in mixed youtube vs compute in i7-4790 box,
squinting required, and completely invisible in dinky rpi4.

A clear win along with no harm to the mundane mix is a good start in my
book.  Here's hoping canned benchmark bots don't grumble.

	-Mike
Re: [PATCH 03/15] sched/fair: Add lag based placement
Posted by Abel Wu 2 years, 4 months ago
On 5/31/23 7:58 PM, Peter Zijlstra Wrote:
>   		/*
> -		 * Halve their sleep time's effect, to allow
> -		 * for a gentler effect of sleepers:
> +		 * If we want to place a task and preserve lag, we have to
> +		 * consider the effect of the new entity on the weighted
> +		 * average and compensate for this, otherwise lag can quickly
> +		 * evaporate.
> +		 *
> +		 * Lag is defined as:
> +		 *
> +		 *   lag_i = S - s_i = w_i * (V - v_i)
> +		 *
> +		 * To avoid the 'w_i' term all over the place, we only track
> +		 * the virtual lag:
> +		 *
> +		 *   vl_i = V - v_i <=> v_i = V - vl_i
> +		 *
> +		 * And we take V to be the weighted average of all v:
> +		 *
> +		 *   V = (\Sum w_j*v_j) / W
> +		 *
> +		 * Where W is: \Sum w_j
> +		 *
> +		 * Then, the weighted average after adding an entity with lag
> +		 * vl_i is given by:
> +		 *
> +		 *   V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i)
> +		 *      = (W*V + w_i*(V - vl_i)) / (W + w_i)
> +		 *      = (W*V + w_i*V - w_i*vl_i) / (W + w_i)
> +		 *      = (V*(W + w_i) - w_i*l) / (W + w_i)
> +		 *      = V - w_i*vl_i / (W + w_i)
> +		 *
> +		 * And the actual lag after adding an entity with vl_i is:
> +		 *
> +		 *   vl'_i = V' - v_i
> +		 *         = V - w_i*vl_i / (W + w_i) - (V - vl_i)
> +		 *         = vl_i - w_i*vl_i / (W + w_i)
> +		 *
> +		 * Which is strictly less than vl_i. So in order to preserve lag

Maybe a stupid question, but why vl'_i < vl_i? Since vl_i can be negative.

> +		 * we should inflate the lag before placement such that the
> +		 * effective lag after placement comes out right.
> +		 *
> +		 * As such, invert the above relation for vl'_i to get the vl_i
> +		 * we need to use such that the lag after placement is the lag
> +		 * we computed before dequeue.
> +		 *
> +		 *   vl'_i = vl_i - w_i*vl_i / (W + w_i)
> +		 *         = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i)
> +		 *
> +		 *   (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i
> +		 *                   = W*vl_i
> +		 *
> +		 *   vl_i = (W + w_i)*vl'_i / W
>   		 */
> -		if (sched_feat(GENTLE_FAIR_SLEEPERS))
> -			thresh >>= 1;
> +		load = cfs_rq->avg_load;
> +		if (curr && curr->on_rq)
> +			load += curr->load.weight;
>   
> -		vruntime -= thresh;
> +		lag *= load + se->load.weight;
> +		if (WARN_ON_ONCE(!load))
> +			load = 1;
> +		lag = div_s64(lag, load);
> +
> +		vruntime -= lag;
>   	}
Re: [PATCH 03/15] sched/fair: Add lag based placement
Posted by Peter Zijlstra 2 years, 4 months ago
On Wed, Oct 11, 2023 at 08:00:22PM +0800, Abel Wu wrote:
> On 5/31/23 7:58 PM, Peter Zijlstra Wrote:
> >   		/*
> > +		 * If we want to place a task and preserve lag, we have to
> > +		 * consider the effect of the new entity on the weighted
> > +		 * average and compensate for this, otherwise lag can quickly
> > +		 * evaporate.
> > +		 *
> > +		 * Lag is defined as:
> > +		 *
> > +		 *   lag_i = S - s_i = w_i * (V - v_i)
> > +		 *
> > +		 * To avoid the 'w_i' term all over the place, we only track
> > +		 * the virtual lag:
> > +		 *
> > +		 *   vl_i = V - v_i <=> v_i = V - vl_i
> > +		 *
> > +		 * And we take V to be the weighted average of all v:
> > +		 *
> > +		 *   V = (\Sum w_j*v_j) / W
> > +		 *
> > +		 * Where W is: \Sum w_j
> > +		 *
> > +		 * Then, the weighted average after adding an entity with lag
> > +		 * vl_i is given by:
> > +		 *
> > +		 *   V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i)
> > +		 *      = (W*V + w_i*(V - vl_i)) / (W + w_i)
> > +		 *      = (W*V + w_i*V - w_i*vl_i) / (W + w_i)
> > +		 *      = (V*(W + w_i) - w_i*l) / (W + w_i)
> > +		 *      = V - w_i*vl_i / (W + w_i)
> > +		 *
> > +		 * And the actual lag after adding an entity with vl_i is:
> > +		 *
> > +		 *   vl'_i = V' - v_i
> > +		 *         = V - w_i*vl_i / (W + w_i) - (V - vl_i)
> > +		 *         = vl_i - w_i*vl_i / (W + w_i)
> > +		 *
> > +		 * Which is strictly less than vl_i. So in order to preserve lag
> 
> Maybe a stupid question, but why vl'_i < vl_i? Since vl_i can be negative.

So the below doesn't care about the sign, it simply inverts this
relation to express vl_i in vl'_i:

> > +		 * we should inflate the lag before placement such that the
> > +		 * effective lag after placement comes out right.
> > +		 *
> > +		 * As such, invert the above relation for vl'_i to get the vl_i
> > +		 * we need to use such that the lag after placement is the lag
> > +		 * we computed before dequeue.
> > +		 *
> > +		 *   vl'_i = vl_i - w_i*vl_i / (W + w_i)
> > +		 *         = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i)
> > +		 *
> > +		 *   (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i
> > +		 *                   = W*vl_i
> > +		 *
> > +		 *   vl_i = (W + w_i)*vl'_i / W

And then we obtain the scale factor: (W + w_i)/W, which is >1, right?

As such, that means that vl'_i must be smaller than vl_i in the absolute
sense, irrespective of sign.
Re: Re: [PATCH 03/15] sched/fair: Add lag based placement
Posted by Abel Wu 2 years, 4 months ago
On 10/11/23 9:24 PM, Peter Zijlstra Wrote:
> On Wed, Oct 11, 2023 at 08:00:22PM +0800, Abel Wu wrote:
>> On 5/31/23 7:58 PM, Peter Zijlstra Wrote:
>>>    		/*
>>> +		 * If we want to place a task and preserve lag, we have to
>>> +		 * consider the effect of the new entity on the weighted
>>> +		 * average and compensate for this, otherwise lag can quickly
>>> +		 * evaporate.
>>> +		 *
>>> +		 * Lag is defined as:
>>> +		 *
>>> +		 *   lag_i = S - s_i = w_i * (V - v_i)
>>> +		 *
>>> +		 * To avoid the 'w_i' term all over the place, we only track
>>> +		 * the virtual lag:
>>> +		 *
>>> +		 *   vl_i = V - v_i <=> v_i = V - vl_i
>>> +		 *
>>> +		 * And we take V to be the weighted average of all v:
>>> +		 *
>>> +		 *   V = (\Sum w_j*v_j) / W
>>> +		 *
>>> +		 * Where W is: \Sum w_j
>>> +		 *
>>> +		 * Then, the weighted average after adding an entity with lag
>>> +		 * vl_i is given by:
>>> +		 *
>>> +		 *   V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i)
>>> +		 *      = (W*V + w_i*(V - vl_i)) / (W + w_i)
>>> +		 *      = (W*V + w_i*V - w_i*vl_i) / (W + w_i)
>>> +		 *      = (V*(W + w_i) - w_i*l) / (W + w_i)
>>> +		 *      = V - w_i*vl_i / (W + w_i)
>>> +		 *
>>> +		 * And the actual lag after adding an entity with vl_i is:
>>> +		 *
>>> +		 *   vl'_i = V' - v_i
>>> +		 *         = V - w_i*vl_i / (W + w_i) - (V - vl_i)
>>> +		 *         = vl_i - w_i*vl_i / (W + w_i)
>>> +		 *
>>> +		 * Which is strictly less than vl_i. So in order to preserve lag
>>
>> Maybe a stupid question, but why vl'_i < vl_i? Since vl_i can be negative.
> 
> So the below doesn't care about the sign, it simply inverts this
> relation to express vl_i in vl'_i:
> 
>>> +		 * we should inflate the lag before placement such that the
>>> +		 * effective lag after placement comes out right.
>>> +		 *
>>> +		 * As such, invert the above relation for vl'_i to get the vl_i
>>> +		 * we need to use such that the lag after placement is the lag
>>> +		 * we computed before dequeue.
>>> +		 *
>>> +		 *   vl'_i = vl_i - w_i*vl_i / (W + w_i)
>>> +		 *         = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i)
>>> +		 *
>>> +		 *   (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i
>>> +		 *                   = W*vl_i
>>> +		 *
>>> +		 *   vl_i = (W + w_i)*vl'_i / W
> 
> And then we obtain the scale factor: (W + w_i)/W, which is >1, right?

Yeah, I see. But the scale factor is only for the to-be-placed entity.
Say there is an entity k on the tree:

	vl_k	= V - v_k

adding the to-be-placed entity i affects this by:

	define delta := w_i*vl_i / (W + w_i)

	vl'_k	= V' - v_k
		= V - delta - (V - vl_k)
		= vl_k - delta

hence for any entity on the tree, its lag is offsetted by @delta. So
I wonder if we should simply do offsetting rather than scaling.

> 
> As such, that means that vl'_i must be smaller than vl_i in the absolute
> sense, irrespective of sign.
Re: Re: [PATCH 03/15] sched/fair: Add lag based placement
Posted by Peter Zijlstra 2 years, 3 months ago
On Thu, Oct 12, 2023 at 03:04:47PM +0800, Abel Wu wrote:
> On 10/11/23 9:24 PM, Peter Zijlstra Wrote:

> > > > +		 * we should inflate the lag before placement such that the
> > > > +		 * effective lag after placement comes out right.
> > > > +		 *
> > > > +		 * As such, invert the above relation for vl'_i to get the vl_i
> > > > +		 * we need to use such that the lag after placement is the lag
> > > > +		 * we computed before dequeue.
> > > > +		 *
> > > > +		 *   vl'_i = vl_i - w_i*vl_i / (W + w_i)
> > > > +		 *         = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i)
> > > > +		 *
> > > > +		 *   (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i
> > > > +		 *                   = W*vl_i
> > > > +		 *
> > > > +		 *   vl_i = (W + w_i)*vl'_i / W
> > 
> > And then we obtain the scale factor: (W + w_i)/W, which is >1, right?
> 
> Yeah, I see. But the scale factor is only for the to-be-placed entity.
> Say there is an entity k on the tree:
> 
> 	vl_k	= V - v_k
> 
> adding the to-be-placed entity i affects this by:
> 
> 	define delta := w_i*vl_i / (W + w_i)
> 
> 	vl'_k	= V' - v_k
> 		= V - delta - (V - vl_k)
> 		= vl_k - delta
> 
> hence for any entity on the tree, its lag is offsetted by @delta. So
> I wonder if we should simply do offsetting rather than scaling.

I don't see the point, the result is the same and computing delta seems
numerically less stable.
Re: Re: [PATCH 03/15] sched/fair: Add lag based placement
Posted by Abel Wu 2 years, 3 months ago
On 10/13/23 3:37 PM, Peter Zijlstra Wrote:
> On Thu, Oct 12, 2023 at 03:04:47PM +0800, Abel Wu wrote:
>> On 10/11/23 9:24 PM, Peter Zijlstra Wrote:
> 
>>>>> +		 * we should inflate the lag before placement such that the
>>>>> +		 * effective lag after placement comes out right.
>>>>> +		 *
>>>>> +		 * As such, invert the above relation for vl'_i to get the vl_i
>>>>> +		 * we need to use such that the lag after placement is the lag
>>>>> +		 * we computed before dequeue.
>>>>> +		 *
>>>>> +		 *   vl'_i = vl_i - w_i*vl_i / (W + w_i)
>>>>> +		 *         = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i)
>>>>> +		 *
>>>>> +		 *   (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i
>>>>> +		 *                   = W*vl_i
>>>>> +		 *
>>>>> +		 *   vl_i = (W + w_i)*vl'_i / W
>>>
>>> And then we obtain the scale factor: (W + w_i)/W, which is >1, right?
>>
>> Yeah, I see. But the scale factor is only for the to-be-placed entity.
>> Say there is an entity k on the tree:
>>
>> 	vl_k	= V - v_k
>>
>> adding the to-be-placed entity i affects this by:
>>
>> 	define delta := w_i*vl_i / (W + w_i)
>>
>> 	vl'_k	= V' - v_k
>> 		= V - delta - (V - vl_k)
>> 		= vl_k - delta
>>
>> hence for any entity on the tree, its lag is offsetted by @delta. So
>> I wonder if we should simply do offsetting rather than scaling.
> 
> I don't see the point, the result is the same and computing delta seems
> numerically less stable.

Right. I was not myself then, please forget what I said..
[tip: sched/core] sched/fair: Add lag based placement
Posted by tip-bot2 for Peter Zijlstra 2 years, 6 months ago
The following commit has been merged into the sched/core branch of tip:

Commit-ID:     86bfbb7ce4f67a88df2639198169b685668e7349
Gitweb:        https://git.kernel.org/tip/86bfbb7ce4f67a88df2639198169b685668e7349
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Wed, 31 May 2023 13:58:42 +02:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Wed, 19 Jul 2023 09:43:58 +02:00

sched/fair: Add lag based placement

With the introduction of avg_vruntime, it is possible to approximate
lag (the entire purpose of introducing it in fact). Use this to do lag
based placement over sleep+wake.

Specifically, the FAIR_SLEEPERS thing places things too far to the
left and messes up the deadline aspect of EEVDF.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20230531124603.794929315@infradead.org
---
 include/linux/sched.h   |   3 +-
 kernel/sched/core.c     |   1 +-
 kernel/sched/fair.c     | 168 ++++++++++++++++++++++++++++++---------
 kernel/sched/features.h |   8 ++-
 4 files changed, 141 insertions(+), 39 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2aab7be..ba1828b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -554,8 +554,9 @@ struct sched_entity {
 
 	u64				exec_start;
 	u64				sum_exec_runtime;
-	u64				vruntime;
 	u64				prev_sum_exec_runtime;
+	u64				vruntime;
+	s64				vlag;
 
 	u64				nr_migrations;
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 83e3654..84b0d47 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4501,6 +4501,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->se.prev_sum_exec_runtime	= 0;
 	p->se.nr_migrations		= 0;
 	p->se.vruntime			= 0;
+	p->se.vlag			= 0;
 	INIT_LIST_HEAD(&p->se.group_node);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fc43482..dd12ada 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -715,6 +715,15 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
 	return cfs_rq->min_vruntime + avg;
 }
 
+/*
+ * lag_i = S - s_i = w_i * (V - v_i)
+ */
+void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	SCHED_WARN_ON(!se->on_rq);
+	se->vlag = avg_vruntime(cfs_rq) - se->vruntime;
+}
+
 static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
 {
 	u64 min_vruntime = cfs_rq->min_vruntime;
@@ -3492,6 +3501,8 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 			    unsigned long weight)
 {
+	unsigned long old_weight = se->load.weight;
+
 	if (se->on_rq) {
 		/* commit outstanding execution time */
 		if (cfs_rq->curr == se)
@@ -3504,6 +3515,14 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 
 	update_load_set(&se->load, weight);
 
+	if (!se->on_rq) {
+		/*
+		 * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
+		 * we need to scale se->vlag when w_i changes.
+		 */
+		se->vlag = div_s64(se->vlag * old_weight, weight);
+	}
+
 #ifdef CONFIG_SMP
 	do {
 		u32 divider = get_pelt_divider(&se->avg);
@@ -4853,49 +4872,119 @@ static void
 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 {
 	u64 vruntime = avg_vruntime(cfs_rq);
+	s64 lag = 0;
 
-	/* sleeps up to a single latency don't count. */
-	if (!initial) {
-		unsigned long thresh;
+	/*
+	 * Due to how V is constructed as the weighted average of entities,
+	 * adding tasks with positive lag, or removing tasks with negative lag
+	 * will move 'time' backwards, this can screw around with the lag of
+	 * other tasks.
+	 *
+	 * EEVDF: placement strategy #1 / #2
+	 */
+	if (sched_feat(PLACE_LAG) && cfs_rq->nr_running > 1) {
+		struct sched_entity *curr = cfs_rq->curr;
+		unsigned long load;
 
-		if (se_is_idle(se))
-			thresh = sysctl_sched_min_granularity;
-		else
-			thresh = sysctl_sched_latency;
+		lag = se->vlag;
 
 		/*
-		 * Halve their sleep time's effect, to allow
-		 * for a gentler effect of sleepers:
+		 * If we want to place a task and preserve lag, we have to
+		 * consider the effect of the new entity on the weighted
+		 * average and compensate for this, otherwise lag can quickly
+		 * evaporate.
+		 *
+		 * Lag is defined as:
+		 *
+		 *   lag_i = S - s_i = w_i * (V - v_i)
+		 *
+		 * To avoid the 'w_i' term all over the place, we only track
+		 * the virtual lag:
+		 *
+		 *   vl_i = V - v_i <=> v_i = V - vl_i
+		 *
+		 * And we take V to be the weighted average of all v:
+		 *
+		 *   V = (\Sum w_j*v_j) / W
+		 *
+		 * Where W is: \Sum w_j
+		 *
+		 * Then, the weighted average after adding an entity with lag
+		 * vl_i is given by:
+		 *
+		 *   V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i)
+		 *      = (W*V + w_i*(V - vl_i)) / (W + w_i)
+		 *      = (W*V + w_i*V - w_i*vl_i) / (W + w_i)
+		 *      = (V*(W + w_i) - w_i*l) / (W + w_i)
+		 *      = V - w_i*vl_i / (W + w_i)
+		 *
+		 * And the actual lag after adding an entity with vl_i is:
+		 *
+		 *   vl'_i = V' - v_i
+		 *         = V - w_i*vl_i / (W + w_i) - (V - vl_i)
+		 *         = vl_i - w_i*vl_i / (W + w_i)
+		 *
+		 * Which is strictly less than vl_i. So in order to preserve lag
+		 * we should inflate the lag before placement such that the
+		 * effective lag after placement comes out right.
+		 *
+		 * As such, invert the above relation for vl'_i to get the vl_i
+		 * we need to use such that the lag after placement is the lag
+		 * we computed before dequeue.
+		 *
+		 *   vl'_i = vl_i - w_i*vl_i / (W + w_i)
+		 *         = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i)
+		 *
+		 *   (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i
+		 *                   = W*vl_i
+		 *
+		 *   vl_i = (W + w_i)*vl'_i / W
 		 */
-		if (sched_feat(GENTLE_FAIR_SLEEPERS))
-			thresh >>= 1;
-
-		vruntime -= thresh;
-	}
-
-	/*
-	 * Pull vruntime of the entity being placed to the base level of
-	 * cfs_rq, to prevent boosting it if placed backwards.
-	 * However, min_vruntime can advance much faster than real time, with
-	 * the extreme being when an entity with the minimal weight always runs
-	 * on the cfs_rq. If the waking entity slept for a long time, its
-	 * vruntime difference from min_vruntime may overflow s64 and their
-	 * comparison may get inversed, so ignore the entity's original
-	 * vruntime in that case.
-	 * The maximal vruntime speedup is given by the ratio of normal to
-	 * minimal weight: scale_load_down(NICE_0_LOAD) / MIN_SHARES.
-	 * When placing a migrated waking entity, its exec_start has been set
-	 * from a different rq. In order to take into account a possible
-	 * divergence between new and prev rq's clocks task because of irq and
-	 * stolen time, we take an additional margin.
-	 * So, cutting off on the sleep time of
-	 *     2^63 / scale_load_down(NICE_0_LOAD) ~ 104 days
-	 * should be safe.
-	 */
-	if (entity_is_long_sleeper(se))
-		se->vruntime = vruntime;
-	else
-		se->vruntime = max_vruntime(se->vruntime, vruntime);
+		load = cfs_rq->avg_load;
+		if (curr && curr->on_rq)
+			load += curr->load.weight;
+
+		lag *= load + se->load.weight;
+		if (WARN_ON_ONCE(!load))
+			load = 1;
+		lag = div_s64(lag, load);
+
+		vruntime -= lag;
+	}
+
+	if (sched_feat(FAIR_SLEEPERS)) {
+
+		/* sleeps up to a single latency don't count. */
+		if (!initial) {
+			unsigned long thresh;
+
+			if (se_is_idle(se))
+				thresh = sysctl_sched_min_granularity;
+			else
+				thresh = sysctl_sched_latency;
+
+			/*
+			 * Halve their sleep time's effect, to allow
+			 * for a gentler effect of sleepers:
+			 */
+			if (sched_feat(GENTLE_FAIR_SLEEPERS))
+				thresh >>= 1;
+
+			vruntime -= thresh;
+		}
+
+		/*
+		 * Pull vruntime of the entity being placed to the base level of
+		 * cfs_rq, to prevent boosting it if placed backwards.  If the entity
+		 * slept for a long time, don't even try to compare its vruntime with
+		 * the base as it may be too far off and the comparison may get
+		 * inversed due to s64 overflow.
+		 */
+		if (!entity_is_long_sleeper(se))
+			vruntime = max_vruntime(se->vruntime, vruntime);
+	}
+
+	se->vruntime = vruntime;
 }
 
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
@@ -5077,6 +5166,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 	clear_buddies(cfs_rq, se);
 
+	if (flags & DEQUEUE_SLEEP)
+		update_entity_lag(cfs_rq, se);
+
 	if (se != cfs_rq->curr)
 		__dequeue_entity(cfs_rq, se);
 	se->on_rq = 0;
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index fa828b3..7958a10 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -1,12 +1,20 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+
 /*
  * Only give sleepers 50% of their service deficit. This allows
  * them to run sooner, but does not allow tons of sleepers to
  * rip the spread apart.
  */
+SCHED_FEAT(FAIR_SLEEPERS, false)
 SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
 
 /*
+ * Using the avg_vruntime, do the right thing and preserve lag across
+ * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
+ */
+SCHED_FEAT(PLACE_LAG, true)
+
+/*
  * Prefer to schedule the task we woke last (assuming it failed
  * wakeup-preemption), since its likely going to consume data we
  * touched, increases cache locality.