[v1] sched/eevdf: Optimize reweight and pick

[PATCH 1/4] sched/eevdf: Fix vruntime adjustment on reweight

Posted by Abel Wu 2 years, 3 months ago

vruntime of the (on_rq && !0-lag) entity needs to be adjusted when
it gets re-weighted, and the calculations can be simplified based
on the fact that re-weight won't change the w-average of all the
entities. Please check the proofs in comments.

But adjusting vruntime can also cause position change in RB-tree
hence require re-queue to fix up which might be costly. This might
be avoided by deferring adjustment to the time the entity actually
leaves tree (dequeue/pick), but that will negatively affect task
selection and probably not good enough either.

Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy")
Signed-off-by: Abel Wu <wuyun.abel@bytedance.com>
---
 kernel/sched/fair.c | 151 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 128 insertions(+), 23 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8767988242ee..b00d09a9b601 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3666,41 +3666,140 @@ static inline void
 dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
 #endif
 
+static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se,
+			   unsigned long weight)
+{
+	unsigned long old_weight = se->load.weight;
+	u64 avruntime = avg_vruntime(cfs_rq);
+	s64 vlag, vslice;
+
+	/*
+	 * VRUNTIME
+	 * ========
+	 *
+	 * COROLLARY #1: The virtual runtime of the entity needs to be
+	 * adjusted if re-weight at !0-lag point.
+	 *
+	 * Proof: For contradiction assume this is not true, so we can
+	 * re-weight without changing vruntime at !0-lag point.
+	 *
+	 *             Weight	VRuntime   Avg-VRuntime
+	 *     before    w          v            V
+	 *      after    w'         v'           V'
+	 *
+	 * Since lag needs to be preserved through re-weight:
+	 *
+	 *	lag = (V - v)*w = (V'- v')*w', where v = v'
+	 *	==>	V' = (V - v)*w/w' + v		(1)
+	 *
+	 * Let W be the total weight of the entities before reweight,
+	 * since V' is the new weighted average of entities:
+	 *
+	 *	V' = (WV + w'v - wv) / (W + w' - w)	(2)
+	 *
+	 * by using (1) & (2) we obtain:
+	 *
+	 *	(WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v
+	 *	==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v
+	 *	==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v
+	 *	==>	(V - v)*W/(W + w' - w) = (V - v)*w/w' (3)
+	 *
+	 * Since we are doing at !0-lag point which means V != v, we
+	 * can simplify (3):
+	 *
+	 *	==>	W / (W + w' - w) = w / w'
+	 *	==>	Ww' = Ww + ww' - ww
+	 *	==>	W * (w' - w) = w * (w' - w)
+	 *	==>	W = w	(re-weight indicates w' != w)
+	 *
+	 * So the cfs_rq contains only one entity, hence vruntime of
+	 * the entity @v should always equal to the cfs_rq's weighted
+	 * average vruntime @V, which means we will always re-weight
+	 * at 0-lag point, thus breach assumption. Proof completed.
+	 *
+	 *
+	 * COROLLARY #2: Re-weight does NOT affect weighted average
+	 * vruntime of all the entities.
+	 *
+	 * Proof: According to corollary #1, Eq. (1) should be:
+	 *
+	 *	(V - v)*w = (V' - v')*w'
+	 *	==>    v' = V' - (V - v)*w/w'		(4)
+	 *
+	 * According to the weighted average formula, we have:
+	 *
+	 *	V' = (WV - wv + w'v') / (W - w + w')
+	 *	   = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w')
+	 *	   = (WV - wv + w'V' - Vw + wv) / (W - w + w')
+	 *	   = (WV + w'V' - Vw) / (W - w + w')
+	 *
+	 *	==>  V'*(W - w + w') = WV + w'V' - Vw
+	 *	==>	V' * (W - w) = (W - w) * V	(5)
+	 *
+	 * If the entity is the only one in the cfs_rq, then reweight
+	 * always occurs at 0-lag point, so V won't change. Or else
+	 * there are other entities, hence W != w, then Eq. (5) turns
+	 * into V' = V. So V won't change in either case, proof done.
+	 *
+	 *
+	 * So according to corollary #1 & #2, the effect of re-weight
+	 * on vruntime should be:
+	 *
+	 *	v' = V' - (V - v) * w / w'		(4)
+	 *	   = V  - (V - v) * w / w'
+	 *	   = V  - vl * w / w'
+	 *	   = V  - vl'
+	 */
+	if (avruntime != se->vruntime) {
+		vlag = (s64)(avruntime - se->vruntime);
+		vlag = div_s64(vlag * old_weight, weight);
+		se->vruntime = avruntime - vlag;
+	}
+
+	/*
+	 * DEADLINE
+	 * ========
+	 *
+	 * When the weight changes, the virtual time slope changes and
+	 * we should adjust the relative virtual deadline accordingly.
+	 *
+	 *	d' = v' + (d - v)*w/w'
+	 *	   = V' - (V - v)*w/w' + (d - v)*w/w'
+	 *	   = V  - (V - v)*w/w' + (d - v)*w/w'
+	 *	   = V  + (d - V)*w/w'
+	 */
+	vslice = (s64)(se->deadline - avruntime);
+	vslice = div_s64(vslice * old_weight, weight);
+	se->deadline = avruntime + vslice;
+}
+
 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 			    unsigned long weight)
 {
-	unsigned long old_weight = se->load.weight;
+	bool curr = cfs_rq->curr == se;
 
 	if (se->on_rq) {
 		/* commit outstanding execution time */
-		if (cfs_rq->curr == se)
+		if (curr)
 			update_curr(cfs_rq);
 		else
-			avg_vruntime_sub(cfs_rq, se);
+			__dequeue_entity(cfs_rq, se);
 		update_load_sub(&cfs_rq->load, se->load.weight);
 	}
 	dequeue_load_avg(cfs_rq, se);
 
-	update_load_set(&se->load, weight);
-
 	if (!se->on_rq) {
 		/*
 		 * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
 		 * we need to scale se->vlag when w_i changes.
 		 */
-		se->vlag = div_s64(se->vlag * old_weight, weight);
+		se->vlag = div_s64(se->vlag * se->load.weight, weight);
 	} else {
-		s64 deadline = se->deadline - se->vruntime;
-		/*
-		 * When the weight changes, the virtual time slope changes and
-		 * we should adjust the relative virtual deadline accordingly.
-		 */
-		deadline = div_s64(deadline * old_weight, weight);
-		se->deadline = se->vruntime + deadline;
-		if (se != cfs_rq->curr)
-			min_deadline_cb_propagate(&se->run_node, NULL);
+		reweight_eevdf(cfs_rq, se, weight);
 	}
 
+	update_load_set(&se->load, weight);
+
 #ifdef CONFIG_SMP
 	do {
 		u32 divider = get_pelt_divider(&se->avg);
@@ -3712,8 +3811,17 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 	enqueue_load_avg(cfs_rq, se);
 	if (se->on_rq) {
 		update_load_add(&cfs_rq->load, se->load.weight);
-		if (cfs_rq->curr != se)
-			avg_vruntime_add(cfs_rq, se);
+		if (!curr) {
+			/*
+			 * The entity's vruntime has been adjusted, so let's check
+			 * whether the rq-wide min_vruntime needs updated too. Since
+			 * the calculations above require stable min_vruntime rather
+			 * than up-to-date one, we do the update at the end of the
+			 * reweight process.
+			 */
+			__enqueue_entity(cfs_rq, se);
+			update_min_vruntime(cfs_rq);
+		}
 	}
 }
 
@@ -3857,14 +3965,11 @@ static void update_cfs_group(struct sched_entity *se)
 
 #ifndef CONFIG_SMP
 	shares = READ_ONCE(gcfs_rq->tg->shares);
-
-	if (likely(se->load.weight == shares))
-		return;
 #else
-	shares   = calc_group_shares(gcfs_rq);
+	shares = calc_group_shares(gcfs_rq);
 #endif
-
-	reweight_entity(cfs_rq_of(se), se, shares);
+	if (unlikely(se->load.weight != shares))
+		reweight_entity(cfs_rq_of(se), se, shares);
 }
 
 #else /* CONFIG_FAIR_GROUP_SCHED */
-- 
2.37.3

Re: [PATCH 1/4] sched/eevdf: Fix vruntime adjustment on reweight

Posted by Tianchen Ding 1 year, 11 months ago

Hi Abel:

I'm not so familiar with eevdf and still learning. Here I've some questions 
about this patch.

1. You did proof that V will not change during reweight (COROLLARY #2). However, 
according to the original paper, the new V will be:
V' = V + lag(j)/(W - w_j) - lag(j)/(W - w_j + w'_j)
So the V' in paper will change (when lag is not zero).
Is the V in Linux code slightly different from the paper?

2. I print some log around reweight_entity(), mainly want to print V by calling 
avg_vruntime(cfs_rq). I found your algorithm only keeps the V unchanged during 
reweight_eevdf(), but not reweight_entity().

In detail:
If curr is true (i.e., cfs_rq->curr == se), we will directly run 
reweight_eevdf(), and the V is not changed.
If curr is false, we will have __dequeue_entity() -> reweight_eevdf() -> 
__enqueue_entity(). The V is finally changed due to dequeue and enqueue. So the 
result of reweight_entity() will be impacted by "cfs_rq->curr == se", is this 
expected?

Thanks!

Re: Re: [PATCH 1/4] sched/eevdf: Fix vruntime adjustment on reweight

Posted by Abel Wu 1 year, 11 months ago

Hi Tianchen,

On 2/29/24 5:24 PM, Tianchen Ding Wrote:
> Hi Abel:
> 
> I'm not so familiar with eevdf and still learning. Here I've some questions about this patch.
> 
> 1. You did proof that V will not change during reweight (COROLLARY #2). However, according to the original paper, the new V will be:
> V' = V + lag(j)/(W - w_j) - lag(j)/(W - w_j + w'_j)
> So the V' in paper will change (when lag is not zero).
> Is the V in Linux code slightly different from the paper?

Good catch. And to the best of my knowledge, the answer is YES. The
above Equation in the paper, which is Eq. (20), is based on the
assumption that:

	"once client 3 leaves, the remaining two clients will
	 proportionally support the eventual loss or gain in the
	 service time"  -- Page 10

	"by updating the virtual time according to Eq. (18,19) we
	 ensure that the sum over the lags of all active clients
	 is always zero"  -- Page 11

But in Peter's implementation, it is the competitors in the new group
that client 3 later joins in who actually support the effect. So when
client 3 leaves competition with !0-lag in Linux, the rq's sum(lag_i)
is no longer zero.

> 
> 2. I print some log around reweight_entity(), mainly want to print V by calling avg_vruntime(cfs_rq). I found your algorithm only keeps the V unchanged during reweight_eevdf(), but not reweight_entity().
> 
> In detail:
> If curr is true (i.e., cfs_rq->curr == se), we will directly run reweight_eevdf(), and the V is not changed.
> If curr is false, we will have __dequeue_entity() -> reweight_eevdf() -> __enqueue_entity(). The V is finally changed due to dequeue and enqueue. So the result of reweight_entity() will be impacted by "cfs_rq->curr == se", is this expected?

Good catch again! It smells like a bug. Since this @se is still on_rq,
it should be taken into consideration when calculating avg_runtime(),
but in fact it isn't because __dequeue_entity() will remove its share.

And I seem to spot another bug, although not relate to this problem,
that we actually need to call update_curr() unconditionally if curr is
available, because we need to commit curr's outstanding runtime to
ensure the result of avg_runtime() is up to date.

Thanks & BR,
	Abel

Re: [PATCH 1/4] sched/eevdf: Fix vruntime adjustment on reweight

Posted by Tianchen Ding 1 year, 11 months ago

On 2024/2/29 22:25, Abel Wu wrote:
> Good catch. And to the best of my knowledge, the answer is YES. The
> above Equation in the paper, which is Eq. (20), is based on the
> assumption that:
> 
>      "once client 3 leaves, the remaining two clients will
>       proportionally support the eventual loss or gain in the
>       service time"  -- Page 10
> 
>      "by updating the virtual time according to Eq. (18,19) we
>       ensure that the sum over the lags of all active clients
>       is always zero"  -- Page 11
> 
> But in Peter's implementation, it is the competitors in the new group
> that client 3 later joins in who actually support the effect. So when
> client 3 leaves competition with !0-lag in Linux, the rq's sum(lag_i)
> is no longer zero.
> 

I've different opinions. According to the comments above avg_vruntime_add(), V
is calculated exactly to satisfy sum(lag_i)=0. This is guaranteed by math.

Actually I print some logs in enqueue_entity() and dequeue_entity() to verify this:

[  293.261236] before dequeue: V=2525278131 W=3072 v=2526243139 w=1024 lag_sum=0
[  293.261237] after dequeue: V=2524795627 W=2048 v=2526243139 w=1024 lag_sum=0
[  293.262286] before enqueue: V=2525319064 W=2048 v=2526766576 w=1024 lag_sum=0
[  293.262287] after enqueue: V=2525801568 W=3072 v=2526766576 w=1024 lag_sum=0

For the first 2 lines, we have 2524795627 = 2525278131 + (2525278131 - 2526243139) * 1024 / 2048.
Which is Eq. (18)

For the last 2 lines, we have 2525801568 = 2525319064 - (2525319064 - 2526766576) * 1024 / 3072.
Which is Eq. (19)

So whatever client 3 leave or join competition with !0-lag in Linux, V is handled properly.

> Good catch again! It smells like a bug. Since this @se is still on_rq,
> it should be taken into consideration when calculating avg_runtime(),
> but in fact it isn't because __dequeue_entity() will remove its share.
> 
> And I seem to spot another bug, although not relate to this problem,
> that we actually need to call update_curr() unconditionally if curr is
> available, because we need to commit curr's outstanding runtime to
> ensure the result of avg_runtime() is up to date.
> 

I've tried to record avg_vruntime before __dequeue_entity() and pass it to
reweight_eevdf(). Then the issue is fixed. The V keeps the same during the whole
reweight_entity().

I could send these two bugfix patches (one for this bug and one you sugguested
about update_curr). But before doing so, I still want to dig out the answer of
my first question.

Hi Peter, would you please provide any information?

Thanks.



My rough logging code:
(Note: lag_sum may output a minus value, with its absolute value less than W.
This is ok because my lag_sum calculate is not so accurate due to the sign flips
in avg_vruntime())

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 533547e3c90a..9306c1bbd472 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5260,6 +5260,62 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
  
  static inline bool cfs_bandwidth_used(void);
  
+static int rbtree_all(const void *key, const struct rb_node *node)
+{
+	return 0;
+}
+
+static s64 get_lag_sum(struct cfs_rq *cfs_rq)
+{
+	u64 avruntime = avg_vruntime(cfs_rq);
+	struct sched_entity *curr = cfs_rq->curr;
+	struct rb_node *node;
+	s64 lag_sum = 0;
+
+	rb_for_each(node, 0, &cfs_rq->tasks_timeline.rb_root, rbtree_all) {
+		struct sched_entity *se = __node_2_se(node);
+
+		if (se->on_rq)
+			lag_sum += (avruntime - se->vruntime) * scale_load_down(se->load.weight);
+	}
+
+	if (curr && curr->on_rq) {
+		lag_sum += (avruntime - curr->vruntime) * scale_load_down(curr->load.weight);
+	}
+
+	return lag_sum;
+}
+
+static void print_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se, bool before, bool enqueue)
+{
+	if (cpu_of(rq_of(cfs_rq)))
+		return; // avoid too many printings.
+
+	long load = cfs_rq->avg_load;
+	struct sched_entity *curr = cfs_rq->curr;
+	unsigned long weight = scale_load_down(se->load.weight);
+	s64 lag_sum = get_lag_sum(cfs_rq);
+	u64 avruntime = avg_vruntime(cfs_rq);
+
+	if (curr && curr->on_rq) {
+		unsigned long curr_weight = scale_load_down(curr->load.weight);
+		load += curr_weight;
+	}
+
+	if (before) {
+		if (enqueue)
+			printk("before enqueue: V=%llu W=%ld v=%llu w=%lu lag_sum=%lld\n", avruntime, load, se->vruntime, weight, lag_sum);
+		else
+			printk("before dequeue: V=%llu W=%ld v=%llu w=%lu lag_sum=%lld\n", avruntime, load, se->vruntime, weight, lag_sum);
+	}
+	else {
+		if (enqueue)
+			printk("after enqueue: V=%llu W=%ld v=%llu w=%lu lag_sum=%lld\n", avruntime, load, se->vruntime, weight, lag_sum);
+		else
+			printk("after dequeue: V=%llu W=%ld v=%llu w=%lu lag_sum=%lld\n", avruntime, load, se->vruntime, weight, lag_sum);
+	}
+}
+
  static void
  enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  {
@@ -5307,9 +5363,11 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  
  	check_schedstat_required();
  	update_stats_enqueue_fair(cfs_rq, se, flags);
+	print_eevdf(cfs_rq, se, true, true);
  	if (!curr)
  		__enqueue_entity(cfs_rq, se);
  	se->on_rq = 1;
+	print_eevdf(cfs_rq, se, false, true);
  
  	if (cfs_rq->nr_running == 1) {
  		check_enqueue_throttle(cfs_rq);
@@ -5347,6 +5405,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
  
  static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
  
+
  static void
  dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  {
@@ -5377,9 +5436,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  	clear_buddies(cfs_rq, se);
  
  	update_entity_lag(cfs_rq, se);
+	print_eevdf(cfs_rq, se, true, false);
  	if (se != cfs_rq->curr)
  		__dequeue_entity(cfs_rq, se);
  	se->on_rq = 0;
+	print_eevdf(cfs_rq, se, false, false);
  	account_entity_dequeue(cfs_rq, se);
  
  	/* return excess runtime on last dequeue */

Re: Re: [PATCH 1/4] sched/eevdf: Fix vruntime adjustment on reweight

Posted by Abel Wu 1 year, 11 months ago

On 3/1/24 2:41 PM, Tianchen Ding Wrote:
> On 2024/2/29 22:25, Abel Wu wrote:
>> Good catch. And to the best of my knowledge, the answer is YES. The
>> above Equation in the paper, which is Eq. (20), is based on the
>> assumption that:
>>
>>      "once client 3 leaves, the remaining two clients will
>>       proportionally support the eventual loss or gain in the
>>       service time"  -- Page 10
>>
>>      "by updating the virtual time according to Eq. (18,19) we
>>       ensure that the sum over the lags of all active clients
>>       is always zero"  -- Page 11
>>
>> But in Peter's implementation, it is the competitors in the new group
>> that client 3 later joins in who actually support the effect. So when
>> client 3 leaves competition with !0-lag in Linux, the rq's sum(lag_i)
>> is no longer zero.
>>
> 
> I've different opinions. According to the comments above avg_vruntime_add(), V
> is calculated exactly to satisfy sum(lag_i)=0. This is guaranteed by math.

Yes, you are right. I mixed another fairness issue with this. What I
was thinking is that considering multiple competition groups (e.g.
runqueues), the latency bound could be violated, that is someone could
starve a bit. Say one entity even with positive lag could become less
competitive if migrated to a higher competitive group.

Staring at Eq. (20) again, what if we do a fake reweight? I mean let
the client leave and rejoin at the same time without changing weight?
IMHO it should have no effects, but according to Eq. (20) the V will
change to:

	V' = V + lag(j)/(W - w_j) - lag(j)/W != V

Have I missed anything?

> 
> Actually I print some logs in enqueue_entity() and dequeue_entity() to verify this:
> 
> [  293.261236] before dequeue: V=2525278131 W=3072 v=2526243139 w=1024 lag_sum=0
> [  293.261237] after dequeue: V=2524795627 W=2048 v=2526243139 w=1024 lag_sum=0
> [  293.262286] before enqueue: V=2525319064 W=2048 v=2526766576 w=1024 lag_sum=0
> [  293.262287] after enqueue: V=2525801568 W=3072 v=2526766576 w=1024 lag_sum=0
> 
> For the first 2 lines, we have 2524795627 = 2525278131 + (2525278131 - 2526243139) * 1024 / 2048.
> Which is Eq. (18)
> 
> For the last 2 lines, we have 2525801568 = 2525319064 - (2525319064 - 2526766576) * 1024 / 3072.
> Which is Eq. (19)
> 
> So whatever client 3 leave or join competition with !0-lag in Linux, V is handled properly.
> 
>> Good catch again! It smells like a bug. Since this @se is still on_rq,
>> it should be taken into consideration when calculating avg_runtime(),
>> but in fact it isn't because __dequeue_entity() will remove its share.
>>
>> And I seem to spot another bug, although not relate to this problem,
>> that we actually need to call update_curr() unconditionally if curr is
>> available, because we need to commit curr's outstanding runtime to
>> ensure the result of avg_runtime() is up to date.
>>
> 
> I've tried to record avg_vruntime before __dequeue_entity() and pass it to
> reweight_eevdf(). Then the issue is fixed. The V keeps the same during the whole
> reweight_entity().
> 
> I could send these two bugfix patches (one for this bug and one you sugguested

That would be appreciated!

Thanks,
	Abel

Re: [PATCH 1/4] sched/eevdf: Fix vruntime adjustment on reweight

Posted by Tianchen Ding 1 year, 11 months ago

On 2024/3/1 16:30, Abel Wu wrote:
> On 3/1/24 2:41 PM, Tianchen Ding Wrote:
>> On 2024/2/29 22:25, Abel Wu wrote:
>>> Good catch. And to the best of my knowledge, the answer is YES. The
>>> above Equation in the paper, which is Eq. (20), is based on the
>>> assumption that:
>>>
>>>      "once client 3 leaves, the remaining two clients will
>>>       proportionally support the eventual loss or gain in the
>>>       service time"  -- Page 10
>>>
>>>      "by updating the virtual time according to Eq. (18,19) we
>>>       ensure that the sum over the lags of all active clients
>>>       is always zero"  -- Page 11
>>>
>>> But in Peter's implementation, it is the competitors in the new group
>>> that client 3 later joins in who actually support the effect. So when
>>> client 3 leaves competition with !0-lag in Linux, the rq's sum(lag_i)
>>> is no longer zero.
>>>
>>
>> I've different opinions. According to the comments above avg_vruntime_add(), V
>> is calculated exactly to satisfy sum(lag_i)=0. This is guaranteed by math.
> 
> Yes, you are right. I mixed another fairness issue with this. What I
> was thinking is that considering multiple competition groups (e.g.
> runqueues), the latency bound could be violated, that is someone could
> starve a bit. Say one entity even with positive lag could become less
> competitive if migrated to a higher competitive group.
> 
> Staring at Eq. (20) again, what if we do a fake reweight? I mean let
> the client leave and rejoin at the same time without changing weight?
> IMHO it should have no effects, but according to Eq. (20) the V will
> change to:
> 
>      V' = V + lag(j)/(W - w_j) - lag(j)/W != V
> 
> Have I missed anything?
> 

Good point! I've not ever noticed this conflict.

I tried to modify reweight_entity() to run dequeue_entity() -> adjust se->vlag ->
enqueue_entity(). And I found V do not changed.

The difference is, when doing enqueue_entity(), Peter enlarges the lag in place_entity().
Because after enqueue, the lag will evaporate.
In order to keep the same lag after enqueue, during place_entity(),
the new lag(t) will be enlarged with (W+w_i)/W.

So the Eq. (20) should be:


     V' = V + lag(j)/(W - w_j) - lag'(j)/(W - w_j + w'_j)

     lag'(j) = lag(j) * (W - w_j + w'_j)/(W - w_j)

So we can get

     V' = V + lag(j)/(W - w_j) - lag(j) * (W - w_j + w'_j)/(W - w_j)/(W - w_j + w'_j) = V

So COROLLARY #2 is correct.

Re: [PATCH 1/4] sched/eevdf: Fix vruntime adjustment on reweight

Posted by Yiwei Lin 2 years, 2 months ago

On 11/7/23 17:05, Abel Wu wrote:
> vruntime of the (on_rq && !0-lag) entity needs to be adjusted when
> it gets re-weighted, and the calculations can be simplified based
> on the fact that re-weight won't change the w-average of all the
> entities. Please check the proofs in comments.
>
> But adjusting vruntime can also cause position change in RB-tree
> hence require re-queue to fix up which might be costly. This might
> be avoided by deferring adjustment to the time the entity actually
> leaves tree (dequeue/pick), but that will negatively affect task
> selection and probably not good enough either.
>
> Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy")
> Signed-off-by: Abel Wu <wuyun.abel@bytedance.com>
> ---
>   kernel/sched/fair.c | 151 +++++++++++++++++++++++++++++++++++++-------
>   1 file changed, 128 insertions(+), 23 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 8767988242ee..b00d09a9b601 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3666,41 +3666,140 @@ static inline void
>   dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
>   #endif
>   
> +static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se,
> +			   unsigned long weight)
> +{
> +	unsigned long old_weight = se->load.weight;
> +	u64 avruntime = avg_vruntime(cfs_rq);
> +	s64 vlag, vslice;
> +
> +	/*
> +	 * VRUNTIME
> +	 * ========
> +	 *
> +	 * COROLLARY #1: The virtual runtime of the entity needs to be
> +	 * adjusted if re-weight at !0-lag point.
> +	 *
> +	 * Proof: For contradiction assume this is not true, so we can
> +	 * re-weight without changing vruntime at !0-lag point.
> +	 *
> +	 *             Weight	VRuntime   Avg-VRuntime
> +	 *     before    w          v            V
> +	 *      after    w'         v'           V'
> +	 *
> +	 * Since lag needs to be preserved through re-weight:
> +	 *
> +	 *	lag = (V - v)*w = (V'- v')*w', where v = v'
> +	 *	==>	V' = (V - v)*w/w' + v		(1)
> +	 *
> +	 * Let W be the total weight of the entities before reweight,
> +	 * since V' is the new weighted average of entities:
> +	 *
> +	 *	V' = (WV + w'v - wv) / (W + w' - w)	(2)
> +	 *
> +	 * by using (1) & (2) we obtain:
> +	 *
> +	 *	(WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v
> +	 *	==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v
> +	 *	==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v
> +	 *	==>	(V - v)*W/(W + w' - w) = (V - v)*w/w' (3)
> +	 *
> +	 * Since we are doing at !0-lag point which means V != v, we
> +	 * can simplify (3):
> +	 *
> +	 *	==>	W / (W + w' - w) = w / w'
> +	 *	==>	Ww' = Ww + ww' - ww
> +	 *	==>	W * (w' - w) = w * (w' - w)
> +	 *	==>	W = w	(re-weight indicates w' != w)
> +	 *
> +	 * So the cfs_rq contains only one entity, hence vruntime of
> +	 * the entity @v should always equal to the cfs_rq's weighted
> +	 * average vruntime @V, which means we will always re-weight
> +	 * at 0-lag point, thus breach assumption. Proof completed.
> +	 *
> +	 *
> +	 * COROLLARY #2: Re-weight does NOT affect weighted average
> +	 * vruntime of all the entities.
> +	 *
> +	 * Proof: According to corollary #1, Eq. (1) should be:
> +	 *
> +	 *	(V - v)*w = (V' - v')*w'
> +	 *	==>    v' = V' - (V - v)*w/w'		(4)
> +	 *
> +	 * According to the weighted average formula, we have:
> +	 *
> +	 *	V' = (WV - wv + w'v') / (W - w + w')
> +	 *	   = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w')
> +	 *	   = (WV - wv + w'V' - Vw + wv) / (W - w + w')
> +	 *	   = (WV + w'V' - Vw) / (W - w + w')
> +	 *
> +	 *	==>  V'*(W - w + w') = WV + w'V' - Vw
> +	 *	==>	V' * (W - w) = (W - w) * V	(5)
> +	 *
> +	 * If the entity is the only one in the cfs_rq, then reweight
> +	 * always occurs at 0-lag point, so V won't change. Or else
> +	 * there are other entities, hence W != w, then Eq. (5) turns
> +	 * into V' = V. So V won't change in either case, proof done.
> +	 *
> +	 *
> +	 * So according to corollary #1 & #2, the effect of re-weight
> +	 * on vruntime should be:
> +	 *
> +	 *	v' = V' - (V - v) * w / w'		(4)
> +	 *	   = V  - (V - v) * w / w'
> +	 *	   = V  - vl * w / w'
> +	 *	   = V  - vl'
> +	 */
> +	if (avruntime != se->vruntime) {
> +		vlag = (s64)(avruntime - se->vruntime);
> +		vlag = div_s64(vlag * old_weight, weight);
> +		se->vruntime = avruntime - vlag;
> +	}
> +
> +	/*
> +	 * DEADLINE
> +	 * ========
> +	 *
> +	 * When the weight changes, the virtual time slope changes and
> +	 * we should adjust the relative virtual deadline accordingly.
> +	 *
> +	 *	d' = v' + (d - v)*w/w'
> +	 *	   = V' - (V - v)*w/w' + (d - v)*w/w'
> +	 *	   = V  - (V - v)*w/w' + (d - v)*w/w'
> +	 *	   = V  + (d - V)*w/w'
> +	 */
> +	vslice = (s64)(se->deadline - avruntime);
> +	vslice = div_s64(vslice * old_weight, weight);
> +	se->deadline = avruntime + vslice;
> +}
> +
>   static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
>   			    unsigned long weight)
>   {
> -	unsigned long old_weight = se->load.weight;
> +	bool curr = cfs_rq->curr == se;
>   
>   	if (se->on_rq) {
>   		/* commit outstanding execution time */
> -		if (cfs_rq->curr == se)
> +		if (curr)
>   			update_curr(cfs_rq);
>   		else
> -			avg_vruntime_sub(cfs_rq, se);
> +			__dequeue_entity(cfs_rq, se);
>   		update_load_sub(&cfs_rq->load, se->load.weight);
>   	}
>   	dequeue_load_avg(cfs_rq, se);
>   
> -	update_load_set(&se->load, weight);
> -
>   	if (!se->on_rq) {
>   		/*
>   		 * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
>   		 * we need to scale se->vlag when w_i changes.
>   		 */
> -		se->vlag = div_s64(se->vlag * old_weight, weight);
> +		se->vlag = div_s64(se->vlag * se->load.weight, weight);
>   	} else {
> -		s64 deadline = se->deadline - se->vruntime;
> -		/*
> -		 * When the weight changes, the virtual time slope changes and
> -		 * we should adjust the relative virtual deadline accordingly.
> -		 */
> -		deadline = div_s64(deadline * old_weight, weight);
> -		se->deadline = se->vruntime + deadline;
> -		if (se != cfs_rq->curr)
> -			min_deadline_cb_propagate(&se->run_node, NULL);
> +		reweight_eevdf(cfs_rq, se, weight);
>   	}
>   
> +	update_load_set(&se->load, weight);
> +
>   #ifdef CONFIG_SMP
>   	do {
>   		u32 divider = get_pelt_divider(&se->avg);
> @@ -3712,8 +3811,17 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
>   	enqueue_load_avg(cfs_rq, se);
>   	if (se->on_rq) {
>   		update_load_add(&cfs_rq->load, se->load.weight);
> -		if (cfs_rq->curr != se)
> -			avg_vruntime_add(cfs_rq, se);
> +		if (!curr) {
> +			/*
> +			 * The entity's vruntime has been adjusted, so let's check
> +			 * whether the rq-wide min_vruntime needs updated too. Since
> +			 * the calculations above require stable min_vruntime rather
> +			 * than up-to-date one, we do the update at the end of the
> +			 * reweight process.
> +			 */
> +			__enqueue_entity(cfs_rq, se);
> +			update_min_vruntime(cfs_rq);
> +		}
>   	}
>   }
Sorry if I am asking stupid question...... It looks like 
reweight_entity() may have chance to change the weight of cfs_rq->curr 
entity, but we'll never update_min_vruntime() when reweighting it. Is 
there any reason that we can skip the update_min_vruntime() for this case?
>   
> @@ -3857,14 +3965,11 @@ static void update_cfs_group(struct sched_entity *se)
>   
>   #ifndef CONFIG_SMP
>   	shares = READ_ONCE(gcfs_rq->tg->shares);
> -
> -	if (likely(se->load.weight == shares))
> -		return;
>   #else
> -	shares   = calc_group_shares(gcfs_rq);
> +	shares = calc_group_shares(gcfs_rq);
>   #endif
> -
> -	reweight_entity(cfs_rq_of(se), se, shares);
> +	if (unlikely(se->load.weight != shares))
> +		reweight_entity(cfs_rq_of(se), se, shares);
>   }
>   
>   #else /* CONFIG_FAIR_GROUP_SCHED */

Thanks,
Yiwei Lin

Re: Re: [PATCH 1/4] sched/eevdf: Fix vruntime adjustment on reweight

Posted by Abel Wu 2 years, 2 months ago

On 11/15/23 11:36 PM, Yiwei Lin Wrote:
> 
>> @@ -3712,8 +3811,17 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
>>       enqueue_load_avg(cfs_rq, se);
>>       if (se->on_rq) {
>>           update_load_add(&cfs_rq->load, se->load.weight);
>> -        if (cfs_rq->curr != se)
>> -            avg_vruntime_add(cfs_rq, se);
>> +        if (!curr) {
>> +            /*
>> +             * The entity's vruntime has been adjusted, so let's check
>> +             * whether the rq-wide min_vruntime needs updated too. Since
>> +             * the calculations above require stable min_vruntime rather
>> +             * than up-to-date one, we do the update at the end of the
>> +             * reweight process.
>> +             */
>> +            __enqueue_entity(cfs_rq, se);
>> +            update_min_vruntime(cfs_rq);
>> +        }
>>       }
>>   }
> Sorry if I am asking stupid question...... It looks like reweight_entity() may have chance to change the weight of cfs_rq->curr entity, but we'll never update_min_vruntime() when reweighting it. Is there any reason that we can skip the update_min_vruntime() for this case?

No, you are right!

Thanks!
	Abel

Re: [PATCH 1/4] sched/eevdf: Fix vruntime adjustment on reweight

Posted by Abel Wu 2 years, 2 months ago

On 11/16/23 12:48 PM, Abel Wu Wrote:
> On 11/15/23 11:36 PM, Yiwei Lin Wrote:
>>
>>> @@ -3712,8 +3811,17 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
>>>       enqueue_load_avg(cfs_rq, se);
>>>       if (se->on_rq) {
>>>           update_load_add(&cfs_rq->load, se->load.weight);
>>> -        if (cfs_rq->curr != se)
>>> -            avg_vruntime_add(cfs_rq, se);
>>> +        if (!curr) {
>>> +            /*
>>> +             * The entity's vruntime has been adjusted, so let's check
>>> +             * whether the rq-wide min_vruntime needs updated too. Since
>>> +             * the calculations above require stable min_vruntime rather
>>> +             * than up-to-date one, we do the update at the end of the
>>> +             * reweight process.
>>> +             */
>>> +            __enqueue_entity(cfs_rq, se);
>>> +            update_min_vruntime(cfs_rq);
>>> +        }
>>>       }
>>>   }
>> Sorry if I am asking stupid question...... It looks like reweight_entity() may have chance to change the weight of cfs_rq->curr entity, but we'll never update_min_vruntime() when reweighting it. Is there any reason that we can skip the update_min_vruntime() for this case?
> 
> No, you are right!

I was intended to update_min_vruntime() if se->on_rq and no matter
it is curr or not, just as you suggested. But after a second thought
I wonder if it is necessary to update *NOW*, since we will always
update_curr() before making any change to cfs_rq. Thoughts?

Re: [PATCH 1/4] sched/eevdf: Fix vruntime adjustment on reweight

Posted by Yiwei Lin 2 years, 2 months ago

On 11/16/23 13:07, Abel Wu wrote:
> On 11/16/23 12:48 PM, Abel Wu Wrote:
>> On 11/15/23 11:36 PM, Yiwei Lin Wrote:
>>>
>>>> @@ -3712,8 +3811,17 @@ static void reweight_entity(struct cfs_rq 
>>>> *cfs_rq, struct sched_entity *se,
>>>>       enqueue_load_avg(cfs_rq, se);
>>>>       if (se->on_rq) {
>>>>           update_load_add(&cfs_rq->load, se->load.weight);
>>>> -        if (cfs_rq->curr != se)
>>>> -            avg_vruntime_add(cfs_rq, se);
>>>> +        if (!curr) {
>>>> +            /*
>>>> +             * The entity's vruntime has been adjusted, so let's 
>>>> check
>>>> +             * whether the rq-wide min_vruntime needs updated too. 
>>>> Since
>>>> +             * the calculations above require stable min_vruntime 
>>>> rather
>>>> +             * than up-to-date one, we do the update at the end of 
>>>> the
>>>> +             * reweight process.
>>>> +             */
>>>> +            __enqueue_entity(cfs_rq, se);
>>>> +            update_min_vruntime(cfs_rq);
>>>> +        }
>>>>       }
>>>>   }
>>> Sorry if I am asking stupid question...... It looks like 
>>> reweight_entity() may have chance to change the weight of 
>>> cfs_rq->curr entity, but we'll never update_min_vruntime() when 
>>> reweighting it. Is there any reason that we can skip the 
>>> update_min_vruntime() for this case?
>>
>> No, you are right!
>
> I was intended to update_min_vruntime() if se->on_rq and no matter
> it is curr or not, just as you suggested. But after a second thought
> I wonder if it is necessary to update *NOW*, since we will always
> update_curr() before making any change to cfs_rq. Thoughts?
I lost the fact that we'll update_min_vruntime() every time we 
update_curr(). Because of this fact, we can indeed wait until we need 
the correct min_vruntime and update_min_vruntime() then. The only 
consideration that I came up with is that the sched_debug may not be 
able to reflect the accurate min_vruntime in time. But this may not be a 
big problem.

Further, I have another advanced thought we can remove the 
update_min_vruntime() here in the reweight_entity() directly to save 
more time. The reason that I think this is because min_vruntime is not 
for normalization of vruntime as before which is required on CFS, so we 
will always update_curr() for the latest min_vruntime before using it. 
Also, the update_min_vruntime() in dequeue_entity() may also be removed 
as the reason, i.e. just do update_min_vruntime() in update_curr() to 
simplify. What do you think?

Thanks,
Yiwei Lin

Re: Re: [PATCH 1/4] sched/eevdf: Fix vruntime adjustment on reweight

Posted by Abel Wu 2 years, 2 months ago

On 11/16/23 2:51 PM, Yiwei Lin Wrote:
> 
> On 11/16/23 13:07, Abel Wu wrote:
>> On 11/16/23 12:48 PM, Abel Wu Wrote:
>>> On 11/15/23 11:36 PM, Yiwei Lin Wrote:
>>>>
>>>>> @@ -3712,8 +3811,17 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
>>>>>       enqueue_load_avg(cfs_rq, se);
>>>>>       if (se->on_rq) {
>>>>>           update_load_add(&cfs_rq->load, se->load.weight);
>>>>> -        if (cfs_rq->curr != se)
>>>>> -            avg_vruntime_add(cfs_rq, se);
>>>>> +        if (!curr) {
>>>>> +            /*
>>>>> +             * The entity's vruntime has been adjusted, so let's check
>>>>> +             * whether the rq-wide min_vruntime needs updated too. Since
>>>>> +             * the calculations above require stable min_vruntime rather
>>>>> +             * than up-to-date one, we do the update at the end of the
>>>>> +             * reweight process.
>>>>> +             */
>>>>> +            __enqueue_entity(cfs_rq, se);
>>>>> +            update_min_vruntime(cfs_rq);
>>>>> +        }
>>>>>       }
>>>>>   }
>>>> Sorry if I am asking stupid question...... It looks like reweight_entity() may have chance to change the weight of cfs_rq->curr entity, but we'll never update_min_vruntime() when reweighting it. Is there any reason that we can skip the update_min_vruntime() for this case?
>>>
>>> No, you are right!
>>
>> I was intended to update_min_vruntime() if se->on_rq and no matter
>> it is curr or not, just as you suggested. But after a second thought
>> I wonder if it is necessary to update *NOW*, since we will always
>> update_curr() before making any change to cfs_rq. Thoughts?
> I lost the fact that we'll update_min_vruntime() every time we update_curr(). Because of this fact, we can indeed wait until we need the correct min_vruntime and update_min_vruntime() then. The only consideration that I came up with is that the sched_debug may not be able to reflect the accurate min_vruntime in time. But this may not be a big problem.
> 
> Further, I have another advanced thought we can remove the update_min_vruntime() here in the reweight_entity() directly to save more time. The reason that I think this is because min_vruntime is not for normalization of vruntime as before which is required on CFS, so we will always update_curr() for the latest min_vruntime before using it. Also, the update_min_vruntime() in dequeue_entity() may also be removed as the reason, i.e. just do update_min_vruntime() in update_curr() to simplify. What do you think?

Yes, this is also exactly what I am thinking about. As task placement
now adopts lag-based solution which is irrespective of min_vruntime,
and also based on the fact that it is only used as a base offset for
calculating avg_vruntime (in order to avoid overflow), we probably
can update it in a more relaxed way e.g. in ticks. If relaxed update
works, there seems still work to be done first:

   1) the priority of core pick when core scheduling needs to change
      to deadline-based solution;
   2) need to make sure not overflow in NOHZ_FULL mode

Just some first thoughts come into my mind :)

Thanks,
	Abel

Re: [PATCH 1/4] sched/eevdf: Fix vruntime adjustment on reweight

Posted by Yiwei Lin 2 years, 2 months ago

On 11/16/23 12:48, Abel Wu wrote:
> On 11/15/23 11:36 PM, Yiwei Lin Wrote:
>>
>>> @@ -3712,8 +3811,17 @@ static void reweight_entity(struct cfs_rq 
>>> *cfs_rq, struct sched_entity *se,
>>>       enqueue_load_avg(cfs_rq, se);
>>>       if (se->on_rq) {
>>>           update_load_add(&cfs_rq->load, se->load.weight);
>>> -        if (cfs_rq->curr != se)
>>> -            avg_vruntime_add(cfs_rq, se);
>>> +        if (!curr) {
>>> +            /*
>>> +             * The entity's vruntime has been adjusted, so let's check
>>> +             * whether the rq-wide min_vruntime needs updated too. 
>>> Since
>>> +             * the calculations above require stable min_vruntime 
>>> rather
>>> +             * than up-to-date one, we do the update at the end of the
>>> +             * reweight process.
>>> +             */
>>> +            __enqueue_entity(cfs_rq, se);
>>> +            update_min_vruntime(cfs_rq);
>>> +        }
>>>       }
>>>   }
>> Sorry if I am asking stupid question...... It looks like 
>> reweight_entity() may have chance to change the weight of 
>> cfs_rq->curr entity, but we'll never update_min_vruntime() when 
>> reweighting it. Is there any reason that we can skip the 
>> update_min_vruntime() for this case?
>
> No, you are right!
>
> Thanks!
>     Abel
Thank you! I'll take the responsibility to fix this.

Yiwei Lin

Re: Re: [PATCH 1/4] sched/eevdf: Fix vruntime adjustment on reweight

Posted by Abel Wu 2 years, 2 months ago

On 11/16/23 12:59 PM, Yiwei Lin Wrote:
> 
> On 11/16/23 12:48, Abel Wu wrote:
>> On 11/15/23 11:36 PM, Yiwei Lin Wrote:
>>>
>>>> @@ -3712,8 +3811,17 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
>>>>       enqueue_load_avg(cfs_rq, se);
>>>>       if (se->on_rq) {
>>>>           update_load_add(&cfs_rq->load, se->load.weight);
>>>> -        if (cfs_rq->curr != se)
>>>> -            avg_vruntime_add(cfs_rq, se);
>>>> +        if (!curr) {
>>>> +            /*
>>>> +             * The entity's vruntime has been adjusted, so let's check
>>>> +             * whether the rq-wide min_vruntime needs updated too. Since
>>>> +             * the calculations above require stable min_vruntime rather
>>>> +             * than up-to-date one, we do the update at the end of the
>>>> +             * reweight process.
>>>> +             */
>>>> +            __enqueue_entity(cfs_rq, se);
>>>> +            update_min_vruntime(cfs_rq);
>>>> +        }
>>>>       }
>>>>   }
>>> Sorry if I am asking stupid question...... It looks like reweight_entity() may have chance to change the weight of cfs_rq->curr entity, but we'll never update_min_vruntime() when reweighting it. Is there any reason that we can skip the update_min_vruntime() for this case?
>>
>> No, you are right!
>>
>> Thanks!
>>     Abel
> Thank you! I'll take the responsibility to fix this.

That would be appreciated. I suggest fixing this before we
do further optimizations.

Thanks,
	Abel

Re: [PATCH 1/4] sched/eevdf: Fix vruntime adjustment on reweight

Posted by Peter Zijlstra 2 years, 3 months ago

On Tue, Nov 07, 2023 at 05:05:07PM +0800, Abel Wu wrote:
> vruntime of the (on_rq && !0-lag) entity needs to be adjusted when
> it gets re-weighted, and the calculations can be simplified based
> on the fact that re-weight won't change the w-average of all the
> entities. Please check the proofs in comments.
> 
> But adjusting vruntime can also cause position change in RB-tree
> hence require re-queue to fix up which might be costly. This might
> be avoided by deferring adjustment to the time the entity actually
> leaves tree (dequeue/pick), but that will negatively affect task
> selection and probably not good enough either.
> 
> Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy")
> Signed-off-by: Abel Wu <wuyun.abel@bytedance.com>

Very good, thanks!

It's a bit sad we have to muck about with the tree now, but alas.

If only Google and FB could agree on what to do with this cgroup
nonsense, then maybe we could get rid of all this.

[tip: sched/urgent] sched/eevdf: Fix vruntime adjustment on reweight

Posted by tip-bot2 for Abel Wu 2 years, 2 months ago

The following commit has been merged into the sched/urgent branch of tip:

Commit-ID:     eab03c23c2a162085b13200d7942fc5a00b5ccc8
Gitweb:        https://git.kernel.org/tip/eab03c23c2a162085b13200d7942fc5a00b5ccc8
Author:        Abel Wu <wuyun.abel@bytedance.com>
AuthorDate:    Tue, 07 Nov 2023 17:05:07 +08:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 14 Nov 2023 22:27:00 +01:00

sched/eevdf: Fix vruntime adjustment on reweight

vruntime of the (on_rq && !0-lag) entity needs to be adjusted when
it gets re-weighted, and the calculations can be simplified based
on the fact that re-weight won't change the w-average of all the
entities. Please check the proofs in comments.

But adjusting vruntime can also cause position change in RB-tree
hence require re-queue to fix up which might be costly. This might
be avoided by deferring adjustment to the time the entity actually
leaves tree (dequeue/pick), but that will negatively affect task
selection and probably not good enough either.

Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy")
Signed-off-by: Abel Wu <wuyun.abel@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20231107090510.71322-2-wuyun.abel@bytedance.com
---
 kernel/sched/fair.c | 151 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 128 insertions(+), 23 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2048138..025d909 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3666,41 +3666,140 @@ static inline void
 dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
 #endif
 
+static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se,
+			   unsigned long weight)
+{
+	unsigned long old_weight = se->load.weight;
+	u64 avruntime = avg_vruntime(cfs_rq);
+	s64 vlag, vslice;
+
+	/*
+	 * VRUNTIME
+	 * ========
+	 *
+	 * COROLLARY #1: The virtual runtime of the entity needs to be
+	 * adjusted if re-weight at !0-lag point.
+	 *
+	 * Proof: For contradiction assume this is not true, so we can
+	 * re-weight without changing vruntime at !0-lag point.
+	 *
+	 *             Weight	VRuntime   Avg-VRuntime
+	 *     before    w          v            V
+	 *      after    w'         v'           V'
+	 *
+	 * Since lag needs to be preserved through re-weight:
+	 *
+	 *	lag = (V - v)*w = (V'- v')*w', where v = v'
+	 *	==>	V' = (V - v)*w/w' + v		(1)
+	 *
+	 * Let W be the total weight of the entities before reweight,
+	 * since V' is the new weighted average of entities:
+	 *
+	 *	V' = (WV + w'v - wv) / (W + w' - w)	(2)
+	 *
+	 * by using (1) & (2) we obtain:
+	 *
+	 *	(WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v
+	 *	==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v
+	 *	==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v
+	 *	==>	(V - v)*W/(W + w' - w) = (V - v)*w/w' (3)
+	 *
+	 * Since we are doing at !0-lag point which means V != v, we
+	 * can simplify (3):
+	 *
+	 *	==>	W / (W + w' - w) = w / w'
+	 *	==>	Ww' = Ww + ww' - ww
+	 *	==>	W * (w' - w) = w * (w' - w)
+	 *	==>	W = w	(re-weight indicates w' != w)
+	 *
+	 * So the cfs_rq contains only one entity, hence vruntime of
+	 * the entity @v should always equal to the cfs_rq's weighted
+	 * average vruntime @V, which means we will always re-weight
+	 * at 0-lag point, thus breach assumption. Proof completed.
+	 *
+	 *
+	 * COROLLARY #2: Re-weight does NOT affect weighted average
+	 * vruntime of all the entities.
+	 *
+	 * Proof: According to corollary #1, Eq. (1) should be:
+	 *
+	 *	(V - v)*w = (V' - v')*w'
+	 *	==>    v' = V' - (V - v)*w/w'		(4)
+	 *
+	 * According to the weighted average formula, we have:
+	 *
+	 *	V' = (WV - wv + w'v') / (W - w + w')
+	 *	   = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w')
+	 *	   = (WV - wv + w'V' - Vw + wv) / (W - w + w')
+	 *	   = (WV + w'V' - Vw) / (W - w + w')
+	 *
+	 *	==>  V'*(W - w + w') = WV + w'V' - Vw
+	 *	==>	V' * (W - w) = (W - w) * V	(5)
+	 *
+	 * If the entity is the only one in the cfs_rq, then reweight
+	 * always occurs at 0-lag point, so V won't change. Or else
+	 * there are other entities, hence W != w, then Eq. (5) turns
+	 * into V' = V. So V won't change in either case, proof done.
+	 *
+	 *
+	 * So according to corollary #1 & #2, the effect of re-weight
+	 * on vruntime should be:
+	 *
+	 *	v' = V' - (V - v) * w / w'		(4)
+	 *	   = V  - (V - v) * w / w'
+	 *	   = V  - vl * w / w'
+	 *	   = V  - vl'
+	 */
+	if (avruntime != se->vruntime) {
+		vlag = (s64)(avruntime - se->vruntime);
+		vlag = div_s64(vlag * old_weight, weight);
+		se->vruntime = avruntime - vlag;
+	}
+
+	/*
+	 * DEADLINE
+	 * ========
+	 *
+	 * When the weight changes, the virtual time slope changes and
+	 * we should adjust the relative virtual deadline accordingly.
+	 *
+	 *	d' = v' + (d - v)*w/w'
+	 *	   = V' - (V - v)*w/w' + (d - v)*w/w'
+	 *	   = V  - (V - v)*w/w' + (d - v)*w/w'
+	 *	   = V  + (d - V)*w/w'
+	 */
+	vslice = (s64)(se->deadline - avruntime);
+	vslice = div_s64(vslice * old_weight, weight);
+	se->deadline = avruntime + vslice;
+}
+
 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 			    unsigned long weight)
 {
-	unsigned long old_weight = se->load.weight;
+	bool curr = cfs_rq->curr == se;
 
 	if (se->on_rq) {
 		/* commit outstanding execution time */
-		if (cfs_rq->curr == se)
+		if (curr)
 			update_curr(cfs_rq);
 		else
-			avg_vruntime_sub(cfs_rq, se);
+			__dequeue_entity(cfs_rq, se);
 		update_load_sub(&cfs_rq->load, se->load.weight);
 	}
 	dequeue_load_avg(cfs_rq, se);
 
-	update_load_set(&se->load, weight);
-
 	if (!se->on_rq) {
 		/*
 		 * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
 		 * we need to scale se->vlag when w_i changes.
 		 */
-		se->vlag = div_s64(se->vlag * old_weight, weight);
+		se->vlag = div_s64(se->vlag * se->load.weight, weight);
 	} else {
-		s64 deadline = se->deadline - se->vruntime;
-		/*
-		 * When the weight changes, the virtual time slope changes and
-		 * we should adjust the relative virtual deadline accordingly.
-		 */
-		deadline = div_s64(deadline * old_weight, weight);
-		se->deadline = se->vruntime + deadline;
-		if (se != cfs_rq->curr)
-			min_deadline_cb_propagate(&se->run_node, NULL);
+		reweight_eevdf(cfs_rq, se, weight);
 	}
 
+	update_load_set(&se->load, weight);
+
 #ifdef CONFIG_SMP
 	do {
 		u32 divider = get_pelt_divider(&se->avg);
@@ -3712,8 +3811,17 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 	enqueue_load_avg(cfs_rq, se);
 	if (se->on_rq) {
 		update_load_add(&cfs_rq->load, se->load.weight);
-		if (cfs_rq->curr != se)
-			avg_vruntime_add(cfs_rq, se);
+		if (!curr) {
+			/*
+			 * The entity's vruntime has been adjusted, so let's check
+			 * whether the rq-wide min_vruntime needs updated too. Since
+			 * the calculations above require stable min_vruntime rather
+			 * than up-to-date one, we do the update at the end of the
+			 * reweight process.
+			 */
+			__enqueue_entity(cfs_rq, se);
+			update_min_vruntime(cfs_rq);
+		}
 	}
 }
 
@@ -3857,14 +3965,11 @@ static void update_cfs_group(struct sched_entity *se)
 
 #ifndef CONFIG_SMP
 	shares = READ_ONCE(gcfs_rq->tg->shares);
-
-	if (likely(se->load.weight == shares))
-		return;
 #else
-	shares   = calc_group_shares(gcfs_rq);
+	shares = calc_group_shares(gcfs_rq);
 #endif
-
-	reweight_entity(cfs_rq_of(se), se, shares);
+	if (unlikely(se->load.weight != shares))
+		reweight_entity(cfs_rq_of(se), se, shares);
 }
 
 #else /* CONFIG_FAIR_GROUP_SCHED */