kernel/sched/fair.c | 188 +++++++++++++++++++++---------------------- 1 file changed, 92 insertions(+), 96 deletions(-)
The following commit has been merged into the sched/core branch of tip:
Commit-ID: 6d2051403d6c93832d3058a1b275c6aef2c97f44
Gitweb: https://git.kernel.org/tip/6d2051403d6c93832d3058a1b275c6aef2c97f44
Author: Vincent Guittot <vincent.guittot@linaro.org>
AuthorDate: Mon, 18 May 2026 12:23:45 +02:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 19 May 2026 12:17:36 +02:00
sched/fair: Update util_est after updating util_avg during dequeue
util_est_update() must be called after updating util_avg during the dequeue
of a task and only when the task is not delayed dequeue.
Move util_est_update() in update_load_avg().
Fixes: b55945c500c5 ("sched: Fix pick_next_task_fair() vs try_to_wake_up() race")
Closes: https://lore.kernel.org/all/20260512124653.305275-1-qyousef@layalina.io/
Reported-by: Qais Yousef <qyousef@layalina.io>
Reviewed-and-tested-by: Qais Yousef <qyousef@layalina.io>
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20260518102345.268452-1-vincent.guittot@linaro.org
---
kernel/sched/fair.c | 188 +++++++++++++++++++++----------------------
1 file changed, 92 insertions(+), 96 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7289658..09d3acd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4930,13 +4930,86 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
trace_pelt_cfs_tp(cfs_rq);
}
+#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
+
+static inline void util_est_update(struct sched_entity *se)
+{
+ unsigned int ewma, dequeued, last_ewma_diff;
+
+ if (!sched_feat(UTIL_EST))
+ return;
+
+ /* Get current estimate of utilization */
+ ewma = READ_ONCE(se->avg.util_est);
+
+ /*
+ * If the PELT values haven't changed since enqueue time,
+ * skip the util_est update.
+ */
+ if (ewma & UTIL_AVG_UNCHANGED)
+ return;
+
+ /* Get utilization at dequeue */
+ dequeued = READ_ONCE(se->avg.util_avg);
+
+ /*
+ * Reset EWMA on utilization increases, the moving average is used only
+ * to smooth utilization decreases.
+ */
+ if (ewma <= dequeued) {
+ ewma = dequeued;
+ goto done;
+ }
+
+ /*
+ * Skip update of task's estimated utilization when its members are
+ * already ~1% close to its last activation value.
+ */
+ last_ewma_diff = ewma - dequeued;
+ if (last_ewma_diff < UTIL_EST_MARGIN)
+ goto done;
+
+ /*
+ * To avoid underestimate of task utilization, skip updates of EWMA if
+ * we cannot grant that thread got all CPU time it wanted.
+ */
+ if ((dequeued + UTIL_EST_MARGIN) < READ_ONCE(se->avg.runnable_avg))
+ goto done;
+
+ /*
+ * Update Task's estimated utilization
+ *
+ * When *p completes an activation we can consolidate another sample
+ * of the task size. This is done by using this value to update the
+ * Exponential Weighted Moving Average (EWMA):
+ *
+ * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1)
+ * = w * task_util(p) + ewma(t-1) - w * ewma(t-1)
+ * = w * (task_util(p) - ewma(t-1)) + ewma(t-1)
+ * = w * ( -last_ewma_diff ) + ewma(t-1)
+ * = w * (-last_ewma_diff + ewma(t-1) / w)
+ *
+ * Where 'w' is the weight of new samples, which is configured to be
+ * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
+ */
+ ewma <<= UTIL_EST_WEIGHT_SHIFT;
+ ewma -= last_ewma_diff;
+ ewma >>= UTIL_EST_WEIGHT_SHIFT;
+done:
+ ewma |= UTIL_AVG_UNCHANGED;
+ WRITE_ONCE(se->avg.util_est, ewma);
+
+ trace_sched_util_est_se_tp(se);
+}
+
/*
* Optional action to be done while updating the load average
*/
-#define UPDATE_TG 0x1
-#define SKIP_AGE_LOAD 0x2
-#define DO_ATTACH 0x4
-#define DO_DETACH 0x8
+#define UPDATE_TG 0x01
+#define SKIP_AGE_LOAD 0x02
+#define DO_ATTACH 0x04
+#define DO_DETACH 0x08
+#define UPDATE_UTIL_EST 0x10
/* Update task and its cfs_rq load average */
static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
@@ -4979,6 +5052,9 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
if (flags & UPDATE_TG)
update_tg_load_avg(cfs_rq);
}
+
+ if (flags & UPDATE_UTIL_EST)
+ util_est_update(se);
}
/*
@@ -5037,11 +5113,6 @@ static inline unsigned long task_util(struct task_struct *p)
return READ_ONCE(p->se.avg.util_avg);
}
-static inline unsigned long task_runnable(struct task_struct *p)
-{
- return READ_ONCE(p->se.avg.runnable_avg);
-}
-
static inline unsigned long _task_util_est(struct task_struct *p)
{
return READ_ONCE(p->se.avg.util_est) & ~UTIL_AVG_UNCHANGED;
@@ -5084,88 +5155,6 @@ static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
trace_sched_util_est_cfs_tp(cfs_rq);
}
-#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
-
-static inline void util_est_update(struct cfs_rq *cfs_rq,
- struct task_struct *p,
- bool task_sleep)
-{
- unsigned int ewma, dequeued, last_ewma_diff;
-
- if (!sched_feat(UTIL_EST))
- return;
-
- /*
- * Skip update of task's estimated utilization when the task has not
- * yet completed an activation, e.g. being migrated.
- */
- if (!task_sleep)
- return;
-
- /* Get current estimate of utilization */
- ewma = READ_ONCE(p->se.avg.util_est);
-
- /*
- * If the PELT values haven't changed since enqueue time,
- * skip the util_est update.
- */
- if (ewma & UTIL_AVG_UNCHANGED)
- return;
-
- /* Get utilization at dequeue */
- dequeued = task_util(p);
-
- /*
- * Reset EWMA on utilization increases, the moving average is used only
- * to smooth utilization decreases.
- */
- if (ewma <= dequeued) {
- ewma = dequeued;
- goto done;
- }
-
- /*
- * Skip update of task's estimated utilization when its members are
- * already ~1% close to its last activation value.
- */
- last_ewma_diff = ewma - dequeued;
- if (last_ewma_diff < UTIL_EST_MARGIN)
- goto done;
-
- /*
- * To avoid underestimate of task utilization, skip updates of EWMA if
- * we cannot grant that thread got all CPU time it wanted.
- */
- if ((dequeued + UTIL_EST_MARGIN) < task_runnable(p))
- goto done;
-
-
- /*
- * Update Task's estimated utilization
- *
- * When *p completes an activation we can consolidate another sample
- * of the task size. This is done by using this value to update the
- * Exponential Weighted Moving Average (EWMA):
- *
- * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1)
- * = w * task_util(p) + ewma(t-1) - w * ewma(t-1)
- * = w * (task_util(p) - ewma(t-1)) + ewma(t-1)
- * = w * ( -last_ewma_diff ) + ewma(t-1)
- * = w * (-last_ewma_diff + ewma(t-1) / w)
- *
- * Where 'w' is the weight of new samples, which is configured to be
- * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
- */
- ewma <<= UTIL_EST_WEIGHT_SHIFT;
- ewma -= last_ewma_diff;
- ewma >>= UTIL_EST_WEIGHT_SHIFT;
-done:
- ewma |= UTIL_AVG_UNCHANGED;
- WRITE_ONCE(p->se.avg.util_est, ewma);
-
- trace_sched_util_est_se_tp(&p->se);
-}
-
static inline unsigned long get_actual_cpu_capacity(int cpu)
{
unsigned long capacity = arch_scale_cpu_capacity(cpu);
@@ -5618,7 +5607,7 @@ static bool
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
bool sleep = flags & DEQUEUE_SLEEP;
- int action = UPDATE_TG;
+ int action = 0;
update_curr(cfs_rq);
clear_buddies(cfs_rq, se);
@@ -5638,15 +5627,23 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (sched_feat(DELAY_DEQUEUE) && delay &&
!entity_eligible(cfs_rq, se)) {
- update_load_avg(cfs_rq, se, 0);
+ if (entity_is_task(se))
+ action |= UPDATE_UTIL_EST;
+ update_load_avg(cfs_rq, se, action);
update_entity_lag(cfs_rq, se);
set_delayed(se);
return false;
}
}
- if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
- action |= DO_DETACH;
+ action = UPDATE_TG;
+ if (entity_is_task(se)) {
+ if (task_on_rq_migrating(task_of(se)))
+ action |= DO_DETACH;
+
+ if (sleep && !(flags & DEQUEUE_DELAYED))
+ action |= UPDATE_UTIL_EST;
+ }
/*
* When dequeuing a sched_entity, we must:
@@ -7409,7 +7406,6 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!p->se.sched_delayed)
util_est_dequeue(&rq->cfs, p);
- util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
if (dequeue_entities(rq, &p->se, flags) < 0)
return false;
© 2016 - 2026 Red Hat, Inc.