kernel/sched/fair.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-)
When checking if a shorter slice entity can preempt curr, we compare it with
the next entity to be picked but delayed dequeue entities can screw the
decision whereas they will be dequeued when picking next entity.
Dequeue them while checking for preemption as they will be dequeued anyway
when picking next entity.
tip/sched/core tip/sched/core +this patch
cyclictest slice (ms) (default)2.8 8 8
hackbench slice (ms) (default)2.8 20 20
Total Samples | 22559 22595 22683
Average (us) | 157 64( 59%) 59( 8%)
Median (P50) (us) | 57 57( 0%) 58(- 2%)
90th Percentile (us) | 64 60( 6%) 60( 0%)
99th Percentile (us) | 2407 67( 97%) 67( 0%)
99.9th Percentile (us) | 3400 2288( 33%) 727( 68%)
Maximum (us) | 5037 9252(-84%) 7461( 19%)
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---
v2 changes:
- use pick_next_entity() instead of duplicating the code in
wakeup_preempt_fair()
kernel/sched/fair.c | 27 ++++++++++++++-------------
1 file changed, 14 insertions(+), 13 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f179faf7a6a1..d99e56b6dcc9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1099,7 +1099,7 @@ static inline void cancel_protect_slice(struct sched_entity *se)
*
* Which allows tree pruning through eligibility.
*/
-static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect)
+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq, bool protect)
{
struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
struct sched_entity *se = __pick_first_entity(cfs_rq);
@@ -1170,11 +1170,6 @@ static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect)
return best;
}
-static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
-{
- return __pick_eevdf(cfs_rq, true);
-}
-
struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
@@ -5749,11 +5744,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags);
* 4) do not run the "skip" process, if something else is available
*/
static struct sched_entity *
-pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
+pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq, bool protect)
{
struct sched_entity *se;
- se = pick_eevdf(cfs_rq);
+ se = pick_eevdf(cfs_rq, protect);
if (se->sched_delayed) {
dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
/*
@@ -9015,7 +9010,7 @@ static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_f
{
enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK;
struct task_struct *donor = rq->donor;
- struct sched_entity *se = &donor->se, *pse = &p->se;
+ struct sched_entity *nse, *se = &donor->se, *pse = &p->se;
struct cfs_rq *cfs_rq = task_cfs_rq(donor);
int cse_is_idle, pse_is_idle;
@@ -9126,11 +9121,17 @@ static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_f
}
pick:
+ nse = pick_next_entity(rq, cfs_rq, preempt_action != PREEMPT_WAKEUP_SHORT);
+ /* If @p has become the most eligible task, force preemption */
+ if (nse == pse)
+ goto preempt;
+
/*
- * If @p has become the most eligible task, force preemption.
+ * Because p is enqueued, nse being null can only mean that we
+ * dequeued a delayed task.
*/
- if (__pick_eevdf(cfs_rq, preempt_action != PREEMPT_WAKEUP_SHORT) == pse)
- goto preempt;
+ if (!nse)
+ goto pick;
if (sched_feat(RUN_TO_PARITY))
update_protect_slice(cfs_rq, se);
@@ -9167,7 +9168,7 @@ static struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf)
throttled |= check_cfs_rq_runtime(cfs_rq);
- se = pick_next_entity(rq, cfs_rq);
+ se = pick_next_entity(rq, cfs_rq, true);
if (!se)
goto again;
cfs_rq = group_cfs_rq(se);
--
2.43.0
The following commit has been merged into the sched/urgent branch of tip:
Commit-ID: ac8e69e693631689d74d8f1ebee6f84f737f797f
Gitweb: https://git.kernel.org/tip/ac8e69e693631689d74d8f1ebee6f84f737f797f
Author: Vincent Guittot <vincent.guittot@linaro.org>
AuthorDate: Wed, 22 Apr 2026 11:34:00 +02:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 28 Apr 2026 09:19:54 +02:00
sched/fair: Fix wakeup_preempt_fair() vs delayed dequeue
Similar to how pick_next_entity() must dequeue delayed entities, so too must
wakeup_preempt_fair(). Any delayed task being found means it is eligible and
hence past the 0-lag point, ready for removal.
Worse, by not removing delayed entities from consideration, it can skew the
preemption decision, with the end result that a short slice wakeup will not
result in a preemption.
tip/sched/core tip/sched/core +this patch
cyclictest slice (ms) (default)2.8 8 8
hackbench slice (ms) (default)2.8 20 20
Total Samples | 22559 22595 22683
Average (us) | 157 64( 59%) 59( 8%)
Median (P50) (us) | 57 57( 0%) 58(- 2%)
90th Percentile (us) | 64 60( 6%) 60( 0%)
99th Percentile (us) | 2407 67( 97%) 67( 0%)
99.9th Percentile (us) | 3400 2288( 33%) 727( 68%)
Maximum (us) | 5037 9252(-84%) 7461( 19%)
Fixes: f12e148892ed ("sched/fair: Prepare pick_next_task() for delayed dequeue")
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20260422093400.319251-1-vincent.guittot@linaro.org
---
kernel/sched/fair.c | 27 ++++++++++++++-------------
1 file changed, 14 insertions(+), 13 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 615861d..7289658 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1104,7 +1104,7 @@ static inline void cancel_protect_slice(struct sched_entity *se)
*
* Which allows tree pruning through eligibility.
*/
-static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect)
+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq, bool protect)
{
struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
struct sched_entity *se = __pick_first_entity(cfs_rq);
@@ -1175,11 +1175,6 @@ found:
return best;
}
-static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
-{
- return __pick_eevdf(cfs_rq, true);
-}
-
struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
@@ -5754,11 +5749,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags);
* 4) do not run the "skip" process, if something else is available
*/
static struct sched_entity *
-pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
+pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq, bool protect)
{
struct sched_entity *se;
- se = pick_eevdf(cfs_rq);
+ se = pick_eevdf(cfs_rq, protect);
if (se->sched_delayed) {
dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
/*
@@ -9032,7 +9027,7 @@ static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_f
{
enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK;
struct task_struct *donor = rq->donor;
- struct sched_entity *se = &donor->se, *pse = &p->se;
+ struct sched_entity *nse, *se = &donor->se, *pse = &p->se;
struct cfs_rq *cfs_rq = task_cfs_rq(donor);
int cse_is_idle, pse_is_idle;
@@ -9143,11 +9138,17 @@ static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_f
}
pick:
+ nse = pick_next_entity(rq, cfs_rq, preempt_action != PREEMPT_WAKEUP_SHORT);
+ /* If @p has become the most eligible task, force preemption */
+ if (nse == pse)
+ goto preempt;
+
/*
- * If @p has become the most eligible task, force preemption.
+ * Because p is enqueued, nse being null can only mean that we
+ * dequeued a delayed task.
*/
- if (__pick_eevdf(cfs_rq, preempt_action != PREEMPT_WAKEUP_SHORT) == pse)
- goto preempt;
+ if (!nse)
+ goto pick;
if (sched_feat(RUN_TO_PARITY))
update_protect_slice(cfs_rq, se);
@@ -9184,7 +9185,7 @@ again:
throttled |= check_cfs_rq_runtime(cfs_rq);
- se = pick_next_entity(rq, cfs_rq);
+ se = pick_next_entity(rq, cfs_rq, true);
if (!se)
goto again;
cfs_rq = group_cfs_rq(se);
© 2016 - 2026 Red Hat, Inc.