[PATCH 5/5] sched: Rework sched_class::wakeup_preempt() and rq_modified_*()

Peter Zijlstra posted 5 patches 2 months, 1 week ago
[PATCH 5/5] sched: Rework sched_class::wakeup_preempt() and rq_modified_*()
Posted by Peter Zijlstra 2 months, 1 week ago
Change sched_class::wakeup_preempt() to also get called for
cross-class wakeups, specifically those where the woken task is of a
higher class than the previous highest class.

In order to do this, track the current highest class of the runqueue
in rq::next_class and have wakeup_preempt() track this upwards for
each new wakeup. Additionally have set_next_task() re-set the value to
the current class.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c      |   32 +++++++++++++++++++++++---------
 kernel/sched/deadline.c  |   14 +++++++++-----
 kernel/sched/ext.c       |    9 ++++-----
 kernel/sched/fair.c      |   17 ++++++++++-------
 kernel/sched/idle.c      |    3 ---
 kernel/sched/rt.c        |    9 ++++++---
 kernel/sched/sched.h     |   26 ++------------------------
 kernel/sched/stop_task.c |    3 ---
 8 files changed, 54 insertions(+), 59 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2090,7 +2090,6 @@ void enqueue_task(struct rq *rq, struct
 	 */
 	uclamp_rq_inc(rq, p, flags);
 
-	rq->queue_mask |= p->sched_class->queue_mask;
 	p->sched_class->enqueue_task(rq, p, flags);
 
 	psi_enqueue(p, flags);
@@ -2123,7 +2122,6 @@ inline bool dequeue_task(struct rq *rq,
 	 * and mark the task ->sched_delayed.
 	 */
 	uclamp_rq_dec(rq, p);
-	rq->queue_mask |= p->sched_class->queue_mask;
 	return p->sched_class->dequeue_task(rq, p, flags);
 }
 
@@ -2174,10 +2172,14 @@ void wakeup_preempt(struct rq *rq, struc
 {
 	struct task_struct *donor = rq->donor;
 
-	if (p->sched_class == donor->sched_class)
-		donor->sched_class->wakeup_preempt(rq, p, flags);
-	else if (sched_class_above(p->sched_class, donor->sched_class))
+	if (p->sched_class == rq->next_class) {
+		rq->next_class->wakeup_preempt(rq, p, flags);
+
+	} else if (sched_class_above(p->sched_class, rq->next_class)) {
+		rq->next_class->wakeup_preempt(rq, p, flags);
 		resched_curr(rq);
+		rq->next_class = p->sched_class;
+	}
 
 	/*
 	 * A queue event has occurred, and we're going to schedule.  In
@@ -6797,6 +6799,7 @@ static void __sched notrace __schedule(i
 pick_again:
 	next = pick_next_task(rq, rq->donor, &rf);
 	rq_set_donor(rq, next);
+	rq->next_class = next->sched_class;
 	if (unlikely(task_is_blocked(next))) {
 		next = find_proxy_task(rq, next, &rf);
 		if (!next)
@@ -8646,6 +8649,8 @@ void __init sched_init(void)
 		rq->rt.rt_runtime = global_rt_runtime();
 		init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
 #endif
+		rq->next_class = &idle_sched_class;
+
 		rq->sd = NULL;
 		rq->rd = NULL;
 		rq->cpu_capacity = SCHED_CAPACITY_SCALE;
@@ -10771,10 +10776,8 @@ struct sched_change_ctx *sched_change_be
 		flags |= DEQUEUE_NOCLOCK;
 	}
 
-	if (flags & DEQUEUE_CLASS) {
-		if (p->sched_class->switching_from)
-			p->sched_class->switching_from(rq, p);
-	}
+	if ((flags & DEQUEUE_CLASS) && p->sched_class->switching_from)
+		p->sched_class->switching_from(rq, p);
 
 	*ctx = (struct sched_change_ctx){
 		.p = p,
@@ -10827,6 +10830,17 @@ void sched_change_end(struct sched_chang
 			p->sched_class->switched_to(rq, p);
 
 		/*
+		 * If this was a class promotion; let the old class know it
+		 * got preempted. Note that none of the switch*_from() methods
+		 * know the new class and none of the switch*_to() methods
+		 * know the old class.
+		 */
+		if (ctx->running && sched_class_above(p->sched_class, ctx->class)) {
+			rq->next_class->wakeup_preempt(rq, p, 0);
+			rq->next_class = p->sched_class;
+		}
+
+		/*
 		 * If this was a degradation in class someone should have set
 		 * need_resched by now.
 		 */
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2499,9 +2499,16 @@ static int balance_dl(struct rq *rq, str
  * Only called when both the current and waking task are -deadline
  * tasks.
  */
-static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p,
-				  int flags)
+static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags)
 {
+	/*
+	 * Can only get preempted by stop-class, and those should be
+	 * few and short lived, doesn't really make sense to push
+	 * anything away for that.
+	 */
+	if (p->sched_class != &dl_sched_class)
+		return;
+
 	if (dl_entity_preempt(&p->dl, &rq->donor->dl)) {
 		resched_curr(rq);
 		return;
@@ -3304,9 +3311,6 @@ static int task_is_throttled_dl(struct t
 #endif
 
 DEFINE_SCHED_CLASS(dl) = {
-
-	.queue_mask		= 8,
-
 	.enqueue_task		= enqueue_task_dl,
 	.dequeue_task		= dequeue_task_dl,
 	.yield_task		= yield_task_dl,
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2338,12 +2338,12 @@ static struct task_struct *pick_task_scx
 	bool keep_prev, kick_idle = false;
 	struct task_struct *p;
 
-	rq_modified_clear(rq);
+	rq->next_class = &ext_sched_class;
 	rq_unpin_lock(rq, rf);
 	balance_one(rq, prev);
 	rq_repin_lock(rq, rf);
 	maybe_queue_balance_callback(rq);
-	if (rq_modified_above(rq, &ext_sched_class))
+	if (sched_class_above(rq->next_class, &ext_sched_class))
 		return RETRY_TASK;
 
 	keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
@@ -2967,7 +2967,8 @@ static void switched_from_scx(struct rq
 	scx_disable_task(p);
 }
 
-static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
+static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_flags) {}
+
 static void switched_to_scx(struct rq *rq, struct task_struct *p) {}
 
 int scx_check_setscheduler(struct task_struct *p, int policy)
@@ -3216,8 +3217,6 @@ static void scx_cgroup_unlock(void) {}
  *   their current sched_class. Call them directly from sched core instead.
  */
 DEFINE_SCHED_CLASS(ext) = {
-	.queue_mask		= 1,
-
 	.enqueue_task		= enqueue_task_scx,
 	.dequeue_task		= dequeue_task_scx,
 	.yield_task		= yield_task_scx,
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8697,7 +8697,7 @@ preempt_sync(struct rq *rq, int wake_fla
 /*
  * Preempt the current task with a newly woken task if needed:
  */
-static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
+static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_flags)
 {
 	enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK;
 	struct task_struct *donor = rq->donor;
@@ -8705,6 +8705,12 @@ static void check_preempt_wakeup_fair(st
 	struct cfs_rq *cfs_rq = task_cfs_rq(donor);
 	int cse_is_idle, pse_is_idle;
 
+	/*
+	 * XXX Getting preempted by higher class, try and find idle CPU?
+	 */
+	if (p->sched_class != &fair_sched_class)
+		return;
+
 	if (unlikely(se == pse))
 		return;
 
@@ -12872,7 +12878,7 @@ static int sched_balance_newidle(struct
 	t0 = sched_clock_cpu(this_cpu);
 	__sched_balance_update_blocked_averages(this_rq);
 
-	rq_modified_clear(this_rq);
+	this_rq->next_class = &fair_sched_class;
 	raw_spin_rq_unlock(this_rq);
 
 	for_each_domain(this_cpu, sd) {
@@ -12939,7 +12945,7 @@ static int sched_balance_newidle(struct
 		pulled_task = 1;
 
 	/* If a higher prio class was modified, restart the pick */
-	if (rq_modified_above(this_rq, &fair_sched_class))
+	if (sched_class_above(this_rq->next_class, &fair_sched_class))
 		pulled_task = -1;
 
 out:
@@ -13837,15 +13843,12 @@ static unsigned int get_rr_interval_fair
  * All the scheduling class methods:
  */
 DEFINE_SCHED_CLASS(fair) = {
-
-	.queue_mask		= 2,
-
 	.enqueue_task		= enqueue_task_fair,
 	.dequeue_task		= dequeue_task_fair,
 	.yield_task		= yield_task_fair,
 	.yield_to_task		= yield_to_task_fair,
 
-	.wakeup_preempt		= check_preempt_wakeup_fair,
+	.wakeup_preempt		= wakeup_preempt_fair,
 
 	.pick_task		= pick_task_fair,
 	.pick_next_task		= pick_next_task_fair,
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -534,9 +534,6 @@ static void update_curr_idle(struct rq *
  * Simple, special scheduling class for the per-CPU idle tasks:
  */
 DEFINE_SCHED_CLASS(idle) = {
-
-	.queue_mask		= 0,
-
 	/* no enqueue/yield_task for idle tasks */
 
 	/* dequeue is not valid, we print a debug message there: */
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1615,6 +1615,12 @@ static void wakeup_preempt_rt(struct rq
 {
 	struct task_struct *donor = rq->donor;
 
+	/*
+	 * XXX If we're preempted by DL, queue a push?
+	 */
+	if (p->sched_class != &rt_sched_class)
+		return;
+
 	if (p->prio < donor->prio) {
 		resched_curr(rq);
 		return;
@@ -2568,9 +2574,6 @@ static int task_is_throttled_rt(struct t
 #endif /* CONFIG_SCHED_CORE */
 
 DEFINE_SCHED_CLASS(rt) = {
-
-	.queue_mask		= 4,
-
 	.enqueue_task		= enqueue_task_rt,
 	.dequeue_task		= dequeue_task_rt,
 	.yield_task		= yield_task_rt,
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1119,7 +1119,6 @@ struct rq {
 	raw_spinlock_t		__lock;
 
 	/* Per class runqueue modification mask; bits in class order. */
-	unsigned int		queue_mask;
 	unsigned int		nr_running;
 #ifdef CONFIG_NUMA_BALANCING
 	unsigned int		nr_numa_running;
@@ -1179,6 +1178,7 @@ struct rq {
 	struct sched_dl_entity	*dl_server;
 	struct task_struct	*idle;
 	struct task_struct	*stop;
+	const struct sched_class *next_class;
 	unsigned long		next_balance;
 	struct mm_struct	*prev_mm;
 
@@ -2426,15 +2426,6 @@ struct sched_class {
 #ifdef CONFIG_UCLAMP_TASK
 	int uclamp_enabled;
 #endif
-	/*
-	 * idle:  0
-	 * ext:   1
-	 * fair:  2
-	 * rt:    4
-	 * dl:    8
-	 * stop: 16
-	 */
-	unsigned int queue_mask;
 
 	/*
 	 * move_queued_task/activate_task/enqueue_task: rq->lock
@@ -2593,20 +2584,6 @@ struct sched_class {
 #endif
 };
 
-/*
- * Does not nest; only used around sched_class::pick_task() rq-lock-breaks.
- */
-static inline void rq_modified_clear(struct rq *rq)
-{
-	rq->queue_mask = 0;
-}
-
-static inline bool rq_modified_above(struct rq *rq, const struct sched_class * class)
-{
-	unsigned int mask = class->queue_mask;
-	return rq->queue_mask & ~((mask << 1) - 1);
-}
-
 static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
 	WARN_ON_ONCE(rq->donor != prev);
@@ -3899,6 +3876,7 @@ void move_queued_task_locked(struct rq *
 	deactivate_task(src_rq, task, 0);
 	set_task_cpu(task, dst_rq->cpu);
 	activate_task(dst_rq, task, 0);
+	wakeup_preempt(dst_rq, task, 0);
 }
 
 static inline
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -97,9 +97,6 @@ static void update_curr_stop(struct rq *
  * Simple, special scheduling class for the per-CPU stop tasks:
  */
 DEFINE_SCHED_CLASS(stop) = {
-
-	.queue_mask		= 16,
-
 	.enqueue_task		= enqueue_task_stop,
 	.dequeue_task		= dequeue_task_stop,
 	.yield_task		= yield_task_stop,
Re: [PATCH 5/5] sched: Rework sched_class::wakeup_preempt() and rq_modified_*()
Posted by Kuba Piecuch 2 months, 1 week ago
Hi Peter,

On Thu Nov 27, 2025 at 3:39 PM UTC, Peter Zijlstra wrote:
> Additionally have set_next_task() re-set the value to the current class.

I don't see this part reflected in the patch. Is something missing?

Best,
Kuba
Re: [PATCH 5/5] sched: Rework sched_class::wakeup_preempt() and rq_modified_*()
Posted by Peter Zijlstra 2 months, 1 week ago
On Fri, Nov 28, 2025 at 01:26:30PM +0000, Kuba Piecuch wrote:
> Hi Peter,
> 
> On Thu Nov 27, 2025 at 3:39 PM UTC, Peter Zijlstra wrote:
> > Additionally have set_next_task() re-set the value to the current class.
> 
> I don't see this part reflected in the patch. Is something missing?

Hmm, that does appear to have gone walk-about :/
Re: [PATCH 5/5] sched: Rework sched_class::wakeup_preempt() and rq_modified_*()
Posted by Peter Zijlstra 2 months, 1 week ago
On Fri, Nov 28, 2025 at 02:36:38PM +0100, Peter Zijlstra wrote:
> On Fri, Nov 28, 2025 at 01:26:30PM +0000, Kuba Piecuch wrote:
> > Hi Peter,
> > 
> > On Thu Nov 27, 2025 at 3:39 PM UTC, Peter Zijlstra wrote:
> > > Additionally have set_next_task() re-set the value to the current class.
> > 
> > I don't see this part reflected in the patch. Is something missing?
> 
> Hmm, that does appear to have gone walk-about :/

Aah, here:

@@ -6797,6 +6799,7 @@ static void __sched notrace __schedule(i
 pick_again:
        next = pick_next_task(rq, rq->donor, &rf);
        rq_set_donor(rq, next);
+       rq->next_class = next->sched_class;
        if (unlikely(task_is_blocked(next))) {
                next = find_proxy_task(rq, next, &rf);
                if (!next)

Will fix changelog. Had to do the above instead of set_next_task()
because if proxy stuff.
Re: [PATCH 5/5] sched: Rework sched_class::wakeup_preempt() and rq_modified_*()
Posted by Tejun Heo 2 months, 1 week ago
Hello,

On Thu, Nov 27, 2025 at 04:39:48PM +0100, Peter Zijlstra wrote:
> @@ -2174,10 +2172,14 @@ void wakeup_preempt(struct rq *rq, struc
>  {
>  	struct task_struct *donor = rq->donor;
>  
> -	if (p->sched_class == donor->sched_class)
> -		donor->sched_class->wakeup_preempt(rq, p, flags);
> -	else if (sched_class_above(p->sched_class, donor->sched_class))
> +	if (p->sched_class == rq->next_class) {
> +		rq->next_class->wakeup_preempt(rq, p, flags);
> +
> +	} else if (sched_class_above(p->sched_class, rq->next_class)) {
> +		rq->next_class->wakeup_preempt(rq, p, flags);
>  		resched_curr(rq);
> +		rq->next_class = p->sched_class;
> +	}

I wonder whether this is a bit subtle. Wouldn't it be clearer to add a
separate method which takes an explicit next_class argument for the second
case?

Thanks.

-- 
tejun
Re: [PATCH 5/5] sched: Rework sched_class::wakeup_preempt() and rq_modified_*()
Posted by Shrikanth Hegde 2 months, 1 week ago

On 11/27/25 9:09 PM, Peter Zijlstra wrote:
> Change sched_class::wakeup_preempt() to also get called for
> cross-class wakeups, specifically those where the woken task is of a
> higher class than the previous highest class.
> 
> In order to do this, track the current highest class of the runqueue
> in rq::next_class and have wakeup_preempt() track this upwards for
> each new wakeup. Additionally have set_next_task() re-set the value to
> the current class.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>   kernel/sched/core.c      |   32 +++++++++++++++++++++++---------
>   kernel/sched/deadline.c  |   14 +++++++++-----
>   kernel/sched/ext.c       |    9 ++++-----
>   kernel/sched/fair.c      |   17 ++++++++++-------
>   kernel/sched/idle.c      |    3 ---
>   kernel/sched/rt.c        |    9 ++++++---
>   kernel/sched/sched.h     |   26 ++------------------------
>   kernel/sched/stop_task.c |    3 ---
>   8 files changed, 54 insertions(+), 59 deletions(-)
> 
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2090,7 +2090,6 @@ void enqueue_task(struct rq *rq, struct
>   	 */
>   	uclamp_rq_inc(rq, p, flags);
>   
> -	rq->queue_mask |= p->sched_class->queue_mask;
>   	p->sched_class->enqueue_task(rq, p, flags);
>   
>   	psi_enqueue(p, flags);
> @@ -2123,7 +2122,6 @@ inline bool dequeue_task(struct rq *rq,
>   	 * and mark the task ->sched_delayed.
>   	 */
>   	uclamp_rq_dec(rq, p);
> -	rq->queue_mask |= p->sched_class->queue_mask;
>   	return p->sched_class->dequeue_task(rq, p, flags);
>   }
>   
> @@ -2174,10 +2172,14 @@ void wakeup_preempt(struct rq *rq, struc
>   {
>   	struct task_struct *donor = rq->donor;
>   
> -	if (p->sched_class == donor->sched_class)
> -		donor->sched_class->wakeup_preempt(rq, p, flags);
> -	else if (sched_class_above(p->sched_class, donor->sched_class))
> +	if (p->sched_class == rq->next_class) {
> +		rq->next_class->wakeup_preempt(rq, p, flags);
> +
> +	} else if (sched_class_above(p->sched_class, rq->next_class)) {
> +		rq->next_class->wakeup_preempt(rq, p, flags);

Whats the logic of calling wakeup_preempt here?

say rq was running CFS, now RT is waking up. but first thing we do is return if not
fair_sched_class. it is effectively resched_curr right?

>   		resched_curr(rq);
> +		rq->next_class = p->sched_class;

Since resched will happen and __schedule can set the next_class. it is necessary to set it
even earlier?

> +	}
>   
>   	/*
>   	 * A queue event has occurred, and we're going to schedule.  In
> @@ -6797,6 +6799,7 @@ static void __sched notrace __schedule(i
>   pick_again:
>   	next = pick_next_task(rq, rq->donor, &rf);
>   	rq_set_donor(rq, next);
> +	rq->next_class = next->sched_class;
>   	if (unlikely(task_is_blocked(next))) {
>   		next = find_proxy_task(rq, next, &rf);
>   		if (!next)
> @@ -8646,6 +8649,8 @@ void __init sched_init(void)
>   		rq->rt.rt_runtime = global_rt_runtime();
>   		init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
>   #endif
> +		rq->next_class = &idle_sched_class;
> +
>   		rq->sd = NULL;
>   		rq->rd = NULL;
>   		rq->cpu_capacity = SCHED_CAPACITY_SCALE;
> @@ -10771,10 +10776,8 @@ struct sched_change_ctx *sched_change_be
>   		flags |= DEQUEUE_NOCLOCK;
>   	}
>   
> -	if (flags & DEQUEUE_CLASS) {
> -		if (p->sched_class->switching_from)
> -			p->sched_class->switching_from(rq, p);
> -	}
> +	if ((flags & DEQUEUE_CLASS) && p->sched_class->switching_from)
> +		p->sched_class->switching_from(rq, p);
>   
>   	*ctx = (struct sched_change_ctx){
>   		.p = p,
> @@ -10827,6 +10830,17 @@ void sched_change_end(struct sched_chang
>   			p->sched_class->switched_to(rq, p);
>   
>   		/*
> +		 * If this was a class promotion; let the old class know it
> +		 * got preempted. Note that none of the switch*_from() methods
> +		 * know the new class and none of the switch*_to() methods
> +		 * know the old class.
> +		 */
> +		if (ctx->running && sched_class_above(p->sched_class, ctx->class)) {
> +			rq->next_class->wakeup_preempt(rq, p, 0);
> +			rq->next_class = p->sched_class;
> +		}
> +
> +		/*
>   		 * If this was a degradation in class someone should have set
>   		 * need_resched by now.
>   		 */
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -2499,9 +2499,16 @@ static int balance_dl(struct rq *rq, str
>    * Only called when both the current and waking task are -deadline
>    * tasks.
>    */
> -static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p,
> -				  int flags)
> +static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags)
>   {
> +	/*
> +	 * Can only get preempted by stop-class, and those should be
> +	 * few and short lived, doesn't really make sense to push
> +	 * anything away for that.
> +	 */
> +	if (p->sched_class != &dl_sched_class)
> +		return;
> +
>   	if (dl_entity_preempt(&p->dl, &rq->donor->dl)) {
>   		resched_curr(rq);
>   		return;
> @@ -3304,9 +3311,6 @@ static int task_is_throttled_dl(struct t
>   #endif
>   
>   DEFINE_SCHED_CLASS(dl) = {
> -
> -	.queue_mask		= 8,
> -
>   	.enqueue_task		= enqueue_task_dl,
>   	.dequeue_task		= dequeue_task_dl,
>   	.yield_task		= yield_task_dl,
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -2338,12 +2338,12 @@ static struct task_struct *pick_task_scx
>   	bool keep_prev, kick_idle = false;
>   	struct task_struct *p;
>   
> -	rq_modified_clear(rq);
> +	rq->next_class = &ext_sched_class;
>   	rq_unpin_lock(rq, rf);
>   	balance_one(rq, prev);
>   	rq_repin_lock(rq, rf);
>   	maybe_queue_balance_callback(rq);
> -	if (rq_modified_above(rq, &ext_sched_class))
> +	if (sched_class_above(rq->next_class, &ext_sched_class))
>   		return RETRY_TASK;
>   
>   	keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
> @@ -2967,7 +2967,8 @@ static void switched_from_scx(struct rq
>   	scx_disable_task(p);
>   }
>   
> -static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
> +static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_flags) {}
> +
>   static void switched_to_scx(struct rq *rq, struct task_struct *p) {}
>   
>   int scx_check_setscheduler(struct task_struct *p, int policy)
> @@ -3216,8 +3217,6 @@ static void scx_cgroup_unlock(void) {}
>    *   their current sched_class. Call them directly from sched core instead.
>    */
>   DEFINE_SCHED_CLASS(ext) = {
> -	.queue_mask		= 1,
> -
>   	.enqueue_task		= enqueue_task_scx,
>   	.dequeue_task		= dequeue_task_scx,
>   	.yield_task		= yield_task_scx,
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -8697,7 +8697,7 @@ preempt_sync(struct rq *rq, int wake_fla
>   /*
>    * Preempt the current task with a newly woken task if needed:
>    */
> -static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
> +static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_flags)
>   {
>   	enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK;
>   	struct task_struct *donor = rq->donor;
> @@ -8705,6 +8705,12 @@ static void check_preempt_wakeup_fair(st
>   	struct cfs_rq *cfs_rq = task_cfs_rq(donor);
>   	int cse_is_idle, pse_is_idle;
>   
> +	/*
> +	 * XXX Getting preempted by higher class, try and find idle CPU?
> +	 */
> +	if (p->sched_class != &fair_sched_class)
> +		return;
> +
>   	if (unlikely(se == pse))
>   		return;
>   
> @@ -12872,7 +12878,7 @@ static int sched_balance_newidle(struct
>   	t0 = sched_clock_cpu(this_cpu);
>   	__sched_balance_update_blocked_averages(this_rq);
>   
> -	rq_modified_clear(this_rq);
> +	this_rq->next_class = &fair_sched_class;
>   	raw_spin_rq_unlock(this_rq);
>   
>   	for_each_domain(this_cpu, sd) {
> @@ -12939,7 +12945,7 @@ static int sched_balance_newidle(struct
>   		pulled_task = 1;
>   
>   	/* If a higher prio class was modified, restart the pick */
> -	if (rq_modified_above(this_rq, &fair_sched_class))
> +	if (sched_class_above(this_rq->next_class, &fair_sched_class))
>   		pulled_task = -1;
>   
>   out:
> @@ -13837,15 +13843,12 @@ static unsigned int get_rr_interval_fair
>    * All the scheduling class methods:
>    */
>   DEFINE_SCHED_CLASS(fair) = {
> -
> -	.queue_mask		= 2,
> -
>   	.enqueue_task		= enqueue_task_fair,
>   	.dequeue_task		= dequeue_task_fair,
>   	.yield_task		= yield_task_fair,
>   	.yield_to_task		= yield_to_task_fair,
>   
> -	.wakeup_preempt		= check_preempt_wakeup_fair,
> +	.wakeup_preempt		= wakeup_preempt_fair,
>   
>   	.pick_task		= pick_task_fair,
>   	.pick_next_task		= pick_next_task_fair,
> --- a/kernel/sched/idle.c
> +++ b/kernel/sched/idle.c
> @@ -534,9 +534,6 @@ static void update_curr_idle(struct rq *
>    * Simple, special scheduling class for the per-CPU idle tasks:
>    */
>   DEFINE_SCHED_CLASS(idle) = {
> -
> -	.queue_mask		= 0,
> -
>   	/* no enqueue/yield_task for idle tasks */
>   
>   	/* dequeue is not valid, we print a debug message there: */
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -1615,6 +1615,12 @@ static void wakeup_preempt_rt(struct rq
>   {
>   	struct task_struct *donor = rq->donor;
>   
> +	/*
> +	 * XXX If we're preempted by DL, queue a push?
> +	 */
> +	if (p->sched_class != &rt_sched_class)
> +		return;
> +
>   	if (p->prio < donor->prio) {
>   		resched_curr(rq);
>   		return;
> @@ -2568,9 +2574,6 @@ static int task_is_throttled_rt(struct t
>   #endif /* CONFIG_SCHED_CORE */
>   
>   DEFINE_SCHED_CLASS(rt) = {
> -
> -	.queue_mask		= 4,
> -
>   	.enqueue_task		= enqueue_task_rt,
>   	.dequeue_task		= dequeue_task_rt,
>   	.yield_task		= yield_task_rt,
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1119,7 +1119,6 @@ struct rq {
>   	raw_spinlock_t		__lock;
>   
>   	/* Per class runqueue modification mask; bits in class order. */
> -	unsigned int		queue_mask;
>   	unsigned int		nr_running;
>   #ifdef CONFIG_NUMA_BALANCING
>   	unsigned int		nr_numa_running;
> @@ -1179,6 +1178,7 @@ struct rq {
>   	struct sched_dl_entity	*dl_server;
>   	struct task_struct	*idle;
>   	struct task_struct	*stop;
> +	const struct sched_class *next_class;
>   	unsigned long		next_balance;
>   	struct mm_struct	*prev_mm;
>   
> @@ -2426,15 +2426,6 @@ struct sched_class {
>   #ifdef CONFIG_UCLAMP_TASK
>   	int uclamp_enabled;
>   #endif
> -	/*
> -	 * idle:  0
> -	 * ext:   1
> -	 * fair:  2
> -	 * rt:    4
> -	 * dl:    8
> -	 * stop: 16
> -	 */
> -	unsigned int queue_mask;
>   
>   	/*
>   	 * move_queued_task/activate_task/enqueue_task: rq->lock
> @@ -2593,20 +2584,6 @@ struct sched_class {
>   #endif
>   };
>   
> -/*
> - * Does not nest; only used around sched_class::pick_task() rq-lock-breaks.
> - */
> -static inline void rq_modified_clear(struct rq *rq)
> -{
> -	rq->queue_mask = 0;
> -}
> -
> -static inline bool rq_modified_above(struct rq *rq, const struct sched_class * class)
> -{
> -	unsigned int mask = class->queue_mask;
> -	return rq->queue_mask & ~((mask << 1) - 1);
> -}
> -
>   static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
>   {
>   	WARN_ON_ONCE(rq->donor != prev);
> @@ -3899,6 +3876,7 @@ void move_queued_task_locked(struct rq *
>   	deactivate_task(src_rq, task, 0);
>   	set_task_cpu(task, dst_rq->cpu);
>   	activate_task(dst_rq, task, 0);
> +	wakeup_preempt(dst_rq, task, 0);

Whats the need of wakeup_preempt here?

In all places, move_queued_task_locked is followed by resched_curr
except in __migrate_swap_task which does same wakeup_preempt.


>   }
>   
>   static inline
> --- a/kernel/sched/stop_task.c
> +++ b/kernel/sched/stop_task.c
> @@ -97,9 +97,6 @@ static void update_curr_stop(struct rq *
>    * Simple, special scheduling class for the per-CPU stop tasks:
>    */
>   DEFINE_SCHED_CLASS(stop) = {
> -
> -	.queue_mask		= 16,
> -
>   	.enqueue_task		= enqueue_task_stop,
>   	.dequeue_task		= dequeue_task_stop,
>   	.yield_task		= yield_task_stop,
> 
>
Re: [PATCH 5/5] sched: Rework sched_class::wakeup_preempt() and rq_modified_*()
Posted by Peter Zijlstra 2 months, 1 week ago
On Sat, Nov 29, 2025 at 11:38:49PM +0530, Shrikanth Hegde wrote:

> > @@ -2174,10 +2172,14 @@ void wakeup_preempt(struct rq *rq, struc
> >   {
> >   	struct task_struct *donor = rq->donor;
> > -	if (p->sched_class == donor->sched_class)
> > -		donor->sched_class->wakeup_preempt(rq, p, flags);
> > -	else if (sched_class_above(p->sched_class, donor->sched_class))
> > +	if (p->sched_class == rq->next_class) {
> > +		rq->next_class->wakeup_preempt(rq, p, flags);
> > +
> > +	} else if (sched_class_above(p->sched_class, rq->next_class)) {
> > +		rq->next_class->wakeup_preempt(rq, p, flags);
> 
> Whats the logic of calling wakeup_preempt here?
> 
> say rq was running CFS, now RT is waking up. but first thing we do is
> return if not fair_sched_class. it is effectively resched_curr right?

Yes, as-is this patch seems silly, but that is mostly to preserve
current semantics :-)

The idea is that classes *could* do something else. Notably this was a
request from sched_ext. There are cases where when they pull a task from
the global runqueue and stick it on the local runqueue, but then get
preempted by a higher priority class (say RT) they would want to stick
the task back on the global runqueue such that another CPU can select it
again, instead of having that task linger on a CPU that is not
available.

This issue has come up in the past as well but was never addressed.

Anyway, this is just foundational work. It would let a class respond to
loosing the runqueue to a higher priority class.

I suppose I should go write a better changelog.

> 
> >   		resched_curr(rq);
> > +		rq->next_class = p->sched_class;
> 
> Since resched will happen and __schedule can set the next_class. it is necessary to set it
> even earlier?

Yes, because we can have another wakeup before that schedule.

Imagine running a fair class, getting a fifo wakeup and then a dl
wakeup. You want the fair class, then the rt class to get a preemption
notification.

> > @@ -3899,6 +3876,7 @@ void move_queued_task_locked(struct rq *
> >   	deactivate_task(src_rq, task, 0);
> >   	set_task_cpu(task, dst_rq->cpu);
> >   	activate_task(dst_rq, task, 0);
> > +	wakeup_preempt(dst_rq, task, 0);
> 
> Whats the need of wakeup_preempt here?

Everything that places a task on the runqueue should do a 'wakeup'
preemption to make sure the above mentioned class preemption stuff
works.

It doesn't really matter if the task is new due to an actual wakeup or
due to a migration, the task is 'new' to this CPU and stuff might need
to 'move'.

IIRC this was the only such place that missed the check.
Re: [PATCH 5/5] sched: Rework sched_class::wakeup_preempt() and rq_modified_*()
Posted by Shrikanth Hegde 2 months, 1 week ago

On 11/30/25 5:02 PM, Peter Zijlstra wrote:
> On Sat, Nov 29, 2025 at 11:38:49PM +0530, Shrikanth Hegde wrote:
> 
>>> @@ -2174,10 +2172,14 @@ void wakeup_preempt(struct rq *rq, struc
>>>    {
>>>    	struct task_struct *donor = rq->donor;
>>> -	if (p->sched_class == donor->sched_class)
>>> -		donor->sched_class->wakeup_preempt(rq, p, flags);
>>> -	else if (sched_class_above(p->sched_class, donor->sched_class))
>>> +	if (p->sched_class == rq->next_class) {
>>> +		rq->next_class->wakeup_preempt(rq, p, flags);
>>> +
>>> +	} else if (sched_class_above(p->sched_class, rq->next_class)) {
>>> +		rq->next_class->wakeup_preempt(rq, p, flags);
>>
>> Whats the logic of calling wakeup_preempt here?
>>
>> say rq was running CFS, now RT is waking up. but first thing we do is
>> return if not fair_sched_class. it is effectively resched_curr right?
> 
> Yes, as-is this patch seems silly, but that is mostly to preserve
> current semantics :-)
> 
> The idea is that classes *could* do something else. Notably this was a
> request from sched_ext. There are cases where when they pull a task from
> the global runqueue and stick it on the local runqueue, but then get
> preempted by a higher priority class (say RT) they would want to stick
> the task back on the global runqueue such that another CPU can select it
> again, instead of having that task linger on a CPU that is not
> available.
> 

ok. This helps to understand.

> This issue has come up in the past as well but was never addressed.
> 
> Anyway, this is just foundational work. It would let a class respond to
> loosing the runqueue to a higher priority class.
> 
> I suppose I should go write a better changelog.
> 
>>
>>>    		resched_curr(rq);
>>> +		rq->next_class = p->sched_class;
>>
>> Since resched will happen and __schedule can set the next_class. it is necessary to set it
>> even earlier?
> 
> Yes, because we can have another wakeup before that schedule.
> 
> Imagine running a fair class, getting a fifo wakeup and then a dl
> wakeup. You want the fair class, then the rt class to get a preemption
> notification.
> 
>>> @@ -3899,6 +3876,7 @@ void move_queued_task_locked(struct rq *
>>>    	deactivate_task(src_rq, task, 0);
>>>    	set_task_cpu(task, dst_rq->cpu);
>>>    	activate_task(dst_rq, task, 0);
>>> +	wakeup_preempt(dst_rq, task, 0);
>>
>> Whats the need of wakeup_preempt here?
> 
> Everything that places a task on the runqueue should do a 'wakeup'
> preemption to make sure the above mentioned class preemption stuff
> works.
> 
> It doesn't really matter if the task is new due to an actual wakeup or
> due to a migration, the task is 'new' to this CPU and stuff might need
> to 'move'.
> 
> IIRC this was the only such place that missed the check.

Point was, we might do resched_curr twice in this case.
Once in wakeup_preempt and once by explicit call following
move_queued_task_locked. May remove the later one?
Re: [PATCH 5/5] sched: Rework sched_class::wakeup_preempt() and rq_modified_*()
Posted by Andrea Righi 2 months, 1 week ago
Hi Peter,

On Thu, Nov 27, 2025 at 04:39:48PM +0100, Peter Zijlstra wrote:
...
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1119,7 +1119,6 @@ struct rq {
>  	raw_spinlock_t		__lock;
>  
>  	/* Per class runqueue modification mask; bits in class order. */

We should probably remove this comment as well along with queue_mask.

Thanks,
-Andrea

> -	unsigned int		queue_mask;
>  	unsigned int		nr_running;
>  #ifdef CONFIG_NUMA_BALANCING
>  	unsigned int		nr_numa_running;
> @@ -1179,6 +1178,7 @@ struct rq {
>  	struct sched_dl_entity	*dl_server;
>  	struct task_struct	*idle;
>  	struct task_struct	*stop;
> +	const struct sched_class *next_class;
>  	unsigned long		next_balance;
>  	struct mm_struct	*prev_mm;
>
error: implicit declaration of function ‘rq_modified_clear’ (was [PATCH 5/5] sched: Rework sched_class::wakeup_preempt() and rq_modified_*())
Posted by Thorsten Leemhuis 1 month, 3 weeks ago
On 11/27/25 16:39, Peter Zijlstra wrote:
> Change sched_class::wakeup_preempt() to also get called for
> cross-class wakeups, specifically those where the woken task is of a
> higher class than the previous highest class.

I suspect you might be aware of this already, but this patch afaics
broke compilation of today's -next for me, as reverting fixed things.

"""
In file included from kernel/sched/build_policy.c:62:
kernel/sched/ext.c: In function ‘do_pick_task_scx’:
kernel/sched/ext.c:2455:9: error: implicit declaration of function ‘rq_modified_clear’ [-Wimplicit-function-declaration]
 2455 |         rq_modified_clear(rq);
      |         ^~~~~~~~~~~~~~~~~
kernel/sched/ext.c:2470:27: error: implicit declaration of function ‘rq_modified_above’ [-Wimplicit-function-declaration]
 2470 |         if (!force_scx && rq_modified_above(rq, &ext_sched_class))
      |                           ^~~~~~~~~~~~~~~~~
make[4]: *** [scripts/Makefile.build:287: kernel/sched/build_policy.o] Error 1
make[3]: *** [scripts/Makefile.build:556: kernel/sched] Error 2
make[2]: *** [scripts/Makefile.build:556: kernel] Error 2
make[2]: *** Waiting for unfinished jobs....
make[1]: *** [/builddir/build/BUILD/kernel-6.19.0-build/kernel-next-20251215/linux-6.19.0-0.0.next.20251215.414.vanilla.fc44.s390x/Makefile:2062: .] Error 2
make: *** [Makefile:256: __sub-make] Error 2
"""

Ciao, Thorsten
 > In order to do this, track the current highest class of the runqueue
> in rq::next_class and have wakeup_preempt() track this upwards for
> each new wakeup. Additionally have set_next_task() re-set the value to
> the current class.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  kernel/sched/core.c      |   32 +++++++++++++++++++++++---------
>  kernel/sched/deadline.c  |   14 +++++++++-----
>  kernel/sched/ext.c       |    9 ++++-----
>  kernel/sched/fair.c      |   17 ++++++++++-------
>  kernel/sched/idle.c      |    3 ---
>  kernel/sched/rt.c        |    9 ++++++---
>  kernel/sched/sched.h     |   26 ++------------------------
>  kernel/sched/stop_task.c |    3 ---
>  8 files changed, 54 insertions(+), 59 deletions(-)
> 
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2090,7 +2090,6 @@ void enqueue_task(struct rq *rq, struct
>  	 */
>  	uclamp_rq_inc(rq, p, flags);
>  
> -	rq->queue_mask |= p->sched_class->queue_mask;
>  	p->sched_class->enqueue_task(rq, p, flags);
>  
>  	psi_enqueue(p, flags);
> @@ -2123,7 +2122,6 @@ inline bool dequeue_task(struct rq *rq,
>  	 * and mark the task ->sched_delayed.
>  	 */
>  	uclamp_rq_dec(rq, p);
> -	rq->queue_mask |= p->sched_class->queue_mask;
>  	return p->sched_class->dequeue_task(rq, p, flags);
>  }
>  
> @@ -2174,10 +2172,14 @@ void wakeup_preempt(struct rq *rq, struc
>  {
>  	struct task_struct *donor = rq->donor;
>  
> -	if (p->sched_class == donor->sched_class)
> -		donor->sched_class->wakeup_preempt(rq, p, flags);
> -	else if (sched_class_above(p->sched_class, donor->sched_class))
> +	if (p->sched_class == rq->next_class) {
> +		rq->next_class->wakeup_preempt(rq, p, flags);
> +
> +	} else if (sched_class_above(p->sched_class, rq->next_class)) {
> +		rq->next_class->wakeup_preempt(rq, p, flags);
>  		resched_curr(rq);
> +		rq->next_class = p->sched_class;
> +	}
>  
>  	/*
>  	 * A queue event has occurred, and we're going to schedule.  In
> @@ -6797,6 +6799,7 @@ static void __sched notrace __schedule(i
>  pick_again:
>  	next = pick_next_task(rq, rq->donor, &rf);
>  	rq_set_donor(rq, next);
> +	rq->next_class = next->sched_class;
>  	if (unlikely(task_is_blocked(next))) {
>  		next = find_proxy_task(rq, next, &rf);
>  		if (!next)
> @@ -8646,6 +8649,8 @@ void __init sched_init(void)
>  		rq->rt.rt_runtime = global_rt_runtime();
>  		init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
>  #endif
> +		rq->next_class = &idle_sched_class;
> +
>  		rq->sd = NULL;
>  		rq->rd = NULL;
>  		rq->cpu_capacity = SCHED_CAPACITY_SCALE;
> @@ -10771,10 +10776,8 @@ struct sched_change_ctx *sched_change_be
>  		flags |= DEQUEUE_NOCLOCK;
>  	}
>  
> -	if (flags & DEQUEUE_CLASS) {
> -		if (p->sched_class->switching_from)
> -			p->sched_class->switching_from(rq, p);
> -	}
> +	if ((flags & DEQUEUE_CLASS) && p->sched_class->switching_from)
> +		p->sched_class->switching_from(rq, p);
>  
>  	*ctx = (struct sched_change_ctx){
>  		.p = p,
> @@ -10827,6 +10830,17 @@ void sched_change_end(struct sched_chang
>  			p->sched_class->switched_to(rq, p);
>  
>  		/*
> +		 * If this was a class promotion; let the old class know it
> +		 * got preempted. Note that none of the switch*_from() methods
> +		 * know the new class and none of the switch*_to() methods
> +		 * know the old class.
> +		 */
> +		if (ctx->running && sched_class_above(p->sched_class, ctx->class)) {
> +			rq->next_class->wakeup_preempt(rq, p, 0);
> +			rq->next_class = p->sched_class;
> +		}
> +
> +		/*
>  		 * If this was a degradation in class someone should have set
>  		 * need_resched by now.
>  		 */
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -2499,9 +2499,16 @@ static int balance_dl(struct rq *rq, str
>   * Only called when both the current and waking task are -deadline
>   * tasks.
>   */
> -static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p,
> -				  int flags)
> +static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags)
>  {
> +	/*
> +	 * Can only get preempted by stop-class, and those should be
> +	 * few and short lived, doesn't really make sense to push
> +	 * anything away for that.
> +	 */
> +	if (p->sched_class != &dl_sched_class)
> +		return;
> +
>  	if (dl_entity_preempt(&p->dl, &rq->donor->dl)) {
>  		resched_curr(rq);
>  		return;
> @@ -3304,9 +3311,6 @@ static int task_is_throttled_dl(struct t
>  #endif
>  
>  DEFINE_SCHED_CLASS(dl) = {
> -
> -	.queue_mask		= 8,
> -
>  	.enqueue_task		= enqueue_task_dl,
>  	.dequeue_task		= dequeue_task_dl,
>  	.yield_task		= yield_task_dl,
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -2338,12 +2338,12 @@ static struct task_struct *pick_task_scx
>  	bool keep_prev, kick_idle = false;
>  	struct task_struct *p;
>  
> -	rq_modified_clear(rq);
> +	rq->next_class = &ext_sched_class;
>  	rq_unpin_lock(rq, rf);
>  	balance_one(rq, prev);
>  	rq_repin_lock(rq, rf);
>  	maybe_queue_balance_callback(rq);
> -	if (rq_modified_above(rq, &ext_sched_class))
> +	if (sched_class_above(rq->next_class, &ext_sched_class))
>  		return RETRY_TASK;
>  
>  	keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
> @@ -2967,7 +2967,8 @@ static void switched_from_scx(struct rq
>  	scx_disable_task(p);
>  }
>  
> -static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
> +static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_flags) {}
> +
>  static void switched_to_scx(struct rq *rq, struct task_struct *p) {}
>  
>  int scx_check_setscheduler(struct task_struct *p, int policy)
> @@ -3216,8 +3217,6 @@ static void scx_cgroup_unlock(void) {}
>   *   their current sched_class. Call them directly from sched core instead.
>   */
>  DEFINE_SCHED_CLASS(ext) = {
> -	.queue_mask		= 1,
> -
>  	.enqueue_task		= enqueue_task_scx,
>  	.dequeue_task		= dequeue_task_scx,
>  	.yield_task		= yield_task_scx,
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -8697,7 +8697,7 @@ preempt_sync(struct rq *rq, int wake_fla
>  /*
>   * Preempt the current task with a newly woken task if needed:
>   */
> -static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
> +static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_flags)
>  {
>  	enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK;
>  	struct task_struct *donor = rq->donor;
> @@ -8705,6 +8705,12 @@ static void check_preempt_wakeup_fair(st
>  	struct cfs_rq *cfs_rq = task_cfs_rq(donor);
>  	int cse_is_idle, pse_is_idle;
>  
> +	/*
> +	 * XXX Getting preempted by higher class, try and find idle CPU?
> +	 */
> +	if (p->sched_class != &fair_sched_class)
> +		return;
> +
>  	if (unlikely(se == pse))
>  		return;
>  
> @@ -12872,7 +12878,7 @@ static int sched_balance_newidle(struct
>  	t0 = sched_clock_cpu(this_cpu);
>  	__sched_balance_update_blocked_averages(this_rq);
>  
> -	rq_modified_clear(this_rq);
> +	this_rq->next_class = &fair_sched_class;
>  	raw_spin_rq_unlock(this_rq);
>  
>  	for_each_domain(this_cpu, sd) {
> @@ -12939,7 +12945,7 @@ static int sched_balance_newidle(struct
>  		pulled_task = 1;
>  
>  	/* If a higher prio class was modified, restart the pick */
> -	if (rq_modified_above(this_rq, &fair_sched_class))
> +	if (sched_class_above(this_rq->next_class, &fair_sched_class))
>  		pulled_task = -1;
>  
>  out:
> @@ -13837,15 +13843,12 @@ static unsigned int get_rr_interval_fair
>   * All the scheduling class methods:
>   */
>  DEFINE_SCHED_CLASS(fair) = {
> -
> -	.queue_mask		= 2,
> -
>  	.enqueue_task		= enqueue_task_fair,
>  	.dequeue_task		= dequeue_task_fair,
>  	.yield_task		= yield_task_fair,
>  	.yield_to_task		= yield_to_task_fair,
>  
> -	.wakeup_preempt		= check_preempt_wakeup_fair,
> +	.wakeup_preempt		= wakeup_preempt_fair,
>  
>  	.pick_task		= pick_task_fair,
>  	.pick_next_task		= pick_next_task_fair,
> --- a/kernel/sched/idle.c
> +++ b/kernel/sched/idle.c
> @@ -534,9 +534,6 @@ static void update_curr_idle(struct rq *
>   * Simple, special scheduling class for the per-CPU idle tasks:
>   */
>  DEFINE_SCHED_CLASS(idle) = {
> -
> -	.queue_mask		= 0,
> -
>  	/* no enqueue/yield_task for idle tasks */
>  
>  	/* dequeue is not valid, we print a debug message there: */
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -1615,6 +1615,12 @@ static void wakeup_preempt_rt(struct rq
>  {
>  	struct task_struct *donor = rq->donor;
>  
> +	/*
> +	 * XXX If we're preempted by DL, queue a push?
> +	 */
> +	if (p->sched_class != &rt_sched_class)
> +		return;
> +
>  	if (p->prio < donor->prio) {
>  		resched_curr(rq);
>  		return;
> @@ -2568,9 +2574,6 @@ static int task_is_throttled_rt(struct t
>  #endif /* CONFIG_SCHED_CORE */
>  
>  DEFINE_SCHED_CLASS(rt) = {
> -
> -	.queue_mask		= 4,
> -
>  	.enqueue_task		= enqueue_task_rt,
>  	.dequeue_task		= dequeue_task_rt,
>  	.yield_task		= yield_task_rt,
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1119,7 +1119,6 @@ struct rq {
>  	raw_spinlock_t		__lock;
>  
>  	/* Per class runqueue modification mask; bits in class order. */
> -	unsigned int		queue_mask;
>  	unsigned int		nr_running;
>  #ifdef CONFIG_NUMA_BALANCING
>  	unsigned int		nr_numa_running;
> @@ -1179,6 +1178,7 @@ struct rq {
>  	struct sched_dl_entity	*dl_server;
>  	struct task_struct	*idle;
>  	struct task_struct	*stop;
> +	const struct sched_class *next_class;
>  	unsigned long		next_balance;
>  	struct mm_struct	*prev_mm;
>  
> @@ -2426,15 +2426,6 @@ struct sched_class {
>  #ifdef CONFIG_UCLAMP_TASK
>  	int uclamp_enabled;
>  #endif
> -	/*
> -	 * idle:  0
> -	 * ext:   1
> -	 * fair:  2
> -	 * rt:    4
> -	 * dl:    8
> -	 * stop: 16
> -	 */
> -	unsigned int queue_mask;
>  
>  	/*
>  	 * move_queued_task/activate_task/enqueue_task: rq->lock
> @@ -2593,20 +2584,6 @@ struct sched_class {
>  #endif
>  };
>  
> -/*
> - * Does not nest; only used around sched_class::pick_task() rq-lock-breaks.
> - */
> -static inline void rq_modified_clear(struct rq *rq)
> -{
> -	rq->queue_mask = 0;
> -}
> -
> -static inline bool rq_modified_above(struct rq *rq, const struct sched_class * class)
> -{
> -	unsigned int mask = class->queue_mask;
> -	return rq->queue_mask & ~((mask << 1) - 1);
> -}
> -
>  static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
>  {
>  	WARN_ON_ONCE(rq->donor != prev);
> @@ -3899,6 +3876,7 @@ void move_queued_task_locked(struct rq *
>  	deactivate_task(src_rq, task, 0);
>  	set_task_cpu(task, dst_rq->cpu);
>  	activate_task(dst_rq, task, 0);
> +	wakeup_preempt(dst_rq, task, 0);
>  }
>  
>  static inline
> --- a/kernel/sched/stop_task.c
> +++ b/kernel/sched/stop_task.c
> @@ -97,9 +97,6 @@ static void update_curr_stop(struct rq *
>   * Simple, special scheduling class for the per-CPU stop tasks:
>   */
>  DEFINE_SCHED_CLASS(stop) = {
> -
> -	.queue_mask		= 16,
> -
>  	.enqueue_task		= enqueue_task_stop,
>  	.dequeue_task		= dequeue_task_stop,
>  	.yield_task		= yield_task_stop,
> 
> 
> 

Re: error: implicit declaration of function ‘rq_modified_clear’ (was [PATCH 5/5] sched: Rework sched_class::wakeup_preempt() and rq_modified_*())
Posted by Ingo Molnar 1 month, 3 weeks ago
* Thorsten Leemhuis <linux@leemhuis.info> wrote:

> On 11/27/25 16:39, Peter Zijlstra wrote:
> > Change sched_class::wakeup_preempt() to also get called for
> > cross-class wakeups, specifically those where the woken task is of a
> > higher class than the previous highest class.
>
> I suspect you might be aware of this already, but this patch afaics
> broke compilation of today's -next for me, as reverting fixed things.

Yeah, sorry about that, I fumbled a conflict resolution - should be
fixed for tomorrow's -next.

Thanks,

	Ingo
Re: error: implicit declaration of function ‘rq_modified_clear’ (was [PATCH 5/5] sched: Rework sched_class::wakeup_preempt() and rq_modified_*())
Posted by Nathan Chancellor 1 month, 3 weeks ago
On Mon, Dec 15, 2025 at 08:12:13AM +0100, Ingo Molnar wrote:
> 
> * Thorsten Leemhuis <linux@leemhuis.info> wrote:
> 
> > On 11/27/25 16:39, Peter Zijlstra wrote:
> > > Change sched_class::wakeup_preempt() to also get called for
> > > cross-class wakeups, specifically those where the woken task is of a
> > > higher class than the previous highest class.
> >
> > I suspect you might be aware of this already, but this patch afaics
> > broke compilation of today's -next for me, as reverting fixed things.
> 
> Yeah, sorry about that, I fumbled a conflict resolution - should be
> fixed for tomorrow's -next.

It looks like you cleared up the rq_modified_clear() error but
rq_modified_above() is still present in kernel/sched/ext.c.

Cheers,
Nathan
Re: error: implicit declaration of function ‘rq_modified_clear’ (was [PATCH 5/5] sched: Rework sched_class::wakeup_preempt() and rq_modified_*())
Posted by Thorsten Leemhuis 1 month, 3 weeks ago

On 12/15/25 12:51, Nathan Chancellor wrote:
> On Mon, Dec 15, 2025 at 08:12:13AM +0100, Ingo Molnar wrote:
>>
>> * Thorsten Leemhuis <linux@leemhuis.info> wrote:
>>
>>> On 11/27/25 16:39, Peter Zijlstra wrote:
>>>> Change sched_class::wakeup_preempt() to also get called for
>>>> cross-class wakeups, specifically those where the woken task is of a
>>>> higher class than the previous highest class.
>>>
>>> I suspect you might be aware of this already, but this patch afaics
>>> broke compilation of today's -next for me, as reverting fixed things.
>>
>> Yeah, sorry about that, I fumbled a conflict resolution - should be
>> fixed for tomorrow's -next.
> 
> It looks like you cleared up the rq_modified_clear() error but
> rq_modified_above() is still present in kernel/sched/ext.c.

...which afaics causes this build error in today's next:

In file included from kernel/sched/build_policy.c:62:
kernel/sched/ext.c: In function ‘do_pick_task_scx’:
kernel/sched/ext.c:2470:27: error: implicit declaration of function ‘rq_modified_above’ [-Wimplicit-function-declaration]
 2470 |         if (!force_scx && rq_modified_above(rq, &ext_sched_class))
      |                           ^~~~~~~~~~~~~~~~~
make[4]: *** [scripts/Makefile.build:287: kernel/sched/build_policy.o] Error 1
make[3]: *** [scripts/Makefile.build:556: kernel/sched] Error 2
make[3]: *** Waiting for unfinished jobs....
make[2]: *** [scripts/Makefile.build:556: kernel] Error 2
make[2]: *** Waiting for unfinished jobs....
make[1]: *** [/builddir/build/BUILD/kernel-6.19.0-build/kernel-next-20251216/linux-6.19.0-0.0.next.20251216.415.vanilla.fc44.x86_64/Makefile:2062: .] Error 2
make: *** [Makefile:256: __sub-make] Error 2

Ciao, Thorsten
Re: error: implicit declaration of function ‘rq_modified_clear’ (was [PATCH 5/5] sched: Rework sched_class::wakeup_preempt() and rq_modified_*())
Posted by Tejun Heo 1 month, 3 weeks ago
On Tue, Dec 16, 2025 at 08:02:50AM +0100, Thorsten Leemhuis wrote:
> 
> 
> On 12/15/25 12:51, Nathan Chancellor wrote:
> > On Mon, Dec 15, 2025 at 08:12:13AM +0100, Ingo Molnar wrote:
> >>
> >> * Thorsten Leemhuis <linux@leemhuis.info> wrote:
> >>
> >>> On 11/27/25 16:39, Peter Zijlstra wrote:
> >>>> Change sched_class::wakeup_preempt() to also get called for
> >>>> cross-class wakeups, specifically those where the woken task is of a
> >>>> higher class than the previous highest class.
> >>>
> >>> I suspect you might be aware of this already, but this patch afaics
> >>> broke compilation of today's -next for me, as reverting fixed things.
> >>
> >> Yeah, sorry about that, I fumbled a conflict resolution - should be
> >> fixed for tomorrow's -next.
> > 
> > It looks like you cleared up the rq_modified_clear() error but
> > rq_modified_above() is still present in kernel/sched/ext.c.
> 
> ...which afaics causes this build error in today's next:
> 
> In file included from kernel/sched/build_policy.c:62:
> kernel/sched/ext.c: In function ‘do_pick_task_scx’:
> kernel/sched/ext.c:2470:27: error: implicit declaration of function ‘rq_modified_above’ [-Wimplicit-function-declaration]
>  2470 |         if (!force_scx && rq_modified_above(rq, &ext_sched_class))
>       |                           ^~~~~~~~~~~~~~~~~
> make[4]: *** [scripts/Makefile.build:287: kernel/sched/build_policy.o] Error 1
> make[3]: *** [scripts/Makefile.build:556: kernel/sched] Error 2
> make[3]: *** Waiting for unfinished jobs....
> make[2]: *** [scripts/Makefile.build:556: kernel] Error 2
> make[2]: *** Waiting for unfinished jobs....
> make[1]: *** [/builddir/build/BUILD/kernel-6.19.0-build/kernel-next-20251216/linux-6.19.0-0.0.next.20251216.415.vanilla.fc44.x86_64/Makefile:2062: .] Error 2
> make: *** [Makefile:256: __sub-make] Error 2

Ingo, Peter, I can pull tip and resolve this from sched_ext side too but it
would probably be cleaner to resolve from tip side?

Tahnks.

-- 
tejun
Re: error: implicit declaration of function ‘rq_modified_clear’ (was [PATCH 5/5] sched: Rework sched_class::wakeup_preempt() and rq_modified_*())
Posted by Peter Zijlstra 1 month, 3 weeks ago
On Tue, Dec 16, 2025 at 08:40:36AM -1000, Tejun Heo wrote:

> Ingo, Peter, I can pull tip and resolve this from sched_ext side too but it
> would probably be cleaner to resolve from tip side?

Yeah, I'll fix it up tomorrow morning if Ingo hasn't yet. Sorry for the
mess.
Re: error: implicit declaration of function ‘rq_modified_clear’ (was [PATCH 5/5] sched: Rework sched_class::wakeup_preempt() and rq_modified_*())
Posted by Peter Zijlstra 1 month, 3 weeks ago
On Tue, Dec 16, 2025 at 10:42:29PM +0100, Peter Zijlstra wrote:
> On Tue, Dec 16, 2025 at 08:40:36AM -1000, Tejun Heo wrote:
> 
> > Ingo, Peter, I can pull tip and resolve this from sched_ext side too but it
> > would probably be cleaner to resolve from tip side?
> 
> Yeah, I'll fix it up tomorrow morning if Ingo hasn't yet. Sorry for the
> mess.

Force pushed tip/sched/core -- this issue should now hopefully be laid
to rest. Again, sorry for the mess.
[tip: sched/core] sched/core: Rework sched_class::wakeup_preempt() and rq_modified_*()
Posted by tip-bot2 for Peter Zijlstra 1 month, 3 weeks ago
The following commit has been merged into the sched/core branch of tip:

Commit-ID:     704069649b5bfb7bf1fe32c0281fe9036806a59a
Gitweb:        https://git.kernel.org/tip/704069649b5bfb7bf1fe32c0281fe9036806a59a
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Wed, 10 Dec 2025 09:06:50 +01:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Wed, 17 Dec 2025 10:53:25 +01:00

sched/core: Rework sched_class::wakeup_preempt() and rq_modified_*()

Change sched_class::wakeup_preempt() to also get called for
cross-class wakeups, specifically those where the woken task
is of a higher class than the previous highest class.

In order to do this, track the current highest class of the runqueue
in rq::next_class and have wakeup_preempt() track this upwards for
each new wakeup. Additionally have schedule() re-set the value on
pick.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://patch.msgid.link/20251127154725.901391274@infradead.org
---
 kernel/sched/core.c      | 32 +++++++++++++++++++++++---------
 kernel/sched/deadline.c  | 14 +++++++++-----
 kernel/sched/ext.c       |  9 ++++-----
 kernel/sched/fair.c      | 17 ++++++++++-------
 kernel/sched/idle.c      |  3 ---
 kernel/sched/rt.c        |  9 ++++++---
 kernel/sched/sched.h     | 27 ++-------------------------
 kernel/sched/stop_task.c |  3 ---
 8 files changed, 54 insertions(+), 60 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4479f7d..7d0a862 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2090,7 +2090,6 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 	 */
 	uclamp_rq_inc(rq, p, flags);
 
-	rq->queue_mask |= p->sched_class->queue_mask;
 	p->sched_class->enqueue_task(rq, p, flags);
 
 	psi_enqueue(p, flags);
@@ -2123,7 +2122,6 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 	 * and mark the task ->sched_delayed.
 	 */
 	uclamp_rq_dec(rq, p);
-	rq->queue_mask |= p->sched_class->queue_mask;
 	return p->sched_class->dequeue_task(rq, p, flags);
 }
 
@@ -2174,10 +2172,14 @@ void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct task_struct *donor = rq->donor;
 
-	if (p->sched_class == donor->sched_class)
-		donor->sched_class->wakeup_preempt(rq, p, flags);
-	else if (sched_class_above(p->sched_class, donor->sched_class))
+	if (p->sched_class == rq->next_class) {
+		rq->next_class->wakeup_preempt(rq, p, flags);
+
+	} else if (sched_class_above(p->sched_class, rq->next_class)) {
+		rq->next_class->wakeup_preempt(rq, p, flags);
 		resched_curr(rq);
+		rq->next_class = p->sched_class;
+	}
 
 	/*
 	 * A queue event has occurred, and we're going to schedule.  In
@@ -6804,6 +6806,7 @@ static void __sched notrace __schedule(int sched_mode)
 pick_again:
 	next = pick_next_task(rq, rq->donor, &rf);
 	rq_set_donor(rq, next);
+	rq->next_class = next->sched_class;
 	if (unlikely(task_is_blocked(next))) {
 		next = find_proxy_task(rq, next, &rf);
 		if (!next)
@@ -8650,6 +8653,8 @@ void __init sched_init(void)
 		rq->rt.rt_runtime = global_rt_runtime();
 		init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
 #endif
+		rq->next_class = &idle_sched_class;
+
 		rq->sd = NULL;
 		rq->rd = NULL;
 		rq->cpu_capacity = SCHED_CAPACITY_SCALE;
@@ -10775,10 +10780,8 @@ struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int 
 		flags |= DEQUEUE_NOCLOCK;
 	}
 
-	if (flags & DEQUEUE_CLASS) {
-		if (p->sched_class->switching_from)
-			p->sched_class->switching_from(rq, p);
-	}
+	if ((flags & DEQUEUE_CLASS) && p->sched_class->switching_from)
+		p->sched_class->switching_from(rq, p);
 
 	*ctx = (struct sched_change_ctx){
 		.p = p,
@@ -10831,6 +10834,17 @@ void sched_change_end(struct sched_change_ctx *ctx)
 			p->sched_class->switched_to(rq, p);
 
 		/*
+		 * If this was a class promotion; let the old class know it
+		 * got preempted. Note that none of the switch*_from() methods
+		 * know the new class and none of the switch*_to() methods
+		 * know the old class.
+		 */
+		if (ctx->running && sched_class_above(p->sched_class, ctx->class)) {
+			rq->next_class->wakeup_preempt(rq, p, 0);
+			rq->next_class = p->sched_class;
+		}
+
+		/*
 		 * If this was a degradation in class someone should have set
 		 * need_resched by now.
 		 */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 319439f..80c9559 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2499,9 +2499,16 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
  * Only called when both the current and waking task are -deadline
  * tasks.
  */
-static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p,
-				  int flags)
+static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags)
 {
+	/*
+	 * Can only get preempted by stop-class, and those should be
+	 * few and short lived, doesn't really make sense to push
+	 * anything away for that.
+	 */
+	if (p->sched_class != &dl_sched_class)
+		return;
+
 	if (dl_entity_preempt(&p->dl, &rq->donor->dl)) {
 		resched_curr(rq);
 		return;
@@ -3346,9 +3353,6 @@ static int task_is_throttled_dl(struct task_struct *p, int cpu)
 #endif
 
 DEFINE_SCHED_CLASS(dl) = {
-
-	.queue_mask		= 8,
-
 	.enqueue_task		= enqueue_task_dl,
 	.dequeue_task		= dequeue_task_dl,
 	.yield_task		= yield_task_dl,
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 05f5a49..3b32e64 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2431,7 +2431,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
 	/* see kick_cpus_irq_workfn() */
 	smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
 
-	rq_modified_clear(rq);
+	rq->next_class = &ext_sched_class;
 
 	rq_unpin_lock(rq, rf);
 	balance_one(rq, prev);
@@ -2446,7 +2446,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
 	 * If @force_scx is true, always try to pick a SCHED_EXT task,
 	 * regardless of any higher-priority sched classes activity.
 	 */
-	if (!force_scx && rq_modified_above(rq, &ext_sched_class))
+	if (!force_scx && sched_class_above(rq->next_class, &ext_sched_class))
 		return RETRY_TASK;
 
 	keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
@@ -3075,7 +3075,8 @@ static void switched_from_scx(struct rq *rq, struct task_struct *p)
 	scx_disable_task(p);
 }
 
-static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
+static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_flags) {}
+
 static void switched_to_scx(struct rq *rq, struct task_struct *p) {}
 
 int scx_check_setscheduler(struct task_struct *p, int policy)
@@ -3336,8 +3337,6 @@ static void scx_cgroup_unlock(void) {}
  *   their current sched_class. Call them directly from sched core instead.
  */
 DEFINE_SCHED_CLASS(ext) = {
-	.queue_mask		= 1,
-
 	.enqueue_task		= enqueue_task_scx,
 	.dequeue_task		= dequeue_task_scx,
 	.yield_task		= yield_task_scx,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d588eb8..76f5e4b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8736,7 +8736,7 @@ preempt_sync(struct rq *rq, int wake_flags,
 /*
  * Preempt the current task with a newly woken task if needed:
  */
-static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
+static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_flags)
 {
 	enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK;
 	struct task_struct *donor = rq->donor;
@@ -8744,6 +8744,12 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int 
 	struct cfs_rq *cfs_rq = task_cfs_rq(donor);
 	int cse_is_idle, pse_is_idle;
 
+	/*
+	 * XXX Getting preempted by higher class, try and find idle CPU?
+	 */
+	if (p->sched_class != &fair_sched_class)
+		return;
+
 	if (unlikely(se == pse))
 		return;
 
@@ -12911,7 +12917,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
 	t0 = sched_clock_cpu(this_cpu);
 	__sched_balance_update_blocked_averages(this_rq);
 
-	rq_modified_clear(this_rq);
+	this_rq->next_class = &fair_sched_class;
 	raw_spin_rq_unlock(this_rq);
 
 	for_each_domain(this_cpu, sd) {
@@ -12978,7 +12984,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
 		pulled_task = 1;
 
 	/* If a higher prio class was modified, restart the pick */
-	if (rq_modified_above(this_rq, &fair_sched_class))
+	if (sched_class_above(this_rq->next_class, &fair_sched_class))
 		pulled_task = -1;
 
 out:
@@ -13882,15 +13888,12 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
  * All the scheduling class methods:
  */
 DEFINE_SCHED_CLASS(fair) = {
-
-	.queue_mask		= 2,
-
 	.enqueue_task		= enqueue_task_fair,
 	.dequeue_task		= dequeue_task_fair,
 	.yield_task		= yield_task_fair,
 	.yield_to_task		= yield_to_task_fair,
 
-	.wakeup_preempt		= check_preempt_wakeup_fair,
+	.wakeup_preempt		= wakeup_preempt_fair,
 
 	.pick_task		= pick_task_fair,
 	.pick_next_task		= pick_next_task_fair,
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index c174afe..65eb8f8 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -536,9 +536,6 @@ static void update_curr_idle(struct rq *rq)
  * Simple, special scheduling class for the per-CPU idle tasks:
  */
 DEFINE_SCHED_CLASS(idle) = {
-
-	.queue_mask		= 0,
-
 	/* no enqueue/yield_task for idle tasks */
 
 	/* dequeue is not valid, we print a debug message there: */
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f1867fe..0a9b2cd 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1615,6 +1615,12 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct task_struct *donor = rq->donor;
 
+	/*
+	 * XXX If we're preempted by DL, queue a push?
+	 */
+	if (p->sched_class != &rt_sched_class)
+		return;
+
 	if (p->prio < donor->prio) {
 		resched_curr(rq);
 		return;
@@ -2568,9 +2574,6 @@ static int task_is_throttled_rt(struct task_struct *p, int cpu)
 #endif /* CONFIG_SCHED_CORE */
 
 DEFINE_SCHED_CLASS(rt) = {
-
-	.queue_mask		= 4,
-
 	.enqueue_task		= enqueue_task_rt,
 	.dequeue_task		= dequeue_task_rt,
 	.yield_task		= yield_task_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ab1bfa0..3ceaa9d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1118,8 +1118,6 @@ struct rq {
 	/* runqueue lock: */
 	raw_spinlock_t		__lock;
 
-	/* Per class runqueue modification mask; bits in class order. */
-	unsigned int		queue_mask;
 	unsigned int		nr_running;
 #ifdef CONFIG_NUMA_BALANCING
 	unsigned int		nr_numa_running;
@@ -1179,6 +1177,7 @@ struct rq {
 	struct sched_dl_entity	*dl_server;
 	struct task_struct	*idle;
 	struct task_struct	*stop;
+	const struct sched_class *next_class;
 	unsigned long		next_balance;
 	struct mm_struct	*prev_mm;
 
@@ -2426,15 +2425,6 @@ struct sched_class {
 #ifdef CONFIG_UCLAMP_TASK
 	int uclamp_enabled;
 #endif
-	/*
-	 * idle:  0
-	 * ext:   1
-	 * fair:  2
-	 * rt:    4
-	 * dl:    8
-	 * stop: 16
-	 */
-	unsigned int queue_mask;
 
 	/*
 	 * move_queued_task/activate_task/enqueue_task: rq->lock
@@ -2593,20 +2583,6 @@ struct sched_class {
 #endif
 };
 
-/*
- * Does not nest; only used around sched_class::pick_task() rq-lock-breaks.
- */
-static inline void rq_modified_clear(struct rq *rq)
-{
-	rq->queue_mask = 0;
-}
-
-static inline bool rq_modified_above(struct rq *rq, const struct sched_class * class)
-{
-	unsigned int mask = class->queue_mask;
-	return rq->queue_mask & ~((mask << 1) - 1);
-}
-
 static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
 	WARN_ON_ONCE(rq->donor != prev);
@@ -3899,6 +3875,7 @@ void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_s
 	deactivate_task(src_rq, task, 0);
 	set_task_cpu(task, dst_rq->cpu);
 	activate_task(dst_rq, task, 0);
+	wakeup_preempt(dst_rq, task, 0);
 }
 
 static inline
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 4f9192b..f95798b 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -97,9 +97,6 @@ static void update_curr_stop(struct rq *rq)
  * Simple, special scheduling class for the per-CPU stop tasks:
  */
 DEFINE_SCHED_CLASS(stop) = {
-
-	.queue_mask		= 16,
-
 	.enqueue_task		= enqueue_task_stop,
 	.dequeue_task		= dequeue_task_stop,
 	.yield_task		= yield_task_stop,
[tip: sched/core] sched/core: Rework sched_class::wakeup_preempt() and rq_modified_*()
Posted by tip-bot2 for Peter Zijlstra 1 month, 3 weeks ago
The following commit has been merged into the sched/core branch of tip:

Commit-ID:     5d1f0b2f278eb55aebe29210fbc8f352c53497d6
Gitweb:        https://git.kernel.org/tip/5d1f0b2f278eb55aebe29210fbc8f352c53497d6
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Wed, 10 Dec 2025 09:06:50 +01:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Mon, 15 Dec 2025 07:53:35 +01:00

sched/core: Rework sched_class::wakeup_preempt() and rq_modified_*()

Change sched_class::wakeup_preempt() to also get called for
cross-class wakeups, specifically those where the woken task
is of a higher class than the previous highest class.

In order to do this, track the current highest class of the runqueue
in rq::next_class and have wakeup_preempt() track this upwards for
each new wakeup. Additionally have schedule() re-set the value on
pick.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://patch.msgid.link/20251127154725.901391274@infradead.org
---
 kernel/sched/core.c      | 32 +++++++++++++++++++++++---------
 kernel/sched/deadline.c  | 14 +++++++++-----
 kernel/sched/ext.c       |  7 +++----
 kernel/sched/fair.c      | 17 ++++++++++-------
 kernel/sched/idle.c      |  3 ---
 kernel/sched/rt.c        |  9 ++++++---
 kernel/sched/sched.h     | 26 ++------------------------
 kernel/sched/stop_task.c |  3 ---
 8 files changed, 53 insertions(+), 58 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4479f7d..7d0a862 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2090,7 +2090,6 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 	 */
 	uclamp_rq_inc(rq, p, flags);
 
-	rq->queue_mask |= p->sched_class->queue_mask;
 	p->sched_class->enqueue_task(rq, p, flags);
 
 	psi_enqueue(p, flags);
@@ -2123,7 +2122,6 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 	 * and mark the task ->sched_delayed.
 	 */
 	uclamp_rq_dec(rq, p);
-	rq->queue_mask |= p->sched_class->queue_mask;
 	return p->sched_class->dequeue_task(rq, p, flags);
 }
 
@@ -2174,10 +2172,14 @@ void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct task_struct *donor = rq->donor;
 
-	if (p->sched_class == donor->sched_class)
-		donor->sched_class->wakeup_preempt(rq, p, flags);
-	else if (sched_class_above(p->sched_class, donor->sched_class))
+	if (p->sched_class == rq->next_class) {
+		rq->next_class->wakeup_preempt(rq, p, flags);
+
+	} else if (sched_class_above(p->sched_class, rq->next_class)) {
+		rq->next_class->wakeup_preempt(rq, p, flags);
 		resched_curr(rq);
+		rq->next_class = p->sched_class;
+	}
 
 	/*
 	 * A queue event has occurred, and we're going to schedule.  In
@@ -6804,6 +6806,7 @@ static void __sched notrace __schedule(int sched_mode)
 pick_again:
 	next = pick_next_task(rq, rq->donor, &rf);
 	rq_set_donor(rq, next);
+	rq->next_class = next->sched_class;
 	if (unlikely(task_is_blocked(next))) {
 		next = find_proxy_task(rq, next, &rf);
 		if (!next)
@@ -8650,6 +8653,8 @@ void __init sched_init(void)
 		rq->rt.rt_runtime = global_rt_runtime();
 		init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
 #endif
+		rq->next_class = &idle_sched_class;
+
 		rq->sd = NULL;
 		rq->rd = NULL;
 		rq->cpu_capacity = SCHED_CAPACITY_SCALE;
@@ -10775,10 +10780,8 @@ struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int 
 		flags |= DEQUEUE_NOCLOCK;
 	}
 
-	if (flags & DEQUEUE_CLASS) {
-		if (p->sched_class->switching_from)
-			p->sched_class->switching_from(rq, p);
-	}
+	if ((flags & DEQUEUE_CLASS) && p->sched_class->switching_from)
+		p->sched_class->switching_from(rq, p);
 
 	*ctx = (struct sched_change_ctx){
 		.p = p,
@@ -10831,6 +10834,17 @@ void sched_change_end(struct sched_change_ctx *ctx)
 			p->sched_class->switched_to(rq, p);
 
 		/*
+		 * If this was a class promotion; let the old class know it
+		 * got preempted. Note that none of the switch*_from() methods
+		 * know the new class and none of the switch*_to() methods
+		 * know the old class.
+		 */
+		if (ctx->running && sched_class_above(p->sched_class, ctx->class)) {
+			rq->next_class->wakeup_preempt(rq, p, 0);
+			rq->next_class = p->sched_class;
+		}
+
+		/*
 		 * If this was a degradation in class someone should have set
 		 * need_resched by now.
 		 */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 319439f..80c9559 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2499,9 +2499,16 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
  * Only called when both the current and waking task are -deadline
  * tasks.
  */
-static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p,
-				  int flags)
+static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags)
 {
+	/*
+	 * Can only get preempted by stop-class, and those should be
+	 * few and short lived, doesn't really make sense to push
+	 * anything away for that.
+	 */
+	if (p->sched_class != &dl_sched_class)
+		return;
+
 	if (dl_entity_preempt(&p->dl, &rq->donor->dl)) {
 		resched_curr(rq);
 		return;
@@ -3346,9 +3353,6 @@ static int task_is_throttled_dl(struct task_struct *p, int cpu)
 #endif
 
 DEFINE_SCHED_CLASS(dl) = {
-
-	.queue_mask		= 8,
-
 	.enqueue_task		= enqueue_task_dl,
 	.dequeue_task		= dequeue_task_dl,
 	.yield_task		= yield_task_dl,
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 05f5a49..3058777 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2431,7 +2431,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
 	/* see kick_cpus_irq_workfn() */
 	smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
 
-	rq_modified_clear(rq);
+	rq->next_class = &fair_sched_class;
 
 	rq_unpin_lock(rq, rf);
 	balance_one(rq, prev);
@@ -3075,7 +3075,8 @@ static void switched_from_scx(struct rq *rq, struct task_struct *p)
 	scx_disable_task(p);
 }
 
-static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
+static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_flags) {}
+
 static void switched_to_scx(struct rq *rq, struct task_struct *p) {}
 
 int scx_check_setscheduler(struct task_struct *p, int policy)
@@ -3336,8 +3337,6 @@ static void scx_cgroup_unlock(void) {}
  *   their current sched_class. Call them directly from sched core instead.
  */
 DEFINE_SCHED_CLASS(ext) = {
-	.queue_mask		= 1,
-
 	.enqueue_task		= enqueue_task_scx,
 	.dequeue_task		= dequeue_task_scx,
 	.yield_task		= yield_task_scx,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d588eb8..76f5e4b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8736,7 +8736,7 @@ preempt_sync(struct rq *rq, int wake_flags,
 /*
  * Preempt the current task with a newly woken task if needed:
  */
-static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
+static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_flags)
 {
 	enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK;
 	struct task_struct *donor = rq->donor;
@@ -8744,6 +8744,12 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int 
 	struct cfs_rq *cfs_rq = task_cfs_rq(donor);
 	int cse_is_idle, pse_is_idle;
 
+	/*
+	 * XXX Getting preempted by higher class, try and find idle CPU?
+	 */
+	if (p->sched_class != &fair_sched_class)
+		return;
+
 	if (unlikely(se == pse))
 		return;
 
@@ -12911,7 +12917,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
 	t0 = sched_clock_cpu(this_cpu);
 	__sched_balance_update_blocked_averages(this_rq);
 
-	rq_modified_clear(this_rq);
+	this_rq->next_class = &fair_sched_class;
 	raw_spin_rq_unlock(this_rq);
 
 	for_each_domain(this_cpu, sd) {
@@ -12978,7 +12984,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
 		pulled_task = 1;
 
 	/* If a higher prio class was modified, restart the pick */
-	if (rq_modified_above(this_rq, &fair_sched_class))
+	if (sched_class_above(this_rq->next_class, &fair_sched_class))
 		pulled_task = -1;
 
 out:
@@ -13882,15 +13888,12 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
  * All the scheduling class methods:
  */
 DEFINE_SCHED_CLASS(fair) = {
-
-	.queue_mask		= 2,
-
 	.enqueue_task		= enqueue_task_fair,
 	.dequeue_task		= dequeue_task_fair,
 	.yield_task		= yield_task_fair,
 	.yield_to_task		= yield_to_task_fair,
 
-	.wakeup_preempt		= check_preempt_wakeup_fair,
+	.wakeup_preempt		= wakeup_preempt_fair,
 
 	.pick_task		= pick_task_fair,
 	.pick_next_task		= pick_next_task_fair,
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index c174afe..65eb8f8 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -536,9 +536,6 @@ static void update_curr_idle(struct rq *rq)
  * Simple, special scheduling class for the per-CPU idle tasks:
  */
 DEFINE_SCHED_CLASS(idle) = {
-
-	.queue_mask		= 0,
-
 	/* no enqueue/yield_task for idle tasks */
 
 	/* dequeue is not valid, we print a debug message there: */
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f1867fe..0a9b2cd 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1615,6 +1615,12 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct task_struct *donor = rq->donor;
 
+	/*
+	 * XXX If we're preempted by DL, queue a push?
+	 */
+	if (p->sched_class != &rt_sched_class)
+		return;
+
 	if (p->prio < donor->prio) {
 		resched_curr(rq);
 		return;
@@ -2568,9 +2574,6 @@ static int task_is_throttled_rt(struct task_struct *p, int cpu)
 #endif /* CONFIG_SCHED_CORE */
 
 DEFINE_SCHED_CLASS(rt) = {
-
-	.queue_mask		= 4,
-
 	.enqueue_task		= enqueue_task_rt,
 	.dequeue_task		= dequeue_task_rt,
 	.yield_task		= yield_task_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ab1bfa0..bdb1e74 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1119,7 +1119,6 @@ struct rq {
 	raw_spinlock_t		__lock;
 
 	/* Per class runqueue modification mask; bits in class order. */
-	unsigned int		queue_mask;
 	unsigned int		nr_running;
 #ifdef CONFIG_NUMA_BALANCING
 	unsigned int		nr_numa_running;
@@ -1179,6 +1178,7 @@ struct rq {
 	struct sched_dl_entity	*dl_server;
 	struct task_struct	*idle;
 	struct task_struct	*stop;
+	const struct sched_class *next_class;
 	unsigned long		next_balance;
 	struct mm_struct	*prev_mm;
 
@@ -2426,15 +2426,6 @@ struct sched_class {
 #ifdef CONFIG_UCLAMP_TASK
 	int uclamp_enabled;
 #endif
-	/*
-	 * idle:  0
-	 * ext:   1
-	 * fair:  2
-	 * rt:    4
-	 * dl:    8
-	 * stop: 16
-	 */
-	unsigned int queue_mask;
 
 	/*
 	 * move_queued_task/activate_task/enqueue_task: rq->lock
@@ -2593,20 +2584,6 @@ struct sched_class {
 #endif
 };
 
-/*
- * Does not nest; only used around sched_class::pick_task() rq-lock-breaks.
- */
-static inline void rq_modified_clear(struct rq *rq)
-{
-	rq->queue_mask = 0;
-}
-
-static inline bool rq_modified_above(struct rq *rq, const struct sched_class * class)
-{
-	unsigned int mask = class->queue_mask;
-	return rq->queue_mask & ~((mask << 1) - 1);
-}
-
 static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
 	WARN_ON_ONCE(rq->donor != prev);
@@ -3899,6 +3876,7 @@ void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_s
 	deactivate_task(src_rq, task, 0);
 	set_task_cpu(task, dst_rq->cpu);
 	activate_task(dst_rq, task, 0);
+	wakeup_preempt(dst_rq, task, 0);
 }
 
 static inline
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 4f9192b..f95798b 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -97,9 +97,6 @@ static void update_curr_stop(struct rq *rq)
  * Simple, special scheduling class for the per-CPU stop tasks:
  */
 DEFINE_SCHED_CLASS(stop) = {
-
-	.queue_mask		= 16,
-
 	.enqueue_task		= enqueue_task_stop,
 	.dequeue_task		= dequeue_task_stop,
 	.yield_task		= yield_task_stop,
[tip: sched/core] sched/core: Rework sched_class::wakeup_preempt() and rq_modified_*()
Posted by tip-bot2 for Peter Zijlstra 1 month, 3 weeks ago
The following commit has been merged into the sched/core branch of tip:

Commit-ID:     31ab17f00c810076333c26cb485ec4d778829a76
Gitweb:        https://git.kernel.org/tip/31ab17f00c810076333c26cb485ec4d778829a76
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Wed, 10 Dec 2025 09:06:50 +01:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Sun, 14 Dec 2025 08:25:02 +01:00

sched/core: Rework sched_class::wakeup_preempt() and rq_modified_*()

Change sched_class::wakeup_preempt() to also get called for
cross-class wakeups, specifically those where the woken task
is of a higher class than the previous highest class.

In order to do this, track the current highest class of the runqueue
in rq::next_class and have wakeup_preempt() track this upwards for
each new wakeup. Additionally have schedule() re-set the value on
pick.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://patch.msgid.link/20251127154725.901391274@infradead.org
---
 kernel/sched/core.c      | 32 +++++++++++++++++++++++---------
 kernel/sched/deadline.c  | 14 +++++++++-----
 kernel/sched/ext.c       |  5 ++---
 kernel/sched/fair.c      | 17 ++++++++++-------
 kernel/sched/idle.c      |  3 ---
 kernel/sched/rt.c        |  9 ++++++---
 kernel/sched/sched.h     | 26 ++------------------------
 kernel/sched/stop_task.c |  3 ---
 8 files changed, 52 insertions(+), 57 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4479f7d..7d0a862 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2090,7 +2090,6 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 	 */
 	uclamp_rq_inc(rq, p, flags);
 
-	rq->queue_mask |= p->sched_class->queue_mask;
 	p->sched_class->enqueue_task(rq, p, flags);
 
 	psi_enqueue(p, flags);
@@ -2123,7 +2122,6 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 	 * and mark the task ->sched_delayed.
 	 */
 	uclamp_rq_dec(rq, p);
-	rq->queue_mask |= p->sched_class->queue_mask;
 	return p->sched_class->dequeue_task(rq, p, flags);
 }
 
@@ -2174,10 +2172,14 @@ void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct task_struct *donor = rq->donor;
 
-	if (p->sched_class == donor->sched_class)
-		donor->sched_class->wakeup_preempt(rq, p, flags);
-	else if (sched_class_above(p->sched_class, donor->sched_class))
+	if (p->sched_class == rq->next_class) {
+		rq->next_class->wakeup_preempt(rq, p, flags);
+
+	} else if (sched_class_above(p->sched_class, rq->next_class)) {
+		rq->next_class->wakeup_preempt(rq, p, flags);
 		resched_curr(rq);
+		rq->next_class = p->sched_class;
+	}
 
 	/*
 	 * A queue event has occurred, and we're going to schedule.  In
@@ -6804,6 +6806,7 @@ static void __sched notrace __schedule(int sched_mode)
 pick_again:
 	next = pick_next_task(rq, rq->donor, &rf);
 	rq_set_donor(rq, next);
+	rq->next_class = next->sched_class;
 	if (unlikely(task_is_blocked(next))) {
 		next = find_proxy_task(rq, next, &rf);
 		if (!next)
@@ -8650,6 +8653,8 @@ void __init sched_init(void)
 		rq->rt.rt_runtime = global_rt_runtime();
 		init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
 #endif
+		rq->next_class = &idle_sched_class;
+
 		rq->sd = NULL;
 		rq->rd = NULL;
 		rq->cpu_capacity = SCHED_CAPACITY_SCALE;
@@ -10775,10 +10780,8 @@ struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int 
 		flags |= DEQUEUE_NOCLOCK;
 	}
 
-	if (flags & DEQUEUE_CLASS) {
-		if (p->sched_class->switching_from)
-			p->sched_class->switching_from(rq, p);
-	}
+	if ((flags & DEQUEUE_CLASS) && p->sched_class->switching_from)
+		p->sched_class->switching_from(rq, p);
 
 	*ctx = (struct sched_change_ctx){
 		.p = p,
@@ -10831,6 +10834,17 @@ void sched_change_end(struct sched_change_ctx *ctx)
 			p->sched_class->switched_to(rq, p);
 
 		/*
+		 * If this was a class promotion; let the old class know it
+		 * got preempted. Note that none of the switch*_from() methods
+		 * know the new class and none of the switch*_to() methods
+		 * know the old class.
+		 */
+		if (ctx->running && sched_class_above(p->sched_class, ctx->class)) {
+			rq->next_class->wakeup_preempt(rq, p, 0);
+			rq->next_class = p->sched_class;
+		}
+
+		/*
 		 * If this was a degradation in class someone should have set
 		 * need_resched by now.
 		 */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 319439f..80c9559 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2499,9 +2499,16 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
  * Only called when both the current and waking task are -deadline
  * tasks.
  */
-static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p,
-				  int flags)
+static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags)
 {
+	/*
+	 * Can only get preempted by stop-class, and those should be
+	 * few and short lived, doesn't really make sense to push
+	 * anything away for that.
+	 */
+	if (p->sched_class != &dl_sched_class)
+		return;
+
 	if (dl_entity_preempt(&p->dl, &rq->donor->dl)) {
 		resched_curr(rq);
 		return;
@@ -3346,9 +3353,6 @@ static int task_is_throttled_dl(struct task_struct *p, int cpu)
 #endif
 
 DEFINE_SCHED_CLASS(dl) = {
-
-	.queue_mask		= 8,
-
 	.enqueue_task		= enqueue_task_dl,
 	.dequeue_task		= dequeue_task_dl,
 	.yield_task		= yield_task_dl,
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 05f5a49..8015ab6 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3075,7 +3075,8 @@ static void switched_from_scx(struct rq *rq, struct task_struct *p)
 	scx_disable_task(p);
 }
 
-static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
+static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_flags) {}
+
 static void switched_to_scx(struct rq *rq, struct task_struct *p) {}
 
 int scx_check_setscheduler(struct task_struct *p, int policy)
@@ -3336,8 +3337,6 @@ static void scx_cgroup_unlock(void) {}
  *   their current sched_class. Call them directly from sched core instead.
  */
 DEFINE_SCHED_CLASS(ext) = {
-	.queue_mask		= 1,
-
 	.enqueue_task		= enqueue_task_scx,
 	.dequeue_task		= dequeue_task_scx,
 	.yield_task		= yield_task_scx,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f79951f..ea276d8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8700,7 +8700,7 @@ preempt_sync(struct rq *rq, int wake_flags,
 /*
  * Preempt the current task with a newly woken task if needed:
  */
-static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
+static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_flags)
 {
 	enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK;
 	struct task_struct *donor = rq->donor;
@@ -8708,6 +8708,12 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int 
 	struct cfs_rq *cfs_rq = task_cfs_rq(donor);
 	int cse_is_idle, pse_is_idle;
 
+	/*
+	 * XXX Getting preempted by higher class, try and find idle CPU?
+	 */
+	if (p->sched_class != &fair_sched_class)
+		return;
+
 	if (unlikely(se == pse))
 		return;
 
@@ -12875,7 +12881,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
 	t0 = sched_clock_cpu(this_cpu);
 	__sched_balance_update_blocked_averages(this_rq);
 
-	rq_modified_clear(this_rq);
+	this_rq->next_class = &fair_sched_class;
 	raw_spin_rq_unlock(this_rq);
 
 	for_each_domain(this_cpu, sd) {
@@ -12942,7 +12948,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
 		pulled_task = 1;
 
 	/* If a higher prio class was modified, restart the pick */
-	if (rq_modified_above(this_rq, &fair_sched_class))
+	if (sched_class_above(this_rq->next_class, &fair_sched_class))
 		pulled_task = -1;
 
 out:
@@ -13846,15 +13852,12 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
  * All the scheduling class methods:
  */
 DEFINE_SCHED_CLASS(fair) = {
-
-	.queue_mask		= 2,
-
 	.enqueue_task		= enqueue_task_fair,
 	.dequeue_task		= dequeue_task_fair,
 	.yield_task		= yield_task_fair,
 	.yield_to_task		= yield_to_task_fair,
 
-	.wakeup_preempt		= check_preempt_wakeup_fair,
+	.wakeup_preempt		= wakeup_preempt_fair,
 
 	.pick_task		= pick_task_fair,
 	.pick_next_task		= pick_next_task_fair,
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index c174afe..65eb8f8 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -536,9 +536,6 @@ static void update_curr_idle(struct rq *rq)
  * Simple, special scheduling class for the per-CPU idle tasks:
  */
 DEFINE_SCHED_CLASS(idle) = {
-
-	.queue_mask		= 0,
-
 	/* no enqueue/yield_task for idle tasks */
 
 	/* dequeue is not valid, we print a debug message there: */
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f1867fe..0a9b2cd 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1615,6 +1615,12 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct task_struct *donor = rq->donor;
 
+	/*
+	 * XXX If we're preempted by DL, queue a push?
+	 */
+	if (p->sched_class != &rt_sched_class)
+		return;
+
 	if (p->prio < donor->prio) {
 		resched_curr(rq);
 		return;
@@ -2568,9 +2574,6 @@ static int task_is_throttled_rt(struct task_struct *p, int cpu)
 #endif /* CONFIG_SCHED_CORE */
 
 DEFINE_SCHED_CLASS(rt) = {
-
-	.queue_mask		= 4,
-
 	.enqueue_task		= enqueue_task_rt,
 	.dequeue_task		= dequeue_task_rt,
 	.yield_task		= yield_task_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a40582d..467ea31 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1121,7 +1121,6 @@ struct rq {
 	raw_spinlock_t		__lock;
 
 	/* Per class runqueue modification mask; bits in class order. */
-	unsigned int		queue_mask;
 	unsigned int		nr_running;
 #ifdef CONFIG_NUMA_BALANCING
 	unsigned int		nr_numa_running;
@@ -1181,6 +1180,7 @@ struct rq {
 	struct sched_dl_entity	*dl_server;
 	struct task_struct	*idle;
 	struct task_struct	*stop;
+	const struct sched_class *next_class;
 	unsigned long		next_balance;
 	struct mm_struct	*prev_mm;
 
@@ -2428,15 +2428,6 @@ struct sched_class {
 #ifdef CONFIG_UCLAMP_TASK
 	int uclamp_enabled;
 #endif
-	/*
-	 * idle:  0
-	 * ext:   1
-	 * fair:  2
-	 * rt:    4
-	 * dl:    8
-	 * stop: 16
-	 */
-	unsigned int queue_mask;
 
 	/*
 	 * move_queued_task/activate_task/enqueue_task: rq->lock
@@ -2595,20 +2586,6 @@ struct sched_class {
 #endif
 };
 
-/*
- * Does not nest; only used around sched_class::pick_task() rq-lock-breaks.
- */
-static inline void rq_modified_clear(struct rq *rq)
-{
-	rq->queue_mask = 0;
-}
-
-static inline bool rq_modified_above(struct rq *rq, const struct sched_class * class)
-{
-	unsigned int mask = class->queue_mask;
-	return rq->queue_mask & ~((mask << 1) - 1);
-}
-
 static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
 	WARN_ON_ONCE(rq->donor != prev);
@@ -3901,6 +3878,7 @@ void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_s
 	deactivate_task(src_rq, task, 0);
 	set_task_cpu(task, dst_rq->cpu);
 	activate_task(dst_rq, task, 0);
+	wakeup_preempt(dst_rq, task, 0);
 }
 
 static inline
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 4f9192b..f95798b 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -97,9 +97,6 @@ static void update_curr_stop(struct rq *rq)
  * Simple, special scheduling class for the per-CPU stop tasks:
  */
 DEFINE_SCHED_CLASS(stop) = {
-
-	.queue_mask		= 16,
-
 	.enqueue_task		= enqueue_task_stop,
 	.dequeue_task		= dequeue_task_stop,
 	.yield_task		= yield_task_stop,