[PATCH 03/12] sched: Fold sched_class::switch{ing,ed}_{to,from}() into the change pattern

Peter Zijlstra posted 12 patches 4 months ago
[PATCH 03/12] sched: Fold sched_class::switch{ing,ed}_{to,from}() into the change pattern
Posted by Peter Zijlstra 4 months ago
Add {DE,EN}QUEUE_CLASS and fold the sched_class::switch* methods into
the change pattern. This completes and makes the pattern more
symmetric.

This changes the order of callbacks slightly:

				|
				|  switching_from()
  dequeue_task();		|  dequeue_task()
  put_prev_task();		|  put_prev_task()
				|  switched_from()
				|
  ... change task ...		|  ... change task ...
				|
  switching_to();		|  switching_to()
  enqueue_task();		|  enqueue_task()
  set_next_task();		|  set_next_task()
  prev_class->switched_from()	|
  switched_to()			|  switched_to()
				|

Notably, it moves the switched_from() callback right after the
dequeue/put. Existing implementations don't appear to be affected by
this change in location -- specifically the task isn't enqueued on the
class in question in either location.

Make (CLASS)^(SAVE|MOVE), because there is nothing to save-restore
when changing scheduling classes.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/core.c      |   56 +++++++++++++++++++++--------------------------
 kernel/sched/ext.c       |   26 ++++++++++++++++-----
 kernel/sched/idle.c      |    4 +--
 kernel/sched/rt.c        |    2 -
 kernel/sched/sched.h     |   22 ++++++------------
 kernel/sched/stop_task.c |    4 +--
 kernel/sched/syscalls.c  |    9 +++++--
 7 files changed, 66 insertions(+), 57 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2169,34 +2169,9 @@ inline int task_curr(const struct task_s
 	return cpu_curr(task_cpu(p)) == p;
 }
 
-/*
- * ->switching_to() is called with the pi_lock and rq_lock held and must not
- * mess with locking.
- */
-void check_class_changing(struct rq *rq, struct task_struct *p,
-			  const struct sched_class *prev_class)
+void check_prio_changed(struct rq *rq, struct task_struct *p, int oldprio)
 {
-	if (prev_class != p->sched_class && p->sched_class->switching_to)
-		p->sched_class->switching_to(rq, p);
-}
-
-/*
- * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
- * use the balance_callback list if you want balancing.
- *
- * this means any call to check_class_changed() must be followed by a call to
- * balance_callback().
- */
-void check_class_changed(struct rq *rq, struct task_struct *p,
-			 const struct sched_class *prev_class,
-			 int oldprio)
-{
-	if (prev_class != p->sched_class) {
-		if (prev_class->switched_from)
-			prev_class->switched_from(rq, p);
-
-		p->sched_class->switched_to(rq, p);
-	} else if (oldprio != p->prio || dl_task(p))
+	if (oldprio != p->prio || dl_task(p))
 		p->sched_class->prio_changed(rq, p, oldprio);
 }
 
@@ -7388,6 +7363,11 @@ void rt_mutex_setprio(struct task_struct
 	prev_class = p->sched_class;
 	next_class = __setscheduler_class(p->policy, prio);
 
+	if (prev_class != next_class) {
+		queue_flag |= DEQUEUE_CLASS;
+		queue_flag &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
+	}
+
 	if (prev_class != next_class && p->se.sched_delayed)
 		dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
 
@@ -7424,11 +7404,10 @@ void rt_mutex_setprio(struct task_struct
 
 		p->sched_class = next_class;
 		p->prio = prio;
-
-		check_class_changing(rq, p, prev_class);
 	}
 
-	check_class_changed(rq, p, prev_class, oldprio);
+	if (!(queue_flag & DEQUEUE_CLASS))
+		check_prio_changed(rq, p, oldprio);
 out_unlock:
 	/* Avoid rq from going away on us: */
 	preempt_disable();
@@ -10862,6 +10841,14 @@ struct sched_change_ctx *sched_change_be
 
 	lockdep_assert_rq_held(rq);
 
+	if (flags & DEQUEUE_CLASS) {
+		if (WARN_ON_ONCE(flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)))
+			flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
+
+		if (p->sched_class->switching_from)
+			p->sched_class->switching_from(rq, p);
+	}
+
 	*ctx = (struct sched_change_ctx){
 		.p = p,
 		.flags = flags,
@@ -10874,6 +10861,9 @@ struct sched_change_ctx *sched_change_be
 	if (ctx->running)
 		put_prev_task(rq, p);
 
+	if ((flags & DEQUEUE_CLASS) && p->sched_class->switched_from)
+		p->sched_class->switched_from(rq, p);
+
 	return ctx;
 }
 
@@ -10884,8 +10874,14 @@ void sched_change_end(struct sched_chang
 
 	lockdep_assert_rq_held(rq);
 
+	if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switching_to)
+		p->sched_class->switching_to(rq, p);
+
 	if (ctx->queued)
 		enqueue_task(rq, p, ctx->flags | ENQUEUE_NOCLOCK);
 	if (ctx->running)
 		set_next_task(rq, p);
+
+	if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switched_to)
+		p->sched_class->switched_to(rq, p);
 }
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3912,21 +3912,28 @@ static void scx_disable_workfn(struct kt
 
 	scx_task_iter_start(&sti);
 	while ((p = scx_task_iter_next_locked(&sti))) {
+		unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
 		const struct sched_class *old_class = p->sched_class;
 		const struct sched_class *new_class =
 			__setscheduler_class(p->policy, p->prio);
 
 		update_rq_clock(task_rq(p));
 
+		if (old_class != new_class) {
+			queue_flags |= DEQUEUE_CLASS;
+			queue_flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
+		}
+
 		if (old_class != new_class && p->se.sched_delayed)
 			dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
 
-		scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK) {
+		scoped_guard (sched_change, p, queue_flags) {
 			p->sched_class = new_class;
-			check_class_changing(task_rq(p), p, old_class);
 		}
 
-		check_class_changed(task_rq(p), p, old_class, p->prio);
+		if (!(queue_flags & DEQUEUE_CLASS))
+			check_prio_changed(task_rq(p), p, p->prio);
+
 		scx_exit_task(p);
 	}
 	scx_task_iter_stop(&sti);
@@ -4655,6 +4662,7 @@ static int scx_enable(struct sched_ext_o
 	percpu_down_write(&scx_fork_rwsem);
 	scx_task_iter_start(&sti);
 	while ((p = scx_task_iter_next_locked(&sti))) {
+		unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
 		const struct sched_class *old_class = p->sched_class;
 		const struct sched_class *new_class =
 			__setscheduler_class(p->policy, p->prio);
@@ -4664,16 +4672,22 @@ static int scx_enable(struct sched_ext_o
 
 		update_rq_clock(task_rq(p));
 
+		if (old_class != new_class) {
+			queue_flags |= DEQUEUE_CLASS;
+			queue_flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
+		}
+
 		if (old_class != new_class && p->se.sched_delayed)
 			dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
 
-		scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK) {
+		scoped_guard (sched_change, p, queue_flags) {
 			p->scx.slice = SCX_SLICE_DFL;
 			p->sched_class = new_class;
-			check_class_changing(task_rq(p), p, old_class);
 		}
 
-		check_class_changed(task_rq(p), p, old_class, p->prio);
+		if (!(queue_flags & DEQUEUE_CLASS))
+			check_prio_changed(task_rq(p), p, p->prio);
+
 		put_task_struct(p);
 	}
 	scx_task_iter_stop(&sti);
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -498,7 +498,7 @@ static void task_tick_idle(struct rq *rq
 {
 }
 
-static void switched_to_idle(struct rq *rq, struct task_struct *p)
+static void switching_to_idle(struct rq *rq, struct task_struct *p)
 {
 	BUG();
 }
@@ -536,6 +536,6 @@ DEFINE_SCHED_CLASS(idle) = {
 	.task_tick		= task_tick_idle,
 
 	.prio_changed		= prio_changed_idle,
-	.switched_to		= switched_to_idle,
+	.switching_to		= switching_to_idle,
 	.update_curr		= update_curr_idle,
 };
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2589,8 +2589,8 @@ DEFINE_SCHED_CLASS(rt) = {
 
 	.get_rr_interval	= get_rr_interval_rt,
 
-	.prio_changed		= prio_changed_rt,
 	.switched_to		= switched_to_rt,
+	.prio_changed		= prio_changed_rt,
 
 	.update_curr		= update_curr_rt,
 
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -20,7 +20,6 @@
 #include <linux/sched/task_flags.h>
 #include <linux/sched/task.h>
 #include <linux/sched/topology.h>
-
 #include <linux/atomic.h>
 #include <linux/bitmap.h>
 #include <linux/bug.h>
@@ -2369,6 +2368,7 @@ extern const u32		sched_prio_to_wmult[40
 
 #define DEQUEUE_MIGRATING	0x0010 /* Matches ENQUEUE_MIGRATING */
 #define DEQUEUE_DELAYED		0x0020 /* Matches ENQUEUE_DELAYED */
+#define DEQUEUE_CLASS		0x0040 /* Matches ENQUEUE_CLASS */
 
 #define DEQUEUE_SPECIAL		0x00010000
 #define DEQUEUE_THROTTLE	0x00020000
@@ -2380,6 +2380,7 @@ extern const u32		sched_prio_to_wmult[40
 
 #define ENQUEUE_MIGRATING	0x0010
 #define ENQUEUE_DELAYED		0x0020
+#define ENQUEUE_CLASS		0x0040
 
 #define ENQUEUE_HEAD		0x00010000
 #define ENQUEUE_REPLENISH	0x00020000
@@ -2443,14 +2444,11 @@ struct sched_class {
 	void (*task_fork)(struct task_struct *p);
 	void (*task_dead)(struct task_struct *p);
 
-	/*
-	 * The switched_from() call is allowed to drop rq->lock, therefore we
-	 * cannot assume the switched_from/switched_to pair is serialized by
-	 * rq->lock. They are however serialized by p->pi_lock.
-	 */
-	void (*switching_to) (struct rq *this_rq, struct task_struct *task);
-	void (*switched_from)(struct rq *this_rq, struct task_struct *task);
-	void (*switched_to)  (struct rq *this_rq, struct task_struct *task);
+	void (*switching_from)(struct rq *this_rq, struct task_struct *task);
+	void (*switched_from) (struct rq *this_rq, struct task_struct *task);
+	void (*switching_to)  (struct rq *this_rq, struct task_struct *task);
+	void (*switched_to)   (struct rq *this_rq, struct task_struct *task);
+
 	void (*reweight_task)(struct rq *this_rq, struct task_struct *task,
 			      const struct load_weight *lw);
 	void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
@@ -3879,11 +3877,7 @@ extern void set_load_weight(struct task_
 extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
 extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
 
-extern void check_class_changing(struct rq *rq, struct task_struct *p,
-				 const struct sched_class *prev_class);
-extern void check_class_changed(struct rq *rq, struct task_struct *p,
-				const struct sched_class *prev_class,
-				int oldprio);
+extern void check_prio_changed(struct rq *rq, struct task_struct *p, int oldprio);
 
 extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
 extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -75,7 +75,7 @@ static void task_tick_stop(struct rq *rq
 {
 }
 
-static void switched_to_stop(struct rq *rq, struct task_struct *p)
+static void switching_to_stop(struct rq *rq, struct task_struct *p)
 {
 	BUG(); /* its impossible to change to this class */
 }
@@ -112,6 +112,6 @@ DEFINE_SCHED_CLASS(stop) = {
 	.task_tick		= task_tick_stop,
 
 	.prio_changed		= prio_changed_stop,
-	.switched_to		= switched_to_stop,
+	.switching_to		= switching_to_stop,
 	.update_curr		= update_curr_stop,
 };
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -684,6 +684,11 @@ int __sched_setscheduler(struct task_str
 	prev_class = p->sched_class;
 	next_class = __setscheduler_class(policy, newprio);
 
+	if (prev_class != next_class) {
+		queue_flags |= DEQUEUE_CLASS;
+		queue_flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
+	}
+
 	if (prev_class != next_class && p->se.sched_delayed)
 		dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
 
@@ -695,7 +700,6 @@ int __sched_setscheduler(struct task_str
 			p->prio = newprio;
 		}
 		__setscheduler_uclamp(p, attr);
-		check_class_changing(rq, p, prev_class);
 
 		if (scope->queued) {
 			/*
@@ -707,7 +711,8 @@ int __sched_setscheduler(struct task_str
 		}
 	}
 
-	check_class_changed(rq, p, prev_class, oldprio);
+	if (!(queue_flags & DEQUEUE_CLASS))
+		check_prio_changed(rq, p, oldprio);
 
 	/* Avoid rq from going away on us: */
 	preempt_disable();
Re: [PATCH 03/12] sched: Fold sched_class::switch{ing,ed}_{to,from}() into the change pattern
Posted by Dietmar Eggemann 4 months ago
On 06.10.25 12:44, Peter Zijlstra wrote:
> Add {DE,EN}QUEUE_CLASS and fold the sched_class::switch* methods into
> the change pattern. This completes and makes the pattern more
> symmetric.
> 
> This changes the order of callbacks slightly:
> 
> 				|
> 				|  switching_from()
>   dequeue_task();		|  dequeue_task()
>   put_prev_task();		|  put_prev_task()
> 				|  switched_from()
> 				|
>   ... change task ...		|  ... change task ...
> 				|
>   switching_to();		|  switching_to()
>   enqueue_task();		|  enqueue_task()
>   set_next_task();		|  set_next_task()
>   prev_class->switched_from()	|
>   switched_to()			|  switched_to()
> 				|
> 
> Notably, it moves the switched_from() callback right after the
> dequeue/put. Existing implementations don't appear to be affected by
> this change in location -- specifically the task isn't enqueued on the
> class in question in either location.
> 
> Make (CLASS)^(SAVE|MOVE), because there is nothing to save-restore
> when changing scheduling classes.

This one causes a DL bw related warning when I run a simple 1 DL task
rt-app workload:

# rt-app ./rt-app/dl10.json 

[rt-app] <notice> thread_data_set_unique_name 0 thread0-0
[rt-app] <notice> [0] starting thread ...

[rt-app] <notice> [0] Starting with SCHED_DEADLINE policy with priority 0
[   16.390272] sched: DL replenish lagged too much
[   16.390327] ------------[ cut here ]------------
[   16.390329] WARNING: CPU: 2 PID: 591 at kernel/sched/deadline.c:239 sub_running_bw.isra.0+0xf4/0x150
[   16.391849] Modules linked in:
[   16.392107] CPU: 2 UID: 0 PID: 591 Comm: thread0-0 Not tainted 6.17.0-rc4-00020-ga6b63e5ce187 #46 PREEMPT 
[   16.392885] Hardware name: linux,dummy-virt (DT)
[   16.393265] pstate: 014000c5 (nzcv daIF +PAN -UAO -TCO +DIT -SSBS BTYPE=--)
[   16.393783] pc : sub_running_bw.isra.0+0xf4/0x150
[   16.394153] lr : sub_running_bw.isra.0+0x118/0x150
[   16.394636] sp : ffff80008137bb10
[   16.394864] x29: ffff80008137bb10 x28: ffff0000ff7b39c0 x27: ffff0000ce73dd60
[   16.395333] x26: 0000000000000000 x25: ffffa1134d945000 x24: ffff0000ff7b42c8
[   16.395805] x23: ffffa1134d944000 x22: ffffa1134d944000 x21: 000000000000cccc
[   16.396267] x20: 0000000000060000 x19: ffff0000ff7b42c8 x18: fffffffffffe6f58
[   16.396742] x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000001
[   16.397202] x14: fffffffffffc6f57 x13: 0a6863756d206f6f x12: ffffa1134e743f60
[   16.397674] x11: 00000000000000c0 x10: 0000000000000001 x9 : 0000000000000000
[   16.398130] x8 : ffff0000c001e490 x7 : 0000000000000008 x6 : ffff0000c0029968
[   16.398883] x5 : 00000000ffffffff x4 : 0000000000000064 x3 : ffff0000c0029fa8
[   16.399432] x2 : ffff5eedb1f6e000 x1 : 000000000000cccc x0 : fffffffffffacccc
[   16.399962] Call trace:
[   16.400147]  sub_running_bw.isra.0+0xf4/0x150 (P)
[   16.400510]  task_non_contending+0x248/0x2ac
[   16.400831]  dequeue_task_dl+0x178/0x2d4
[   16.401122]  __schedule+0x6ac/0x1038
[   16.401401]  schedule+0x4c/0x164
[   16.401627]  do_nanosleep+0x6c/0x190
[   16.401862]  hrtimer_nanosleep+0xbc/0x200
[   16.402156]  common_nsleep_timens+0x50/0x90
[   16.402522]  __arm64_sys_clock_nanosleep+0xd0/0x150
[   16.402813]  invoke_syscall+0x48/0x104
[   16.403043]  el0_svc_common.constprop.0+0x40/0xe0
[   16.403327]  do_el0_svc+0x1c/0x28
[   16.403520]  el0_svc+0x4c/0x160
[   16.403711]  el0t_64_sync_handler+0xa0/0xf0
[   16.403950]  el0t_64_sync+0x198/0x19c
[   16.404226] irq event stamp: 196
[   16.404451] hardirqs last  enabled at (195): [<ffffa1134c8021d8>] _raw_spin_unlock_irqrestore+0x6c/0x74
[   16.405086] hardirqs last disabled at (196): [<ffffa1134c7f7850>] __schedule+0x4e8/0x1038
[   16.405629] softirqs last  enabled at (154): [<ffffa1134b4e157c>] handle_softirqs+0x44c/0x498
[   16.406218] softirqs last disabled at (145): [<ffffa1134b410774>] __do_softirq+0x14/0x20

with extra logging and removing underflow WARN_ON_ONCE:

# rt-app ./rt-app/dl10.json 

[rt-app] <notice> thread_data_set_unique_name 0 thread0-0
[rt-app] <notice> [0] starting thread ...

[rt-app] <notice> [0] Starting with SCHED_DEADLINE policy with priority 0
[   18.494469] sched: DL replenish lagged too much
[   18.494483] cpu=3 p->comm=thread0-0 p->pid=592
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[   18.494486] __sub_running_bw() cpu=3 dl_rq->running_bw=18446744073709210828 dl_bw=393216 old=52428
                                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                                        dl_rq->running_bw underflow in task_non_contending()

[   18.494492] CPU: 3 UID: 0 PID: 592 Comm: thread0-0 Not tainted 6.17.0-rc4-00020-ga6b63e5ce187-dirty #44 PREEMPT 
[   18.494495] Hardware name: linux,dummy-virt (DT)
[   18.494497] Call trace:
[   18.494498]  show_stack+0x18/0x24 (C)
[   18.494510]  dump_stack_lvl+0x70/0x98
[   18.494514]  dump_stack+0x18/0x24
[   18.494516]  sub_running_bw.isra.0+0x164/0x180
[   18.494539]  task_non_contending+0x298/0x2e8
[   18.494541]  dequeue_task_dl+0x188/0x31c
[   18.494544]  __schedule+0x6ac/0x1038
[   18.494574]  schedule+0x4c/0x164
[   18.494578]  do_nanosleep+0x6c/0x190
[   18.494580]  hrtimer_nanosleep+0xbc/0x200
[   18.494594]  common_nsleep_timens+0x50/0x90
[   18.494599]  __arm64_sys_clock_nanosleep+0xd0/0x150
[   18.494602]  invoke_syscall+0x48/0x104
[   18.494610]  el0_svc_common.constprop.0+0x40/0xe0
[   18.494612]  do_el0_svc+0x1c/0x28
[   18.494615]  el0_svc+0x4c/0x160
[   18.494617]  el0t_64_sync_handler+0xa0/0xf0
[   18.494620]  el0t_64_sync+0x198/0x19c

Not sure yet how this is related to switched_from_dl() being now called earlier?

[...]
Re: [PATCH 03/12] sched: Fold sched_class::switch{ing,ed}_{to,from}() into the change pattern
Posted by Peter Zijlstra 4 months ago
On Thu, Oct 09, 2025 at 03:30:02PM +0200, Dietmar Eggemann wrote:
> On 06.10.25 12:44, Peter Zijlstra wrote:
> > Add {DE,EN}QUEUE_CLASS and fold the sched_class::switch* methods into
> > the change pattern. This completes and makes the pattern more
> > symmetric.
> > 
> > This changes the order of callbacks slightly:
> > 
> > 				|
> > 				|  switching_from()
> >   dequeue_task();		|  dequeue_task()
> >   put_prev_task();		|  put_prev_task()
> > 				|  switched_from()
> > 				|
> >   ... change task ...		|  ... change task ...
> > 				|
> >   switching_to();		|  switching_to()
> >   enqueue_task();		|  enqueue_task()
> >   set_next_task();		|  set_next_task()
> >   prev_class->switched_from()	|
> >   switched_to()			|  switched_to()
> > 				|
> > 
> > Notably, it moves the switched_from() callback right after the
> > dequeue/put. Existing implementations don't appear to be affected by
> > this change in location -- specifically the task isn't enqueued on the
> > class in question in either location.
> > 
> > Make (CLASS)^(SAVE|MOVE), because there is nothing to save-restore
> > when changing scheduling classes.
> 
> This one causes a DL bw related warning when I run a simple 1 DL task
> rt-app workload:

> Not sure yet how this is related to switched_from_dl() being now called earlier?

Ooh, I might see a problem. task_non_contending() uses dl_task(), which
uses p->prio. The move above means it is now called using the 'old'
prio, whereas it used to run with the 'new' prio.

I suppose it does this to distinguish 'real' DL tasks from PI boosted DL
tasks.

Let me see if I can figure out something for this.
Re: [PATCH 03/12] sched: Fold sched_class::switch{ing,ed}_{to,from}() into the change pattern
Posted by Peter Zijlstra 4 months ago
On Thu, Oct 09, 2025 at 03:54:08PM +0200, Peter Zijlstra wrote:
> On Thu, Oct 09, 2025 at 03:30:02PM +0200, Dietmar Eggemann wrote:
> > On 06.10.25 12:44, Peter Zijlstra wrote:
> > > Add {DE,EN}QUEUE_CLASS and fold the sched_class::switch* methods into
> > > the change pattern. This completes and makes the pattern more
> > > symmetric.
> > > 
> > > This changes the order of callbacks slightly:
> > > 
> > > 				|
> > > 				|  switching_from()
> > >   dequeue_task();		|  dequeue_task()
> > >   put_prev_task();		|  put_prev_task()
> > > 				|  switched_from()
> > > 				|
> > >   ... change task ...		|  ... change task ...
> > > 				|
> > >   switching_to();		|  switching_to()
> > >   enqueue_task();		|  enqueue_task()
> > >   set_next_task();		|  set_next_task()
> > >   prev_class->switched_from()	|
> > >   switched_to()			|  switched_to()
> > > 				|
> > > 
> > > Notably, it moves the switched_from() callback right after the
> > > dequeue/put. Existing implementations don't appear to be affected by
> > > this change in location -- specifically the task isn't enqueued on the
> > > class in question in either location.
> > > 
> > > Make (CLASS)^(SAVE|MOVE), because there is nothing to save-restore
> > > when changing scheduling classes.
> > 
> > This one causes a DL bw related warning when I run a simple 1 DL task
> > rt-app workload:
> 
> > Not sure yet how this is related to switched_from_dl() being now called earlier?
> 
> Ooh, I might see a problem. task_non_contending() uses dl_task(), which
> uses p->prio. The move above means it is now called using the 'old'
> prio, whereas it used to run with the 'new' prio.
> 
> Let me see if I can figure out something for this.

Does this help? /me goes find rt-app.

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 615411a0a881..fe2272c812b2 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -405,7 +405,7 @@ static void __dl_clear_params(struct sched_dl_entity *dl_se);
  * up, and checks if the task is still in the "ACTIVE non contending"
  * state or not (in the second case, it updates running_bw).
  */
-static void task_non_contending(struct sched_dl_entity *dl_se)
+static void task_non_contending(struct sched_dl_entity *dl_se, bool dl_task)
 {
 	struct hrtimer *timer = &dl_se->inactive_timer;
 	struct rq *rq = rq_of_dl_se(dl_se);
@@ -444,10 +444,10 @@ static void task_non_contending(struct sched_dl_entity *dl_se)
 		} else {
 			struct task_struct *p = dl_task_of(dl_se);
 
-			if (dl_task(p))
+			if (dl_task)
 				sub_running_bw(dl_se, dl_rq);
 
-			if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
+			if (!dl_task || READ_ONCE(p->__state) == TASK_DEAD) {
 				struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
 
 				if (READ_ONCE(p->__state) == TASK_DEAD)
@@ -2045,7 +2045,7 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags)
 	 * or "inactive")
 	 */
 	if (flags & DEQUEUE_SLEEP)
-		task_non_contending(dl_se);
+		task_non_contending(dl_se, true);
 }
 
 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
@@ -2970,7 +2970,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
 	 * will reset the task parameters.
 	 */
 	if (task_on_rq_queued(p) && p->dl.dl_runtime)
-		task_non_contending(&p->dl);
+		task_non_contending(&p->dl, false);
 
 	/*
 	 * In case a task is setscheduled out from SCHED_DEADLINE we need to
Re: [PATCH 03/12] sched: Fold sched_class::switch{ing,ed}_{to,from}() into the change pattern
Posted by Dietmar Eggemann 4 months ago

On 09.10.25 16:09, Peter Zijlstra wrote:
> On Thu, Oct 09, 2025 at 03:54:08PM +0200, Peter Zijlstra wrote:
>> On Thu, Oct 09, 2025 at 03:30:02PM +0200, Dietmar Eggemann wrote:
>>> On 06.10.25 12:44, Peter Zijlstra wrote:
>>>> Add {DE,EN}QUEUE_CLASS and fold the sched_class::switch* methods into
>>>> the change pattern. This completes and makes the pattern more
>>>> symmetric.
>>>>
>>>> This changes the order of callbacks slightly:
>>>>
>>>> 				|
>>>> 				|  switching_from()
>>>>   dequeue_task();		|  dequeue_task()
>>>>   put_prev_task();		|  put_prev_task()
>>>> 				|  switched_from()
>>>> 				|
>>>>   ... change task ...		|  ... change task ...
>>>> 				|
>>>>   switching_to();		|  switching_to()
>>>>   enqueue_task();		|  enqueue_task()
>>>>   set_next_task();		|  set_next_task()
>>>>   prev_class->switched_from()	|
>>>>   switched_to()			|  switched_to()
>>>> 				|
>>>>
>>>> Notably, it moves the switched_from() callback right after the
>>>> dequeue/put. Existing implementations don't appear to be affected by
>>>> this change in location -- specifically the task isn't enqueued on the
>>>> class in question in either location.
>>>>
>>>> Make (CLASS)^(SAVE|MOVE), because there is nothing to save-restore
>>>> when changing scheduling classes.
>>>
>>> This one causes a DL bw related warning when I run a simple 1 DL task
>>> rt-app workload:
>>
>>> Not sure yet how this is related to switched_from_dl() being now called earlier?
>>
>> Ooh, I might see a problem. task_non_contending() uses dl_task(), which
>> uses p->prio. The move above means it is now called using the 'old'
>> prio, whereas it used to run with the 'new' prio.
>>
>> Let me see if I can figure out something for this.
> 
> Does this help? /me goes find rt-app.

Yes, but there seems to be more ... missing DEQUEUE_SAVE (a.k.a.
ENQUEUE_RESTORE) in

  enqueue_dl_entity()

    if (flags & (ENQUEUE_RESTORE|ENQUEUE_MIGRATING))
                 ^^^^^^^^^^^^^^^
      ...
      add_running_bw(dl_se, dl_rq)

and

  __sched_setscheduler()

    ...
    if (prev_class != next_class)
      queue_flags |= DEQUEUE_CLASS;
      queue_flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
                       ^^^^^^^^^^^^

as well as

  sched_change_begin()

    ...
    if (flags & DEQUEUE_CLASS) {
      if (WARN_ON_ONCE(flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)))
        flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
                   ^^^^^^^^^^^^

With your patch and this the issue went away:

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 884926d3dd95..35074799e9ad 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10844,9 +10844,6 @@ struct sched_change_ctx
*sched_change_begin(struct task_struct *p, unsigned int
        lockdep_assert_rq_held(rq);

        if (flags & DEQUEUE_CLASS) {
-               if (WARN_ON_ONCE(flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)))
-                       flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
-
                if (p->sched_class->switching_from)
                        p->sched_class->switching_from(rq, p);
        }
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 007d1440374b..bcef5c72d287 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -684,10 +684,8 @@ int __sched_setscheduler(struct task_struct *p,
        prev_class = p->sched_class;
        next_class = __setscheduler_class(policy, newprio);

-       if (prev_class != next_class) {
+       if (prev_class != next_class)
                queue_flags |= DEQUEUE_CLASS;
-               queue_flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
-       }
Re: [PATCH 03/12] sched: Fold sched_class::switch{ing,ed}_{to,from}() into the change pattern
Posted by Peter Zijlstra 3 months, 4 weeks ago
On Thu, Oct 09, 2025 at 06:50:55PM +0200, Dietmar Eggemann wrote:

> Yes, but there seems to be more ... missing DEQUEUE_SAVE (a.k.a.
> ENQUEUE_RESTORE) in
> 
>   enqueue_dl_entity()
> 
>     if (flags & (ENQUEUE_RESTORE|ENQUEUE_MIGRATING))
>                  ^^^^^^^^^^^^^^^
>       ...
>       add_running_bw(dl_se, dl_rq)
> 
> and
> 
>   __sched_setscheduler()
> 
>     ...
>     if (prev_class != next_class)
>       queue_flags |= DEQUEUE_CLASS;
>       queue_flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
>                        ^^^^^^^^^^^^
> 
> as well as
> 
>   sched_change_begin()
> 
>     ...
>     if (flags & DEQUEUE_CLASS) {
>       if (WARN_ON_ONCE(flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)))
>         flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
>                    ^^^^^^^^^^^^
> 

Urgh.. SAVE/RESTORE while changing CLASS is so weird.

But yeah, let me take that bit out for now -- I'll make a note in a
comment that we should look at perhaps cleaning that up instead.
[tip: sched/core] sched/deadline: Prepare for switched_from() change
Posted by tip-bot2 for Peter Zijlstra 3 months, 3 weeks ago
The following commit has been merged into the sched/core branch of tip:

Commit-ID:     5e42d4c123ba9b89ce19b3aa7e22b7684cbfa49c
Gitweb:        https://git.kernel.org/tip/5e42d4c123ba9b89ce19b3aa7e22b7684cbfa49c
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Thu, 09 Oct 2025 16:09:25 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 16 Oct 2025 11:13:51 +02:00

sched/deadline: Prepare for switched_from() change

Prepare for the sched_class::switch*() methods getting folded into the
change pattern. As a result of that, the location of switched_from
will change slightly. SCHED_DEADLINE is affected by this change in
location:

  OLD                              NEW
				|
				|  switching_from()
  dequeue_task();		|  dequeue_task()
  put_prev_task();		|  put_prev_task()
				|  switched_from()
				|
  ... change task ...		|  ... change task ...
				|
  switching_to();		|  switching_to()
  enqueue_task();		|  enqueue_task()
  set_next_task();		|  set_next_task()
  prev_class->switched_from()	|
  switched_to()			|  switched_to()
				|

Notably, where switched_from() was called *after* the change to the
task, it will get called before it. Specifically, switched_from_dl()
uses dl_task(p) which uses p->prio; which is changed when switching
class (it might be the reason to switch class in case of PI).

When switched_from_dl() gets called, the task will have left the
deadline class and dl_task() must be false, while when doing
dequeue_dl_entity() the task must be a dl_task(), otherwise we'd have
called a different dequeue method.

Reported-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/deadline.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 933bd1f..fd147a7 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -405,7 +405,7 @@ static void __dl_clear_params(struct sched_dl_entity *dl_se);
  * up, and checks if the task is still in the "ACTIVE non contending"
  * state or not (in the second case, it updates running_bw).
  */
-static void task_non_contending(struct sched_dl_entity *dl_se)
+static void task_non_contending(struct sched_dl_entity *dl_se, bool dl_task)
 {
 	struct hrtimer *timer = &dl_se->inactive_timer;
 	struct rq *rq = rq_of_dl_se(dl_se);
@@ -444,10 +444,10 @@ static void task_non_contending(struct sched_dl_entity *dl_se)
 		} else {
 			struct task_struct *p = dl_task_of(dl_se);
 
-			if (dl_task(p))
+			if (dl_task)
 				sub_running_bw(dl_se, dl_rq);
 
-			if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
+			if (!dl_task || READ_ONCE(p->__state) == TASK_DEAD) {
 				struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
 
 				if (READ_ONCE(p->__state) == TASK_DEAD)
@@ -2045,7 +2045,7 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags)
 	 * or "inactive")
 	 */
 	if (flags & DEQUEUE_SLEEP)
-		task_non_contending(dl_se);
+		task_non_contending(dl_se, true);
 }
 
 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
@@ -2970,7 +2970,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
 	 * will reset the task parameters.
 	 */
 	if (task_on_rq_queued(p) && p->dl.dl_runtime)
-		task_non_contending(&p->dl);
+		task_non_contending(&p->dl, false);
 
 	/*
 	 * In case a task is setscheduled out from SCHED_DEADLINE we need to
[tip: sched/core] sched: Fold sched_class::switch{ing,ed}_{to,from}() into the change pattern
Posted by tip-bot2 for Peter Zijlstra 3 months, 3 weeks ago
The following commit has been merged into the sched/core branch of tip:

Commit-ID:     637b0682821b144d5993211cf0a768b322138a69
Gitweb:        https://git.kernel.org/tip/637b0682821b144d5993211cf0a768b322138a69
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Wed, 30 Oct 2024 15:08:15 +01:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 16 Oct 2025 11:13:51 +02:00

sched: Fold sched_class::switch{ing,ed}_{to,from}() into the change pattern

Add {DE,EN}QUEUE_CLASS and fold the sched_class::switch* methods into
the change pattern. This completes and makes the pattern more
symmetric.

This changes the order of callbacks slightly:

  OLD                              NEW
				|
				|  switching_from()
  dequeue_task();		|  dequeue_task()
  put_prev_task();		|  put_prev_task()
				|  switched_from()
				|
  ... change task ...		|  ... change task ...
				|
  switching_to();		|  switching_to()
  enqueue_task();		|  enqueue_task()
  set_next_task();		|  set_next_task()
  prev_class->switched_from()	|
  switched_to()			|  switched_to()
				|

Notably, it moves the switched_from() callback right after the
dequeue/put. Existing implementations don't appear to be affected by
this change in location -- specifically the task isn't enqueued on the
class in question in either location.

Make (CLASS)^(SAVE|MOVE), because there is nothing to save-restore
when changing scheduling classes.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
---
 kernel/sched/core.c      | 51 ++++++++++++++++-----------------------
 kernel/sched/ext.c       | 22 ++++++++++++-----
 kernel/sched/idle.c      |  4 +--
 kernel/sched/rt.c        |  2 +-
 kernel/sched/sched.h     | 22 ++++++-----------
 kernel/sched/stop_task.c |  4 +--
 kernel/sched/syscalls.c  |  7 +++--
 7 files changed, 55 insertions(+), 57 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index eca40df..4dbd206 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2169,34 +2169,9 @@ inline int task_curr(const struct task_struct *p)
 	return cpu_curr(task_cpu(p)) == p;
 }
 
-/*
- * ->switching_to() is called with the pi_lock and rq_lock held and must not
- * mess with locking.
- */
-void check_class_changing(struct rq *rq, struct task_struct *p,
-			  const struct sched_class *prev_class)
-{
-	if (prev_class != p->sched_class && p->sched_class->switching_to)
-		p->sched_class->switching_to(rq, p);
-}
-
-/*
- * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
- * use the balance_callback list if you want balancing.
- *
- * this means any call to check_class_changed() must be followed by a call to
- * balance_callback().
- */
-void check_class_changed(struct rq *rq, struct task_struct *p,
-			 const struct sched_class *prev_class,
-			 int oldprio)
+void check_prio_changed(struct rq *rq, struct task_struct *p, int oldprio)
 {
-	if (prev_class != p->sched_class) {
-		if (prev_class->switched_from)
-			prev_class->switched_from(rq, p);
-
-		p->sched_class->switched_to(rq, p);
-	} else if (oldprio != p->prio || dl_task(p))
+	if (oldprio != p->prio || dl_task(p))
 		p->sched_class->prio_changed(rq, p, oldprio);
 }
 
@@ -7388,6 +7363,9 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 	prev_class = p->sched_class;
 	next_class = __setscheduler_class(p->policy, prio);
 
+	if (prev_class != next_class)
+		queue_flag |= DEQUEUE_CLASS;
+
 	if (prev_class != next_class && p->se.sched_delayed)
 		dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
 
@@ -7424,11 +7402,10 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 
 		p->sched_class = next_class;
 		p->prio = prio;
-
-		check_class_changing(rq, p, prev_class);
 	}
 
-	check_class_changed(rq, p, prev_class, oldprio);
+	if (!(queue_flag & DEQUEUE_CLASS))
+		check_prio_changed(rq, p, oldprio);
 out_unlock:
 	/* Avoid rq from going away on us: */
 	preempt_disable();
@@ -10862,6 +10839,11 @@ struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int 
 
 	lockdep_assert_rq_held(rq);
 
+	if (flags & DEQUEUE_CLASS) {
+		if (p->sched_class->switching_from)
+			p->sched_class->switching_from(rq, p);
+	}
+
 	*ctx = (struct sched_change_ctx){
 		.p = p,
 		.flags = flags,
@@ -10874,6 +10856,9 @@ struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int 
 	if (ctx->running)
 		put_prev_task(rq, p);
 
+	if ((flags & DEQUEUE_CLASS) && p->sched_class->switched_from)
+		p->sched_class->switched_from(rq, p);
+
 	return ctx;
 }
 
@@ -10884,8 +10869,14 @@ void sched_change_end(struct sched_change_ctx *ctx)
 
 	lockdep_assert_rq_held(rq);
 
+	if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switching_to)
+		p->sched_class->switching_to(rq, p);
+
 	if (ctx->queued)
 		enqueue_task(rq, p, ctx->flags | ENQUEUE_NOCLOCK);
 	if (ctx->running)
 		set_next_task(rq, p);
+
+	if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switched_to)
+		p->sched_class->switched_to(rq, p);
 }
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 4566a7c..a408c39 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3912,21 +3912,26 @@ static void scx_disable_workfn(struct kthread_work *work)
 
 	scx_task_iter_start(&sti);
 	while ((p = scx_task_iter_next_locked(&sti))) {
+		unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
 		const struct sched_class *old_class = p->sched_class;
 		const struct sched_class *new_class =
 			__setscheduler_class(p->policy, p->prio);
 
 		update_rq_clock(task_rq(p));
 
+		if (old_class != new_class)
+			queue_flags |= DEQUEUE_CLASS;
+
 		if (old_class != new_class && p->se.sched_delayed)
 			dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
 
-		scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK) {
+		scoped_guard (sched_change, p, queue_flags) {
 			p->sched_class = new_class;
-			check_class_changing(task_rq(p), p, old_class);
 		}
 
-		check_class_changed(task_rq(p), p, old_class, p->prio);
+		if (!(queue_flags & DEQUEUE_CLASS))
+			check_prio_changed(task_rq(p), p, p->prio);
+
 		scx_exit_task(p);
 	}
 	scx_task_iter_stop(&sti);
@@ -4655,6 +4660,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 	percpu_down_write(&scx_fork_rwsem);
 	scx_task_iter_start(&sti);
 	while ((p = scx_task_iter_next_locked(&sti))) {
+		unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
 		const struct sched_class *old_class = p->sched_class;
 		const struct sched_class *new_class =
 			__setscheduler_class(p->policy, p->prio);
@@ -4664,16 +4670,20 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 
 		update_rq_clock(task_rq(p));
 
+		if (old_class != new_class)
+			queue_flags |= DEQUEUE_CLASS;
+
 		if (old_class != new_class && p->se.sched_delayed)
 			dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
 
-		scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK) {
+		scoped_guard (sched_change, p, queue_flags) {
 			p->scx.slice = SCX_SLICE_DFL;
 			p->sched_class = new_class;
-			check_class_changing(task_rq(p), p, old_class);
 		}
 
-		check_class_changed(task_rq(p), p, old_class, p->prio);
+		if (!(queue_flags & DEQUEUE_CLASS))
+			check_prio_changed(task_rq(p), p, p->prio);
+
 		put_task_struct(p);
 	}
 	scx_task_iter_stop(&sti);
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index c39b089..f02dced 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -498,7 +498,7 @@ static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
 {
 }
 
-static void switched_to_idle(struct rq *rq, struct task_struct *p)
+static void switching_to_idle(struct rq *rq, struct task_struct *p)
 {
 	BUG();
 }
@@ -536,6 +536,6 @@ DEFINE_SCHED_CLASS(idle) = {
 	.task_tick		= task_tick_idle,
 
 	.prio_changed		= prio_changed_idle,
-	.switched_to		= switched_to_idle,
+	.switching_to		= switching_to_idle,
 	.update_curr		= update_curr_idle,
 };
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 7936d43..6b2e811 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2589,8 +2589,8 @@ DEFINE_SCHED_CLASS(rt) = {
 
 	.get_rr_interval	= get_rr_interval_rt,
 
-	.prio_changed		= prio_changed_rt,
 	.switched_to		= switched_to_rt,
+	.prio_changed		= prio_changed_rt,
 
 	.update_curr		= update_curr_rt,
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 24b3c6c..e3f4215 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -20,7 +20,6 @@
 #include <linux/sched/task_flags.h>
 #include <linux/sched/task.h>
 #include <linux/sched/topology.h>
-
 #include <linux/atomic.h>
 #include <linux/bitmap.h>
 #include <linux/bug.h>
@@ -2369,6 +2368,7 @@ extern const u32		sched_prio_to_wmult[40];
 
 #define DEQUEUE_MIGRATING	0x0010 /* Matches ENQUEUE_MIGRATING */
 #define DEQUEUE_DELAYED		0x0020 /* Matches ENQUEUE_DELAYED */
+#define DEQUEUE_CLASS		0x0040 /* Matches ENQUEUE_CLASS */
 
 #define DEQUEUE_SPECIAL		0x00010000
 #define DEQUEUE_THROTTLE	0x00020000
@@ -2380,6 +2380,7 @@ extern const u32		sched_prio_to_wmult[40];
 
 #define ENQUEUE_MIGRATING	0x0010
 #define ENQUEUE_DELAYED		0x0020
+#define ENQUEUE_CLASS		0x0040
 
 #define ENQUEUE_HEAD		0x00010000
 #define ENQUEUE_REPLENISH	0x00020000
@@ -2443,14 +2444,11 @@ struct sched_class {
 	void (*task_fork)(struct task_struct *p);
 	void (*task_dead)(struct task_struct *p);
 
-	/*
-	 * The switched_from() call is allowed to drop rq->lock, therefore we
-	 * cannot assume the switched_from/switched_to pair is serialized by
-	 * rq->lock. They are however serialized by p->pi_lock.
-	 */
-	void (*switching_to) (struct rq *this_rq, struct task_struct *task);
-	void (*switched_from)(struct rq *this_rq, struct task_struct *task);
-	void (*switched_to)  (struct rq *this_rq, struct task_struct *task);
+	void (*switching_from)(struct rq *this_rq, struct task_struct *task);
+	void (*switched_from) (struct rq *this_rq, struct task_struct *task);
+	void (*switching_to)  (struct rq *this_rq, struct task_struct *task);
+	void (*switched_to)   (struct rq *this_rq, struct task_struct *task);
+
 	void (*reweight_task)(struct rq *this_rq, struct task_struct *task,
 			      const struct load_weight *lw);
 	void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
@@ -3879,11 +3877,7 @@ extern void set_load_weight(struct task_struct *p, bool update_load);
 extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
 extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
 
-extern void check_class_changing(struct rq *rq, struct task_struct *p,
-				 const struct sched_class *prev_class);
-extern void check_class_changed(struct rq *rq, struct task_struct *p,
-				const struct sched_class *prev_class,
-				int oldprio);
+extern void check_prio_changed(struct rq *rq, struct task_struct *p, int oldprio);
 
 extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
 extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 2d4e279..fcc4c54 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -75,7 +75,7 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
 {
 }
 
-static void switched_to_stop(struct rq *rq, struct task_struct *p)
+static void switching_to_stop(struct rq *rq, struct task_struct *p)
 {
 	BUG(); /* its impossible to change to this class */
 }
@@ -112,6 +112,6 @@ DEFINE_SCHED_CLASS(stop) = {
 	.task_tick		= task_tick_stop,
 
 	.prio_changed		= prio_changed_stop,
-	.switched_to		= switched_to_stop,
+	.switching_to		= switching_to_stop,
 	.update_curr		= update_curr_stop,
 };
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 09ffe91..bcef5c7 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -684,6 +684,9 @@ change:
 	prev_class = p->sched_class;
 	next_class = __setscheduler_class(policy, newprio);
 
+	if (prev_class != next_class)
+		queue_flags |= DEQUEUE_CLASS;
+
 	if (prev_class != next_class && p->se.sched_delayed)
 		dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
 
@@ -695,7 +698,6 @@ change:
 			p->prio = newprio;
 		}
 		__setscheduler_uclamp(p, attr);
-		check_class_changing(rq, p, prev_class);
 
 		if (scope->queued) {
 			/*
@@ -707,7 +709,8 @@ change:
 		}
 	}
 
-	check_class_changed(rq, p, prev_class, oldprio);
+	if (!(queue_flags & DEQUEUE_CLASS))
+		check_prio_changed(rq, p, oldprio);
 
 	/* Avoid rq from going away on us: */
 	preempt_disable();