sched: Cleanup the change-pattern and related locking

[PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern

Posted by Peter Zijlstra 4 months ago

Move sched_class::prio_changed() into the change pattern.

And while there, extend it with sched_class::get_prio() in order to
fix the deadline sitation.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/core.c      |   24 +++++++++++++-----------
 kernel/sched/deadline.c  |   20 +++++++++++---------
 kernel/sched/ext.c       |    8 +-------
 kernel/sched/fair.c      |    8 ++++++--
 kernel/sched/idle.c      |    5 ++++-
 kernel/sched/rt.c        |    5 ++++-
 kernel/sched/sched.h     |    7 ++++---
 kernel/sched/stop_task.c |    5 ++++-
 kernel/sched/syscalls.c  |    9 ---------
 9 files changed, 47 insertions(+), 44 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2169,12 +2169,6 @@ inline int task_curr(const struct task_s
 	return cpu_curr(task_cpu(p)) == p;
 }
 
-void check_prio_changed(struct rq *rq, struct task_struct *p, int oldprio)
-{
-	if (oldprio != p->prio || dl_task(p))
-		p->sched_class->prio_changed(rq, p, oldprio);
-}
-
 void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct task_struct *donor = rq->donor;
@@ -7402,9 +7396,6 @@ void rt_mutex_setprio(struct task_struct
 		p->sched_class = next_class;
 		p->prio = prio;
 	}
-
-	if (!(queue_flag & DEQUEUE_CLASS))
-		check_prio_changed(rq, p, oldprio);
 out_unlock:
 	/* Avoid rq from going away on us: */
 	preempt_disable();
@@ -10860,6 +10851,13 @@ struct sched_change_ctx *sched_change_be
 		.running = task_current(rq, p),
 	};
 
+	if (!(flags & DEQUEUE_CLASS)) {
+		if (p->sched_class->get_prio)
+			ctx->prio = p->sched_class->get_prio(rq, p);
+		else
+			ctx->prio = p->prio;
+	}
+
 	if (ctx->queued)
 		dequeue_task(rq, p, flags);
 	if (ctx->running)
@@ -10886,6 +10884,10 @@ void sched_change_end(struct sched_chang
 	if (ctx->running)
 		set_next_task(rq, p);
 
-	if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switched_to)
-		p->sched_class->switched_to(rq, p);
+	if (ctx->flags & ENQUEUE_CLASS) {
+		if (p->sched_class->switched_to)
+			p->sched_class->switched_to(rq, p);
+	} else {
+		p->sched_class->prio_changed(rq, p, ctx->prio);
+	}
 }
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -3042,23 +3042,24 @@ static void switched_to_dl(struct rq *rq
 	}
 }
 
+static u64 get_prio_dl(struct rq *rq, struct task_struct *p)
+{
+	return p->dl.deadline;
+}
+
 /*
  * If the scheduling parameters of a -deadline task changed,
  * a push or pull operation might be needed.
  */
-static void prio_changed_dl(struct rq *rq, struct task_struct *p,
-			    int oldprio)
+static void prio_changed_dl(struct rq *rq, struct task_struct *p, u64 old_deadline)
 {
 	if (!task_on_rq_queued(p))
 		return;
 
-	/*
-	 * This might be too much, but unfortunately
-	 * we don't have the old deadline value, and
-	 * we can't argue if the task is increasing
-	 * or lowering its prio, so...
-	 */
-	if (!rq->dl.overloaded)
+	if (p->dl.deadline == old_deadline)
+		return;
+
+	if (dl_time_before(old_deadline, p->dl.deadline))
 		deadline_queue_pull_task(rq);
 
 	if (task_current_donor(rq, p)) {
@@ -3113,6 +3114,7 @@ DEFINE_SCHED_CLASS(dl) = {
 	.task_tick		= task_tick_dl,
 	.task_fork              = task_fork_dl,
 
+	.get_prio		= get_prio_dl,
 	.prio_changed           = prio_changed_dl,
 	.switched_from		= switched_from_dl,
 	.switched_to		= switched_to_dl,
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2961,7 +2961,7 @@ static void reweight_task_scx(struct rq
 				 p, p->scx.weight);
 }
 
-static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio)
+static void prio_changed_scx(struct rq *rq, struct task_struct *p, u64 oldprio)
 {
 }
 
@@ -3928,9 +3928,6 @@ static void scx_disable_workfn(struct kt
 			p->sched_class = new_class;
 		}
 
-		if (!(queue_flags & DEQUEUE_CLASS))
-			check_prio_changed(task_rq(p), p, p->prio);
-
 		scx_exit_task(p);
 	}
 	scx_task_iter_stop(&sti);
@@ -4679,9 +4676,6 @@ static int scx_enable(struct sched_ext_o
 			p->sched_class = new_class;
 		}
 
-		if (!(queue_flags & DEQUEUE_CLASS))
-			check_prio_changed(task_rq(p), p, p->prio);
-
 		put_task_struct(p);
 	}
 	scx_task_iter_stop(&sti);
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -13138,11 +13138,14 @@ static void task_fork_fair(struct task_s
  * the current task.
  */
 static void
-prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
+prio_changed_fair(struct rq *rq, struct task_struct *p, u64 oldprio)
 {
 	if (!task_on_rq_queued(p))
 		return;
 
+	if (p->prio == oldprio)
+		return;
+
 	if (rq->cfs.nr_queued == 1)
 		return;
 
@@ -13154,8 +13157,9 @@ prio_changed_fair(struct rq *rq, struct
 	if (task_current_donor(rq, p)) {
 		if (p->prio > oldprio)
 			resched_curr(rq);
-	} else
+	} else {
 		wakeup_preempt(rq, p, 0);
+	}
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -504,8 +504,11 @@ static void switching_to_idle(struct rq
 }
 
 static void
-prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
+prio_changed_idle(struct rq *rq, struct task_struct *p, u64 oldprio)
 {
+	if (p->prio == oldprio)
+		return;
+
 	BUG();
 }
 
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2437,11 +2437,14 @@ static void switched_to_rt(struct rq *rq
  * us to initiate a push or pull.
  */
 static void
-prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
+prio_changed_rt(struct rq *rq, struct task_struct *p, u64 oldprio)
 {
 	if (!task_on_rq_queued(p))
 		return;
 
+	if (p->prio == oldprio)
+		return;
+
 	if (task_current_donor(rq, p)) {
 		/*
 		 * If our priority decreases while running, we
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2451,8 +2451,10 @@ struct sched_class {
 
 	void (*reweight_task)(struct rq *this_rq, struct task_struct *task,
 			      const struct load_weight *lw);
+
+	u64  (*get_prio)     (struct rq *this_rq, struct task_struct *task);
 	void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
-			      int oldprio);
+			      u64 oldprio);
 
 	unsigned int (*get_rr_interval)(struct rq *rq,
 					struct task_struct *task);
@@ -3877,12 +3879,11 @@ extern void set_load_weight(struct task_
 extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
 extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
 
-extern void check_prio_changed(struct rq *rq, struct task_struct *p, int oldprio);
-
 extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
 extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
 
 struct sched_change_ctx {
+	u64			prio;
 	struct task_struct	*p;
 	int			flags;
 	bool			queued;
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -81,8 +81,11 @@ static void switching_to_stop(struct rq
 }
 
 static void
-prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
+prio_changed_stop(struct rq *rq, struct task_struct *p, u64 oldprio)
 {
+	if (p->prio == oldprio)
+		return;
+
 	BUG(); /* how!?, what priority? */
 }
 
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -95,12 +95,6 @@ void set_user_nice(struct task_struct *p
 		old_prio = p->prio;
 		p->prio = effective_prio(p);
 	}
-
-	/*
-	 * If the task increased its priority or is running and
-	 * lowered its priority, then reschedule its CPU:
-	 */
-	p->sched_class->prio_changed(rq, p, old_prio);
 }
 EXPORT_SYMBOL(set_user_nice);
 
@@ -708,9 +702,6 @@ int __sched_setscheduler(struct task_str
 		}
 	}
 
-	if (!(queue_flags & DEQUEUE_CLASS))
-		check_prio_changed(rq, p, oldprio);
-
 	/* Avoid rq from going away on us: */
 	preempt_disable();
 	head = splice_balance_callbacks(rq);

Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern

Posted by Pierre Gondois 3 weeks, 6 days ago

Hello Peter,

It seems this patch:
6455ad5346c9 ("sched: Move sched_class::prio_changed() into the change 
pattern")
is triggering the following warning:
rq_pin_lock()
\-WARN_ON_ONCE(rq->balance_callback && rq->balance_callback != 
&balance_push_callback);

On an arm64 Juno, it can be reproduced by creating and killing a
deadline task:
chrt -d -T 1000000 -P 1000000 0 yes > /dev/null

[   49.518832] Hardware name: ARM LTD ARM Juno Development Platform/ARM 
Juno Development Platform, BIOS EDK II Jul 11 2025
[   49.518838] Call trace:
[   49.518842]  show_stack (arch/arm64/kernel/stacktrace.c:501) (C)
[   49.518864]  dump_stack_lvl (lib/dump_stack.c:122)
[   49.518878]  dump_stack (lib/dump_stack.c:130)
[   49.518889]  prio_changed_dl (kernel/sched/deadline.c:0 
kernel/sched/deadline.c:3343)
[   49.518903]  sched_change_end (kernel/sched/core.c:0)
[   49.518916]  sched_move_task (kernel/sched/core.c:9167)
[   49.518927]  sched_autogroup_exit_task (kernel/sched/autogroup.c:157)
[   49.518940]  do_exit (kernel/exit.c:975)
[   49.518950]  do_group_exit (kernel/exit.c:0)
[   49.518960]  get_signal (kernel/signal.c:0)
[   49.518970]  arch_do_signal_or_restart (arch/arm64/kernel/signal.c:1619)
[   49.518983]  exit_to_user_mode_loop (kernel/entry/common.c:43 
kernel/entry/common.c:75)
[   49.518994]  el0_svc (./include/linux/irq-entry-common.h:0 
./include/linux/irq-entry-common.h:242 
arch/arm64/kernel/entry-common.c:81 arch/arm64/kernel/entry-common.c:725)
[   49.519009]  el0t_64_sync_handler (arch/arm64/kernel/entry-common.c:0)
[   49.519023]  el0t_64_sync (arch/arm64/kernel/entry.S:596)
[   49.519119] ------------[ cut here ]------------
[   49.519124] WARNING: kernel/sched/sched.h:1829 at 
__schedule+0x404/0xf78, CPU#1: yes/326
[   49.612674] Modules linked in:
[   49.615737] CPU: 1 UID: 0 PID: 326 Comm: yes Not tainted 
6.19.0-rc4-next-20260109-g8be7ad74b7e4 #261 PREEMPT
[   49.625670] Hardware name: ARM LTD ARM Juno Development Platform/ARM 
Juno Development Platform, BIOS EDK II Jul 11 2025
[   49.636470] pstate: 800000c5 (Nzcv daIF -PAN -UAO -TCO -DIT -SSBS 
BTYPE=--)
[   49.643443] pc : __schedule (kernel/sched/core.c:0 
kernel/sched/sched.h:1907 kernel/sched/core.c:6798)
[   49.647287] lr : __schedule (kernel/sched/sched.h:1827 
kernel/sched/sched.h:1907 kernel/sched/core.c:6798)
[   49.651130] sp : ffff800081d739e0
[   49.654445] x29: ffff800081d73a40 x28: ffff000809548908 x27: 
ffffddc6d7c532e8
[   49.661604] x26: ffff000809548000 x25: 00000000400004d8 x24: 
0000000000000009
[   49.668762] x23: 0000000000000001 x22: ffffddc6d7bf8500 x21: 
ffffddc6d5b9bdb0
[   49.675919] x20: ffff00097681c500 x19: ffff000809548000 x18: 
ffff800081d735b8
[   49.683076] x17: 0000000000000063 x16: 0000000000000000 x15: 
0000000000000004
[   49.690233] x14: ffff000809548aa0 x13: 000000000dc48bda x12: 
000000002edb68e5
[   49.697391] x11: 0000000000000000 x10: 0000000000000001 x9 : 
ffffddc6d7c7b388
[   49.704548] x8 : ffff000976636420 x7 : ffffddc6d5b9ae64 x6 : 
0000000000000000
[   49.711704] x5 : 0000000000000001 x4 : 0000000000000001 x3 : 
0000000000000000
[   49.718861] x2 : 0000000000000008 x1 : ffff00097681c518 x0 : 
0000000000008629
[   49.726017] Call trace:
[   49.728462]  __schedule (kernel/sched/core.c:0 
kernel/sched/sched.h:1907 kernel/sched/core.c:6798) (P)
[   49.732308]  preempt_schedule_common 
(./arch/arm64/include/asm/preempt.h:53 kernel/sched/core.c:7080)
[   49.736762]  preempt_schedule (kernel/sched/core.c:0)
[   49.740606]  _raw_spin_unlock_irqrestore 
(./include/linux/spinlock_api_smp.h:0 kernel/locking/spinlock.c:194)
[   49.745410]  sched_move_task (kernel/sched/sched.h:0)
[   49.749341]  sched_autogroup_exit_task (kernel/sched/autogroup.c:157)
[   49.753969]  do_exit (kernel/exit.c:975)
[   49.757202]  do_group_exit (kernel/exit.c:0)
[   49.760782]  get_signal (kernel/signal.c:0)
[   49.764277]  arch_do_signal_or_restart (arch/arm64/kernel/signal.c:1619)
[   49.769078]  exit_to_user_mode_loop (kernel/entry/common.c:43 
kernel/entry/common.c:75)
[   49.773530]  el0_svc (./include/linux/irq-entry-common.h:0 
./include/linux/irq-entry-common.h:242 
arch/arm64/kernel/entry-common.c:81 arch/arm64/kernel/entry-common.c:725)
[   49.776767]  el0t_64_sync_handler (arch/arm64/kernel/entry-common.c:0)
[   49.781048]  el0t_64_sync (arch/arm64/kernel/entry.S:596)
[   49.784716] irq event stamp: 80194
[   49.788118] hardirqs last  enabled at (80193): irqentry_exit 
(kernel/entry/common.c:0)
[   49.796575] hardirqs last disabled at (80194): __schedule 
(kernel/sched/core.c:6755)
[   49.804858] softirqs last  enabled at (77126): handle_softirqs 
(./arch/arm64/include/asm/preempt.h:12 kernel/softirq.c:469 
kernel/softirq.c:654)
[   49.813575] softirqs last disabled at (77121): __do_softirq 
(kernel/softirq.c:661)
[   49.821856] ---[ end trace 0000000000000000 ]---

The first stack dump comes from this:
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 1f94994984038..4647fea76d748 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -632,11 +640,17 @@ static inline void 
deadline_queue_push_tasks(struct rq *rq)
         if (!has_pushable_dl_tasks(rq))
                 return;

+       if (sysctl_sched_debug_local)
+               dump_stack();
+
         queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), 
push_dl_tasks);
  }

  static inline void deadline_queue_pull_task(struct rq *rq)
  {
+       if (sysctl_sched_debug_local)
+               dump_stack();
+
         queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), 
pull_dl_task);
  }

On 10/6/25 12:44, Peter Zijlstra wrote:
> Move sched_class::prio_changed() into the change pattern.
>
> And while there, extend it with sched_class::get_prio() in order to
> fix the deadline sitation.
>
> Suggested-by: Tejun Heo <tj@kernel.org>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Acked-by: Tejun Heo <tj@kernel.org>
> ---
>   kernel/sched/core.c      |   24 +++++++++++++-----------
>   kernel/sched/deadline.c  |   20 +++++++++++---------
>   kernel/sched/ext.c       |    8 +-------
>   kernel/sched/fair.c      |    8 ++++++--
>   kernel/sched/idle.c      |    5 ++++-
>   kernel/sched/rt.c        |    5 ++++-
>   kernel/sched/sched.h     |    7 ++++---
>   kernel/sched/stop_task.c |    5 ++++-
>   kernel/sched/syscalls.c  |    9 ---------
>   9 files changed, 47 insertions(+), 44 deletions(-)
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2169,12 +2169,6 @@ inline int task_curr(const struct task_s
>   	return cpu_curr(task_cpu(p)) == p;
>   }
>   
> -void check_prio_changed(struct rq *rq, struct task_struct *p, int oldprio)
> -{
> -	if (oldprio != p->prio || dl_task(p))
> -		p->sched_class->prio_changed(rq, p, oldprio);
> -}
> -
>   void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags)
>   {
>   	struct task_struct *donor = rq->donor;
> @@ -7402,9 +7396,6 @@ void rt_mutex_setprio(struct task_struct
>   		p->sched_class = next_class;
>   		p->prio = prio;
>   	}
> -
> -	if (!(queue_flag & DEQUEUE_CLASS))
> -		check_prio_changed(rq, p, oldprio);
>   out_unlock:
>   	/* Avoid rq from going away on us: */
>   	preempt_disable();
The cause might be the above. This used to call __balance_callbacks()
while holding the rq lock.

> @@ -10860,6 +10851,13 @@ struct sched_change_ctx *sched_change_be
>   		.running = task_current(rq, p),
>   	};
>   
> +	if (!(flags & DEQUEUE_CLASS)) {
> +		if (p->sched_class->get_prio)
> +			ctx->prio = p->sched_class->get_prio(rq, p);
> +		else
> +			ctx->prio = p->prio;
> +	}
> +
>   	if (ctx->queued)
>   		dequeue_task(rq, p, flags);
>   	if (ctx->running)
> @@ -10886,6 +10884,10 @@ void sched_change_end(struct sched_chang
>   	if (ctx->running)
>   		set_next_task(rq, p);
>   
> -	if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switched_to)
> -		p->sched_class->switched_to(rq, p);
> +	if (ctx->flags & ENQUEUE_CLASS) {
> +		if (p->sched_class->switched_to)
> +			p->sched_class->switched_to(rq, p);
> +	} else {
> +		p->sched_class->prio_changed(rq, p, ctx->prio);
> +	}
Now this is not the case anymore it seems. prio_changed_dl() sets the
balance_callback and rq_pin_lock() is called with a non-NULL value.

Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern

Posted by K Prateek Nayak 3 weeks, 5 days ago

Hello Pierre,

On 1/13/2026 2:14 AM, Pierre Gondois wrote:
> Hello Peter,
> 
> It seems this patch:
> 6455ad5346c9 ("sched: Move sched_class::prio_changed() into the change pattern")
> is triggering the following warning:
> rq_pin_lock()
> \-WARN_ON_ONCE(rq->balance_callback && rq->balance_callback != &balance_push_callback);

Can you check if the following solution helps your case too:
https://lore.kernel.org/all/20260106104113.GX3707891@noisy.programming.kicks-ass.net/

-- 
Thanks and Regards,
Prateek

Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern

Posted by Pierre Gondois 3 weeks, 5 days ago

Hello Prateek,

On 1/13/26 05:12, K Prateek Nayak wrote:
> Hello Pierre,
>
> On 1/13/2026 2:14 AM, Pierre Gondois wrote:
>> Hello Peter,
>>
>> It seems this patch:
>> 6455ad5346c9 ("sched: Move sched_class::prio_changed() into the change pattern")
>> is triggering the following warning:
>> rq_pin_lock()
>> \-WARN_ON_ONCE(rq->balance_callback && rq->balance_callback != &balance_push_callback);
> Can you check if the following solution helps your case too:
> https://lore.kernel.org/all/20260106104113.GX3707891@noisy.programming.kicks-ass.net/
>
I can still see the issue.
It seems the task deadline is also updated in:
sched_change_end()
\-enqueue_task_dl()
   \-enqueue_dl_entity()
     \-setup_new_dl_entity()
       \-replenish_dl_new_period()
if the task's period finished.

So in sched_change_end(), the task priority (i.e. p->dl.deadline) is 
updated.
This results in having an old_deadline earlier than the new p->dl.deadline.
Thus the rq->balance_callback:

prio_changed_dl() {
...
if (dl_time_before(old_deadline, p->dl.deadline))
   deadline_queue_pull_task(rq);
...
}

Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern

Posted by Peter Zijlstra 3 weeks, 5 days ago

On Tue, Jan 13, 2026 at 11:45:43AM +0100, Pierre Gondois wrote:
> Hello Prateek,
> 
> On 1/13/26 05:12, K Prateek Nayak wrote:
> > Hello Pierre,
> > 
> > On 1/13/2026 2:14 AM, Pierre Gondois wrote:
> > > Hello Peter,
> > > 
> > > It seems this patch:
> > > 6455ad5346c9 ("sched: Move sched_class::prio_changed() into the change pattern")
> > > is triggering the following warning:
> > > rq_pin_lock()
> > > \-WARN_ON_ONCE(rq->balance_callback && rq->balance_callback != &balance_push_callback);
> > Can you check if the following solution helps your case too:
> > https://lore.kernel.org/all/20260106104113.GX3707891@noisy.programming.kicks-ass.net/
> > 
> I can still see the issue.
> It seems the task deadline is also updated in:
> sched_change_end()
> \-enqueue_task_dl()
>   \-enqueue_dl_entity()
>     \-setup_new_dl_entity()
>       \-replenish_dl_new_period()
> if the task's period finished.
> 
> So in sched_change_end(), the task priority (i.e. p->dl.deadline) is
> updated.
> This results in having an old_deadline earlier than the new p->dl.deadline.
> Thus the rq->balance_callback:
> 
> prio_changed_dl() {
> ...
> if (dl_time_before(old_deadline, p->dl.deadline))
>   deadline_queue_pull_task(rq);
> ...
> }

Hum... so this one is a little more tricky.

So the normal rules are that DEQUEUE_SAVE + ENQUEUE_RESTORE should be as
invariant as possible.

But what I think happens here is that at the point of dequeue we are
effectively ready to throttle/replenish, but we don't.

Then at enqueue, we do. The replenish changes the deadline and we're up
a creek.

Let me think about this for a bit...

Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern

Posted by K Prateek Nayak 3 weeks, 4 days ago

Hello Peter,

On 1/13/2026 5:17 PM, Peter Zijlstra wrote:
> Hum... so this one is a little more tricky.
> 
> So the normal rules are that DEQUEUE_SAVE + ENQUEUE_RESTORE should be as
> invariant as possible.
> 
> But what I think happens here is that at the point of dequeue we are
> effectively ready to throttle/replenish, but we don't.
> 
> Then at enqueue, we do. The replenish changes the deadline and we're up
> a creek.

I've the following data from the scenario in which I observe
the same splat as Pierre splat wit the two fixes on top of tip:

    yes-4108    [194] d..2.    53.396872: get_prio_dl: get_prio_dl: clock(53060728757)
    yes-4108    [194] d..2.    53.396873: update_curr_dl_se: update_curr_dl_se: past throttle label
    yes-4108    [194] d..2.    53.396873: update_curr_dl_se: dl_throttled(0) dl_overrun(0) timer_queued(0) server?(0)
    yes-4108    [194] d..2.    53.396873: update_curr_dl_se: dl_se->runtime(190623) rq->dl.overloaded(0)
    yes-4108    [194] d..2.    53.396874: get_prio_dl: get_prio_dl: deadline(53060017809)

    yes-4108    [194] d..2.    53.396878: enqueue_dl_entity: ENQUEUE_RESTORE update_dl_entity
    yes-4108    [194] d..2.    53.396878: enqueue_dl_entity: setup_new_dl_entity
    yes-4108    [194] d..2.    53.396878: enqueue_dl_entity: Replenish: Old: 53060017809 dl_deadline(1000000)
    yes-4108    [194] d..2.    53.396879: enqueue_dl_entity: Replenish: New: 53061728757
    yes-4108    [194] d..2.    53.396882: prio_changed_dl.part.0: Woops! prio_changed_dl: CPU(194) clock(53060728757) overloaded(0): Task: yes(4108), Curr: yes(4108) deadline: 53060017809 -> 53061728757

get_prio_dl() sees "deadline < rq->clock" but dl_se->runtime is still
positive so update_curr_dl_se() doesn't fiddle with the deadline.

ENQUEUE_RESTORE sees "deadline" before "rq->clock" and calls
setup_new_dl_entity() which calls replenish.

sched_change_end() will call prio_changed() with the old deadline from
get_prio_dl() but enqueue advanced the deadline so we land in a
pickle.

> 
> Let me think about this for a bit...

Should prio_changed_dl() care about "dl_se->dl_deadline" having changed
within the sched_change guard since that is the attribute that can be
changed using sched_setattr() right?

-- 
Thanks and Regards,
Prateek

Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern

Posted by Peter Zijlstra 3 weeks, 4 days ago

On Wed, Jan 14, 2026 at 12:17:11PM +0530, K Prateek Nayak wrote:
> Hello Peter,
> 
> On 1/13/2026 5:17 PM, Peter Zijlstra wrote:
> > Hum... so this one is a little more tricky.
> > 
> > So the normal rules are that DEQUEUE_SAVE + ENQUEUE_RESTORE should be as
> > invariant as possible.
> > 
> > But what I think happens here is that at the point of dequeue we are
> > effectively ready to throttle/replenish, but we don't.
> > 
> > Then at enqueue, we do. The replenish changes the deadline and we're up
> > a creek.
> 
> I've the following data from the scenario in which I observe
> the same splat as Pierre splat wit the two fixes on top of tip:
> 
>     yes-4108    [194] d..2.    53.396872: get_prio_dl: get_prio_dl: clock(53060728757)
>     yes-4108    [194] d..2.    53.396873: update_curr_dl_se: update_curr_dl_se: past throttle label
>     yes-4108    [194] d..2.    53.396873: update_curr_dl_se: dl_throttled(0) dl_overrun(0) timer_queued(0) server?(0)
>     yes-4108    [194] d..2.    53.396873: update_curr_dl_se: dl_se->runtime(190623) rq->dl.overloaded(0)
>     yes-4108    [194] d..2.    53.396874: get_prio_dl: get_prio_dl: deadline(53060017809)
> 
>     yes-4108    [194] d..2.    53.396878: enqueue_dl_entity: ENQUEUE_RESTORE update_dl_entity
>     yes-4108    [194] d..2.    53.396878: enqueue_dl_entity: setup_new_dl_entity
>     yes-4108    [194] d..2.    53.396878: enqueue_dl_entity: Replenish: Old: 53060017809 dl_deadline(1000000)
>     yes-4108    [194] d..2.    53.396879: enqueue_dl_entity: Replenish: New: 53061728757
>     yes-4108    [194] d..2.    53.396882: prio_changed_dl.part.0: Woops! prio_changed_dl: CPU(194) clock(53060728757) overloaded(0): Task: yes(4108), Curr: yes(4108) deadline: 53060017809 -> 53061728757
> 
> get_prio_dl() sees "deadline < rq->clock" but dl_se->runtime is still
> positive so update_curr_dl_se() doesn't fiddle with the deadline.
> 
> ENQUEUE_RESTORE sees "deadline" before "rq->clock" and calls
> setup_new_dl_entity() which calls replenish.

Right this. That's more or less where I ended up as well. Just don't
know what to do about that. It doesn't feel right.

That is, it means that a task behaves differently depending on if a
(unrelated) sched_change comes in between.

If undisturbed it will be allowed to exhaust its runtime, irrespective
of it missing its deadline (valid for G-EDF); while when it gets
disturbed it will be forced to replenish.

Juri, Luca, I'm tempted to suggest to simply remove the replenish on
RESTORE entirely -- that would allow the task to continue as it had
been, irrespective of it being 'late'.

Something like so -- what would this break?

--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2214,10 +2214,6 @@ enqueue_dl_entity(struct sched_dl_entity
 		update_dl_entity(dl_se);
 	} else if (flags & ENQUEUE_REPLENISH) {
 		replenish_dl_entity(dl_se);
-	} else if ((flags & ENQUEUE_RESTORE) &&
-		   !is_dl_boosted(dl_se) &&
-		   dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) {
-		setup_new_dl_entity(dl_se);
 	}
 
 	/*

> > Let me think about this for a bit...
> 
> Should prio_changed_dl() care about "dl_se->dl_deadline" having changed
> within the sched_change guard since that is the attribute that can be
> changed using sched_setattr() right?

__setparam_dl() changes dl_se->dl_deadline, as you say, but that does
not immediately affect the current dl_se->deadline. It will take effect
the next replenish.

That is, changing dl task attributes changes the next activation, not
the current. And since DL is a dynamic priority scheme, it doesn't
affect the current priority.

Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern

Posted by Peter Zijlstra 3 weeks, 4 days ago

On Wed, Jan 14, 2026 at 11:23:36AM +0100, Peter Zijlstra wrote:

> Juri, Luca, I'm tempted to suggest to simply remove the replenish on
> RESTORE entirely -- that would allow the task to continue as it had
> been, irrespective of it being 'late'.
> 
> Something like so -- what would this break?
> 
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -2214,10 +2214,6 @@ enqueue_dl_entity(struct sched_dl_entity
>  		update_dl_entity(dl_se);
>  	} else if (flags & ENQUEUE_REPLENISH) {
>  		replenish_dl_entity(dl_se);
> -	} else if ((flags & ENQUEUE_RESTORE) &&
> -		   !is_dl_boosted(dl_se) &&
> -		   dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) {
> -		setup_new_dl_entity(dl_se);
>  	}
>  
>  	/*

Ah, this is de-boost, right? Boosting allows one to break the CBS rules
and then we have to rein in the excesses.

But we have {DE,EN}QUEUE_MOVE for this, that explicitly allows priority
to change and is set for rt_mutex_setprio() (among others).

So doing s/RESTORE/MOVE/ above.

The corollary to all this is that everybody that sets MOVE must be able
to deal with balance callbacks, so audit that too.

This then gives something like so.. which builds and boots for me, but
clearly I haven't been able to trigger these funny cases.

---
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4969,9 +4969,13 @@ struct balance_callback *splice_balance_
 	return __splice_balance_callbacks(rq, true);
 }
 
-static void __balance_callbacks(struct rq *rq)
+void __balance_callbacks(struct rq *rq, struct rq_flags *rf)
 {
+	if (rf)
+		rq_unpin_lock(rq, rf);
 	do_balance_callbacks(rq, __splice_balance_callbacks(rq, false));
+	if (rf)
+		rq_repin_lock(rq, rf);
 }
 
 void balance_callbacks(struct rq *rq, struct balance_callback *head)
@@ -5018,7 +5022,7 @@ static inline void finish_lock_switch(st
 	 * prev into current:
 	 */
 	spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_);
-	__balance_callbacks(rq);
+	__balance_callbacks(rq, NULL);
 	raw_spin_rq_unlock_irq(rq);
 }
 
@@ -6901,7 +6905,7 @@ static void __sched notrace __schedule(i
 			proxy_tag_curr(rq, next);
 
 		rq_unpin_lock(rq, &rf);
-		__balance_callbacks(rq);
+		__balance_callbacks(rq, NULL);
 		raw_spin_rq_unlock_irq(rq);
 	}
 	trace_sched_exit_tp(is_switch);
@@ -7350,7 +7354,7 @@ void rt_mutex_setprio(struct task_struct
 	trace_sched_pi_setprio(p, pi_task);
 	oldprio = p->prio;
 
-	if (oldprio == prio)
+	if (oldprio == prio && !dl_prio(prio))
 		queue_flag &= ~DEQUEUE_MOVE;
 
 	prev_class = p->sched_class;
@@ -7396,9 +7400,7 @@ void rt_mutex_setprio(struct task_struct
 out_unlock:
 	/* Caller holds task_struct::pi_lock, IRQs are still disabled */
 
-	rq_unpin_lock(rq, &rf);
-	__balance_callbacks(rq);
-	rq_repin_lock(rq, &rf);
+	__balance_callbacks(rq, &rf);
 	__task_rq_unlock(rq, p, &rf);
 }
 #endif /* CONFIG_RT_MUTEXES */
@@ -9167,6 +9169,8 @@ void sched_move_task(struct task_struct
 
 	if (resched)
 		resched_curr(rq);
+
+	__balance_callbacks(rq, &rq_guard.rf);
 }
 
 static struct cgroup_subsys_state *
@@ -10891,6 +10895,9 @@ void sched_change_end(struct sched_chang
 				resched_curr(rq);
 		}
 	} else {
+		/*
+		 * XXX validate prio only really changed when ENQUEUE_MOVE is set.
+		 */
 		p->sched_class->prio_changed(rq, p, ctx->prio);
 	}
 }
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2214,9 +2214,14 @@ enqueue_dl_entity(struct sched_dl_entity
 		update_dl_entity(dl_se);
 	} else if (flags & ENQUEUE_REPLENISH) {
 		replenish_dl_entity(dl_se);
-	} else if ((flags & ENQUEUE_RESTORE) &&
+	} else if ((flags & ENQUEUE_MOVE) &&
 		   !is_dl_boosted(dl_se) &&
 		   dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) {
+		/*
+		 * Deals with the de-boost case, and ENQUEUE_MOVE explicitly
+		 * allows us to change priority. Callers are expected to deal
+		 * with balance_callbacks.
+		 */
 		setup_new_dl_entity(dl_se);
 	}
 
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -545,6 +545,7 @@ static void scx_task_iter_start(struct s
 static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
 {
 	if (iter->locked_task) {
+		__balance_callbacks(iter->rq, &iter->rf);
 		task_rq_unlock(iter->rq, iter->locked_task, &iter->rf);
 		iter->locked_task = NULL;
 	}
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2430,7 +2430,8 @@ extern const u32		sched_prio_to_wmult[40
  *                should preserve as much state as possible.
  *
  * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
- *        in the runqueue.
+ *        in the runqueue. IOW the priority is allowed to change. Callers
+ *        must expect to deal with balance callbacks.
  *
  * NOCLOCK - skip the update_rq_clock() (avoids double updates)
  *
@@ -4019,6 +4020,8 @@ extern void enqueue_task(struct rq *rq,
 extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
 
 extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
+
+extern void __balance_callbacks(struct rq *rq, struct rq_flags *rf);
 extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
 
 /*
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -639,7 +639,7 @@ int __sched_setscheduler(struct task_str
 		 * itself.
 		 */
 		newprio = rt_effective_prio(p, newprio);
-		if (newprio == oldprio)
+		if (newprio == oldprio && !dl_prio(newprio))
 			queue_flags &= ~DEQUEUE_MOVE;
 	}

Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern

Posted by Juri Lelli 3 weeks, 4 days ago

On 14/01/26 14:05, Peter Zijlstra wrote:
> On Wed, Jan 14, 2026 at 11:23:36AM +0100, Peter Zijlstra wrote:
> 
> > Juri, Luca, I'm tempted to suggest to simply remove the replenish on
> > RESTORE entirely -- that would allow the task to continue as it had
> > been, irrespective of it being 'late'.
> > 
> > Something like so -- what would this break?
> > 
> > --- a/kernel/sched/deadline.c
> > +++ b/kernel/sched/deadline.c
> > @@ -2214,10 +2214,6 @@ enqueue_dl_entity(struct sched_dl_entity
> >  		update_dl_entity(dl_se);
> >  	} else if (flags & ENQUEUE_REPLENISH) {
> >  		replenish_dl_entity(dl_se);
> > -	} else if ((flags & ENQUEUE_RESTORE) &&
> > -		   !is_dl_boosted(dl_se) &&
> > -		   dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) {
> > -		setup_new_dl_entity(dl_se);
> >  	}
> >  
> >  	/*
> 
> Ah, this is de-boost, right? Boosting allows one to break the CBS rules
> and then we have to rein in the excesses.
> 
> But we have {DE,EN}QUEUE_MOVE for this, that explicitly allows priority
> to change and is set for rt_mutex_setprio() (among others).
> 
> So doing s/RESTORE/MOVE/ above.
> 
> The corollary to all this is that everybody that sets MOVE must be able
> to deal with balance callbacks, so audit that too.
> 
> This then gives something like so.. which builds and boots for me, but
> clearly I haven't been able to trigger these funny cases.
> 
> ---
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4969,9 +4969,13 @@ struct balance_callback *splice_balance_
>  	return __splice_balance_callbacks(rq, true);
>  }
>  
> -static void __balance_callbacks(struct rq *rq)
> +void __balance_callbacks(struct rq *rq, struct rq_flags *rf)
>  {
> +	if (rf)
> +		rq_unpin_lock(rq, rf);
>  	do_balance_callbacks(rq, __splice_balance_callbacks(rq, false));
> +	if (rf)
> +		rq_repin_lock(rq, rf);
>  }
>  
>  void balance_callbacks(struct rq *rq, struct balance_callback *head)
> @@ -5018,7 +5022,7 @@ static inline void finish_lock_switch(st
>  	 * prev into current:
>  	 */
>  	spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_);
> -	__balance_callbacks(rq);
> +	__balance_callbacks(rq, NULL);
>  	raw_spin_rq_unlock_irq(rq);
>  }
>  
> @@ -6901,7 +6905,7 @@ static void __sched notrace __schedule(i
>  			proxy_tag_curr(rq, next);
>  
>  		rq_unpin_lock(rq, &rf);
> -		__balance_callbacks(rq);
> +		__balance_callbacks(rq, NULL);
>  		raw_spin_rq_unlock_irq(rq);
>  	}
>  	trace_sched_exit_tp(is_switch);
> @@ -7350,7 +7354,7 @@ void rt_mutex_setprio(struct task_struct
>  	trace_sched_pi_setprio(p, pi_task);
>  	oldprio = p->prio;
>  
> -	if (oldprio == prio)
> +	if (oldprio == prio && !dl_prio(prio))
>  		queue_flag &= ~DEQUEUE_MOVE;
>  
>  	prev_class = p->sched_class;
> @@ -7396,9 +7400,7 @@ void rt_mutex_setprio(struct task_struct
>  out_unlock:
>  	/* Caller holds task_struct::pi_lock, IRQs are still disabled */
>  
> -	rq_unpin_lock(rq, &rf);
> -	__balance_callbacks(rq);
> -	rq_repin_lock(rq, &rf);
> +	__balance_callbacks(rq, &rf);
>  	__task_rq_unlock(rq, p, &rf);
>  }
>  #endif /* CONFIG_RT_MUTEXES */
> @@ -9167,6 +9169,8 @@ void sched_move_task(struct task_struct
>  
>  	if (resched)
>  		resched_curr(rq);
> +
> +	__balance_callbacks(rq, &rq_guard.rf);
>  }
>  
>  static struct cgroup_subsys_state *
> @@ -10891,6 +10895,9 @@ void sched_change_end(struct sched_chang
>  				resched_curr(rq);
>  		}
>  	} else {
> +		/*
> +		 * XXX validate prio only really changed when ENQUEUE_MOVE is set.
> +		 */
>  		p->sched_class->prio_changed(rq, p, ctx->prio);
>  	}
>  }
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -2214,9 +2214,14 @@ enqueue_dl_entity(struct sched_dl_entity
>  		update_dl_entity(dl_se);
>  	} else if (flags & ENQUEUE_REPLENISH) {
>  		replenish_dl_entity(dl_se);
> -	} else if ((flags & ENQUEUE_RESTORE) &&
> +	} else if ((flags & ENQUEUE_MOVE) &&
>  		   !is_dl_boosted(dl_se) &&
>  		   dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) {
> +		/*
> +		 * Deals with the de-boost case, and ENQUEUE_MOVE explicitly
> +		 * allows us to change priority. Callers are expected to deal
> +		 * with balance_callbacks.
> +		 */
>  		setup_new_dl_entity(dl_se);
>  	}
>  
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -545,6 +545,7 @@ static void scx_task_iter_start(struct s
>  static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
>  {
>  	if (iter->locked_task) {
> +		__balance_callbacks(iter->rq, &iter->rf);
>  		task_rq_unlock(iter->rq, iter->locked_task, &iter->rf);
>  		iter->locked_task = NULL;
>  	}
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2430,7 +2430,8 @@ extern const u32		sched_prio_to_wmult[40
>   *                should preserve as much state as possible.
>   *
>   * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
> - *        in the runqueue.
> + *        in the runqueue. IOW the priority is allowed to change. Callers
> + *        must expect to deal with balance callbacks.
>   *
>   * NOCLOCK - skip the update_rq_clock() (avoids double updates)
>   *
> @@ -4019,6 +4020,8 @@ extern void enqueue_task(struct rq *rq,
>  extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
>  
>  extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
> +
> +extern void __balance_callbacks(struct rq *rq, struct rq_flags *rf);
>  extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
>  
>  /*
> --- a/kernel/sched/syscalls.c
> +++ b/kernel/sched/syscalls.c
> @@ -639,7 +639,7 @@ int __sched_setscheduler(struct task_str
>  		 * itself.
>  		 */
>  		newprio = rt_effective_prio(p, newprio);
> -		if (newprio == oldprio)
> +		if (newprio == oldprio && !dl_prio(newprio))
>  			queue_flags &= ~DEQUEUE_MOVE;
>  	}

We have been using (improperly?) ENQUEUE_SAVE also to know when a new
entity gets setscheduled to DEADLINE (or its parameters are changed) and
it looks like this keeps that happening with DEQUEUE_MOVE. So, from a
quick first look, it does sound good to me.

Thanks!
Juri

Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern

Posted by Peter Zijlstra 3 weeks, 3 days ago

On Wed, Jan 14, 2026 at 03:20:48PM +0100, Juri Lelli wrote:

> > --- a/kernel/sched/syscalls.c
> > +++ b/kernel/sched/syscalls.c
> > @@ -639,7 +639,7 @@ int __sched_setscheduler(struct task_str
> >  		 * itself.
> >  		 */
> >  		newprio = rt_effective_prio(p, newprio);
> > -		if (newprio == oldprio)
> > +		if (newprio == oldprio && !dl_prio(newprio))
> >  			queue_flags &= ~DEQUEUE_MOVE;
> >  	}
> 
> We have been using (improperly?) ENQUEUE_SAVE also to know when a new
> entity gets setscheduled to DEADLINE (or its parameters are changed) and
> it looks like this keeps that happening with DEQUEUE_MOVE. So, from a
> quick first look, it does sound good to me.

If this is strictly about tasks coming into SCHED_DEADLINE there are a
number of alternative options:

 - there are the sched_class::switch{ing,ed}_to() callbacks;
 - there is (the fairly recent) ENQUEUE_CLASS.

Anyway, let me break up this one patch into individual bits and write
changelogs. I'll stick them in queue/sched/urgent for now; hopefully
Pierre can given them a spin and report back if it all sorts his
problem).

Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern

Posted by Peter Zijlstra 3 weeks, 3 days ago

On Thu, Jan 15, 2026 at 09:24:31AM +0100, Peter Zijlstra wrote:
> On Wed, Jan 14, 2026 at 03:20:48PM +0100, Juri Lelli wrote:
> 
> > > --- a/kernel/sched/syscalls.c
> > > +++ b/kernel/sched/syscalls.c
> > > @@ -639,7 +639,7 @@ int __sched_setscheduler(struct task_str
> > >  		 * itself.
> > >  		 */
> > >  		newprio = rt_effective_prio(p, newprio);
> > > -		if (newprio == oldprio)
> > > +		if (newprio == oldprio && !dl_prio(newprio))
> > >  			queue_flags &= ~DEQUEUE_MOVE;
> > >  	}
> > 
> > We have been using (improperly?) ENQUEUE_SAVE also to know when a new
> > entity gets setscheduled to DEADLINE (or its parameters are changed) and
> > it looks like this keeps that happening with DEQUEUE_MOVE. So, from a
> > quick first look, it does sound good to me.
> 
> If this is strictly about tasks coming into SCHED_DEADLINE there are a
> number of alternative options:
> 
>  - there are the sched_class::switch{ing,ed}_to() callbacks;
>  - there is (the fairly recent) ENQUEUE_CLASS.
> 
> Anyway, let me break up this one patch into individual bits and write
> changelogs. I'll stick them in queue/sched/urgent for now; hopefully
> Pierre can given them a spin and report back if it all sorts his
> problem).

Now live at:

https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/log/?h=sched/urgent

Please test.

Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern

Posted by Pierre Gondois 3 weeks, 3 days ago

Hello Peter,

On 1/15/26 10:05, Peter Zijlstra wrote:
> On Thu, Jan 15, 2026 at 09:24:31AM +0100, Peter Zijlstra wrote:
>> On Wed, Jan 14, 2026 at 03:20:48PM +0100, Juri Lelli wrote:
>>
>>>> --- a/kernel/sched/syscalls.c
>>>> +++ b/kernel/sched/syscalls.c
>>>> @@ -639,7 +639,7 @@ int __sched_setscheduler(struct task_str
>>>>   		 * itself.
>>>>   		 */
>>>>   		newprio = rt_effective_prio(p, newprio);
>>>> -		if (newprio == oldprio)
>>>> +		if (newprio == oldprio && !dl_prio(newprio))
>>>>   			queue_flags &= ~DEQUEUE_MOVE;
>>>>   	}
>>> We have been using (improperly?) ENQUEUE_SAVE also to know when a new
>>> entity gets setscheduled to DEADLINE (or its parameters are changed) and
>>> it looks like this keeps that happening with DEQUEUE_MOVE. So, from a
>>> quick first look, it does sound good to me.
>> If this is strictly about tasks coming into SCHED_DEADLINE there are a
>> number of alternative options:
>>
>>   - there are the sched_class::switch{ing,ed}_to() callbacks;
>>   - there is (the fairly recent) ENQUEUE_CLASS.
>>
>> Anyway, let me break up this one patch into individual bits and write
>> changelogs. I'll stick them in queue/sched/urgent for now; hopefully
>> Pierre can given them a spin and report back if it all sorts his
>> problem).
> Now live at:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/log/?h=sched/urgent
>
> Please test.
I don't see the balance_callback or the double clock update warnings 
anymore.

Thanks for the branch,
Regards,
Pierre

Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern

Posted by Juri Lelli 3 weeks, 3 days ago

On 15/01/26 14:13, Pierre Gondois wrote:
> Hello Peter,
> 
> On 1/15/26 10:05, Peter Zijlstra wrote:
> > On Thu, Jan 15, 2026 at 09:24:31AM +0100, Peter Zijlstra wrote:
> > > On Wed, Jan 14, 2026 at 03:20:48PM +0100, Juri Lelli wrote:
> > > 
> > > > > --- a/kernel/sched/syscalls.c
> > > > > +++ b/kernel/sched/syscalls.c
> > > > > @@ -639,7 +639,7 @@ int __sched_setscheduler(struct task_str
> > > > >   		 * itself.
> > > > >   		 */
> > > > >   		newprio = rt_effective_prio(p, newprio);
> > > > > -		if (newprio == oldprio)
> > > > > +		if (newprio == oldprio && !dl_prio(newprio))
> > > > >   			queue_flags &= ~DEQUEUE_MOVE;
> > > > >   	}
> > > > We have been using (improperly?) ENQUEUE_SAVE also to know when a new
> > > > entity gets setscheduled to DEADLINE (or its parameters are changed) and
> > > > it looks like this keeps that happening with DEQUEUE_MOVE. So, from a
> > > > quick first look, it does sound good to me.
> > > If this is strictly about tasks coming into SCHED_DEADLINE there are a
> > > number of alternative options:
> > > 
> > >   - there are the sched_class::switch{ing,ed}_to() callbacks;
> > >   - there is (the fairly recent) ENQUEUE_CLASS.
> > > 
> > > Anyway, let me break up this one patch into individual bits and write
> > > changelogs. I'll stick them in queue/sched/urgent for now; hopefully
> > > Pierre can given them a spin and report back if it all sorts his
> > > problem).
> > Now live at:
> > 
> > https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/log/?h=sched/urgent
> > 
> > Please test.
> I don't see the balance_callback or the double clock update warnings
> anymore.

FWIW (as I wasn't seeing the reported issue) I had a look as well and
tested locally. Patches look good and nothing to report on the test
side.

Thanks!
Juri

Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern

Posted by luca abeni 3 weeks, 4 days ago

Hi Juri,

On Wed, 14 Jan 2026 15:20:48 +0100
Juri Lelli <juri.lelli@redhat.com> wrote:
[...]
> > > --- a/kernel/sched/deadline.c
> > > +++ b/kernel/sched/deadline.c
> > > @@ -2214,10 +2214,6 @@ enqueue_dl_entity(struct sched_dl_entity
> > >  		update_dl_entity(dl_se);
> > >  	} else if (flags & ENQUEUE_REPLENISH) {
> > >  		replenish_dl_entity(dl_se);
> > > -	} else if ((flags & ENQUEUE_RESTORE) &&
> > > -		   !is_dl_boosted(dl_se) &&
> > > -		   dl_time_before(dl_se->deadline,
> > > rq_clock(rq_of_dl_se(dl_se)))) {
> > > -		setup_new_dl_entity(dl_se);
> > >  	}
> > >  
> > >  	/*  
[...]
> > --- a/kernel/sched/syscalls.c
> > +++ b/kernel/sched/syscalls.c
> > @@ -639,7 +639,7 @@ int __sched_setscheduler(struct task_str
> >  		 * itself.
> >  		 */
> >  		newprio = rt_effective_prio(p, newprio);
> > -		if (newprio == oldprio)
> > +		if (newprio == oldprio && !dl_prio(newprio))
> >  			queue_flags &= ~DEQUEUE_MOVE;
> >  	}  
> 
> We have been using (improperly?) ENQUEUE_SAVE also to know when a new
> entity gets setscheduled to DEADLINE (or its parameters are changed)
> and it looks like this keeps that happening with DEQUEUE_MOVE.

You are right: double thinking about it, I seem to remember that the
"flags & ENQUEUE_RESTORE" check above was introduced to fix tasks
switching to SCHED_DEADLINE...

So, I agree that changing "ENQUEUE_RESTORE" to "ENQUEUE_MOVE" should be
the right thing to do


			Luca

Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern

Posted by luca abeni 3 weeks, 4 days ago

Hi Peter,

On Wed, 14 Jan 2026 14:05:28 +0100
Peter Zijlstra <peterz@infradead.org> wrote:

> On Wed, Jan 14, 2026 at 11:23:36AM +0100, Peter Zijlstra wrote:
> 
> > Juri, Luca, I'm tempted to suggest to simply remove the replenish on
> > RESTORE entirely -- that would allow the task to continue as it had
> > been, irrespective of it being 'late'.
> > 
> > Something like so -- what would this break?
> > 
> > --- a/kernel/sched/deadline.c
> > +++ b/kernel/sched/deadline.c
> > @@ -2214,10 +2214,6 @@ enqueue_dl_entity(struct sched_dl_entity
> >  		update_dl_entity(dl_se);
> >  	} else if (flags & ENQUEUE_REPLENISH) {
> >  		replenish_dl_entity(dl_se);
> > -	} else if ((flags & ENQUEUE_RESTORE) &&
> > -		   !is_dl_boosted(dl_se) &&
> > -		   dl_time_before(dl_se->deadline,
> > rq_clock(rq_of_dl_se(dl_se)))) {
> > -		setup_new_dl_entity(dl_se);
> >  	}
> >  
> >  	/*  
> 
> Ah, this is de-boost, right? Boosting allows one to break the CBS
> rules and then we have to rein in the excesses.

Sorry, I am missing a little bit of context (I am trying to catch up
reading the mailing list archives)... But I agree that the call to
setup_new_dl_entity() mentioned above does not make too much sense.

I suspect the hunk above could be directly removed, as you originally
suggested (on de-boosting(), the task returns to its original deadline,
which is larger than the inherited one, so I am not sure whether we
should generate a new deadline or just leave it as it is, even if it
has been missed).



				Luca
> 
> But we have {DE,EN}QUEUE_MOVE for this, that explicitly allows
> priority to change and is set for rt_mutex_setprio() (among others).
> 
> So doing s/RESTORE/MOVE/ above.
> 
> The corollary to all this is that everybody that sets MOVE must be
> able to deal with balance callbacks, so audit that too.
> 
> This then gives something like so.. which builds and boots for me, but
> clearly I haven't been able to trigger these funny cases.
> 
> ---
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4969,9 +4969,13 @@ struct balance_callback *splice_balance_
>  	return __splice_balance_callbacks(rq, true);
>  }
>  
> -static void __balance_callbacks(struct rq *rq)
> +void __balance_callbacks(struct rq *rq, struct rq_flags *rf)
>  {
> +	if (rf)
> +		rq_unpin_lock(rq, rf);
>  	do_balance_callbacks(rq, __splice_balance_callbacks(rq,
> false));
> +	if (rf)
> +		rq_repin_lock(rq, rf);
>  }
>  
>  void balance_callbacks(struct rq *rq, struct balance_callback *head)
> @@ -5018,7 +5022,7 @@ static inline void finish_lock_switch(st
>  	 * prev into current:
>  	 */
>  	spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_);
> -	__balance_callbacks(rq);
> +	__balance_callbacks(rq, NULL);
>  	raw_spin_rq_unlock_irq(rq);
>  }
>  
> @@ -6901,7 +6905,7 @@ static void __sched notrace __schedule(i
>  			proxy_tag_curr(rq, next);
>  
>  		rq_unpin_lock(rq, &rf);
> -		__balance_callbacks(rq);
> +		__balance_callbacks(rq, NULL);
>  		raw_spin_rq_unlock_irq(rq);
>  	}
>  	trace_sched_exit_tp(is_switch);
> @@ -7350,7 +7354,7 @@ void rt_mutex_setprio(struct task_struct
>  	trace_sched_pi_setprio(p, pi_task);
>  	oldprio = p->prio;
>  
> -	if (oldprio == prio)
> +	if (oldprio == prio && !dl_prio(prio))
>  		queue_flag &= ~DEQUEUE_MOVE;
>  
>  	prev_class = p->sched_class;
> @@ -7396,9 +7400,7 @@ void rt_mutex_setprio(struct task_struct
>  out_unlock:
>  	/* Caller holds task_struct::pi_lock, IRQs are still
> disabled */ 
> -	rq_unpin_lock(rq, &rf);
> -	__balance_callbacks(rq);
> -	rq_repin_lock(rq, &rf);
> +	__balance_callbacks(rq, &rf);
>  	__task_rq_unlock(rq, p, &rf);
>  }
>  #endif /* CONFIG_RT_MUTEXES */
> @@ -9167,6 +9169,8 @@ void sched_move_task(struct task_struct
>  
>  	if (resched)
>  		resched_curr(rq);
> +
> +	__balance_callbacks(rq, &rq_guard.rf);
>  }
>  
>  static struct cgroup_subsys_state *
> @@ -10891,6 +10895,9 @@ void sched_change_end(struct sched_chang
>  				resched_curr(rq);
>  		}
>  	} else {
> +		/*
> +		 * XXX validate prio only really changed when
> ENQUEUE_MOVE is set.
> +		 */
>  		p->sched_class->prio_changed(rq, p, ctx->prio);
>  	}
>  }
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -2214,9 +2214,14 @@ enqueue_dl_entity(struct sched_dl_entity
>  		update_dl_entity(dl_se);
>  	} else if (flags & ENQUEUE_REPLENISH) {
>  		replenish_dl_entity(dl_se);
> -	} else if ((flags & ENQUEUE_RESTORE) &&
> +	} else if ((flags & ENQUEUE_MOVE) &&
>  		   !is_dl_boosted(dl_se) &&
>  		   dl_time_before(dl_se->deadline,
> rq_clock(rq_of_dl_se(dl_se)))) {
> +		/*
> +		 * Deals with the de-boost case, and ENQUEUE_MOVE
> explicitly
> +		 * allows us to change priority. Callers are
> expected to deal
> +		 * with balance_callbacks.
> +		 */
>  		setup_new_dl_entity(dl_se);
>  	}
>  
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -545,6 +545,7 @@ static void scx_task_iter_start(struct s
>  static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
>  {
>  	if (iter->locked_task) {
> +		__balance_callbacks(iter->rq, &iter->rf);
>  		task_rq_unlock(iter->rq, iter->locked_task,
> &iter->rf); iter->locked_task = NULL;
>  	}
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2430,7 +2430,8 @@ extern const u32
> sched_prio_to_wmult[40
>   *                should preserve as much state as possible.
>   *
>   * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the
> location
> - *        in the runqueue.
> + *        in the runqueue. IOW the priority is allowed to change.
> Callers
> + *        must expect to deal with balance callbacks.
>   *
>   * NOCLOCK - skip the update_rq_clock() (avoids double updates)
>   *
> @@ -4019,6 +4020,8 @@ extern void enqueue_task(struct rq *rq,
>  extern bool dequeue_task(struct rq *rq, struct task_struct *p, int
> flags); 
>  extern struct balance_callback *splice_balance_callbacks(struct rq
> *rq); +
> +extern void __balance_callbacks(struct rq *rq, struct rq_flags *rf);
>  extern void balance_callbacks(struct rq *rq, struct balance_callback
> *head); 
>  /*
> --- a/kernel/sched/syscalls.c
> +++ b/kernel/sched/syscalls.c
> @@ -639,7 +639,7 @@ int __sched_setscheduler(struct task_str
>  		 * itself.
>  		 */
>  		newprio = rt_effective_prio(p, newprio);
> -		if (newprio == oldprio)
> +		if (newprio == oldprio && !dl_prio(newprio))
>  			queue_flags &= ~DEQUEUE_MOVE;
>  	}
>

[tip: sched/urgent] sched/deadline: Use ENQUEUE_MOVE to allow priority change

Posted by tip-bot2 for Peter Zijlstra 3 weeks, 3 days ago

The following commit has been merged into the sched/urgent branch of tip:

Commit-ID:     627cc25f84466d557d86e5dc67b43a4eea604c80
Gitweb:        https://git.kernel.org/tip/627cc25f84466d557d86e5dc67b43a4eea604c80
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Thu, 15 Jan 2026 09:27:22 +01:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 15 Jan 2026 21:57:53 +01:00

sched/deadline: Use ENQUEUE_MOVE to allow priority change

Pierre reported hitting balance callback warnings for deadline tasks
after commit 6455ad5346c9 ("sched: Move sched_class::prio_changed()
into the change pattern").

It turns out that DEQUEUE_SAVE+ENQUEUE_RESTORE does not preserve DL
priority and subsequently trips a balance pass -- where one was not
expected.

>From discussion with Juri and Luca, the purpose of this clause was to
deal with tasks new to DL and all those sites will have MOVE set (as
well as CLASS, but MOVE is move conservative at this point).

Per the previous patches MOVE is audited to always run the balance
callbacks, so switch enqueue_dl_entity() to use MOVE for this case.

Fixes: 6455ad5346c9 ("sched: Move sched_class::prio_changed() into the change pattern")
Reported-by: Pierre Gondois <pierre.gondois@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Pierre Gondois <pierre.gondois@arm.com>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://patch.msgid.link/20260114130528.GB831285@noisy.programming.kicks-ass.net
---
 kernel/sched/deadline.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 5d6f3cc..c509f2e 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2214,7 +2214,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
 		update_dl_entity(dl_se);
 	} else if (flags & ENQUEUE_REPLENISH) {
 		replenish_dl_entity(dl_se);
-	} else if ((flags & ENQUEUE_RESTORE) &&
+	} else if ((flags & ENQUEUE_MOVE) &&
 		   !is_dl_boosted(dl_se) &&
 		   dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) {
 		setup_new_dl_entity(dl_se);

[tip: sched/urgent] sched: Deadline has dynamic priority

Posted by tip-bot2 for Peter Zijlstra 3 weeks, 3 days ago

The following commit has been merged into the sched/urgent branch of tip:

Commit-ID:     e008ec6c7904ed99d3b2cb634b6545b008a99288
Gitweb:        https://git.kernel.org/tip/e008ec6c7904ed99d3b2cb634b6545b008a99288
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Thu, 15 Jan 2026 09:25:37 +01:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 15 Jan 2026 21:57:53 +01:00

sched: Deadline has dynamic priority

While FIFO/RR have static priority, DEADLINE is a dynamic priority
scheme. Notably it has static priority -1. Do not assume the priority
doesn't change for deadline tasks just because the static priority
doesn't change.

This ensures DL always sees {DE,EN}QUEUE_MOVE where appropriate.

Fixes: ff77e4685359 ("sched/rt: Fix PI handling vs. sched_setscheduler()")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Pierre Gondois <pierre.gondois@arm.com>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://patch.msgid.link/20260114130528.GB831285@noisy.programming.kicks-ass.net
---
 kernel/sched/core.c     | 2 +-
 kernel/sched/syscalls.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4d925d7..045f83a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7320,7 +7320,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 	trace_sched_pi_setprio(p, pi_task);
 	oldprio = p->prio;
 
-	if (oldprio == prio)
+	if (oldprio == prio && !dl_prio(prio))
 		queue_flag &= ~DEQUEUE_MOVE;
 
 	prev_class = p->sched_class;
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index cb337de..6f10db3 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -639,7 +639,7 @@ change:
 		 * itself.
 		 */
 		newprio = rt_effective_prio(p, newprio);
-		if (newprio == oldprio)
+		if (newprio == oldprio && !dl_prio(newprio))
 			queue_flags &= ~DEQUEUE_MOVE;
 	}

[tip: sched/urgent] sched: Audit MOVE vs balance_callbacks

Posted by tip-bot2 for Peter Zijlstra 3 weeks, 3 days ago

The following commit has been merged into the sched/urgent branch of tip:

Commit-ID:     53439363c0a111f11625982b69c88ee2ce8608ec
Gitweb:        https://git.kernel.org/tip/53439363c0a111f11625982b69c88ee2ce8608ec
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Thu, 15 Jan 2026 09:17:49 +01:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 15 Jan 2026 21:57:53 +01:00

sched: Audit MOVE vs balance_callbacks

The {DE,EN}QUEUE_MOVE flag indicates a task is allowed to change
priority, which means there could be balance callbacks queued.

Therefore audit all MOVE users and make sure they do run balance
callbacks before dropping rq-lock.

Fixes: 6455ad5346c9 ("sched: Move sched_class::prio_changed() into the change pattern")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Pierre Gondois <pierre.gondois@arm.com>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://patch.msgid.link/20260114130528.GB831285@noisy.programming.kicks-ass.net
---
 kernel/sched/core.c  | 4 +++-
 kernel/sched/ext.c   | 1 +
 kernel/sched/sched.h | 5 ++++-
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 842a3ad..4d925d7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4950,7 +4950,7 @@ struct balance_callback *splice_balance_callbacks(struct rq *rq)
 	return __splice_balance_callbacks(rq, true);
 }
 
-static void __balance_callbacks(struct rq *rq, struct rq_flags *rf)
+void __balance_callbacks(struct rq *rq, struct rq_flags *rf)
 {
 	if (rf)
 		rq_unpin_lock(rq, rf);
@@ -9126,6 +9126,8 @@ void sched_move_task(struct task_struct *tsk, bool for_autogroup)
 
 	if (resched)
 		resched_curr(rq);
+
+	__balance_callbacks(rq, &rq_guard.rf);
 }
 
 static struct cgroup_subsys_state *
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 8f6d8d7..afe28c0 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -545,6 +545,7 @@ static void scx_task_iter_start(struct scx_task_iter *iter)
 static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
 {
 	if (iter->locked_task) {
+		__balance_callbacks(iter->rq, &iter->rf);
 		task_rq_unlock(iter->rq, iter->locked_task, &iter->rf);
 		iter->locked_task = NULL;
 	}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e885a93..93fce4b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2388,7 +2388,8 @@ extern const u32		sched_prio_to_wmult[40];
  *                should preserve as much state as possible.
  *
  * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
- *        in the runqueue.
+ *        in the runqueue. IOW the priority is allowed to change. Callers
+ *        must expect to deal with balance callbacks.
  *
  * NOCLOCK - skip the update_rq_clock() (avoids double updates)
  *
@@ -3969,6 +3970,8 @@ extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
 extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
 
 extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
+
+extern void __balance_callbacks(struct rq *rq, struct rq_flags *rf);
 extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
 
 /*

[tip: sched/urgent] sched: Fold rq-pin swizzle into __balance_callbacks()

Posted by tip-bot2 for Peter Zijlstra 3 weeks, 3 days ago

The following commit has been merged into the sched/urgent branch of tip:

Commit-ID:     49041e87f9cd3e6be8926b80b3fee71e89323e1c
Gitweb:        https://git.kernel.org/tip/49041e87f9cd3e6be8926b80b3fee71e89323e1c
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Thu, 15 Jan 2026 09:16:44 +01:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 15 Jan 2026 21:57:52 +01:00

sched: Fold rq-pin swizzle into __balance_callbacks()

Prepare for more users needing the rq-pin swizzle.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Pierre Gondois <pierre.gondois@arm.com>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://patch.msgid.link/20260114130528.GB831285@noisy.programming.kicks-ass.net
---
 kernel/sched/core.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 60afadb..842a3ad 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4950,9 +4950,13 @@ struct balance_callback *splice_balance_callbacks(struct rq *rq)
 	return __splice_balance_callbacks(rq, true);
 }
 
-static void __balance_callbacks(struct rq *rq)
+static void __balance_callbacks(struct rq *rq, struct rq_flags *rf)
 {
+	if (rf)
+		rq_unpin_lock(rq, rf);
 	do_balance_callbacks(rq, __splice_balance_callbacks(rq, false));
+	if (rf)
+		rq_repin_lock(rq, rf);
 }
 
 void balance_callbacks(struct rq *rq, struct balance_callback *head)
@@ -4991,7 +4995,7 @@ static inline void finish_lock_switch(struct rq *rq)
 	 * prev into current:
 	 */
 	spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_);
-	__balance_callbacks(rq);
+	__balance_callbacks(rq, NULL);
 	raw_spin_rq_unlock_irq(rq);
 }
 
@@ -6867,7 +6871,7 @@ keep_resched:
 			proxy_tag_curr(rq, next);
 
 		rq_unpin_lock(rq, &rf);
-		__balance_callbacks(rq);
+		__balance_callbacks(rq, NULL);
 		raw_spin_rq_unlock_irq(rq);
 	}
 	trace_sched_exit_tp(is_switch);
@@ -7362,9 +7366,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 out_unlock:
 	/* Caller holds task_struct::pi_lock, IRQs are still disabled */
 
-	rq_unpin_lock(rq, &rf);
-	__balance_callbacks(rq);
-	rq_repin_lock(rq, &rf);
+	__balance_callbacks(rq, &rf);
 	__task_rq_unlock(rq, p, &rf);
 }
 #endif /* CONFIG_RT_MUTEXES */

Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern

Posted by K Prateek Nayak 3 weeks, 5 days ago

Hello Pierre,

On 1/13/2026 4:15 PM, Pierre Gondois wrote:
> Hello Prateek,
> 
> On 1/13/26 05:12, K Prateek Nayak wrote:
>> Hello Pierre,
>>
>> On 1/13/2026 2:14 AM, Pierre Gondois wrote:
>>> Hello Peter,
>>>
>>> It seems this patch:
>>> 6455ad5346c9 ("sched: Move sched_class::prio_changed() into the change pattern")
>>> is triggering the following warning:
>>> rq_pin_lock()
>>> \-WARN_ON_ONCE(rq->balance_callback && rq->balance_callback != &balance_push_callback);
>> Can you check if the following solution helps your case too:
>> https://lore.kernel.org/all/20260106104113.GX3707891@noisy.programming.kicks-ass.net/
>>
> I can still see the issue.
> It seems the task deadline is also updated in:
> sched_change_end()
> \-enqueue_task_dl()
>   \-enqueue_dl_entity()
>     \-setup_new_dl_entity()
>       \-replenish_dl_new_period()
> if the task's period finished.

Ah! Got it. Thank you for testing the fix.

I'm curious, why is setup_new_dl_entity() doing an
update_rq_clock()? That can advance the rq->clock and make it look like
we need a replenish.

Does enabling WARN_DOUBLE_CLOCK warn of a double clock update before
hitting this warning?

> 
> So in sched_change_end(), the task priority (i.e. p->dl.deadline) is updated.
> This results in having an old_deadline earlier than the new p->dl.deadline.
> Thus the rq->balance_callback:
> 
> prio_changed_dl() {
> ...
> if (dl_time_before(old_deadline, p->dl.deadline))
>   deadline_queue_pull_task(rq);
> ...
> }
> 

Thank you for your analysis.

-- 
Thanks and Regards,
Prateek

Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern

Posted by Peter Zijlstra 3 weeks, 5 days ago

On Tue, Jan 13, 2026 at 04:35:02PM +0530, K Prateek Nayak wrote:

> Does enabling WARN_DOUBLE_CLOCK warn of a double clock update before
> hitting this warning?

setup_new_dl_entity() -> update_rq_clock() seems like it will trip that
in this case.

Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern

Posted by Peter Zijlstra 3 weeks, 5 days ago

On Tue, Jan 13, 2026 at 12:53:09PM +0100, Peter Zijlstra wrote:
> On Tue, Jan 13, 2026 at 04:35:02PM +0530, K Prateek Nayak wrote:
> 
> > Does enabling WARN_DOUBLE_CLOCK warn of a double clock update before
> > hitting this warning?
> 
> setup_new_dl_entity() -> update_rq_clock() seems like it will trip that
> in this case.

Something like so to fix: 9f239df55546 ("sched/deadline: Initialize dl_servers after SMP")


--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -752,8 +752,6 @@ static inline void setup_new_dl_entity(s
 	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
 	struct rq *rq = rq_of_dl_rq(dl_rq);
 
-	update_rq_clock(rq);
-
 	WARN_ON(is_dl_boosted(dl_se));
 	WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
 
@@ -1834,6 +1832,7 @@ void sched_init_dl_servers(void)
 		rq = cpu_rq(cpu);
 
 		guard(rq_lock_irq)(rq);
+		update_rq_clock(rq);
 
 		dl_se = &rq->fair_server;

Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern

Posted by Pierre Gondois 3 weeks, 5 days ago

On 1/13/26 12:56, Peter Zijlstra wrote:
> On Tue, Jan 13, 2026 at 12:53:09PM +0100, Peter Zijlstra wrote:
>> On Tue, Jan 13, 2026 at 04:35:02PM +0530, K Prateek Nayak wrote:
>>
>>> Does enabling WARN_DOUBLE_CLOCK warn of a double clock update before
>>> hitting this warning?
>> setup_new_dl_entity() -> update_rq_clock() seems like it will trip that
>> in this case.
> Something like so to fix: 9f239df55546 ("sched/deadline: Initialize dl_servers after SMP")
>
>
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -752,8 +752,6 @@ static inline void setup_new_dl_entity(s
>   	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
>   	struct rq *rq = rq_of_dl_rq(dl_rq);
>   
> -	update_rq_clock(rq);
> -
>   	WARN_ON(is_dl_boosted(dl_se));
>   	WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
>   
> @@ -1834,6 +1832,7 @@ void sched_init_dl_servers(void)
>   		rq = cpu_rq(cpu);
>   
>   		guard(rq_lock_irq)(rq);
> +		update_rq_clock(rq);
>   
>   		dl_se = &rq->fair_server;
>   
Yes right, enabling WARN_DOUBLE_CLOCK detects the double clock update 
and this fixes it.

Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern

Posted by Pierre Gondois 3 weeks, 5 days ago

On 1/13/26 12:56, Peter Zijlstra wrote:
> On Tue, Jan 13, 2026 at 12:53:09PM +0100, Peter Zijlstra wrote:
>> On Tue, Jan 13, 2026 at 04:35:02PM +0530, K Prateek Nayak wrote:
>>
>>> Does enabling WARN_DOUBLE_CLOCK warn of a double clock update before
>>> hitting this warning?
>> setup_new_dl_entity() -> update_rq_clock() seems like it will trip that
>> in this case.
> Something like so to fix: 9f239df55546 ("sched/deadline: Initialize dl_servers after SMP")
>
>
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -752,8 +752,6 @@ static inline void setup_new_dl_entity(s
>   	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
>   	struct rq *rq = rq_of_dl_rq(dl_rq);
>   
> -	update_rq_clock(rq);
> -
>   	WARN_ON(is_dl_boosted(dl_se));
>   	WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
>   
> @@ -1834,6 +1832,7 @@ void sched_init_dl_servers(void)
>   		rq = cpu_rq(cpu);
>   
>   		guard(rq_lock_irq)(rq);
> +		update_rq_clock(rq);
>   
>   		dl_se = &rq->fair_server;
>   
Yes right, enabling WARN_DOUBLE_CLOCK detects the double clock update 
and this fixes it.

[tip: sched/urgent] sched/deadline: Avoid double update_rq_clock()

Posted by tip-bot2 for Peter Zijlstra 3 weeks, 3 days ago

The following commit has been merged into the sched/urgent branch of tip:

Commit-ID:     4de9ff76067b40c3660df73efaea57389e62ea7a
Gitweb:        https://git.kernel.org/tip/4de9ff76067b40c3660df73efaea57389e62ea7a
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Tue, 13 Jan 2026 12:57:14 +01:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 15 Jan 2026 21:57:52 +01:00

sched/deadline: Avoid double update_rq_clock()

When setup_new_dl_entity() is called from enqueue_task_dl() ->
enqueue_dl_entity(), the rq-clock should already be updated, and
calling update_rq_clock() again is not right.

Move the update_rq_clock() to the one other caller of
setup_new_dl_entity(): sched_init_dl_server().

Fixes: 9f239df55546 ("sched/deadline: Initialize dl_servers after SMP")
Reported-by: Pierre Gondois <pierre.gondois@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Pierre Gondois <pierre.gondois@arm.com>
Link: https://patch.msgid.link/20260113115622.GA831285@noisy.programming.kicks-ass.net
---
 kernel/sched/deadline.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index b7acf74..5d6f3cc 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -752,8 +752,6 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
 	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
 	struct rq *rq = rq_of_dl_rq(dl_rq);
 
-	update_rq_clock(rq);
-
 	WARN_ON(is_dl_boosted(dl_se));
 	WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
 
@@ -1839,6 +1837,7 @@ void sched_init_dl_servers(void)
 		rq = cpu_rq(cpu);
 
 		guard(rq_lock_irq)(rq);
+		update_rq_clock(rq);
 
 		dl_se = &rq->fair_server;

[tip: sched/core] sched: Move sched_class::prio_changed() into the change pattern

Posted by tip-bot2 for Peter Zijlstra 3 months, 3 weeks ago

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     6455ad5346c9cf755fa9dda6e326c4028fb3c853
Gitweb:        https://git.kernel.org/tip/6455ad5346c9cf755fa9dda6e326c4028fb3c853
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Fri, 01 Nov 2024 14:16:10 +01:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 16 Oct 2025 11:13:52 +02:00

sched: Move sched_class::prio_changed() into the change pattern

Move sched_class::prio_changed() into the change pattern.

And while there, extend it with sched_class::get_prio() in order to
fix the deadline sitation.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
---
 kernel/sched/core.c      | 24 +++++++++++++-----------
 kernel/sched/deadline.c  | 20 +++++++++++---------
 kernel/sched/ext.c       |  8 +-------
 kernel/sched/fair.c      |  8 ++++++--
 kernel/sched/idle.c      |  5 ++++-
 kernel/sched/rt.c        |  5 ++++-
 kernel/sched/sched.h     |  7 ++++---
 kernel/sched/stop_task.c |  5 ++++-
 kernel/sched/syscalls.c  |  9 ---------
 9 files changed, 47 insertions(+), 44 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bd2c551..4a4dbce 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2169,12 +2169,6 @@ inline int task_curr(const struct task_struct *p)
 	return cpu_curr(task_cpu(p)) == p;
 }
 
-void check_prio_changed(struct rq *rq, struct task_struct *p, int oldprio)
-{
-	if (oldprio != p->prio || dl_task(p))
-		p->sched_class->prio_changed(rq, p, oldprio);
-}
-
 void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct task_struct *donor = rq->donor;
@@ -7400,9 +7394,6 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 		p->sched_class = next_class;
 		p->prio = prio;
 	}
-
-	if (!(queue_flag & DEQUEUE_CLASS))
-		check_prio_changed(rq, p, oldprio);
 out_unlock:
 	/* Avoid rq from going away on us: */
 	preempt_disable();
@@ -10855,6 +10846,13 @@ struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int 
 		.running = task_current_donor(rq, p),
 	};
 
+	if (!(flags & DEQUEUE_CLASS)) {
+		if (p->sched_class->get_prio)
+			ctx->prio = p->sched_class->get_prio(rq, p);
+		else
+			ctx->prio = p->prio;
+	}
+
 	if (ctx->queued)
 		dequeue_task(rq, p, flags);
 	if (ctx->running)
@@ -10881,6 +10879,10 @@ void sched_change_end(struct sched_change_ctx *ctx)
 	if (ctx->running)
 		set_next_task(rq, p);
 
-	if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switched_to)
-		p->sched_class->switched_to(rq, p);
+	if (ctx->flags & ENQUEUE_CLASS) {
+		if (p->sched_class->switched_to)
+			p->sched_class->switched_to(rq, p);
+	} else {
+		p->sched_class->prio_changed(rq, p, ctx->prio);
+	}
 }
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fd147a7..1f94994 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -3042,23 +3042,24 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
 	}
 }
 
+static u64 get_prio_dl(struct rq *rq, struct task_struct *p)
+{
+	return p->dl.deadline;
+}
+
 /*
  * If the scheduling parameters of a -deadline task changed,
  * a push or pull operation might be needed.
  */
-static void prio_changed_dl(struct rq *rq, struct task_struct *p,
-			    int oldprio)
+static void prio_changed_dl(struct rq *rq, struct task_struct *p, u64 old_deadline)
 {
 	if (!task_on_rq_queued(p))
 		return;
 
-	/*
-	 * This might be too much, but unfortunately
-	 * we don't have the old deadline value, and
-	 * we can't argue if the task is increasing
-	 * or lowering its prio, so...
-	 */
-	if (!rq->dl.overloaded)
+	if (p->dl.deadline == old_deadline)
+		return;
+
+	if (dl_time_before(old_deadline, p->dl.deadline))
 		deadline_queue_pull_task(rq);
 
 	if (task_current_donor(rq, p)) {
@@ -3113,6 +3114,7 @@ DEFINE_SCHED_CLASS(dl) = {
 	.task_tick		= task_tick_dl,
 	.task_fork              = task_fork_dl,
 
+	.get_prio		= get_prio_dl,
 	.prio_changed           = prio_changed_dl,
 	.switched_from		= switched_from_dl,
 	.switched_to		= switched_to_dl,
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index b0a1e2a..ad371b6 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2961,7 +2961,7 @@ static void reweight_task_scx(struct rq *rq, struct task_struct *p,
 				 p, p->scx.weight);
 }
 
-static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio)
+static void prio_changed_scx(struct rq *rq, struct task_struct *p, u64 oldprio)
 {
 }
 
@@ -3926,9 +3926,6 @@ static void scx_disable_workfn(struct kthread_work *work)
 			p->sched_class = new_class;
 		}
 
-		if (!(queue_flags & DEQUEUE_CLASS))
-			check_prio_changed(task_rq(p), p, p->prio);
-
 		scx_exit_task(p);
 	}
 	scx_task_iter_stop(&sti);
@@ -4675,9 +4672,6 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 			p->sched_class = new_class;
 		}
 
-		if (!(queue_flags & DEQUEUE_CLASS))
-			check_prio_changed(task_rq(p), p, p->prio);
-
 		put_task_struct(p);
 	}
 	scx_task_iter_stop(&sti);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6c462e4..77a713e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -13150,11 +13150,14 @@ static void task_fork_fair(struct task_struct *p)
  * the current task.
  */
 static void
-prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
+prio_changed_fair(struct rq *rq, struct task_struct *p, u64 oldprio)
 {
 	if (!task_on_rq_queued(p))
 		return;
 
+	if (p->prio == oldprio)
+		return;
+
 	if (rq->cfs.nr_queued == 1)
 		return;
 
@@ -13166,8 +13169,9 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
 	if (task_current_donor(rq, p)) {
 		if (p->prio > oldprio)
 			resched_curr(rq);
-	} else
+	} else {
 		wakeup_preempt(rq, p, 0);
+	}
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index f02dced..dee6e01 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -504,8 +504,11 @@ static void switching_to_idle(struct rq *rq, struct task_struct *p)
 }
 
 static void
-prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
+prio_changed_idle(struct rq *rq, struct task_struct *p, u64 oldprio)
 {
+	if (p->prio == oldprio)
+		return;
+
 	BUG();
 }
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 6b2e811..c2347e4 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2437,11 +2437,14 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
  * us to initiate a push or pull.
  */
 static void
-prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
+prio_changed_rt(struct rq *rq, struct task_struct *p, u64 oldprio)
 {
 	if (!task_on_rq_queued(p))
 		return;
 
+	if (p->prio == oldprio)
+		return;
+
 	if (task_current_donor(rq, p)) {
 		/*
 		 * If our priority decreases while running, we
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e3f4215..bcde43d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2451,8 +2451,10 @@ struct sched_class {
 
 	void (*reweight_task)(struct rq *this_rq, struct task_struct *task,
 			      const struct load_weight *lw);
+
+	u64  (*get_prio)     (struct rq *this_rq, struct task_struct *task);
 	void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
-			      int oldprio);
+			      u64 oldprio);
 
 	unsigned int (*get_rr_interval)(struct rq *rq,
 					struct task_struct *task);
@@ -3877,8 +3879,6 @@ extern void set_load_weight(struct task_struct *p, bool update_load);
 extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
 extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
 
-extern void check_prio_changed(struct rq *rq, struct task_struct *p, int oldprio);
-
 extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
 extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
 
@@ -3899,6 +3899,7 @@ extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
  * the task's queueing state is idempotent across the operation.
  */
 struct sched_change_ctx {
+	u64			prio;
 	struct task_struct	*p;
 	int			flags;
 	bool			queued;
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index fcc4c54..73aa8de 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -81,8 +81,11 @@ static void switching_to_stop(struct rq *rq, struct task_struct *p)
 }
 
 static void
-prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
+prio_changed_stop(struct rq *rq, struct task_struct *p, u64 oldprio)
 {
+	if (p->prio == oldprio)
+		return;
+
 	BUG(); /* how!?, what priority? */
 }
 
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 6583faf..20af564 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -95,12 +95,6 @@ void set_user_nice(struct task_struct *p, long nice)
 		old_prio = p->prio;
 		p->prio = effective_prio(p);
 	}
-
-	/*
-	 * If the task increased its priority or is running and
-	 * lowered its priority, then reschedule its CPU:
-	 */
-	p->sched_class->prio_changed(rq, p, old_prio);
 }
 EXPORT_SYMBOL(set_user_nice);
 
@@ -706,9 +700,6 @@ change:
 		}
 	}
 
-	if (!(queue_flags & DEQUEUE_CLASS))
-		check_prio_changed(rq, p, oldprio);
-
 	/* Avoid rq from going away on us: */
 	preempt_disable();
 	head = splice_balance_callbacks(rq);