[PATCH 01/14] sched: Employ sched_change guards

Peter Zijlstra posted 14 patches 5 months ago
[PATCH 01/14] sched: Employ sched_change guards
Posted by Peter Zijlstra 5 months ago
As proposed a long while ago -- and half done by scx -- wrap the
scheduler's 'change' pattern in a guard helper.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/cleanup.h |    5 +
 kernel/sched/core.c     |  156 +++++++++++++++++-------------------------------
 kernel/sched/ext.c      |   39 +++++-------
 kernel/sched/sched.h    |   21 +++---
 kernel/sched/syscalls.c |   65 +++++++-------------
 5 files changed, 114 insertions(+), 172 deletions(-)

--- a/include/linux/cleanup.h
+++ b/include/linux/cleanup.h
@@ -340,6 +340,11 @@ _label:
 #define __DEFINE_CLASS_IS_CONDITIONAL(_name, _is_cond)	\
 static __maybe_unused const bool class_##_name##_is_conditional = _is_cond
 
+#define DEFINE_CLASS_IS_UNCONDITIONAL(_name)		\
+	__DEFINE_CLASS_IS_CONDITIONAL(_name, false);	\
+	static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \
+	{ return (void *)1; }
+
 #define __GUARD_IS_ERR(_ptr)                                       \
 	({                                                         \
 		unsigned long _rc = (__force unsigned long)(_ptr); \
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7361,7 +7361,7 @@ void rt_mutex_post_schedule(void)
  */
 void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 {
-	int prio, oldprio, queued, running, queue_flag =
+	int prio, oldprio, queue_flag =
 		DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
 	const struct sched_class *prev_class, *next_class;
 	struct rq_flags rf;
@@ -7426,52 +7426,42 @@ void rt_mutex_setprio(struct task_struct
 	if (prev_class != next_class && p->se.sched_delayed)
 		dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
 
-	queued = task_on_rq_queued(p);
-	running = task_current_donor(rq, p);
-	if (queued)
-		dequeue_task(rq, p, queue_flag);
-	if (running)
-		put_prev_task(rq, p);
-
-	/*
-	 * Boosting condition are:
-	 * 1. -rt task is running and holds mutex A
-	 *      --> -dl task blocks on mutex A
-	 *
-	 * 2. -dl task is running and holds mutex A
-	 *      --> -dl task blocks on mutex A and could preempt the
-	 *          running task
-	 */
-	if (dl_prio(prio)) {
-		if (!dl_prio(p->normal_prio) ||
-		    (pi_task && dl_prio(pi_task->prio) &&
-		     dl_entity_preempt(&pi_task->dl, &p->dl))) {
-			p->dl.pi_se = pi_task->dl.pi_se;
-			queue_flag |= ENQUEUE_REPLENISH;
+	scoped_guard (sched_change, p, queue_flag) {
+		/*
+		 * Boosting condition are:
+		 * 1. -rt task is running and holds mutex A
+		 *      --> -dl task blocks on mutex A
+		 *
+		 * 2. -dl task is running and holds mutex A
+		 *      --> -dl task blocks on mutex A and could preempt the
+		 *          running task
+		 */
+		if (dl_prio(prio)) {
+			if (!dl_prio(p->normal_prio) ||
+			    (pi_task && dl_prio(pi_task->prio) &&
+			     dl_entity_preempt(&pi_task->dl, &p->dl))) {
+				p->dl.pi_se = pi_task->dl.pi_se;
+				scope->flags |= ENQUEUE_REPLENISH;
+			} else {
+				p->dl.pi_se = &p->dl;
+			}
+		} else if (rt_prio(prio)) {
+			if (dl_prio(oldprio))
+				p->dl.pi_se = &p->dl;
+			if (oldprio < prio)
+				scope->flags |= ENQUEUE_HEAD;
 		} else {
-			p->dl.pi_se = &p->dl;
+			if (dl_prio(oldprio))
+				p->dl.pi_se = &p->dl;
+			if (rt_prio(oldprio))
+				p->rt.timeout = 0;
 		}
-	} else if (rt_prio(prio)) {
-		if (dl_prio(oldprio))
-			p->dl.pi_se = &p->dl;
-		if (oldprio < prio)
-			queue_flag |= ENQUEUE_HEAD;
-	} else {
-		if (dl_prio(oldprio))
-			p->dl.pi_se = &p->dl;
-		if (rt_prio(oldprio))
-			p->rt.timeout = 0;
-	}
 
-	p->sched_class = next_class;
-	p->prio = prio;
+		p->sched_class = next_class;
+		p->prio = prio;
 
-	check_class_changing(rq, p, prev_class);
-
-	if (queued)
-		enqueue_task(rq, p, queue_flag);
-	if (running)
-		set_next_task(rq, p);
+		check_class_changing(rq, p, prev_class);
+	}
 
 	check_class_changed(rq, p, prev_class, oldprio);
 out_unlock:
@@ -8119,26 +8109,9 @@ int migrate_task_to(struct task_struct *
  */
 void sched_setnuma(struct task_struct *p, int nid)
 {
-	bool queued, running;
-	struct rq_flags rf;
-	struct rq *rq;
-
-	rq = task_rq_lock(p, &rf);
-	queued = task_on_rq_queued(p);
-	running = task_current_donor(rq, p);
-
-	if (queued)
-		dequeue_task(rq, p, DEQUEUE_SAVE);
-	if (running)
-		put_prev_task(rq, p);
-
-	p->numa_preferred_nid = nid;
-
-	if (queued)
-		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
-	if (running)
-		set_next_task(rq, p);
-	task_rq_unlock(rq, p, &rf);
+	guard(task_rq_lock)(p);
+	scoped_guard (sched_change, p, DEQUEUE_SAVE)
+		p->numa_preferred_nid = nid;
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
@@ -9240,8 +9213,9 @@ static void sched_change_group(struct ta
  */
 void sched_move_task(struct task_struct *tsk, bool for_autogroup)
 {
-	int queued, running, queue_flags =
+	unsigned int queue_flags =
 		DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
+	bool resched = false;
 	struct rq *rq;
 
 	CLASS(task_rq_lock, rq_guard)(tsk);
@@ -9249,28 +9223,12 @@ void sched_move_task(struct task_struct
 
 	update_rq_clock(rq);
 
-	running = task_current_donor(rq, tsk);
-	queued = task_on_rq_queued(tsk);
-
-	if (queued)
-		dequeue_task(rq, tsk, queue_flags);
-	if (running)
-		put_prev_task(rq, tsk);
-
-	sched_change_group(tsk);
-	if (!for_autogroup)
-		scx_cgroup_move_task(tsk);
-
-	if (queued)
-		enqueue_task(rq, tsk, queue_flags);
-	if (running) {
-		set_next_task(rq, tsk);
-		/*
-		 * After changing group, the running task may have joined a
-		 * throttled one but it's still the running task. Trigger a
-		 * resched to make sure that task can still run.
-		 */
-		resched_curr(rq);
+	scoped_guard (sched_change, tsk, queue_flags) {
+		sched_change_group(tsk);
+		if (!for_autogroup)
+			scx_cgroup_move_task(tsk);
+		if (scope->running)
+			resched = true;
 	}
 }
 
@@ -10929,37 +10887,39 @@ void sched_mm_cid_fork(struct task_struc
 }
 #endif /* CONFIG_SCHED_MM_CID */
 
-#ifdef CONFIG_SCHED_CLASS_EXT
-void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
-			    struct sched_enq_and_set_ctx *ctx)
+static DEFINE_PER_CPU(struct sched_change_ctx, sched_change_ctx);
+
+struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags)
 {
+	struct sched_change_ctx *ctx = this_cpu_ptr(&sched_change_ctx);
 	struct rq *rq = task_rq(p);
 
 	lockdep_assert_rq_held(rq);
 
-	*ctx = (struct sched_enq_and_set_ctx){
+	*ctx = (struct sched_change_ctx){
 		.p = p,
-		.queue_flags = queue_flags,
+		.flags = flags,
 		.queued = task_on_rq_queued(p),
 		.running = task_current(rq, p),
 	};
 
-	update_rq_clock(rq);
 	if (ctx->queued)
-		dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK);
+		dequeue_task(rq, p, flags);
 	if (ctx->running)
 		put_prev_task(rq, p);
+
+	return ctx;
 }
 
-void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx)
+void sched_change_end(struct sched_change_ctx *ctx)
 {
-	struct rq *rq = task_rq(ctx->p);
+	struct task_struct *p = ctx->p;
+	struct rq *rq = task_rq(p);
 
 	lockdep_assert_rq_held(rq);
 
 	if (ctx->queued)
-		enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK);
+		enqueue_task(rq, p, ctx->flags | ENQUEUE_NOCLOCK);
 	if (ctx->running)
-		set_next_task(rq, ctx->p);
+		set_next_task(rq, p);
 }
-#endif /* CONFIG_SCHED_CLASS_EXT */
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4867,11 +4867,10 @@ static void scx_bypass(bool bypass)
 		 */
 		list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list,
 						 scx.runnable_node) {
-			struct sched_enq_and_set_ctx ctx;
-
 			/* cycling deq/enq is enough, see the function comment */
-			sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
-			sched_enq_and_set_task(&ctx);
+			scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
+				/* nothing */ ;
+			}
 		}
 
 		/* resched to restore ticks and idle state */
@@ -5003,17 +5002,16 @@ static void scx_disable_workfn(struct kt
 		const struct sched_class *old_class = p->sched_class;
 		const struct sched_class *new_class =
 			__setscheduler_class(p->policy, p->prio);
-		struct sched_enq_and_set_ctx ctx;
-
-		if (old_class != new_class && p->se.sched_delayed)
-			dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
 
-		sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
+		update_rq_clock(task_rq(p));
 
-		p->sched_class = new_class;
-		check_class_changing(task_rq(p), p, old_class);
+		if (old_class != new_class && p->se.sched_delayed)
+			dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
 
-		sched_enq_and_set_task(&ctx);
+		scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK) {
+			p->sched_class = new_class;
+			check_class_changing(task_rq(p), p, old_class);
+		}
 
 		check_class_changed(task_rq(p), p, old_class, p->prio);
 		scx_exit_task(p);
@@ -5747,21 +5745,20 @@ static int scx_enable(struct sched_ext_o
 		const struct sched_class *old_class = p->sched_class;
 		const struct sched_class *new_class =
 			__setscheduler_class(p->policy, p->prio);
-		struct sched_enq_and_set_ctx ctx;
 
 		if (!tryget_task_struct(p))
 			continue;
 
-		if (old_class != new_class && p->se.sched_delayed)
-			dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
-
-		sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
+		update_rq_clock(task_rq(p));
 
-		p->scx.slice = SCX_SLICE_DFL;
-		p->sched_class = new_class;
-		check_class_changing(task_rq(p), p, old_class);
+		if (old_class != new_class && p->se.sched_delayed)
+			dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
 
-		sched_enq_and_set_task(&ctx);
+		scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK) {
+			p->scx.slice = SCX_SLICE_DFL;
+			p->sched_class = new_class;
+			check_class_changing(task_rq(p), p, old_class);
+		}
 
 		check_class_changed(task_rq(p), p, old_class, p->prio);
 		put_task_struct(p);
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3860,23 +3860,22 @@ extern void check_class_changed(struct r
 extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
 extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
 
-#ifdef CONFIG_SCHED_CLASS_EXT
-/*
- * Used by SCX in the enable/disable paths to move tasks between sched_classes
- * and establish invariants.
- */
-struct sched_enq_and_set_ctx {
+struct sched_change_ctx {
 	struct task_struct	*p;
-	int			queue_flags;
+	int			flags;
 	bool			queued;
 	bool			running;
 };
 
-void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
-			    struct sched_enq_and_set_ctx *ctx);
-void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx);
+struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags);
+void sched_change_end(struct sched_change_ctx *ctx);
 
-#endif /* CONFIG_SCHED_CLASS_EXT */
+DEFINE_CLASS(sched_change, struct sched_change_ctx *,
+	     sched_change_end(_T),
+	     sched_change_begin(p, flags),
+	     struct task_struct *p, unsigned int flags)
+
+DEFINE_CLASS_IS_UNCONDITIONAL(sched_change)
 
 #include "ext.h"
 
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -64,7 +64,6 @@ static int effective_prio(struct task_st
 
 void set_user_nice(struct task_struct *p, long nice)
 {
-	bool queued, running;
 	struct rq *rq;
 	int old_prio;
 
@@ -90,22 +89,12 @@ void set_user_nice(struct task_struct *p
 		return;
 	}
 
-	queued = task_on_rq_queued(p);
-	running = task_current_donor(rq, p);
-	if (queued)
-		dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
-	if (running)
-		put_prev_task(rq, p);
-
-	p->static_prio = NICE_TO_PRIO(nice);
-	set_load_weight(p, true);
-	old_prio = p->prio;
-	p->prio = effective_prio(p);
-
-	if (queued)
-		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
-	if (running)
-		set_next_task(rq, p);
+	scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK) {
+		p->static_prio = NICE_TO_PRIO(nice);
+		set_load_weight(p, true);
+		old_prio = p->prio;
+		p->prio = effective_prio(p);
+	}
 
 	/*
 	 * If the task increased its priority or is running and
@@ -515,7 +504,7 @@ int __sched_setscheduler(struct task_str
 			 bool user, bool pi)
 {
 	int oldpolicy = -1, policy = attr->sched_policy;
-	int retval, oldprio, newprio, queued, running;
+	int retval, oldprio, newprio;
 	const struct sched_class *prev_class, *next_class;
 	struct balance_callback *head;
 	struct rq_flags rf;
@@ -698,33 +687,25 @@ int __sched_setscheduler(struct task_str
 	if (prev_class != next_class && p->se.sched_delayed)
 		dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
 
-	queued = task_on_rq_queued(p);
-	running = task_current_donor(rq, p);
-	if (queued)
-		dequeue_task(rq, p, queue_flags);
-	if (running)
-		put_prev_task(rq, p);
-
-	if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
-		__setscheduler_params(p, attr);
-		p->sched_class = next_class;
-		p->prio = newprio;
-	}
-	__setscheduler_uclamp(p, attr);
-	check_class_changing(rq, p, prev_class);
+	scoped_guard (sched_change, p, queue_flags) {
 
-	if (queued) {
-		/*
-		 * We enqueue to tail when the priority of a task is
-		 * increased (user space view).
-		 */
-		if (oldprio < p->prio)
-			queue_flags |= ENQUEUE_HEAD;
+		if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
+			__setscheduler_params(p, attr);
+			p->sched_class = next_class;
+			p->prio = newprio;
+		}
+		__setscheduler_uclamp(p, attr);
+		check_class_changing(rq, p, prev_class);
 
-		enqueue_task(rq, p, queue_flags);
+		if (scope->queued) {
+			/*
+			 * We enqueue to tail when the priority of a task is
+			 * increased (user space view).
+			 */
+			if (oldprio < p->prio)
+				scope->flags |= ENQUEUE_HEAD;
+		}
 	}
-	if (running)
-		set_next_task(rq, p);
 
 	check_class_changed(rq, p, prev_class, oldprio);
Re: [PATCH 01/14] sched: Employ sched_change guards
Posted by Shrikanth Hegde 4 months ago

On 9/10/25 9:14 PM, Peter Zijlstra wrote:
> As proposed a long while ago -- and half done by scx -- wrap the
> scheduler's 'change' pattern in a guard helper.
> 
[...]>   		put_task_struct(p);
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -3860,23 +3860,22 @@ extern void check_class_changed(struct r
>   extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
>   extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
>   
> -#ifdef CONFIG_SCHED_CLASS_EXT
> -/*
> - * Used by SCX in the enable/disable paths to move tasks between sched_classes
> - * and establish invariants.
> - */
> -struct sched_enq_and_set_ctx {
> +struct sched_change_ctx {
>   	struct task_struct	*p;
> -	int			queue_flags;
> +	int			flags;
>   	bool			queued;
>   	bool			running;
>   };
>   
> -void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
> -			    struct sched_enq_and_set_ctx *ctx);
> -void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx);
> +struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags);
> +void sched_change_end(struct sched_change_ctx *ctx);
>   
> -#endif /* CONFIG_SCHED_CLASS_EXT */
> +DEFINE_CLASS(sched_change, struct sched_change_ctx *,
> +	     sched_change_end(_T),
> +	     sched_change_begin(p, flags),
> +	     struct task_struct *p, unsigned int flags)
> +
> +DEFINE_CLASS_IS_UNCONDITIONAL(sched_change)
>   
>   #include "ext.h"
>   
could you please add a comment on matching flags on dequeue/enqueue
here?

Since the ctx->flags don't get cleared, one could be left wondering how
does the enqueue happens(exp: ENQUEUE_RESTORE) until they see it works 
since flags match.
Re: [PATCH 01/14] sched: Employ sched_change guards
Posted by Peter Zijlstra 4 months ago
On Mon, Oct 06, 2025 at 08:51:27PM +0530, Shrikanth Hegde wrote:
> 
> 
> On 9/10/25 9:14 PM, Peter Zijlstra wrote:
> > As proposed a long while ago -- and half done by scx -- wrap the
> > scheduler's 'change' pattern in a guard helper.
> > 
> [...]>   		put_task_struct(p);
> > --- a/kernel/sched/sched.h
> > +++ b/kernel/sched/sched.h
> > @@ -3860,23 +3860,22 @@ extern void check_class_changed(struct r
> >   extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
> >   extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
> > -#ifdef CONFIG_SCHED_CLASS_EXT
> > -/*
> > - * Used by SCX in the enable/disable paths to move tasks between sched_classes
> > - * and establish invariants.
> > - */
> > -struct sched_enq_and_set_ctx {
> > +struct sched_change_ctx {
> >   	struct task_struct	*p;
> > -	int			queue_flags;
> > +	int			flags;
> >   	bool			queued;
> >   	bool			running;
> >   };
> > -void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
> > -			    struct sched_enq_and_set_ctx *ctx);
> > -void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx);
> > +struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags);
> > +void sched_change_end(struct sched_change_ctx *ctx);
> > -#endif /* CONFIG_SCHED_CLASS_EXT */
> > +DEFINE_CLASS(sched_change, struct sched_change_ctx *,
> > +	     sched_change_end(_T),
> > +	     sched_change_begin(p, flags),
> > +	     struct task_struct *p, unsigned int flags)
> > +
> > +DEFINE_CLASS_IS_UNCONDITIONAL(sched_change)
> >   #include "ext.h"
> could you please add a comment on matching flags on dequeue/enqueue
> here?

Would something like so be okay? This assumes at least the second patch
is applied as well.

---

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10783,6 +10783,12 @@ struct sched_change_ctx *sched_change_be
 	struct sched_change_ctx *ctx = this_cpu_ptr(&sched_change_ctx);
 	struct rq *rq = task_rq(p);
 
+	/*
+	 * Must exclusively use matched flags since this is both dequeue and
+	 * enqueue.
+	 */
+	WARN_ON_ONCE(flags & 0xFFFF0000);
+
 	lockdep_assert_rq_held(rq);
 
 	if (!(flags & DEQUEUE_NOCLOCK)) {
Re: [PATCH 01/14] sched: Employ sched_change guards
Posted by Shrikanth Hegde 4 months ago

On 10/6/25 11:44 PM, Peter Zijlstra wrote:
> On Mon, Oct 06, 2025 at 08:51:27PM +0530, Shrikanth Hegde wrote:
>>
>>
>> On 9/10/25 9:14 PM, Peter Zijlstra wrote:
>>> As proposed a long while ago -- and half done by scx -- wrap the
>>> scheduler's 'change' pattern in a guard helper.
>>>
>> [...]>   		put_task_struct(p);
>>> --- a/kernel/sched/sched.h
>>> +++ b/kernel/sched/sched.h
>>> @@ -3860,23 +3860,22 @@ extern void check_class_changed(struct r
>>>    extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
>>>    extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
>>> -#ifdef CONFIG_SCHED_CLASS_EXT
>>> -/*
>>> - * Used by SCX in the enable/disable paths to move tasks between sched_classes
>>> - * and establish invariants.
>>> - */
>>> -struct sched_enq_and_set_ctx {
>>> +struct sched_change_ctx {
>>>    	struct task_struct	*p;
>>> -	int			queue_flags;
>>> +	int			flags;
>>>    	bool			queued;
>>>    	bool			running;
>>>    };
>>> -void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
>>> -			    struct sched_enq_and_set_ctx *ctx);
>>> -void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx);
>>> +struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags);
>>> +void sched_change_end(struct sched_change_ctx *ctx);
>>> -#endif /* CONFIG_SCHED_CLASS_EXT */
>>> +DEFINE_CLASS(sched_change, struct sched_change_ctx *,
>>> +	     sched_change_end(_T),
>>> +	     sched_change_begin(p, flags),
>>> +	     struct task_struct *p, unsigned int flags)
>>> +
>>> +DEFINE_CLASS_IS_UNCONDITIONAL(sched_change)
>>>    #include "ext.h"
>> could you please add a comment on matching flags on dequeue/enqueue
>> here?
> 
> Would something like so be okay? This assumes at least the second patch
> is applied as well.
> 
> ---
> 
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -10783,6 +10783,12 @@ struct sched_change_ctx *sched_change_be
>   	struct sched_change_ctx *ctx = this_cpu_ptr(&sched_change_ctx);
>   	struct rq *rq = task_rq(p);
>   
> +	/*
> +	 * Must exclusively use matched flags since this is both dequeue and
> +	 * enqueue.
> +	 */

yes. Something like that. Unless callsites explicitly change the flags using
the scope, enqueue will happen with matching flags.

> +	WARN_ON_ONCE(flags & 0xFFFF0000);
> +

A mythical example:
scope_guard(sched_change, p, DEQUEUE_THROTTLE)
	scope->flags &= ~DEQUEUE_THROTTLE;
	scope->flags |= ENQUEUE_HEAD;

But, One could still do this right? for such users the warning may be wrong.

>   	lockdep_assert_rq_held(rq);
>   
>   	if (!(flags & DEQUEUE_NOCLOCK)) {
Re: [PATCH 01/14] sched: Employ sched_change guards
Posted by Peter Zijlstra 4 months ago
On Tue, Oct 07, 2025 at 10:42:29AM +0530, Shrikanth Hegde wrote:
> On 10/6/25 11:44 PM, Peter Zijlstra wrote:

> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -10783,6 +10783,12 @@ struct sched_change_ctx *sched_change_be
> >   	struct sched_change_ctx *ctx = this_cpu_ptr(&sched_change_ctx);
> >   	struct rq *rq = task_rq(p);
> > +	/*
> > +	 * Must exclusively use matched flags since this is both dequeue and
> > +	 * enqueue.
> > +	 */
> 
> yes. Something like that. Unless callsites explicitly change the flags using
> the scope, enqueue will happen with matching flags.
> 
> > +	WARN_ON_ONCE(flags & 0xFFFF0000);
> > +
> 
> A mythical example:
> scope_guard(sched_change, p, DEQUEUE_THROTTLE)
> 	scope->flags &= ~DEQUEUE_THROTTLE;
> 	scope->flags |= ENQUEUE_HEAD;
> 
> But, One could still do this right? for such users the warning may be wrong.

Right, I suppose this would be possible. Lets worry about it if/when it
ever comes up though.
[tip: sched/core] sched: Mandate shared flags for sched_change
Posted by tip-bot2 for Peter Zijlstra 3 months, 3 weeks ago
The following commit has been merged into the sched/core branch of tip:

Commit-ID:     73ec89a1ce4bce98f74b6520a95e64cd9986aae5
Gitweb:        https://git.kernel.org/tip/73ec89a1ce4bce98f74b6520a95e64cd9986aae5
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Mon, 06 Oct 2025 20:12:34 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 16 Oct 2025 11:13:54 +02:00

sched: Mandate shared flags for sched_change

Shrikanth noted that sched_change pattern relies on using shared
flags.

Suggested-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3d5659f..e2199e4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10781,6 +10781,12 @@ struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int 
 	struct sched_change_ctx *ctx = this_cpu_ptr(&sched_change_ctx);
 	struct rq *rq = task_rq(p);
 
+	/*
+	 * Must exclusively use matched flags since this is both dequeue and
+	 * enqueue.
+	 */
+	WARN_ON_ONCE(flags & 0xFFFF0000);
+
 	lockdep_assert_rq_held(rq);
 
 	if (!(flags & DEQUEUE_NOCLOCK)) {
Re: [PATCH 01/14] sched: Employ sched_change guards
Posted by K Prateek Nayak 5 months ago
Hello Peter,

On 9/10/2025 9:14 PM, Peter Zijlstra wrote:
> @@ -9240,8 +9213,9 @@ static void sched_change_group(struct ta
>   */
>  void sched_move_task(struct task_struct *tsk, bool for_autogroup)
>  {
> -	int queued, running, queue_flags =
> +	unsigned int queue_flags =
>  		DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;

nit.

Since we don't do a complete dequeue for delayed task in
sched_move_task(), can we get rid of that DEQUEUE_NOCLOCK and ...

> +	bool resched = false;
>  	struct rq *rq;
>  
>  	CLASS(task_rq_lock, rq_guard)(tsk);
> @@ -9249,28 +9223,12 @@ void sched_move_task(struct task_struct
>  
>  	update_rq_clock(rq);

... this clock update and instead rely on sched_change_begin() to
handle it within the guard?

>  
> -	running = task_current_donor(rq, tsk);
> -	queued = task_on_rq_queued(tsk);
> -
> -	if (queued)
> -		dequeue_task(rq, tsk, queue_flags);
> -	if (running)
> -		put_prev_task(rq, tsk);
> -
> -	sched_change_group(tsk);
> -	if (!for_autogroup)
> -		scx_cgroup_move_task(tsk);
> -
> -	if (queued)
> -		enqueue_task(rq, tsk, queue_flags);
> -	if (running) {
> -		set_next_task(rq, tsk);
> -		/*
> -		 * After changing group, the running task may have joined a
> -		 * throttled one but it's still the running task. Trigger a
> -		 * resched to make sure that task can still run.
> -		 */
> -		resched_curr(rq);
> +	scoped_guard (sched_change, tsk, queue_flags) {
> +		sched_change_group(tsk);
> +		if (!for_autogroup)
> +			scx_cgroup_move_task(tsk);
> +		if (scope->running)
> +			resched = true;
>  	}

Also, are we missing a:

	if (resched)
		resched_curr(rq);

here after the guard? I don't see anything in sched_change_end() at this
point that would trigger a resched.

>  }
-- 
Thanks and Regards,
Prateek
Re: [PATCH 01/14] sched: Employ sched_change guards
Posted by Peter Zijlstra 5 months ago
On Thu, Sep 11, 2025 at 02:36:21PM +0530, K Prateek Nayak wrote:
> Hello Peter,
> 
> On 9/10/2025 9:14 PM, Peter Zijlstra wrote:
> > @@ -9240,8 +9213,9 @@ static void sched_change_group(struct ta
> >   */
> >  void sched_move_task(struct task_struct *tsk, bool for_autogroup)
> >  {
> > -	int queued, running, queue_flags =
> > +	unsigned int queue_flags =
> >  		DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
> 
> nit.
> 
> Since we don't do a complete dequeue for delayed task in
> sched_move_task(), can we get rid of that DEQUEUE_NOCLOCK and ...
> 
> > +	bool resched = false;
> >  	struct rq *rq;
> >  
> >  	CLASS(task_rq_lock, rq_guard)(tsk);
> > @@ -9249,28 +9223,12 @@ void sched_move_task(struct task_struct
> >  
> >  	update_rq_clock(rq);
> 
> ... this clock update and instead rely on sched_change_begin() to
> handle it within the guard?

Yeah, I suppose we could. But let me try and do that in a later patch,
on-top of all this.

> > -	running = task_current_donor(rq, tsk);
> > -	queued = task_on_rq_queued(tsk);
> > -
> > -	if (queued)
> > -		dequeue_task(rq, tsk, queue_flags);
> > -	if (running)
> > -		put_prev_task(rq, tsk);
> > -
> > -	sched_change_group(tsk);
> > -	if (!for_autogroup)
> > -		scx_cgroup_move_task(tsk);
> > -
> > -	if (queued)
> > -		enqueue_task(rq, tsk, queue_flags);
> > -	if (running) {
> > -		set_next_task(rq, tsk);
> > -		/*
> > -		 * After changing group, the running task may have joined a
> > -		 * throttled one but it's still the running task. Trigger a
> > -		 * resched to make sure that task can still run.
> > -		 */
> > -		resched_curr(rq);
> > +	scoped_guard (sched_change, tsk, queue_flags) {
> > +		sched_change_group(tsk);
> > +		if (!for_autogroup)
> > +			scx_cgroup_move_task(tsk);
> > +		if (scope->running)
> > +			resched = true;
> >  	}
> 
> Also, are we missing a:
> 
> 	if (resched)
> 		resched_curr(rq);
> 
> here after the guard? I don't see anything in sched_change_end() at this
> point that would trigger a resched.

Bah, yes. That hunk must've gone missing in one of the many rebases I
did while folding back fixes :/
Re: [PATCH 01/14] sched: Employ sched_change guards
Posted by Peter Zijlstra 5 months ago
On Thu, Sep 11, 2025 at 11:55:23AM +0200, Peter Zijlstra wrote:
> On Thu, Sep 11, 2025 at 02:36:21PM +0530, K Prateek Nayak wrote:
> > Hello Peter,
> > 
> > On 9/10/2025 9:14 PM, Peter Zijlstra wrote:
> > > @@ -9240,8 +9213,9 @@ static void sched_change_group(struct ta
> > >   */
> > >  void sched_move_task(struct task_struct *tsk, bool for_autogroup)
> > >  {
> > > -	int queued, running, queue_flags =
> > > +	unsigned int queue_flags =
> > >  		DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
> > 
> > nit.
> > 
> > Since we don't do a complete dequeue for delayed task in
> > sched_move_task(), can we get rid of that DEQUEUE_NOCLOCK and ...
> > 
> > > +	bool resched = false;
> > >  	struct rq *rq;
> > >  
> > >  	CLASS(task_rq_lock, rq_guard)(tsk);
> > > @@ -9249,28 +9223,12 @@ void sched_move_task(struct task_struct
> > >  
> > >  	update_rq_clock(rq);
> > 
> > ... this clock update and instead rely on sched_change_begin() to
> > handle it within the guard?
> 
> Yeah, I suppose we could. But let me try and do that in a later patch,
> on-top of all this.

Something like so?

---
 core.c     |   33 +++++++++++----------------------
 ext.c      |   12 ++++--------
 syscalls.c |    4 +---
 3 files changed, 16 insertions(+), 33 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2359,10 +2359,8 @@ static void migrate_disable_switch(struc
 	if (p->cpus_ptr != &p->cpus_mask)
 		return;
 
-	scoped_guard (task_rq_lock, p) {
-		update_rq_clock(scope.rq);
+	scoped_guard (task_rq_lock, p)
 		do_set_cpus_allowed(p, &ac);
-	}
 }
 
 void migrate_disable(void)
@@ -2716,9 +2714,7 @@ void set_cpus_allowed_common(struct task
 static void
 do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
 {
-	u32 flags = DEQUEUE_SAVE | DEQUEUE_NOCLOCK | DEQUEUE_LOCKED;
-
-	scoped_guard (sched_change, p, flags) {
+	scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_LOCKED) {
 		p->sched_class->set_cpus_allowed(p, ctx);
 		mm_set_cpus_allowed(p->mm, ctx->new_mask);
 	}
@@ -2740,10 +2736,8 @@ void set_cpus_allowed_force(struct task_
 		struct rcu_head rcu;
 	};
 
-	scoped_guard (__task_rq_lock, p) {
-		update_rq_clock(scope.rq);
+	scoped_guard (__task_rq_lock, p)
 		do_set_cpus_allowed(p, &ac);
-	}
 
 	/*
 	 * Because this is called with p->pi_lock held, it is not possible
@@ -9159,16 +9153,13 @@ static void sched_change_group(struct ta
  */
 void sched_move_task(struct task_struct *tsk, bool for_autogroup)
 {
-	unsigned int queue_flags =
-		DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK | DEQUEUE_LOCKED;
+	unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_LOCKED;
 	bool resched = false;
 	struct rq *rq;
 
 	CLASS(task_rq_lock, rq_guard)(tsk);
 	rq = rq_guard.rq;
 
-	update_rq_clock(rq);
-
 	scoped_guard (sched_change, tsk, queue_flags) {
 		sched_change_group(tsk);
 		if (!for_autogroup)
@@ -10852,19 +10843,17 @@ struct sched_change_ctx *sched_change_be
 	}
 #endif
 
+	if (!(flags & DEQUEUE_NOCLOCK)) {
+		update_rq_clock(rq);
+		flags |= DEQUEUE_NOCLOCK;
+	}
+
 	if (flags & DEQUEUE_CLASS) {
 		if (WARN_ON_ONCE(flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)))
 			flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
 
-		if (p->sched_class->switching_from) {
-			/*
-			 * switching_from_fair() assumes CLASS implies NOCLOCK;
-			 * fixing this assumption would mean switching_from()
-			 * would need to be able to change flags.
-			 */
-			WARN_ON(!(flags & DEQUEUE_NOCLOCK));
+		if (p->sched_class->switching_from)
 			p->sched_class->switching_from(rq, p);
-		}
 	}
 
 	*ctx = (struct sched_change_ctx){
@@ -10915,7 +10904,7 @@ void sched_change_end(struct sched_chang
 		p->sched_class->switching_to(rq, p);
 
 	if (ctx->queued)
-		enqueue_task(rq, p, ctx->flags | ENQUEUE_NOCLOCK);
+		enqueue_task(rq, p, ctx->flags);
 	if (ctx->running)
 		set_next_task(rq, p, ctx->flags);
 
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5018,14 +5018,12 @@ static void scx_disable_workfn(struct kt
 
 	scx_task_iter_start(&sti);
 	while ((p = scx_task_iter_next_locked(&sti))) {
-		unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE |
-					   DEQUEUE_NOCLOCK | DEQUEUE_LOCKED;
+		unsigned int queue_flags =
+			DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_LOCKED;
 		const struct sched_class *old_class = p->sched_class;
 		const struct sched_class *new_class =
 			__setscheduler_class(p->policy, p->prio);
 
-		update_rq_clock(task_rq(p));
-
 		if (old_class != new_class) {
 			queue_flags |= DEQUEUE_CLASS;
 			queue_flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
@@ -5763,8 +5761,8 @@ static int scx_enable(struct sched_ext_o
 	percpu_down_write(&scx_fork_rwsem);
 	scx_task_iter_start(&sti);
 	while ((p = scx_task_iter_next_locked(&sti))) {
-		unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE |
-					   DEQUEUE_NOCLOCK | DEQUEUE_LOCKED;
+		unsigned int queue_flags =
+			DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_LOCKED;
 		const struct sched_class *old_class = p->sched_class;
 		const struct sched_class *new_class =
 			__setscheduler_class(p->policy, p->prio);
@@ -5772,8 +5770,6 @@ static int scx_enable(struct sched_ext_o
 		if (!tryget_task_struct(p))
 			continue;
 
-		update_rq_clock(task_rq(p));
-
 		if (old_class != new_class) {
 			queue_flags |= DEQUEUE_CLASS;
 			queue_flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -76,8 +76,6 @@ void set_user_nice(struct task_struct *p
 	CLASS(task_rq_lock, rq_guard)(p);
 	rq = rq_guard.rq;
 
-	update_rq_clock(rq);
-
 	/*
 	 * The RT priorities are set via sched_setscheduler(), but we still
 	 * allow the 'normal' nice value to be set - but as expected
@@ -89,7 +87,7 @@ void set_user_nice(struct task_struct *p
 		return;
 	}
 
-	scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK | DEQUEUE_LOCKED) {
+	scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_LOCKED) {
 		p->static_prio = NICE_TO_PRIO(nice);
 		set_load_weight(p, true);
 		old_prio = p->prio;
Re: [PATCH 01/14] sched: Employ sched_change guards
Posted by K Prateek Nayak 5 months ago
Hello Peter,

On 9/11/2025 3:40 PM, Peter Zijlstra wrote:
>> Yeah, I suppose we could. But let me try and do that in a later patch,
>> on-top of all this.

Sure thing.

> 
> Something like so?

Yup! That whole lot look better. Thank you.

> 
>  [..snip..]
-- 
Thanks and Regards,
Prateek