As proposed a long while ago -- and half done by scx -- wrap the
scheduler's 'change' pattern in a guard helper.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/cleanup.h | 5 +
kernel/sched/core.c | 156 +++++++++++++++++-------------------------------
kernel/sched/ext.c | 39 +++++-------
kernel/sched/sched.h | 21 +++---
kernel/sched/syscalls.c | 65 +++++++-------------
5 files changed, 114 insertions(+), 172 deletions(-)
--- a/include/linux/cleanup.h
+++ b/include/linux/cleanup.h
@@ -340,6 +340,11 @@ _label:
#define __DEFINE_CLASS_IS_CONDITIONAL(_name, _is_cond) \
static __maybe_unused const bool class_##_name##_is_conditional = _is_cond
+#define DEFINE_CLASS_IS_UNCONDITIONAL(_name) \
+ __DEFINE_CLASS_IS_CONDITIONAL(_name, false); \
+ static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \
+ { return (void *)1; }
+
#define __GUARD_IS_ERR(_ptr) \
({ \
unsigned long _rc = (__force unsigned long)(_ptr); \
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7361,7 +7361,7 @@ void rt_mutex_post_schedule(void)
*/
void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
{
- int prio, oldprio, queued, running, queue_flag =
+ int prio, oldprio, queue_flag =
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
const struct sched_class *prev_class, *next_class;
struct rq_flags rf;
@@ -7426,52 +7426,42 @@ void rt_mutex_setprio(struct task_struct
if (prev_class != next_class && p->se.sched_delayed)
dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
- queued = task_on_rq_queued(p);
- running = task_current_donor(rq, p);
- if (queued)
- dequeue_task(rq, p, queue_flag);
- if (running)
- put_prev_task(rq, p);
-
- /*
- * Boosting condition are:
- * 1. -rt task is running and holds mutex A
- * --> -dl task blocks on mutex A
- *
- * 2. -dl task is running and holds mutex A
- * --> -dl task blocks on mutex A and could preempt the
- * running task
- */
- if (dl_prio(prio)) {
- if (!dl_prio(p->normal_prio) ||
- (pi_task && dl_prio(pi_task->prio) &&
- dl_entity_preempt(&pi_task->dl, &p->dl))) {
- p->dl.pi_se = pi_task->dl.pi_se;
- queue_flag |= ENQUEUE_REPLENISH;
+ scoped_guard (sched_change, p, queue_flag) {
+ /*
+ * Boosting condition are:
+ * 1. -rt task is running and holds mutex A
+ * --> -dl task blocks on mutex A
+ *
+ * 2. -dl task is running and holds mutex A
+ * --> -dl task blocks on mutex A and could preempt the
+ * running task
+ */
+ if (dl_prio(prio)) {
+ if (!dl_prio(p->normal_prio) ||
+ (pi_task && dl_prio(pi_task->prio) &&
+ dl_entity_preempt(&pi_task->dl, &p->dl))) {
+ p->dl.pi_se = pi_task->dl.pi_se;
+ scope->flags |= ENQUEUE_REPLENISH;
+ } else {
+ p->dl.pi_se = &p->dl;
+ }
+ } else if (rt_prio(prio)) {
+ if (dl_prio(oldprio))
+ p->dl.pi_se = &p->dl;
+ if (oldprio < prio)
+ scope->flags |= ENQUEUE_HEAD;
} else {
- p->dl.pi_se = &p->dl;
+ if (dl_prio(oldprio))
+ p->dl.pi_se = &p->dl;
+ if (rt_prio(oldprio))
+ p->rt.timeout = 0;
}
- } else if (rt_prio(prio)) {
- if (dl_prio(oldprio))
- p->dl.pi_se = &p->dl;
- if (oldprio < prio)
- queue_flag |= ENQUEUE_HEAD;
- } else {
- if (dl_prio(oldprio))
- p->dl.pi_se = &p->dl;
- if (rt_prio(oldprio))
- p->rt.timeout = 0;
- }
- p->sched_class = next_class;
- p->prio = prio;
+ p->sched_class = next_class;
+ p->prio = prio;
- check_class_changing(rq, p, prev_class);
-
- if (queued)
- enqueue_task(rq, p, queue_flag);
- if (running)
- set_next_task(rq, p);
+ check_class_changing(rq, p, prev_class);
+ }
check_class_changed(rq, p, prev_class, oldprio);
out_unlock:
@@ -8119,26 +8109,9 @@ int migrate_task_to(struct task_struct *
*/
void sched_setnuma(struct task_struct *p, int nid)
{
- bool queued, running;
- struct rq_flags rf;
- struct rq *rq;
-
- rq = task_rq_lock(p, &rf);
- queued = task_on_rq_queued(p);
- running = task_current_donor(rq, p);
-
- if (queued)
- dequeue_task(rq, p, DEQUEUE_SAVE);
- if (running)
- put_prev_task(rq, p);
-
- p->numa_preferred_nid = nid;
-
- if (queued)
- enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
- if (running)
- set_next_task(rq, p);
- task_rq_unlock(rq, p, &rf);
+ guard(task_rq_lock)(p);
+ scoped_guard (sched_change, p, DEQUEUE_SAVE)
+ p->numa_preferred_nid = nid;
}
#endif /* CONFIG_NUMA_BALANCING */
@@ -9240,8 +9213,9 @@ static void sched_change_group(struct ta
*/
void sched_move_task(struct task_struct *tsk, bool for_autogroup)
{
- int queued, running, queue_flags =
+ unsigned int queue_flags =
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
+ bool resched = false;
struct rq *rq;
CLASS(task_rq_lock, rq_guard)(tsk);
@@ -9249,28 +9223,12 @@ void sched_move_task(struct task_struct
update_rq_clock(rq);
- running = task_current_donor(rq, tsk);
- queued = task_on_rq_queued(tsk);
-
- if (queued)
- dequeue_task(rq, tsk, queue_flags);
- if (running)
- put_prev_task(rq, tsk);
-
- sched_change_group(tsk);
- if (!for_autogroup)
- scx_cgroup_move_task(tsk);
-
- if (queued)
- enqueue_task(rq, tsk, queue_flags);
- if (running) {
- set_next_task(rq, tsk);
- /*
- * After changing group, the running task may have joined a
- * throttled one but it's still the running task. Trigger a
- * resched to make sure that task can still run.
- */
- resched_curr(rq);
+ scoped_guard (sched_change, tsk, queue_flags) {
+ sched_change_group(tsk);
+ if (!for_autogroup)
+ scx_cgroup_move_task(tsk);
+ if (scope->running)
+ resched = true;
}
}
@@ -10929,37 +10887,39 @@ void sched_mm_cid_fork(struct task_struc
}
#endif /* CONFIG_SCHED_MM_CID */
-#ifdef CONFIG_SCHED_CLASS_EXT
-void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
- struct sched_enq_and_set_ctx *ctx)
+static DEFINE_PER_CPU(struct sched_change_ctx, sched_change_ctx);
+
+struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags)
{
+ struct sched_change_ctx *ctx = this_cpu_ptr(&sched_change_ctx);
struct rq *rq = task_rq(p);
lockdep_assert_rq_held(rq);
- *ctx = (struct sched_enq_and_set_ctx){
+ *ctx = (struct sched_change_ctx){
.p = p,
- .queue_flags = queue_flags,
+ .flags = flags,
.queued = task_on_rq_queued(p),
.running = task_current(rq, p),
};
- update_rq_clock(rq);
if (ctx->queued)
- dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK);
+ dequeue_task(rq, p, flags);
if (ctx->running)
put_prev_task(rq, p);
+
+ return ctx;
}
-void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx)
+void sched_change_end(struct sched_change_ctx *ctx)
{
- struct rq *rq = task_rq(ctx->p);
+ struct task_struct *p = ctx->p;
+ struct rq *rq = task_rq(p);
lockdep_assert_rq_held(rq);
if (ctx->queued)
- enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK);
+ enqueue_task(rq, p, ctx->flags | ENQUEUE_NOCLOCK);
if (ctx->running)
- set_next_task(rq, ctx->p);
+ set_next_task(rq, p);
}
-#endif /* CONFIG_SCHED_CLASS_EXT */
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4867,11 +4867,10 @@ static void scx_bypass(bool bypass)
*/
list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list,
scx.runnable_node) {
- struct sched_enq_and_set_ctx ctx;
-
/* cycling deq/enq is enough, see the function comment */
- sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
- sched_enq_and_set_task(&ctx);
+ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
+ /* nothing */ ;
+ }
}
/* resched to restore ticks and idle state */
@@ -5003,17 +5002,16 @@ static void scx_disable_workfn(struct kt
const struct sched_class *old_class = p->sched_class;
const struct sched_class *new_class =
__setscheduler_class(p->policy, p->prio);
- struct sched_enq_and_set_ctx ctx;
-
- if (old_class != new_class && p->se.sched_delayed)
- dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
- sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
+ update_rq_clock(task_rq(p));
- p->sched_class = new_class;
- check_class_changing(task_rq(p), p, old_class);
+ if (old_class != new_class && p->se.sched_delayed)
+ dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
- sched_enq_and_set_task(&ctx);
+ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK) {
+ p->sched_class = new_class;
+ check_class_changing(task_rq(p), p, old_class);
+ }
check_class_changed(task_rq(p), p, old_class, p->prio);
scx_exit_task(p);
@@ -5747,21 +5745,20 @@ static int scx_enable(struct sched_ext_o
const struct sched_class *old_class = p->sched_class;
const struct sched_class *new_class =
__setscheduler_class(p->policy, p->prio);
- struct sched_enq_and_set_ctx ctx;
if (!tryget_task_struct(p))
continue;
- if (old_class != new_class && p->se.sched_delayed)
- dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
-
- sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
+ update_rq_clock(task_rq(p));
- p->scx.slice = SCX_SLICE_DFL;
- p->sched_class = new_class;
- check_class_changing(task_rq(p), p, old_class);
+ if (old_class != new_class && p->se.sched_delayed)
+ dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
- sched_enq_and_set_task(&ctx);
+ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK) {
+ p->scx.slice = SCX_SLICE_DFL;
+ p->sched_class = new_class;
+ check_class_changing(task_rq(p), p, old_class);
+ }
check_class_changed(task_rq(p), p, old_class, p->prio);
put_task_struct(p);
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3860,23 +3860,22 @@ extern void check_class_changed(struct r
extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
-#ifdef CONFIG_SCHED_CLASS_EXT
-/*
- * Used by SCX in the enable/disable paths to move tasks between sched_classes
- * and establish invariants.
- */
-struct sched_enq_and_set_ctx {
+struct sched_change_ctx {
struct task_struct *p;
- int queue_flags;
+ int flags;
bool queued;
bool running;
};
-void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
- struct sched_enq_and_set_ctx *ctx);
-void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx);
+struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags);
+void sched_change_end(struct sched_change_ctx *ctx);
-#endif /* CONFIG_SCHED_CLASS_EXT */
+DEFINE_CLASS(sched_change, struct sched_change_ctx *,
+ sched_change_end(_T),
+ sched_change_begin(p, flags),
+ struct task_struct *p, unsigned int flags)
+
+DEFINE_CLASS_IS_UNCONDITIONAL(sched_change)
#include "ext.h"
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -64,7 +64,6 @@ static int effective_prio(struct task_st
void set_user_nice(struct task_struct *p, long nice)
{
- bool queued, running;
struct rq *rq;
int old_prio;
@@ -90,22 +89,12 @@ void set_user_nice(struct task_struct *p
return;
}
- queued = task_on_rq_queued(p);
- running = task_current_donor(rq, p);
- if (queued)
- dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
- if (running)
- put_prev_task(rq, p);
-
- p->static_prio = NICE_TO_PRIO(nice);
- set_load_weight(p, true);
- old_prio = p->prio;
- p->prio = effective_prio(p);
-
- if (queued)
- enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
- if (running)
- set_next_task(rq, p);
+ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK) {
+ p->static_prio = NICE_TO_PRIO(nice);
+ set_load_weight(p, true);
+ old_prio = p->prio;
+ p->prio = effective_prio(p);
+ }
/*
* If the task increased its priority or is running and
@@ -515,7 +504,7 @@ int __sched_setscheduler(struct task_str
bool user, bool pi)
{
int oldpolicy = -1, policy = attr->sched_policy;
- int retval, oldprio, newprio, queued, running;
+ int retval, oldprio, newprio;
const struct sched_class *prev_class, *next_class;
struct balance_callback *head;
struct rq_flags rf;
@@ -698,33 +687,25 @@ int __sched_setscheduler(struct task_str
if (prev_class != next_class && p->se.sched_delayed)
dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
- queued = task_on_rq_queued(p);
- running = task_current_donor(rq, p);
- if (queued)
- dequeue_task(rq, p, queue_flags);
- if (running)
- put_prev_task(rq, p);
-
- if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
- __setscheduler_params(p, attr);
- p->sched_class = next_class;
- p->prio = newprio;
- }
- __setscheduler_uclamp(p, attr);
- check_class_changing(rq, p, prev_class);
+ scoped_guard (sched_change, p, queue_flags) {
- if (queued) {
- /*
- * We enqueue to tail when the priority of a task is
- * increased (user space view).
- */
- if (oldprio < p->prio)
- queue_flags |= ENQUEUE_HEAD;
+ if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
+ __setscheduler_params(p, attr);
+ p->sched_class = next_class;
+ p->prio = newprio;
+ }
+ __setscheduler_uclamp(p, attr);
+ check_class_changing(rq, p, prev_class);
- enqueue_task(rq, p, queue_flags);
+ if (scope->queued) {
+ /*
+ * We enqueue to tail when the priority of a task is
+ * increased (user space view).
+ */
+ if (oldprio < p->prio)
+ scope->flags |= ENQUEUE_HEAD;
+ }
}
- if (running)
- set_next_task(rq, p);
check_class_changed(rq, p, prev_class, oldprio);
On 9/10/25 9:14 PM, Peter Zijlstra wrote:
> As proposed a long while ago -- and half done by scx -- wrap the
> scheduler's 'change' pattern in a guard helper.
>
[...]> put_task_struct(p);
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -3860,23 +3860,22 @@ extern void check_class_changed(struct r
> extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
> extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
>
> -#ifdef CONFIG_SCHED_CLASS_EXT
> -/*
> - * Used by SCX in the enable/disable paths to move tasks between sched_classes
> - * and establish invariants.
> - */
> -struct sched_enq_and_set_ctx {
> +struct sched_change_ctx {
> struct task_struct *p;
> - int queue_flags;
> + int flags;
> bool queued;
> bool running;
> };
>
> -void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
> - struct sched_enq_and_set_ctx *ctx);
> -void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx);
> +struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags);
> +void sched_change_end(struct sched_change_ctx *ctx);
>
> -#endif /* CONFIG_SCHED_CLASS_EXT */
> +DEFINE_CLASS(sched_change, struct sched_change_ctx *,
> + sched_change_end(_T),
> + sched_change_begin(p, flags),
> + struct task_struct *p, unsigned int flags)
> +
> +DEFINE_CLASS_IS_UNCONDITIONAL(sched_change)
>
> #include "ext.h"
>
could you please add a comment on matching flags on dequeue/enqueue
here?
Since the ctx->flags don't get cleared, one could be left wondering how
does the enqueue happens(exp: ENQUEUE_RESTORE) until they see it works
since flags match.
On Mon, Oct 06, 2025 at 08:51:27PM +0530, Shrikanth Hegde wrote:
>
>
> On 9/10/25 9:14 PM, Peter Zijlstra wrote:
> > As proposed a long while ago -- and half done by scx -- wrap the
> > scheduler's 'change' pattern in a guard helper.
> >
> [...]> put_task_struct(p);
> > --- a/kernel/sched/sched.h
> > +++ b/kernel/sched/sched.h
> > @@ -3860,23 +3860,22 @@ extern void check_class_changed(struct r
> > extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
> > extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
> > -#ifdef CONFIG_SCHED_CLASS_EXT
> > -/*
> > - * Used by SCX in the enable/disable paths to move tasks between sched_classes
> > - * and establish invariants.
> > - */
> > -struct sched_enq_and_set_ctx {
> > +struct sched_change_ctx {
> > struct task_struct *p;
> > - int queue_flags;
> > + int flags;
> > bool queued;
> > bool running;
> > };
> > -void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
> > - struct sched_enq_and_set_ctx *ctx);
> > -void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx);
> > +struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags);
> > +void sched_change_end(struct sched_change_ctx *ctx);
> > -#endif /* CONFIG_SCHED_CLASS_EXT */
> > +DEFINE_CLASS(sched_change, struct sched_change_ctx *,
> > + sched_change_end(_T),
> > + sched_change_begin(p, flags),
> > + struct task_struct *p, unsigned int flags)
> > +
> > +DEFINE_CLASS_IS_UNCONDITIONAL(sched_change)
> > #include "ext.h"
> could you please add a comment on matching flags on dequeue/enqueue
> here?
Would something like so be okay? This assumes at least the second patch
is applied as well.
---
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10783,6 +10783,12 @@ struct sched_change_ctx *sched_change_be
struct sched_change_ctx *ctx = this_cpu_ptr(&sched_change_ctx);
struct rq *rq = task_rq(p);
+ /*
+ * Must exclusively use matched flags since this is both dequeue and
+ * enqueue.
+ */
+ WARN_ON_ONCE(flags & 0xFFFF0000);
+
lockdep_assert_rq_held(rq);
if (!(flags & DEQUEUE_NOCLOCK)) {
On 10/6/25 11:44 PM, Peter Zijlstra wrote:
> On Mon, Oct 06, 2025 at 08:51:27PM +0530, Shrikanth Hegde wrote:
>>
>>
>> On 9/10/25 9:14 PM, Peter Zijlstra wrote:
>>> As proposed a long while ago -- and half done by scx -- wrap the
>>> scheduler's 'change' pattern in a guard helper.
>>>
>> [...]> put_task_struct(p);
>>> --- a/kernel/sched/sched.h
>>> +++ b/kernel/sched/sched.h
>>> @@ -3860,23 +3860,22 @@ extern void check_class_changed(struct r
>>> extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
>>> extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
>>> -#ifdef CONFIG_SCHED_CLASS_EXT
>>> -/*
>>> - * Used by SCX in the enable/disable paths to move tasks between sched_classes
>>> - * and establish invariants.
>>> - */
>>> -struct sched_enq_and_set_ctx {
>>> +struct sched_change_ctx {
>>> struct task_struct *p;
>>> - int queue_flags;
>>> + int flags;
>>> bool queued;
>>> bool running;
>>> };
>>> -void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
>>> - struct sched_enq_and_set_ctx *ctx);
>>> -void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx);
>>> +struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags);
>>> +void sched_change_end(struct sched_change_ctx *ctx);
>>> -#endif /* CONFIG_SCHED_CLASS_EXT */
>>> +DEFINE_CLASS(sched_change, struct sched_change_ctx *,
>>> + sched_change_end(_T),
>>> + sched_change_begin(p, flags),
>>> + struct task_struct *p, unsigned int flags)
>>> +
>>> +DEFINE_CLASS_IS_UNCONDITIONAL(sched_change)
>>> #include "ext.h"
>> could you please add a comment on matching flags on dequeue/enqueue
>> here?
>
> Would something like so be okay? This assumes at least the second patch
> is applied as well.
>
> ---
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -10783,6 +10783,12 @@ struct sched_change_ctx *sched_change_be
> struct sched_change_ctx *ctx = this_cpu_ptr(&sched_change_ctx);
> struct rq *rq = task_rq(p);
>
> + /*
> + * Must exclusively use matched flags since this is both dequeue and
> + * enqueue.
> + */
yes. Something like that. Unless callsites explicitly change the flags using
the scope, enqueue will happen with matching flags.
> + WARN_ON_ONCE(flags & 0xFFFF0000);
> +
A mythical example:
scope_guard(sched_change, p, DEQUEUE_THROTTLE)
scope->flags &= ~DEQUEUE_THROTTLE;
scope->flags |= ENQUEUE_HEAD;
But, One could still do this right? for such users the warning may be wrong.
> lockdep_assert_rq_held(rq);
>
> if (!(flags & DEQUEUE_NOCLOCK)) {
On Tue, Oct 07, 2025 at 10:42:29AM +0530, Shrikanth Hegde wrote: > On 10/6/25 11:44 PM, Peter Zijlstra wrote: > > --- a/kernel/sched/core.c > > +++ b/kernel/sched/core.c > > @@ -10783,6 +10783,12 @@ struct sched_change_ctx *sched_change_be > > struct sched_change_ctx *ctx = this_cpu_ptr(&sched_change_ctx); > > struct rq *rq = task_rq(p); > > + /* > > + * Must exclusively use matched flags since this is both dequeue and > > + * enqueue. > > + */ > > yes. Something like that. Unless callsites explicitly change the flags using > the scope, enqueue will happen with matching flags. > > > + WARN_ON_ONCE(flags & 0xFFFF0000); > > + > > A mythical example: > scope_guard(sched_change, p, DEQUEUE_THROTTLE) > scope->flags &= ~DEQUEUE_THROTTLE; > scope->flags |= ENQUEUE_HEAD; > > But, One could still do this right? for such users the warning may be wrong. Right, I suppose this would be possible. Lets worry about it if/when it ever comes up though.
The following commit has been merged into the sched/core branch of tip:
Commit-ID: 73ec89a1ce4bce98f74b6520a95e64cd9986aae5
Gitweb: https://git.kernel.org/tip/73ec89a1ce4bce98f74b6520a95e64cd9986aae5
Author: Peter Zijlstra <peterz@infradead.org>
AuthorDate: Mon, 06 Oct 2025 20:12:34 +02:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 16 Oct 2025 11:13:54 +02:00
sched: Mandate shared flags for sched_change
Shrikanth noted that sched_change pattern relies on using shared
flags.
Suggested-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/sched/core.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3d5659f..e2199e4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10781,6 +10781,12 @@ struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int
struct sched_change_ctx *ctx = this_cpu_ptr(&sched_change_ctx);
struct rq *rq = task_rq(p);
+ /*
+ * Must exclusively use matched flags since this is both dequeue and
+ * enqueue.
+ */
+ WARN_ON_ONCE(flags & 0xFFFF0000);
+
lockdep_assert_rq_held(rq);
if (!(flags & DEQUEUE_NOCLOCK)) {
Hello Peter,
On 9/10/2025 9:14 PM, Peter Zijlstra wrote:
> @@ -9240,8 +9213,9 @@ static void sched_change_group(struct ta
> */
> void sched_move_task(struct task_struct *tsk, bool for_autogroup)
> {
> - int queued, running, queue_flags =
> + unsigned int queue_flags =
> DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
nit.
Since we don't do a complete dequeue for delayed task in
sched_move_task(), can we get rid of that DEQUEUE_NOCLOCK and ...
> + bool resched = false;
> struct rq *rq;
>
> CLASS(task_rq_lock, rq_guard)(tsk);
> @@ -9249,28 +9223,12 @@ void sched_move_task(struct task_struct
>
> update_rq_clock(rq);
... this clock update and instead rely on sched_change_begin() to
handle it within the guard?
>
> - running = task_current_donor(rq, tsk);
> - queued = task_on_rq_queued(tsk);
> -
> - if (queued)
> - dequeue_task(rq, tsk, queue_flags);
> - if (running)
> - put_prev_task(rq, tsk);
> -
> - sched_change_group(tsk);
> - if (!for_autogroup)
> - scx_cgroup_move_task(tsk);
> -
> - if (queued)
> - enqueue_task(rq, tsk, queue_flags);
> - if (running) {
> - set_next_task(rq, tsk);
> - /*
> - * After changing group, the running task may have joined a
> - * throttled one but it's still the running task. Trigger a
> - * resched to make sure that task can still run.
> - */
> - resched_curr(rq);
> + scoped_guard (sched_change, tsk, queue_flags) {
> + sched_change_group(tsk);
> + if (!for_autogroup)
> + scx_cgroup_move_task(tsk);
> + if (scope->running)
> + resched = true;
> }
Also, are we missing a:
if (resched)
resched_curr(rq);
here after the guard? I don't see anything in sched_change_end() at this
point that would trigger a resched.
> }
--
Thanks and Regards,
Prateek
On Thu, Sep 11, 2025 at 02:36:21PM +0530, K Prateek Nayak wrote:
> Hello Peter,
>
> On 9/10/2025 9:14 PM, Peter Zijlstra wrote:
> > @@ -9240,8 +9213,9 @@ static void sched_change_group(struct ta
> > */
> > void sched_move_task(struct task_struct *tsk, bool for_autogroup)
> > {
> > - int queued, running, queue_flags =
> > + unsigned int queue_flags =
> > DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
>
> nit.
>
> Since we don't do a complete dequeue for delayed task in
> sched_move_task(), can we get rid of that DEQUEUE_NOCLOCK and ...
>
> > + bool resched = false;
> > struct rq *rq;
> >
> > CLASS(task_rq_lock, rq_guard)(tsk);
> > @@ -9249,28 +9223,12 @@ void sched_move_task(struct task_struct
> >
> > update_rq_clock(rq);
>
> ... this clock update and instead rely on sched_change_begin() to
> handle it within the guard?
Yeah, I suppose we could. But let me try and do that in a later patch,
on-top of all this.
> > - running = task_current_donor(rq, tsk);
> > - queued = task_on_rq_queued(tsk);
> > -
> > - if (queued)
> > - dequeue_task(rq, tsk, queue_flags);
> > - if (running)
> > - put_prev_task(rq, tsk);
> > -
> > - sched_change_group(tsk);
> > - if (!for_autogroup)
> > - scx_cgroup_move_task(tsk);
> > -
> > - if (queued)
> > - enqueue_task(rq, tsk, queue_flags);
> > - if (running) {
> > - set_next_task(rq, tsk);
> > - /*
> > - * After changing group, the running task may have joined a
> > - * throttled one but it's still the running task. Trigger a
> > - * resched to make sure that task can still run.
> > - */
> > - resched_curr(rq);
> > + scoped_guard (sched_change, tsk, queue_flags) {
> > + sched_change_group(tsk);
> > + if (!for_autogroup)
> > + scx_cgroup_move_task(tsk);
> > + if (scope->running)
> > + resched = true;
> > }
>
> Also, are we missing a:
>
> if (resched)
> resched_curr(rq);
>
> here after the guard? I don't see anything in sched_change_end() at this
> point that would trigger a resched.
Bah, yes. That hunk must've gone missing in one of the many rebases I
did while folding back fixes :/
On Thu, Sep 11, 2025 at 11:55:23AM +0200, Peter Zijlstra wrote:
> On Thu, Sep 11, 2025 at 02:36:21PM +0530, K Prateek Nayak wrote:
> > Hello Peter,
> >
> > On 9/10/2025 9:14 PM, Peter Zijlstra wrote:
> > > @@ -9240,8 +9213,9 @@ static void sched_change_group(struct ta
> > > */
> > > void sched_move_task(struct task_struct *tsk, bool for_autogroup)
> > > {
> > > - int queued, running, queue_flags =
> > > + unsigned int queue_flags =
> > > DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
> >
> > nit.
> >
> > Since we don't do a complete dequeue for delayed task in
> > sched_move_task(), can we get rid of that DEQUEUE_NOCLOCK and ...
> >
> > > + bool resched = false;
> > > struct rq *rq;
> > >
> > > CLASS(task_rq_lock, rq_guard)(tsk);
> > > @@ -9249,28 +9223,12 @@ void sched_move_task(struct task_struct
> > >
> > > update_rq_clock(rq);
> >
> > ... this clock update and instead rely on sched_change_begin() to
> > handle it within the guard?
>
> Yeah, I suppose we could. But let me try and do that in a later patch,
> on-top of all this.
Something like so?
---
core.c | 33 +++++++++++----------------------
ext.c | 12 ++++--------
syscalls.c | 4 +---
3 files changed, 16 insertions(+), 33 deletions(-)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2359,10 +2359,8 @@ static void migrate_disable_switch(struc
if (p->cpus_ptr != &p->cpus_mask)
return;
- scoped_guard (task_rq_lock, p) {
- update_rq_clock(scope.rq);
+ scoped_guard (task_rq_lock, p)
do_set_cpus_allowed(p, &ac);
- }
}
void migrate_disable(void)
@@ -2716,9 +2714,7 @@ void set_cpus_allowed_common(struct task
static void
do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
{
- u32 flags = DEQUEUE_SAVE | DEQUEUE_NOCLOCK | DEQUEUE_LOCKED;
-
- scoped_guard (sched_change, p, flags) {
+ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_LOCKED) {
p->sched_class->set_cpus_allowed(p, ctx);
mm_set_cpus_allowed(p->mm, ctx->new_mask);
}
@@ -2740,10 +2736,8 @@ void set_cpus_allowed_force(struct task_
struct rcu_head rcu;
};
- scoped_guard (__task_rq_lock, p) {
- update_rq_clock(scope.rq);
+ scoped_guard (__task_rq_lock, p)
do_set_cpus_allowed(p, &ac);
- }
/*
* Because this is called with p->pi_lock held, it is not possible
@@ -9159,16 +9153,13 @@ static void sched_change_group(struct ta
*/
void sched_move_task(struct task_struct *tsk, bool for_autogroup)
{
- unsigned int queue_flags =
- DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK | DEQUEUE_LOCKED;
+ unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_LOCKED;
bool resched = false;
struct rq *rq;
CLASS(task_rq_lock, rq_guard)(tsk);
rq = rq_guard.rq;
- update_rq_clock(rq);
-
scoped_guard (sched_change, tsk, queue_flags) {
sched_change_group(tsk);
if (!for_autogroup)
@@ -10852,19 +10843,17 @@ struct sched_change_ctx *sched_change_be
}
#endif
+ if (!(flags & DEQUEUE_NOCLOCK)) {
+ update_rq_clock(rq);
+ flags |= DEQUEUE_NOCLOCK;
+ }
+
if (flags & DEQUEUE_CLASS) {
if (WARN_ON_ONCE(flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)))
flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
- if (p->sched_class->switching_from) {
- /*
- * switching_from_fair() assumes CLASS implies NOCLOCK;
- * fixing this assumption would mean switching_from()
- * would need to be able to change flags.
- */
- WARN_ON(!(flags & DEQUEUE_NOCLOCK));
+ if (p->sched_class->switching_from)
p->sched_class->switching_from(rq, p);
- }
}
*ctx = (struct sched_change_ctx){
@@ -10915,7 +10904,7 @@ void sched_change_end(struct sched_chang
p->sched_class->switching_to(rq, p);
if (ctx->queued)
- enqueue_task(rq, p, ctx->flags | ENQUEUE_NOCLOCK);
+ enqueue_task(rq, p, ctx->flags);
if (ctx->running)
set_next_task(rq, p, ctx->flags);
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5018,14 +5018,12 @@ static void scx_disable_workfn(struct kt
scx_task_iter_start(&sti);
while ((p = scx_task_iter_next_locked(&sti))) {
- unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE |
- DEQUEUE_NOCLOCK | DEQUEUE_LOCKED;
+ unsigned int queue_flags =
+ DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_LOCKED;
const struct sched_class *old_class = p->sched_class;
const struct sched_class *new_class =
__setscheduler_class(p->policy, p->prio);
- update_rq_clock(task_rq(p));
-
if (old_class != new_class) {
queue_flags |= DEQUEUE_CLASS;
queue_flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
@@ -5763,8 +5761,8 @@ static int scx_enable(struct sched_ext_o
percpu_down_write(&scx_fork_rwsem);
scx_task_iter_start(&sti);
while ((p = scx_task_iter_next_locked(&sti))) {
- unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE |
- DEQUEUE_NOCLOCK | DEQUEUE_LOCKED;
+ unsigned int queue_flags =
+ DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_LOCKED;
const struct sched_class *old_class = p->sched_class;
const struct sched_class *new_class =
__setscheduler_class(p->policy, p->prio);
@@ -5772,8 +5770,6 @@ static int scx_enable(struct sched_ext_o
if (!tryget_task_struct(p))
continue;
- update_rq_clock(task_rq(p));
-
if (old_class != new_class) {
queue_flags |= DEQUEUE_CLASS;
queue_flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -76,8 +76,6 @@ void set_user_nice(struct task_struct *p
CLASS(task_rq_lock, rq_guard)(p);
rq = rq_guard.rq;
- update_rq_clock(rq);
-
/*
* The RT priorities are set via sched_setscheduler(), but we still
* allow the 'normal' nice value to be set - but as expected
@@ -89,7 +87,7 @@ void set_user_nice(struct task_struct *p
return;
}
- scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK | DEQUEUE_LOCKED) {
+ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_LOCKED) {
p->static_prio = NICE_TO_PRIO(nice);
set_load_weight(p, true);
old_prio = p->prio;
Hello Peter, On 9/11/2025 3:40 PM, Peter Zijlstra wrote: >> Yeah, I suppose we could. But let me try and do that in a later patch, >> on-top of all this. Sure thing. > > Something like so? Yup! That whole lot look better. Thank you. > > [..snip..] -- Thanks and Regards, Prateek
© 2016 - 2026 Red Hat, Inc.