Have enqueue/dequeue set a per-class bit in rq->queue_mask. This then
enables easy tracking of which runqueues are modified over a
lock-break.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/sched/core.c | 2 ++
kernel/sched/deadline.c | 2 ++
kernel/sched/ext.c | 2 ++
kernel/sched/fair.c | 7 +++++--
kernel/sched/idle.c | 2 ++
kernel/sched/rt.c | 2 ++
kernel/sched/sched.h | 10 ++++++++++
kernel/sched/stop_task.c | 2 ++
8 files changed, 27 insertions(+), 2 deletions(-)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2089,6 +2089,7 @@ void enqueue_task(struct rq *rq, struct
*/
uclamp_rq_inc(rq, p, flags);
+ rq->queue_mask |= p->sched_class->queue_mask;
p->sched_class->enqueue_task(rq, p, flags);
psi_enqueue(p, flags);
@@ -2121,6 +2122,7 @@ inline bool dequeue_task(struct rq *rq,
* and mark the task ->sched_delayed.
*/
uclamp_rq_dec(rq, p);
+ rq->queue_mask |= p->sched_class->queue_mask;
return p->sched_class->dequeue_task(rq, p, flags);
}
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -3094,6 +3094,8 @@ static int task_is_throttled_dl(struct t
DEFINE_SCHED_CLASS(dl) = {
+ .queue_mask = 8,
+
.enqueue_task = enqueue_task_dl,
.dequeue_task = dequeue_task_dl,
.yield_task = yield_task_dl,
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3234,6 +3234,8 @@ static void scx_cgroup_unlock(void) {}
* their current sched_class. Call them directly from sched core instead.
*/
DEFINE_SCHED_CLASS(ext) = {
+ .queue_mask = 1,
+
.enqueue_task = enqueue_task_scx,
.dequeue_task = dequeue_task_scx,
.yield_task = yield_task_scx,
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12830,6 +12830,7 @@ static int sched_balance_newidle(struct
}
rcu_read_unlock();
+ this_rq->queue_mask = 0;
raw_spin_rq_unlock(this_rq);
t0 = sched_clock_cpu(this_cpu);
@@ -12887,8 +12888,8 @@ static int sched_balance_newidle(struct
if (this_rq->cfs.h_nr_queued && !pulled_task)
pulled_task = 1;
- /* Is there a task of a high priority class? */
- if (this_rq->nr_running != this_rq->cfs.h_nr_queued)
+ /* If a higher prio class was modified, restart the pick */
+ if (this_rq->queue_mask & ~((fair_sched_class.queue_mask << 1)-1))
pulled_task = -1;
out:
@@ -13623,6 +13624,8 @@ static unsigned int get_rr_interval_fair
*/
DEFINE_SCHED_CLASS(fair) = {
+ .queue_mask = 2,
+
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -522,6 +522,8 @@ static void update_curr_idle(struct rq *
*/
DEFINE_SCHED_CLASS(idle) = {
+ .queue_mask = 0,
+
/* no enqueue/yield_task for idle tasks */
/* dequeue is not valid, we print a debug message there: */
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2571,6 +2571,8 @@ static int task_is_throttled_rt(struct t
DEFINE_SCHED_CLASS(rt) = {
+ .queue_mask = 4,
+
.enqueue_task = enqueue_task_rt,
.dequeue_task = dequeue_task_rt,
.yield_task = yield_task_rt,
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1118,6 +1118,7 @@ struct rq {
/* runqueue lock: */
raw_spinlock_t __lock;
+ unsigned int queue_mask;
unsigned int nr_running;
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
@@ -2414,6 +2415,15 @@ struct sched_class {
#ifdef CONFIG_UCLAMP_TASK
int uclamp_enabled;
#endif
+ /*
+ * idle: 0
+ * ext: 1
+ * fair: 2
+ * rt: 4
+ * dl: 8
+ * stop: 16
+ */
+ unsigned int queue_mask;
/*
* move_queued_task/activate_task/enqueue_task: rq->lock
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -99,6 +99,8 @@ static void update_curr_stop(struct rq *
*/
DEFINE_SCHED_CLASS(stop) = {
+ .queue_mask = 16,
+
.enqueue_task = enqueue_task_stop,
.dequeue_task = dequeue_task_stop,
.yield_task = yield_task_stop,
Hi Peter, On 06/10/25 12:46, Peter Zijlstra wrote: > Have enqueue/dequeue set a per-class bit in rq->queue_mask. This then > enables easy tracking of which runqueues are modified over a > lock-break. > > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> > --- Nice. > @@ -12887,8 +12888,8 @@ static int sched_balance_newidle(struct > if (this_rq->cfs.h_nr_queued && !pulled_task) > pulled_task = 1; > > - /* Is there a task of a high priority class? */ > - if (this_rq->nr_running != this_rq->cfs.h_nr_queued) > + /* If a higher prio class was modified, restart the pick */ > + if (this_rq->queue_mask & ~((fair_sched_class.queue_mask << 1)-1)) > pulled_task = -1; Does this however want a self-documenting inline helper or macro to make it even more clear? If this is always going to be the only caller maybe not so much. Thanks, Juri
On Tue, Oct 07, 2025 at 12:08:03PM +0200, Juri Lelli wrote:
> Hi Peter,
>
> On 06/10/25 12:46, Peter Zijlstra wrote:
> > Have enqueue/dequeue set a per-class bit in rq->queue_mask. This then
> > enables easy tracking of which runqueues are modified over a
> > lock-break.
> >
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > ---
>
> Nice.
>
> > @@ -12887,8 +12888,8 @@ static int sched_balance_newidle(struct
> > if (this_rq->cfs.h_nr_queued && !pulled_task)
> > pulled_task = 1;
> >
> > - /* Is there a task of a high priority class? */
> > - if (this_rq->nr_running != this_rq->cfs.h_nr_queued)
> > + /* If a higher prio class was modified, restart the pick */
> > + if (this_rq->queue_mask & ~((fair_sched_class.queue_mask << 1)-1))
> > pulled_task = -1;
>
> Does this however want a self-documenting inline helper or macro to make
> it even more clear? If this is always going to be the only caller maybe
> not so much.
There's another one in patch 3. I suppose we can do that. Maybe
something like:
static inline bool rq_modified_above(struct rq *rq, struct sched_class *class)
{
unsigned int mask = class->queue_mask;
return rq->queue_mask & ~((mask << 1) - 1);
}
This then writes the above like:
if (rq_modified_above(this_rq, &fair_sched_class))
On 07/10/25 12:16, Peter Zijlstra wrote:
> On Tue, Oct 07, 2025 at 12:08:03PM +0200, Juri Lelli wrote:
> > Hi Peter,
> >
> > On 06/10/25 12:46, Peter Zijlstra wrote:
> > > Have enqueue/dequeue set a per-class bit in rq->queue_mask. This then
> > > enables easy tracking of which runqueues are modified over a
> > > lock-break.
> > >
> > > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > > ---
> >
> > Nice.
> >
> > > @@ -12887,8 +12888,8 @@ static int sched_balance_newidle(struct
> > > if (this_rq->cfs.h_nr_queued && !pulled_task)
> > > pulled_task = 1;
> > >
> > > - /* Is there a task of a high priority class? */
> > > - if (this_rq->nr_running != this_rq->cfs.h_nr_queued)
> > > + /* If a higher prio class was modified, restart the pick */
> > > + if (this_rq->queue_mask & ~((fair_sched_class.queue_mask << 1)-1))
> > > pulled_task = -1;
> >
> > Does this however want a self-documenting inline helper or macro to make
> > it even more clear? If this is always going to be the only caller maybe
> > not so much.
>
> There's another one in patch 3. I suppose we can do that. Maybe
> something like:
>
> static inline bool rq_modified_above(struct rq *rq, struct sched_class *class)
> {
> unsigned int mask = class->queue_mask;
> return rq->queue_mask & ~((mask << 1) - 1);
> }
>
> This then writes the above like:
>
> if (rq_modified_above(this_rq, &fair_sched_class))
>
Yeah. Maybe also add a "check rq::queue_mask comment for additional
details" or something like this.
Thanks!
Juri
The following commit has been merged into the sched/core branch of tip:
Commit-ID: 1e900f415c6082cd4bcdae4c92515d21fb389473
Gitweb: https://git.kernel.org/tip/1e900f415c6082cd4bcdae4c92515d21fb389473
Author: Peter Zijlstra <peterz@infradead.org>
AuthorDate: Wed, 01 Oct 2025 15:50:15 +02:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 16 Oct 2025 11:13:55 +02:00
sched: Detect per-class runqueue changes
Have enqueue/dequeue set a per-class bit in rq->queue_mask. This then
enables easy tracking of which runqueues are modified over a
lock-break.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
---
kernel/sched/core.c | 2 ++
kernel/sched/deadline.c | 2 ++
kernel/sched/ext.c | 2 ++
kernel/sched/fair.c | 7 +++++--
kernel/sched/idle.c | 2 ++
kernel/sched/rt.c | 2 ++
kernel/sched/sched.h | 25 +++++++++++++++++++++++++
kernel/sched/stop_task.c | 2 ++
8 files changed, 42 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e2199e4..9fc990f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2089,6 +2089,7 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
*/
uclamp_rq_inc(rq, p, flags);
+ rq->queue_mask |= p->sched_class->queue_mask;
p->sched_class->enqueue_task(rq, p, flags);
psi_enqueue(p, flags);
@@ -2121,6 +2122,7 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
* and mark the task ->sched_delayed.
*/
uclamp_rq_dec(rq, p);
+ rq->queue_mask |= p->sched_class->queue_mask;
return p->sched_class->dequeue_task(rq, p, flags);
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 1f94994..83e6175 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -3092,6 +3092,8 @@ static int task_is_throttled_dl(struct task_struct *p, int cpu)
DEFINE_SCHED_CLASS(dl) = {
+ .queue_mask = 8,
+
.enqueue_task = enqueue_task_dl,
.dequeue_task = dequeue_task_dl,
.yield_task = yield_task_dl,
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 5717042..949c3a6 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3234,6 +3234,8 @@ static void scx_cgroup_unlock(void) {}
* their current sched_class. Call them directly from sched core instead.
*/
DEFINE_SCHED_CLASS(ext) = {
+ .queue_mask = 1,
+
.enqueue_task = enqueue_task_scx,
.dequeue_task = dequeue_task_scx,
.yield_task = yield_task_scx,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 77a713e..23ac05c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12841,6 +12841,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
}
rcu_read_unlock();
+ rq_modified_clear(this_rq);
raw_spin_rq_unlock(this_rq);
t0 = sched_clock_cpu(this_cpu);
@@ -12898,8 +12899,8 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
if (this_rq->cfs.h_nr_queued && !pulled_task)
pulled_task = 1;
- /* Is there a task of a high priority class? */
- if (this_rq->nr_running != this_rq->cfs.h_nr_queued)
+ /* If a higher prio class was modified, restart the pick */
+ if (rq_modified_above(this_rq, &fair_sched_class))
pulled_task = -1;
out:
@@ -13633,6 +13634,8 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
*/
DEFINE_SCHED_CLASS(fair) = {
+ .queue_mask = 2,
+
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index dee6e01..055b0dd 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -521,6 +521,8 @@ static void update_curr_idle(struct rq *rq)
*/
DEFINE_SCHED_CLASS(idle) = {
+ .queue_mask = 0,
+
/* no enqueue/yield_task for idle tasks */
/* dequeue is not valid, we print a debug message there: */
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index c2347e4..9bc828d 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2569,6 +2569,8 @@ static int task_is_throttled_rt(struct task_struct *p, int cpu)
DEFINE_SCHED_CLASS(rt) = {
+ .queue_mask = 4,
+
.enqueue_task = enqueue_task_rt,
.dequeue_task = dequeue_task_rt,
.yield_task = yield_task_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e3d2710..f4a3230 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1118,6 +1118,8 @@ struct rq {
/* runqueue lock: */
raw_spinlock_t __lock;
+ /* Per class runqueue modification mask; bits in class order. */
+ unsigned int queue_mask;
unsigned int nr_running;
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
@@ -2414,6 +2416,15 @@ struct sched_class {
#ifdef CONFIG_UCLAMP_TASK
int uclamp_enabled;
#endif
+ /*
+ * idle: 0
+ * ext: 1
+ * fair: 2
+ * rt: 4
+ * dl: 8
+ * stop: 16
+ */
+ unsigned int queue_mask;
/*
* move_queued_task/activate_task/enqueue_task: rq->lock
@@ -2571,6 +2582,20 @@ struct sched_class {
#endif
};
+/*
+ * Does not nest; only used around sched_class::pick_task() rq-lock-breaks.
+ */
+static inline void rq_modified_clear(struct rq *rq)
+{
+ rq->queue_mask = 0;
+}
+
+static inline bool rq_modified_above(struct rq *rq, const struct sched_class * class)
+{
+ unsigned int mask = class->queue_mask;
+ return rq->queue_mask & ~((mask << 1) - 1);
+}
+
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
{
WARN_ON_ONCE(rq->donor != prev);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 73aa8de..d98c453 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -98,6 +98,8 @@ static void update_curr_stop(struct rq *rq)
*/
DEFINE_SCHED_CLASS(stop) = {
+ .queue_mask = 16,
+
.enqueue_task = enqueue_task_stop,
.dequeue_task = dequeue_task_stop,
.yield_task = yield_task_stop,
© 2016 - 2026 Red Hat, Inc.