[RFC][PATCH 1/3] sched: Detect per-class runqueue changes

Peter Zijlstra posted 3 patches 4 months ago
[RFC][PATCH 1/3] sched: Detect per-class runqueue changes
Posted by Peter Zijlstra 4 months ago
Have enqueue/dequeue set a per-class bit in rq->queue_mask. This then
enables easy tracking of which runqueues are modified over a
lock-break.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c      |    2 ++
 kernel/sched/deadline.c  |    2 ++
 kernel/sched/ext.c       |    2 ++
 kernel/sched/fair.c      |    7 +++++--
 kernel/sched/idle.c      |    2 ++
 kernel/sched/rt.c        |    2 ++
 kernel/sched/sched.h     |   10 ++++++++++
 kernel/sched/stop_task.c |    2 ++
 8 files changed, 27 insertions(+), 2 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2089,6 +2089,7 @@ void enqueue_task(struct rq *rq, struct
 	 */
 	uclamp_rq_inc(rq, p, flags);
 
+	rq->queue_mask |= p->sched_class->queue_mask;
 	p->sched_class->enqueue_task(rq, p, flags);
 
 	psi_enqueue(p, flags);
@@ -2121,6 +2122,7 @@ inline bool dequeue_task(struct rq *rq,
 	 * and mark the task ->sched_delayed.
 	 */
 	uclamp_rq_dec(rq, p);
+	rq->queue_mask |= p->sched_class->queue_mask;
 	return p->sched_class->dequeue_task(rq, p, flags);
 }
 
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -3094,6 +3094,8 @@ static int task_is_throttled_dl(struct t
 
 DEFINE_SCHED_CLASS(dl) = {
 
+	.queue_mask		= 8,
+
 	.enqueue_task		= enqueue_task_dl,
 	.dequeue_task		= dequeue_task_dl,
 	.yield_task		= yield_task_dl,
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3234,6 +3234,8 @@ static void scx_cgroup_unlock(void) {}
  *   their current sched_class. Call them directly from sched core instead.
  */
 DEFINE_SCHED_CLASS(ext) = {
+	.queue_mask		= 1,
+
 	.enqueue_task		= enqueue_task_scx,
 	.dequeue_task		= dequeue_task_scx,
 	.yield_task		= yield_task_scx,
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12830,6 +12830,7 @@ static int sched_balance_newidle(struct
 	}
 	rcu_read_unlock();
 
+	this_rq->queue_mask = 0;
 	raw_spin_rq_unlock(this_rq);
 
 	t0 = sched_clock_cpu(this_cpu);
@@ -12887,8 +12888,8 @@ static int sched_balance_newidle(struct
 	if (this_rq->cfs.h_nr_queued && !pulled_task)
 		pulled_task = 1;
 
-	/* Is there a task of a high priority class? */
-	if (this_rq->nr_running != this_rq->cfs.h_nr_queued)
+	/* If a higher prio class was modified, restart the pick */
+	if (this_rq->queue_mask & ~((fair_sched_class.queue_mask << 1)-1))
 		pulled_task = -1;
 
 out:
@@ -13623,6 +13624,8 @@ static unsigned int get_rr_interval_fair
  */
 DEFINE_SCHED_CLASS(fair) = {
 
+	.queue_mask		= 2,
+
 	.enqueue_task		= enqueue_task_fair,
 	.dequeue_task		= dequeue_task_fair,
 	.yield_task		= yield_task_fair,
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -522,6 +522,8 @@ static void update_curr_idle(struct rq *
  */
 DEFINE_SCHED_CLASS(idle) = {
 
+	.queue_mask		= 0,
+
 	/* no enqueue/yield_task for idle tasks */
 
 	/* dequeue is not valid, we print a debug message there: */
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2571,6 +2571,8 @@ static int task_is_throttled_rt(struct t
 
 DEFINE_SCHED_CLASS(rt) = {
 
+	.queue_mask		= 4,
+
 	.enqueue_task		= enqueue_task_rt,
 	.dequeue_task		= dequeue_task_rt,
 	.yield_task		= yield_task_rt,
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1118,6 +1118,7 @@ struct rq {
 	/* runqueue lock: */
 	raw_spinlock_t		__lock;
 
+	unsigned int		queue_mask;
 	unsigned int		nr_running;
 #ifdef CONFIG_NUMA_BALANCING
 	unsigned int		nr_numa_running;
@@ -2414,6 +2415,15 @@ struct sched_class {
 #ifdef CONFIG_UCLAMP_TASK
 	int uclamp_enabled;
 #endif
+	/*
+	 * idle:  0
+	 * ext:   1
+	 * fair:  2
+	 * rt:    4
+	 * dl:    8
+	 * stop: 16
+	 */
+	unsigned int queue_mask;
 
 	/*
 	 * move_queued_task/activate_task/enqueue_task: rq->lock
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -99,6 +99,8 @@ static void update_curr_stop(struct rq *
  */
 DEFINE_SCHED_CLASS(stop) = {
 
+	.queue_mask		= 16,
+
 	.enqueue_task		= enqueue_task_stop,
 	.dequeue_task		= dequeue_task_stop,
 	.yield_task		= yield_task_stop,
Re: [RFC][PATCH 1/3] sched: Detect per-class runqueue changes
Posted by Juri Lelli 4 months ago
Hi Peter,

On 06/10/25 12:46, Peter Zijlstra wrote:
> Have enqueue/dequeue set a per-class bit in rq->queue_mask. This then
> enables easy tracking of which runqueues are modified over a
> lock-break.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---

Nice.

> @@ -12887,8 +12888,8 @@ static int sched_balance_newidle(struct
>  	if (this_rq->cfs.h_nr_queued && !pulled_task)
>  		pulled_task = 1;
>  
> -	/* Is there a task of a high priority class? */
> -	if (this_rq->nr_running != this_rq->cfs.h_nr_queued)
> +	/* If a higher prio class was modified, restart the pick */
> +	if (this_rq->queue_mask & ~((fair_sched_class.queue_mask << 1)-1))
>  		pulled_task = -1;

Does this however want a self-documenting inline helper or macro to make
it even more clear? If this is always going to be the only caller maybe
not so much.

Thanks,
Juri
Re: [RFC][PATCH 1/3] sched: Detect per-class runqueue changes
Posted by Peter Zijlstra 4 months ago
On Tue, Oct 07, 2025 at 12:08:03PM +0200, Juri Lelli wrote:
> Hi Peter,
> 
> On 06/10/25 12:46, Peter Zijlstra wrote:
> > Have enqueue/dequeue set a per-class bit in rq->queue_mask. This then
> > enables easy tracking of which runqueues are modified over a
> > lock-break.
> > 
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > ---
> 
> Nice.
> 
> > @@ -12887,8 +12888,8 @@ static int sched_balance_newidle(struct
> >  	if (this_rq->cfs.h_nr_queued && !pulled_task)
> >  		pulled_task = 1;
> >  
> > -	/* Is there a task of a high priority class? */
> > -	if (this_rq->nr_running != this_rq->cfs.h_nr_queued)
> > +	/* If a higher prio class was modified, restart the pick */
> > +	if (this_rq->queue_mask & ~((fair_sched_class.queue_mask << 1)-1))
> >  		pulled_task = -1;
> 
> Does this however want a self-documenting inline helper or macro to make
> it even more clear? If this is always going to be the only caller maybe
> not so much.

There's another one in patch 3. I suppose we can do that. Maybe
something like:

static inline bool rq_modified_above(struct rq *rq, struct sched_class *class)
{
	unsigned int mask = class->queue_mask;
	return rq->queue_mask & ~((mask << 1) - 1);
}

This then writes the above like:

	if (rq_modified_above(this_rq, &fair_sched_class))
Re: [RFC][PATCH 1/3] sched: Detect per-class runqueue changes
Posted by Juri Lelli 4 months ago
On 07/10/25 12:16, Peter Zijlstra wrote:
> On Tue, Oct 07, 2025 at 12:08:03PM +0200, Juri Lelli wrote:
> > Hi Peter,
> > 
> > On 06/10/25 12:46, Peter Zijlstra wrote:
> > > Have enqueue/dequeue set a per-class bit in rq->queue_mask. This then
> > > enables easy tracking of which runqueues are modified over a
> > > lock-break.
> > > 
> > > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > > ---
> > 
> > Nice.
> > 
> > > @@ -12887,8 +12888,8 @@ static int sched_balance_newidle(struct
> > >  	if (this_rq->cfs.h_nr_queued && !pulled_task)
> > >  		pulled_task = 1;
> > >  
> > > -	/* Is there a task of a high priority class? */
> > > -	if (this_rq->nr_running != this_rq->cfs.h_nr_queued)
> > > +	/* If a higher prio class was modified, restart the pick */
> > > +	if (this_rq->queue_mask & ~((fair_sched_class.queue_mask << 1)-1))
> > >  		pulled_task = -1;
> > 
> > Does this however want a self-documenting inline helper or macro to make
> > it even more clear? If this is always going to be the only caller maybe
> > not so much.
> 
> There's another one in patch 3. I suppose we can do that. Maybe
> something like:
> 
> static inline bool rq_modified_above(struct rq *rq, struct sched_class *class)
> {
> 	unsigned int mask = class->queue_mask;
> 	return rq->queue_mask & ~((mask << 1) - 1);
> }
> 
> This then writes the above like:
> 
> 	if (rq_modified_above(this_rq, &fair_sched_class))
> 

Yeah. Maybe also add a "check rq::queue_mask comment for additional
details" or something like this.

Thanks!
Juri
[tip: sched/core] sched: Detect per-class runqueue changes
Posted by tip-bot2 for Peter Zijlstra 3 months, 3 weeks ago
The following commit has been merged into the sched/core branch of tip:

Commit-ID:     1e900f415c6082cd4bcdae4c92515d21fb389473
Gitweb:        https://git.kernel.org/tip/1e900f415c6082cd4bcdae4c92515d21fb389473
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Wed, 01 Oct 2025 15:50:15 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 16 Oct 2025 11:13:55 +02:00

sched: Detect per-class runqueue changes

Have enqueue/dequeue set a per-class bit in rq->queue_mask. This then
enables easy tracking of which runqueues are modified over a
lock-break.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/core.c      |  2 ++
 kernel/sched/deadline.c  |  2 ++
 kernel/sched/ext.c       |  2 ++
 kernel/sched/fair.c      |  7 +++++--
 kernel/sched/idle.c      |  2 ++
 kernel/sched/rt.c        |  2 ++
 kernel/sched/sched.h     | 25 +++++++++++++++++++++++++
 kernel/sched/stop_task.c |  2 ++
 8 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e2199e4..9fc990f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2089,6 +2089,7 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 	 */
 	uclamp_rq_inc(rq, p, flags);
 
+	rq->queue_mask |= p->sched_class->queue_mask;
 	p->sched_class->enqueue_task(rq, p, flags);
 
 	psi_enqueue(p, flags);
@@ -2121,6 +2122,7 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 	 * and mark the task ->sched_delayed.
 	 */
 	uclamp_rq_dec(rq, p);
+	rq->queue_mask |= p->sched_class->queue_mask;
 	return p->sched_class->dequeue_task(rq, p, flags);
 }
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 1f94994..83e6175 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -3092,6 +3092,8 @@ static int task_is_throttled_dl(struct task_struct *p, int cpu)
 
 DEFINE_SCHED_CLASS(dl) = {
 
+	.queue_mask		= 8,
+
 	.enqueue_task		= enqueue_task_dl,
 	.dequeue_task		= dequeue_task_dl,
 	.yield_task		= yield_task_dl,
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 5717042..949c3a6 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3234,6 +3234,8 @@ static void scx_cgroup_unlock(void) {}
  *   their current sched_class. Call them directly from sched core instead.
  */
 DEFINE_SCHED_CLASS(ext) = {
+	.queue_mask		= 1,
+
 	.enqueue_task		= enqueue_task_scx,
 	.dequeue_task		= dequeue_task_scx,
 	.yield_task		= yield_task_scx,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 77a713e..23ac05c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12841,6 +12841,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
 	}
 	rcu_read_unlock();
 
+	rq_modified_clear(this_rq);
 	raw_spin_rq_unlock(this_rq);
 
 	t0 = sched_clock_cpu(this_cpu);
@@ -12898,8 +12899,8 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
 	if (this_rq->cfs.h_nr_queued && !pulled_task)
 		pulled_task = 1;
 
-	/* Is there a task of a high priority class? */
-	if (this_rq->nr_running != this_rq->cfs.h_nr_queued)
+	/* If a higher prio class was modified, restart the pick */
+	if (rq_modified_above(this_rq, &fair_sched_class))
 		pulled_task = -1;
 
 out:
@@ -13633,6 +13634,8 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
  */
 DEFINE_SCHED_CLASS(fair) = {
 
+	.queue_mask		= 2,
+
 	.enqueue_task		= enqueue_task_fair,
 	.dequeue_task		= dequeue_task_fair,
 	.yield_task		= yield_task_fair,
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index dee6e01..055b0dd 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -521,6 +521,8 @@ static void update_curr_idle(struct rq *rq)
  */
 DEFINE_SCHED_CLASS(idle) = {
 
+	.queue_mask		= 0,
+
 	/* no enqueue/yield_task for idle tasks */
 
 	/* dequeue is not valid, we print a debug message there: */
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index c2347e4..9bc828d 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2569,6 +2569,8 @@ static int task_is_throttled_rt(struct task_struct *p, int cpu)
 
 DEFINE_SCHED_CLASS(rt) = {
 
+	.queue_mask		= 4,
+
 	.enqueue_task		= enqueue_task_rt,
 	.dequeue_task		= dequeue_task_rt,
 	.yield_task		= yield_task_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e3d2710..f4a3230 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1118,6 +1118,8 @@ struct rq {
 	/* runqueue lock: */
 	raw_spinlock_t		__lock;
 
+	/* Per class runqueue modification mask; bits in class order. */
+	unsigned int		queue_mask;
 	unsigned int		nr_running;
 #ifdef CONFIG_NUMA_BALANCING
 	unsigned int		nr_numa_running;
@@ -2414,6 +2416,15 @@ struct sched_class {
 #ifdef CONFIG_UCLAMP_TASK
 	int uclamp_enabled;
 #endif
+	/*
+	 * idle:  0
+	 * ext:   1
+	 * fair:  2
+	 * rt:    4
+	 * dl:    8
+	 * stop: 16
+	 */
+	unsigned int queue_mask;
 
 	/*
 	 * move_queued_task/activate_task/enqueue_task: rq->lock
@@ -2571,6 +2582,20 @@ struct sched_class {
 #endif
 };
 
+/*
+ * Does not nest; only used around sched_class::pick_task() rq-lock-breaks.
+ */
+static inline void rq_modified_clear(struct rq *rq)
+{
+	rq->queue_mask = 0;
+}
+
+static inline bool rq_modified_above(struct rq *rq, const struct sched_class * class)
+{
+	unsigned int mask = class->queue_mask;
+	return rq->queue_mask & ~((mask << 1) - 1);
+}
+
 static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
 	WARN_ON_ONCE(rq->donor != prev);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 73aa8de..d98c453 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -98,6 +98,8 @@ static void update_curr_stop(struct rq *rq)
  */
 DEFINE_SCHED_CLASS(stop) = {
 
+	.queue_mask		= 16,
+
 	.enqueue_task		= enqueue_task_stop,
 	.dequeue_task		= dequeue_task_stop,
 	.yield_task		= yield_task_stop,