include/linux/sched/ext.h | 1 + kernel/sched/ext.c | 21 ++++++++++++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-)
Add a cpumask field to struct scx_dispatch_q to track the union of
allowed CPUs for all tasks in the queue. Use this mask to perform an
O(1) check in consume_dispatch_q() before scanning the queue.
When a CPU attempts to consume from a queue, it currently must iterate
through all N tasks to determine if any can run on that CPU. If the
queue contains only tasks pinned to other CPUs (via sched_setaffinity
or cgroups), this O(N) scan finds nothing.
With the cpumask, if the current CPU is not in the allowed set, skip
the entire queue immediately with a single bit test. This changes the
"queue is unsuitable" case from O(N) to O(1).
The mask is updated when tasks are enqueued and cleared when the queue
becomes empty, preventing permanent saturation from transient pinned
tasks.
This benefits large systems with CPU-pinned workloads, where CPUs
frequently scan queues containing no eligible tasks.
Signed-off-by: Qiliang Yuan <yuanql9@chinatelecom.cn>
Signed-off-by: Qiliang Yuan <realwujing@gmail.com>
---
include/linux/sched/ext.h | 1 +
kernel/sched/ext.c | 21 ++++++++++++++++++++-
2 files changed, 21 insertions(+), 1 deletion(-)
diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index bcb962d5ee7d..f20e57cf53a3 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -79,6 +79,7 @@ struct scx_dispatch_q {
struct rhash_head hash_node;
struct llist_node free_node;
struct rcu_head rcu;
+ struct cpumask *cpus_allowed; /* union of all tasks' allowed cpus */
};
/* scx_entity.flags */
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index afe28c04d5aa..5a060c97cd64 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1120,8 +1120,12 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
if (is_local)
local_dsq_post_enq(dsq, p, enq_flags);
- else
+ else {
+ /* Update cpumask to track union of all tasks' allowed CPUs */
+ if (dsq->cpus_allowed)
+ cpumask_or(dsq->cpus_allowed, dsq->cpus_allowed, p->cpus_ptr);
raw_spin_unlock(&dsq->lock);
+ }
}
static void task_unlink_from_dsq(struct task_struct *p,
@@ -1138,6 +1142,10 @@ static void task_unlink_from_dsq(struct task_struct *p,
list_del_init(&p->scx.dsq_list.node);
dsq_mod_nr(dsq, -1);
+ /* Clear cpumask when queue becomes empty to prevent saturation */
+ if (dsq->nr == 0 && dsq->cpus_allowed)
+ cpumask_clear(dsq->cpus_allowed);
+
if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN) && dsq->first_task == p) {
struct task_struct *first_task;
@@ -1897,6 +1905,14 @@ static bool consume_dispatch_q(struct scx_sched *sch, struct rq *rq,
if (list_empty(&dsq->list))
return false;
+ /*
+ * O(1) optimization: Check if any task in the queue can run on this CPU.
+ * If the cpumask is allocated and this CPU is not in the allowed set,
+ * we can skip the entire queue without scanning.
+ */
+ if (dsq->cpus_allowed && !cpumask_test_cpu(cpu_of(rq), dsq->cpus_allowed))
+ return false;
+
raw_spin_lock(&dsq->lock);
nldsq_for_each_task(p, dsq) {
@@ -3397,6 +3413,9 @@ static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id)
raw_spin_lock_init(&dsq->lock);
INIT_LIST_HEAD(&dsq->list);
dsq->id = dsq_id;
+
+ /* Allocate cpumask for tracking allowed CPUs */
+ dsq->cpus_allowed = kzalloc(cpumask_size(), GFP_KERNEL);
}
static void free_dsq_irq_workfn(struct irq_work *irq_work)
--
2.51.0
Hi Qiliang,
On Mon, Feb 02, 2026 at 10:03:46PM -0500, Qiliang Yuan wrote:
> Add a cpumask field to struct scx_dispatch_q to track the union of
> allowed CPUs for all tasks in the queue. Use this mask to perform an
> O(1) check in consume_dispatch_q() before scanning the queue.
>
> When a CPU attempts to consume from a queue, it currently must iterate
> through all N tasks to determine if any can run on that CPU. If the
> queue contains only tasks pinned to other CPUs (via sched_setaffinity
> or cgroups), this O(N) scan finds nothing.
>
> With the cpumask, if the current CPU is not in the allowed set, skip
> the entire queue immediately with a single bit test. This changes the
> "queue is unsuitable" case from O(N) to O(1).
>
> The mask is updated when tasks are enqueued and cleared when the queue
> becomes empty, preventing permanent saturation from transient pinned
> tasks.
>
> This benefits large systems with CPU-pinned workloads, where CPUs
> frequently scan queues containing no eligible tasks.
Did you run some benchmarks / have some numbers?
It's true that we save the O(N) scan when the DSQ has no eligible tasks,
but we're adding cost on every enqueue: cpumask_or() on potentially large
cpumasks can be expensive.
I think this optimization can help when queues frequently contain only
tasks pinned to other CPUs or when the queue has many tasks (N is large).
I have the feeling that for small queues or mixed workloads, the cpumask
overhead probably exceeds the savings...
>
> Signed-off-by: Qiliang Yuan <yuanql9@chinatelecom.cn>
> Signed-off-by: Qiliang Yuan <realwujing@gmail.com>
> ---
> include/linux/sched/ext.h | 1 +
> kernel/sched/ext.c | 21 ++++++++++++++++++++-
> 2 files changed, 21 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
> index bcb962d5ee7d..f20e57cf53a3 100644
> --- a/include/linux/sched/ext.h
> +++ b/include/linux/sched/ext.h
> @@ -79,6 +79,7 @@ struct scx_dispatch_q {
> struct rhash_head hash_node;
> struct llist_node free_node;
> struct rcu_head rcu;
> + struct cpumask *cpus_allowed; /* union of all tasks' allowed cpus */
> };
>
> /* scx_entity.flags */
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index afe28c04d5aa..5a060c97cd64 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -1120,8 +1120,12 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
>
> if (is_local)
> local_dsq_post_enq(dsq, p, enq_flags);
> - else
> + else {
> + /* Update cpumask to track union of all tasks' allowed CPUs */
> + if (dsq->cpus_allowed)
> + cpumask_or(dsq->cpus_allowed, dsq->cpus_allowed, p->cpus_ptr);
> raw_spin_unlock(&dsq->lock);
> + }
> }
The cpumask is only updated during enqueue and cleared when the queue
empties. If a task's affinity changes while it's already in the queue
(i.e., sched_setaffinity()), the cpus_allowed mask becomes stale. This
means: 1) the mask might include CPUs that no task can actually run on
anymore (false positive) or, more critically, 2) if a task's affinity
expands, the mask won't reflect this, causing CPUs to skip a queue that
actually has eligible tasks (false negative).
I think we need to hook something in sched_change to update the mask when
p->cpus_ptr changes.
>
> static void task_unlink_from_dsq(struct task_struct *p,
> @@ -1138,6 +1142,10 @@ static void task_unlink_from_dsq(struct task_struct *p,
> list_del_init(&p->scx.dsq_list.node);
> dsq_mod_nr(dsq, -1);
>
> + /* Clear cpumask when queue becomes empty to prevent saturation */
> + if (dsq->nr == 0 && dsq->cpus_allowed)
> + cpumask_clear(dsq->cpus_allowed);
> +
> if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN) && dsq->first_task == p) {
> struct task_struct *first_task;
>
> @@ -1897,6 +1905,14 @@ static bool consume_dispatch_q(struct scx_sched *sch, struct rq *rq,
> if (list_empty(&dsq->list))
> return false;
>
> + /*
> + * O(1) optimization: Check if any task in the queue can run on this CPU.
> + * If the cpumask is allocated and this CPU is not in the allowed set,
> + * we can skip the entire queue without scanning.
> + */
> + if (dsq->cpus_allowed && !cpumask_test_cpu(cpu_of(rq), dsq->cpus_allowed))
> + return false;
> +
> raw_spin_lock(&dsq->lock);
>
> nldsq_for_each_task(p, dsq) {
> @@ -3397,6 +3413,9 @@ static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id)
> raw_spin_lock_init(&dsq->lock);
> INIT_LIST_HEAD(&dsq->list);
> dsq->id = dsq_id;
> +
> + /* Allocate cpumask for tracking allowed CPUs */
> + dsq->cpus_allowed = kzalloc(cpumask_size(), GFP_KERNEL);
I don't see the corresponding kfree() in the cleanup path.
> }
>
> static void free_dsq_irq_workfn(struct irq_work *irq_work)
> --
> 2.51.0
>
Thanks,
-Andrea
Hi Andrea, I have fixed those issues in v2: https://lore.kernel.org/all/20260204093435.3915393-1-realwujing@gmail.com/ On Tue, Feb 03, 2026 at 09:37:14AM +0100, Andrea Righi wrote: > Did you run some benchmarks / have some numbers? I'm working on collecting more detailed benchmark numbers. However, theoretically, the bitwise cpumask_or() should be much cheaper than a DSQ scan that results in multiple cache misses during task structure dereferencing, even for small queues. > It's true that we save the O(N) scan when the DSQ has no eligible tasks, but we're > adding cost on every enqueue: cpumask_or() on potentially large cpumasks can be > expensive. > ... for small queues or mixed workloads, the cpumask overhead probably exceeds > the savings... To minimize the enqueue overhead, I've optimized the dispatch_enqueue() path in v2: - Use cpumask_copy() instead of cpumask_or() when the task is the first one in the DSQ. - Skip the cpumask_or() update if the DSQ's cpus_allowed mask is already full. > The cpumask is only updated during enqueue and cleared when the queue empties. If a > task's affinity changes while it's already in the queue (i.e., sched_setaffinity()), > the cpus_allowed mask becomes stale. Fixed in v2. I've added a hook in set_cpus_allowed_scx() to update the DSQ's cpus_allowed mask whenever a task's affinity changes while it is enqueued in a DSQ. > I don't see the corresponding kfree() in the cleanup path. Fixed in v2. I've added an RCU callback (free_dsq_rcu_callback) to explicitly free dsq->cpus_allowed before freeing the DSQ structure itself. Also, I've restricted the cpumask allocation to user-defined DSQs only, as built-in DSQs (local, global, bypass) don't need this optimization. Thanks, Qiliang
© 2016 - 2026 Red Hat, Inc.