[v2] sched/ext: Add cpumask to skip unsuitable dispatch queues

[PATCH v2] sched/ext: Add cpumask to skip unsuitable dispatch queues

Posted by Qiliang Yuan 3 days ago

Add a cpus_allowed cpumask to struct scx_dispatch_q to track the union
of affinity masks for all tasks enqueued in a user-defined DSQ. This
allows a CPU to quickly skip DSQs that contain no tasks runnable on the
current CPU, avoiding wasteful O(N) scans.

- Allocate/free cpus_allowed only for user-defined DSQs.
- Use free_dsq_rcu_callback to safely free the DSQ and its nested mask.
- Update the mask in dispatch_enqueue() using cpumask_copy() for the
  first task and cpumask_or() for subsequent ones. Skip updates if the
  mask is already full.
- Update the DSQ mask in set_cpus_allowed_scx() when a task's affinity
  changes while enqueued.
- Handle allocation failures in scx_create_dsq() to prevent memory leaks.

This optimization improves performance with many DSQs and tight affinity
constraints. The bitwise overhead is significantly lower than potential
cache misses during task iteration.

Signed-off-by: Qiliang Yuan <yuanql9@chinatelecom.cn>
Signed-off-by: Qiliang Yuan <realwujing@gmail.com>
---
v2:
 - Fix memory leak by adding RCU callback to free dsq->cpus_allowed.
 - Handle affinity changes while task is in DSQ via set_cpus_allowed_scx().
 - Ensure dsq->cpus_allowed is only allocated for user DSQs.
 - Handle allocation failures in scx_create_dsq().
 - Optimize enqueue path by using cpumask_copy() for the first task and
   skipping OR if mask is already full.

 include/linux/sched/ext.h |  1 +
 kernel/sched/ext.c        | 68 ++++++++++++++++++++++++++++++++++++---
 2 files changed, 64 insertions(+), 5 deletions(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index bcb962d5ee7d..f20e57cf53a3 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -79,6 +79,7 @@ struct scx_dispatch_q {
 	struct rhash_head	hash_node;
 	struct llist_node	free_node;
 	struct rcu_head		rcu;
+	struct cpumask		*cpus_allowed; /* union of all tasks' allowed cpus */
 };
 
 /* scx_entity.flags */
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index afe28c04d5aa..0ae3728e08b8 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1120,8 +1120,16 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
 
 	if (is_local)
 		local_dsq_post_enq(dsq, p, enq_flags);
-	else
+	else {
+		/* Update cpumask to track union of all tasks' allowed CPUs */
+		if (dsq->cpus_allowed) {
+			if (dsq->nr == 1)
+				cpumask_copy(dsq->cpus_allowed, p->cpus_ptr);
+			else if (!cpumask_full(dsq->cpus_allowed))
+				cpumask_or(dsq->cpus_allowed, dsq->cpus_allowed, p->cpus_ptr);
+		}
 		raw_spin_unlock(&dsq->lock);
+	}
 }
 
 static void task_unlink_from_dsq(struct task_struct *p,
@@ -1138,6 +1146,10 @@ static void task_unlink_from_dsq(struct task_struct *p,
 	list_del_init(&p->scx.dsq_list.node);
 	dsq_mod_nr(dsq, -1);
 
+	/* Clear cpumask when queue becomes empty to prevent saturation */
+	if (dsq->nr == 0 && dsq->cpus_allowed)
+		cpumask_clear(dsq->cpus_allowed);
+
 	if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN) && dsq->first_task == p) {
 		struct task_struct *first_task;
 
@@ -1897,6 +1909,14 @@ static bool consume_dispatch_q(struct scx_sched *sch, struct rq *rq,
 	if (list_empty(&dsq->list))
 		return false;
 
+	/*
+	 * O(1) optimization: Check if any task in the queue can run on this CPU.
+	 * If the cpumask is allocated and this CPU is not in the allowed set,
+	 * we can skip the entire queue without scanning.
+	 */
+	if (dsq->cpus_allowed && !cpumask_test_cpu(cpu_of(rq), dsq->cpus_allowed))
+		return false;
+
 	raw_spin_lock(&dsq->lock);
 
 	nldsq_for_each_task(p, dsq) {
@@ -2616,9 +2636,25 @@ static void set_cpus_allowed_scx(struct task_struct *p,
 				 struct affinity_context *ac)
 {
 	struct scx_sched *sch = scx_root;
+	struct scx_dispatch_q *dsq;
 
 	set_cpus_allowed_common(p, ac);
 
+	/*
+	 * If the task is currently in a DSQ, update the DSQ's allowed mask.
+	 * As the task's affinity has changed, the DSQ's union mask must
+	 * be updated to reflect the new allowed CPUs.
+	 */
+	dsq = p->scx.dsq;
+	if (dsq && dsq->cpus_allowed) {
+		unsigned long flags;
+
+		raw_spin_lock_irqsave(&dsq->lock, flags);
+		if (p->scx.dsq == dsq)
+			cpumask_or(dsq->cpus_allowed, dsq->cpus_allowed, p->cpus_ptr);
+		raw_spin_unlock_irqrestore(&dsq->lock, flags);
+	}
+
 	/*
 	 * The effective cpumask is stored in @p->cpus_ptr which may temporarily
 	 * differ from the configured one in @p->cpus_mask. Always tell the bpf
@@ -3390,13 +3426,29 @@ DEFINE_SCHED_CLASS(ext) = {
 #endif
 };
 
-static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id)
+static int init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id)
 {
 	memset(dsq, 0, sizeof(*dsq));
 
 	raw_spin_lock_init(&dsq->lock);
 	INIT_LIST_HEAD(&dsq->list);
 	dsq->id = dsq_id;
+
+	/* Allocate cpumask for tracking allowed CPUs only for user DSQs */
+	if (!(dsq_id & SCX_DSQ_FLAG_BUILTIN)) {
+		dsq->cpus_allowed = kzalloc(cpumask_size(), GFP_KERNEL);
+		if (!dsq->cpus_allowed)
+			return -ENOMEM;
+	}
+	return 0;
+}
+
+static void free_dsq_rcu_callback(struct rcu_head *rcu)
+{
+	struct scx_dispatch_q *dsq = container_of(rcu, struct scx_dispatch_q, rcu);
+
+	kfree(dsq->cpus_allowed);
+	kfree(dsq);
 }
 
 static void free_dsq_irq_workfn(struct irq_work *irq_work)
@@ -3405,7 +3457,7 @@ static void free_dsq_irq_workfn(struct irq_work *irq_work)
 	struct scx_dispatch_q *dsq, *tmp_dsq;
 
 	llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node)
-		kfree_rcu(dsq, rcu);
+		call_rcu(&dsq->rcu, free_dsq_rcu_callback);
 }
 
 static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn);
@@ -6298,7 +6350,11 @@ __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
 	if (!dsq)
 		return -ENOMEM;
 
-	init_dsq(dsq, dsq_id);
+	ret = init_dsq(dsq, dsq_id);
+	if (ret) {
+		kfree(dsq);
+		return ret;
+	}
 
 	rcu_read_lock();
 
@@ -6310,8 +6366,10 @@ __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
 		ret = -ENODEV;
 
 	rcu_read_unlock();
-	if (ret)
+	if (ret) {
+		kfree(dsq->cpus_allowed);
 		kfree(dsq);
+	}
 	return ret;
 }
 
-- 
2.51.0

Re: [PATCH v2] sched/ext: Add cpumask to skip unsuitable dispatch queues

Posted by Tejun Heo 2 days, 13 hours ago

On Wed, Feb 04, 2026 at 04:34:18AM -0500, Qiliang Yuan wrote:
> Add a cpus_allowed cpumask to struct scx_dispatch_q to track the union
> of affinity masks for all tasks enqueued in a user-defined DSQ. This
> allows a CPU to quickly skip DSQs that contain no tasks runnable on the
> current CPU, avoiding wasteful O(N) scans.
> 
> - Allocate/free cpus_allowed only for user-defined DSQs.
> - Use free_dsq_rcu_callback to safely free the DSQ and its nested mask.
> - Update the mask in dispatch_enqueue() using cpumask_copy() for the
>   first task and cpumask_or() for subsequent ones. Skip updates if the
>   mask is already full.
> - Update the DSQ mask in set_cpus_allowed_scx() when a task's affinity
>   changes while enqueued.
> - Handle allocation failures in scx_create_dsq() to prevent memory leaks.
> 
> This optimization improves performance with many DSQs and tight affinity
> constraints. The bitwise overhead is significantly lower than potential
> cache misses during task iteration.
> 
> Signed-off-by: Qiliang Yuan <yuanql9@chinatelecom.cn>
> Signed-off-by: Qiliang Yuan <realwujing@gmail.com>

As Emil pointed out earlier, this adds overhead in general path which scales
with the number of CPUs and the benefit isn't that generic. Similar
optimizations can be done from BPF side and throwing a lot of tasks with
varying affinity restrictions into a single queue frequently scanned by
multiple CPUs is not scalable to begin with.

Thanks.

-- 
tejun