[v1] sched_ext: Introduce rq lock tracking

[PATCH 1/2] sched_ext: Track currently locked rq

Posted by Andrea Righi 9 months, 3 weeks ago

Some kfuncs provided by sched_ext may need to operate on a struct rq,
but they can be invoked from various contexts, specifically, different
scx callbacks.

While some of these callbacks are invoked with a particular rq already
locked, others are not. This makes it impossible for a kfunc to reliably
determine whether it's safe to access a given rq, triggering potential
bugs or unsafe behaviors, see for example [1].

To address this, track the currently locked rq whenever a sched_ext
callback is invoked via SCX_CALL_OP*().

This allows kfuncs that need to operate on an arbitrary rq to retrieve
the currently locked one and apply the appropriate action as needed.

[1] https://lore.kernel.org/lkml/20250325140021.73570-1-arighi@nvidia.com/

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrea Righi <arighi@nvidia.com>
---
 include/linux/sched/ext.h |   1 +
 kernel/sched/ext.c        | 126 +++++++++++++++++++++++---------------
 kernel/sched/ext_idle.c   |   2 +-
 3 files changed, 78 insertions(+), 51 deletions(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index f7545430a5482..9de57f2a284fd 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -149,6 +149,7 @@ struct sched_ext_entity {
 	s32			selected_cpu;
 	u32			kf_mask;	/* see scx_kf_mask above */
 	struct task_struct	*kf_tasks[2];	/* see SCX_CALL_OP_TASK() */
+	struct rq		*locked_rq;	/* currently locked rq */
 	atomic_long_t		ops_state;
 
 	struct list_head	runnable_node;	/* rq->scx.runnable_list */
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index bb0873411d798..a14a5c3bc38ac 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1116,8 +1116,32 @@ static void scx_kf_disallow(u32 mask)
 	current->scx.kf_mask &= ~mask;
 }
 
-#define SCX_CALL_OP(mask, op, args...)						\
+static inline void update_locked_rq(struct rq *rq)
+{
+	/*
+	 * Check whether @rq is actually locked. This can help expose bugs
+	 * or incorrect assumptions about the context in which a kfunc or
+	 * callback is executed.
+	 */
+	if (rq)
+		lockdep_assert_rq_held(rq);
+	current->scx.locked_rq = rq;
+	barrier();
+}
+
+/*
+ * Return the rq currently locked from an scx callback, or NULL if no rq is
+ * locked.
+ */
+static inline struct rq *scx_locked_rq(void)
+{
+	barrier();
+	return current->scx.locked_rq;
+}
+
+#define SCX_CALL_OP(mask, rq, op, args...)					\
 do {										\
+	update_locked_rq(rq);							\
 	if (mask) {								\
 		scx_kf_allow(mask);						\
 		scx_ops.op(args);						\
@@ -1127,9 +1151,11 @@ do {										\
 	}									\
 } while (0)
 
-#define SCX_CALL_OP_RET(mask, op, args...)					\
+#define SCX_CALL_OP_RET(mask, rq, op, args...)					\
 ({										\
 	__typeof__(scx_ops.op(args)) __ret;					\
+										\
+	update_locked_rq(rq);							\
 	if (mask) {								\
 		scx_kf_allow(mask);						\
 		__ret = scx_ops.op(args);					\
@@ -1151,31 +1177,31 @@ do {										\
  * scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on
  * the specific task.
  */
-#define SCX_CALL_OP_TASK(mask, op, task, args...)				\
+#define SCX_CALL_OP_TASK(mask, rq, op, task, args...)				\
 do {										\
 	BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL);				\
 	current->scx.kf_tasks[0] = task;					\
-	SCX_CALL_OP(mask, op, task, ##args);					\
+	SCX_CALL_OP(mask, rq, op, task, ##args);				\
 	current->scx.kf_tasks[0] = NULL;					\
 } while (0)
 
-#define SCX_CALL_OP_TASK_RET(mask, op, task, args...)				\
+#define SCX_CALL_OP_TASK_RET(mask, rq, op, task, args...)			\
 ({										\
 	__typeof__(scx_ops.op(task, ##args)) __ret;				\
 	BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL);				\
 	current->scx.kf_tasks[0] = task;					\
-	__ret = SCX_CALL_OP_RET(mask, op, task, ##args);			\
+	__ret = SCX_CALL_OP_RET(mask, rq, op, task, ##args);			\
 	current->scx.kf_tasks[0] = NULL;					\
 	__ret;									\
 })
 
-#define SCX_CALL_OP_2TASKS_RET(mask, op, task0, task1, args...)			\
+#define SCX_CALL_OP_2TASKS_RET(mask, rq, op, task0, task1, args...)		\
 ({										\
 	__typeof__(scx_ops.op(task0, task1, ##args)) __ret;			\
 	BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL);				\
 	current->scx.kf_tasks[0] = task0;					\
 	current->scx.kf_tasks[1] = task1;					\
-	__ret = SCX_CALL_OP_RET(mask, op, task0, task1, ##args);		\
+	__ret = SCX_CALL_OP_RET(mask, rq, op, task0, task1, ##args);		\
 	current->scx.kf_tasks[0] = NULL;					\
 	current->scx.kf_tasks[1] = NULL;					\
 	__ret;									\
@@ -2174,7 +2200,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 	WARN_ON_ONCE(*ddsp_taskp);
 	*ddsp_taskp = p;
 
-	SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, p, enq_flags);
+	SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, rq, enqueue, p, enq_flags);
 
 	*ddsp_taskp = NULL;
 	if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
@@ -2269,7 +2295,7 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
 	add_nr_running(rq, 1);
 
 	if (SCX_HAS_OP(runnable) && !task_on_rq_migrating(p))
-		SCX_CALL_OP_TASK(SCX_KF_REST, runnable, p, enq_flags);
+		SCX_CALL_OP_TASK(SCX_KF_REST, rq, runnable, p, enq_flags);
 
 	if (enq_flags & SCX_ENQ_WAKEUP)
 		touch_core_sched(rq, p);
@@ -2283,7 +2309,7 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
 		__scx_add_event(SCX_EV_SELECT_CPU_FALLBACK, 1);
 }
 
-static void ops_dequeue(struct task_struct *p, u64 deq_flags)
+static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
 {
 	unsigned long opss;
 
@@ -2304,7 +2330,7 @@ static void ops_dequeue(struct task_struct *p, u64 deq_flags)
 		BUG();
 	case SCX_OPSS_QUEUED:
 		if (SCX_HAS_OP(dequeue))
-			SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, p, deq_flags);
+			SCX_CALL_OP_TASK(SCX_KF_REST, rq, dequeue, p, deq_flags);
 
 		if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
 					    SCX_OPSS_NONE))
@@ -2337,7 +2363,7 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
 		return true;
 	}
 
-	ops_dequeue(p, deq_flags);
+	ops_dequeue(rq, p, deq_flags);
 
 	/*
 	 * A currently running task which is going off @rq first gets dequeued
@@ -2353,11 +2379,11 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
 	 */
 	if (SCX_HAS_OP(stopping) && task_current(rq, p)) {
 		update_curr_scx(rq);
-		SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, false);
+		SCX_CALL_OP_TASK(SCX_KF_REST, rq, stopping, p, false);
 	}
 
 	if (SCX_HAS_OP(quiescent) && !task_on_rq_migrating(p))
-		SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, p, deq_flags);
+		SCX_CALL_OP_TASK(SCX_KF_REST, rq, quiescent, p, deq_flags);
 
 	if (deq_flags & SCX_DEQ_SLEEP)
 		p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP;
@@ -2377,7 +2403,7 @@ static void yield_task_scx(struct rq *rq)
 	struct task_struct *p = rq->curr;
 
 	if (SCX_HAS_OP(yield))
-		SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, p, NULL);
+		SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, rq, yield, p, NULL);
 	else
 		p->scx.slice = 0;
 }
@@ -2387,7 +2413,7 @@ static bool yield_to_task_scx(struct rq *rq, struct task_struct *to)
 	struct task_struct *from = rq->curr;
 
 	if (SCX_HAS_OP(yield))
-		return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, from, to);
+		return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, rq, yield, from, to);
 	else
 		return false;
 }
@@ -2945,7 +2971,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
 		 * emitted in switch_class().
 		 */
 		if (SCX_HAS_OP(cpu_acquire))
-			SCX_CALL_OP(SCX_KF_REST, cpu_acquire, cpu_of(rq), NULL);
+			SCX_CALL_OP(SCX_KF_REST, rq, cpu_acquire, cpu_of(rq), NULL);
 		rq->scx.cpu_released = false;
 	}
 
@@ -2990,7 +3016,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
 	do {
 		dspc->nr_tasks = 0;
 
-		SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, cpu_of(rq),
+		SCX_CALL_OP(SCX_KF_DISPATCH, rq, dispatch, cpu_of(rq),
 			    prev_on_scx ? prev : NULL);
 
 		flush_dispatch_buf(rq);
@@ -3104,7 +3130,7 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
 		 * Core-sched might decide to execute @p before it is
 		 * dispatched. Call ops_dequeue() to notify the BPF scheduler.
 		 */
-		ops_dequeue(p, SCX_DEQ_CORE_SCHED_EXEC);
+		ops_dequeue(rq, p, SCX_DEQ_CORE_SCHED_EXEC);
 		dispatch_dequeue(rq, p);
 	}
 
@@ -3112,7 +3138,7 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
 
 	/* see dequeue_task_scx() on why we skip when !QUEUED */
 	if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED))
-		SCX_CALL_OP_TASK(SCX_KF_REST, running, p);
+		SCX_CALL_OP_TASK(SCX_KF_REST, rq, running, p);
 
 	clr_task_runnable(p, true);
 
@@ -3194,7 +3220,7 @@ static void switch_class(struct rq *rq, struct task_struct *next)
 			};
 
 			SCX_CALL_OP(SCX_KF_CPU_RELEASE,
-				    cpu_release, cpu_of(rq), &args);
+				    rq, cpu_release, cpu_of(rq), &args);
 		}
 		rq->scx.cpu_released = true;
 	}
@@ -3207,7 +3233,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
 
 	/* see dequeue_task_scx() on why we skip when !QUEUED */
 	if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED))
-		SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, true);
+		SCX_CALL_OP_TASK(SCX_KF_REST, rq, stopping, p, true);
 
 	if (p->scx.flags & SCX_TASK_QUEUED) {
 		set_task_runnable(rq, p);
@@ -3345,7 +3371,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
 	 * verifier.
 	 */
 	if (SCX_HAS_OP(core_sched_before) && !scx_rq_bypassing(task_rq(a)))
-		return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before,
+		return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, NULL, core_sched_before,
 					      (struct task_struct *)a,
 					      (struct task_struct *)b);
 	else
@@ -3381,7 +3407,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
 		WARN_ON_ONCE(*ddsp_taskp);
 		*ddsp_taskp = p;
 
-		cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU,
+		cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU, NULL,
 					   select_cpu, p, prev_cpu, wake_flags);
 		p->scx.selected_cpu = cpu;
 		*ddsp_taskp = NULL;
@@ -3426,7 +3452,7 @@ static void set_cpus_allowed_scx(struct task_struct *p,
 	 * designation pointless. Cast it away when calling the operation.
 	 */
 	if (SCX_HAS_OP(set_cpumask))
-		SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p,
+		SCX_CALL_OP_TASK(SCX_KF_REST, NULL, set_cpumask, p,
 				 (struct cpumask *)p->cpus_ptr);
 }
 
@@ -3440,9 +3466,9 @@ static void handle_hotplug(struct rq *rq, bool online)
 		scx_idle_update_selcpu_topology(&scx_ops);
 
 	if (online && SCX_HAS_OP(cpu_online))
-		SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu);
+		SCX_CALL_OP(SCX_KF_UNLOCKED, rq, cpu_online, cpu);
 	else if (!online && SCX_HAS_OP(cpu_offline))
-		SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_offline, cpu);
+		SCX_CALL_OP(SCX_KF_UNLOCKED, rq, cpu_offline, cpu);
 	else
 		scx_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
 			 "cpu %d going %s, exiting scheduler", cpu,
@@ -3545,7 +3571,7 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
 		curr->scx.slice = 0;
 		touch_core_sched(rq, curr);
 	} else if (SCX_HAS_OP(tick)) {
-		SCX_CALL_OP_TASK(SCX_KF_REST, tick, curr);
+		SCX_CALL_OP_TASK(SCX_KF_REST, rq, tick, curr);
 	}
 
 	if (!curr->scx.slice)
@@ -3622,7 +3648,7 @@ static int scx_init_task(struct task_struct *p, struct task_group *tg, bool fork
 			.fork = fork,
 		};
 
-		ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init_task, p, &args);
+		ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, NULL, init_task, p, &args);
 		if (unlikely(ret)) {
 			ret = ops_sanitize_err("init_task", ret);
 			return ret;
@@ -3679,11 +3705,11 @@ static void scx_enable_task(struct task_struct *p)
 	p->scx.weight = sched_weight_to_cgroup(weight);
 
 	if (SCX_HAS_OP(enable))
-		SCX_CALL_OP_TASK(SCX_KF_REST, enable, p);
+		SCX_CALL_OP_TASK(SCX_KF_REST, task_rq(p), enable, p);
 	scx_set_task_state(p, SCX_TASK_ENABLED);
 
 	if (SCX_HAS_OP(set_weight))
-		SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight);
+		SCX_CALL_OP_TASK(SCX_KF_REST, task_rq(p), set_weight, p, p->scx.weight);
 }
 
 static void scx_disable_task(struct task_struct *p)
@@ -3692,7 +3718,7 @@ static void scx_disable_task(struct task_struct *p)
 	WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED);
 
 	if (SCX_HAS_OP(disable))
-		SCX_CALL_OP_TASK(SCX_KF_REST, disable, p);
+		SCX_CALL_OP_TASK(SCX_KF_REST, task_rq(p), disable, p);
 	scx_set_task_state(p, SCX_TASK_READY);
 }
 
@@ -3721,7 +3747,7 @@ static void scx_exit_task(struct task_struct *p)
 	}
 
 	if (SCX_HAS_OP(exit_task))
-		SCX_CALL_OP_TASK(SCX_KF_REST, exit_task, p, &args);
+		SCX_CALL_OP_TASK(SCX_KF_REST, task_rq(p), exit_task, p, &args);
 	scx_set_task_state(p, SCX_TASK_NONE);
 }
 
@@ -3830,7 +3856,7 @@ static void reweight_task_scx(struct rq *rq, struct task_struct *p,
 
 	p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight));
 	if (SCX_HAS_OP(set_weight))
-		SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight);
+		SCX_CALL_OP_TASK(SCX_KF_REST, rq, set_weight, p, p->scx.weight);
 }
 
 static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio)
@@ -3846,7 +3872,7 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)
 	 * different scheduler class. Keep the BPF scheduler up-to-date.
 	 */
 	if (SCX_HAS_OP(set_cpumask))
-		SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p,
+		SCX_CALL_OP_TASK(SCX_KF_REST, rq, set_cpumask, p,
 				 (struct cpumask *)p->cpus_ptr);
 }
 
@@ -3908,7 +3934,7 @@ int scx_tg_online(struct task_group *tg)
 			struct scx_cgroup_init_args args =
 				{ .weight = tg->scx_weight };
 
-			ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init,
+			ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, NULL, cgroup_init,
 					      tg->css.cgroup, &args);
 			if (ret)
 				ret = ops_sanitize_err("cgroup_init", ret);
@@ -3930,7 +3956,7 @@ void scx_tg_offline(struct task_group *tg)
 	percpu_down_read(&scx_cgroup_rwsem);
 
 	if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED))
-		SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, tg->css.cgroup);
+		SCX_CALL_OP(SCX_KF_UNLOCKED, NULL, cgroup_exit, tg->css.cgroup);
 	tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
 
 	percpu_up_read(&scx_cgroup_rwsem);
@@ -3963,7 +3989,7 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset)
 			continue;
 
 		if (SCX_HAS_OP(cgroup_prep_move)) {
-			ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_prep_move,
+			ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, NULL, cgroup_prep_move,
 					      p, from, css->cgroup);
 			if (ret)
 				goto err;
@@ -3977,7 +4003,7 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset)
 err:
 	cgroup_taskset_for_each(p, css, tset) {
 		if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)
-			SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p,
+			SCX_CALL_OP(SCX_KF_UNLOCKED, NULL, cgroup_cancel_move, p,
 				    p->scx.cgrp_moving_from, css->cgroup);
 		p->scx.cgrp_moving_from = NULL;
 	}
@@ -3996,7 +4022,7 @@ void scx_cgroup_move_task(struct task_struct *p)
 	 * cgrp_moving_from set.
 	 */
 	if (SCX_HAS_OP(cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from))
-		SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, p,
+		SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, NULL, cgroup_move, p,
 			p->scx.cgrp_moving_from, tg_cgrp(task_group(p)));
 	p->scx.cgrp_moving_from = NULL;
 }
@@ -4016,7 +4042,7 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
 
 	cgroup_taskset_for_each(p, css, tset) {
 		if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)
-			SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p,
+			SCX_CALL_OP(SCX_KF_UNLOCKED, NULL, cgroup_cancel_move, p,
 				    p->scx.cgrp_moving_from, css->cgroup);
 		p->scx.cgrp_moving_from = NULL;
 	}
@@ -4030,7 +4056,7 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight)
 
 	if (scx_cgroup_enabled && tg->scx_weight != weight) {
 		if (SCX_HAS_OP(cgroup_set_weight))
-			SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight,
+			SCX_CALL_OP(SCX_KF_UNLOCKED, NULL, cgroup_set_weight,
 				    tg_cgrp(tg), weight);
 		tg->scx_weight = weight;
 	}
@@ -4219,7 +4245,7 @@ static void scx_cgroup_exit(void)
 			continue;
 		rcu_read_unlock();
 
-		SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, css->cgroup);
+		SCX_CALL_OP(SCX_KF_UNLOCKED, NULL, cgroup_exit, css->cgroup);
 
 		rcu_read_lock();
 		css_put(css);
@@ -4256,7 +4282,7 @@ static int scx_cgroup_init(void)
 			continue;
 		rcu_read_unlock();
 
-		ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init,
+		ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, NULL, cgroup_init,
 				      css->cgroup, &args);
 		if (ret) {
 			css_put(css);
@@ -4749,7 +4775,7 @@ static void scx_disable_workfn(struct kthread_work *work)
 	}
 
 	if (scx_ops.exit)
-		SCX_CALL_OP(SCX_KF_UNLOCKED, exit, ei);
+		SCX_CALL_OP(SCX_KF_UNLOCKED, NULL, exit, ei);
 
 	cancel_delayed_work_sync(&scx_watchdog_work);
 
@@ -4955,7 +4981,7 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
 
 	if (SCX_HAS_OP(dump_task)) {
 		ops_dump_init(s, "    ");
-		SCX_CALL_OP(SCX_KF_REST, dump_task, dctx, p);
+		SCX_CALL_OP(SCX_KF_REST, NULL, dump_task, dctx, p);
 		ops_dump_exit();
 	}
 
@@ -5002,7 +5028,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
 
 	if (SCX_HAS_OP(dump)) {
 		ops_dump_init(&s, "");
-		SCX_CALL_OP(SCX_KF_UNLOCKED, dump, &dctx);
+		SCX_CALL_OP(SCX_KF_UNLOCKED, NULL, dump, &dctx);
 		ops_dump_exit();
 	}
 
@@ -5059,7 +5085,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
 		used = seq_buf_used(&ns);
 		if (SCX_HAS_OP(dump_cpu)) {
 			ops_dump_init(&ns, "  ");
-			SCX_CALL_OP(SCX_KF_REST, dump_cpu, &dctx, cpu, idle);
+			SCX_CALL_OP(SCX_KF_REST, NULL, dump_cpu, &dctx, cpu, idle);
 			ops_dump_exit();
 		}
 
@@ -5315,7 +5341,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 	scx_idle_enable(ops);
 
 	if (scx_ops.init) {
-		ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init);
+		ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, NULL, init);
 		if (ret) {
 			ret = ops_sanitize_err("init", ret);
 			cpus_read_unlock();
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index 023ae6df5e8ca..9213989e72b4e 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -745,7 +745,7 @@ void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
 	 * managed by put_prev_task_idle()/set_next_task_idle().
 	 */
 	if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq))
-		SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
+		SCX_CALL_OP(SCX_KF_REST, rq, update_idle, cpu_of(rq), idle);
 
 	/*
 	 * Update the idle masks:
-- 
2.49.0

Re: [PATCH 1/2] sched_ext: Track currently locked rq

Posted by Tejun Heo 9 months, 3 weeks ago

Hello, Andrea.

On Sat, Apr 19, 2025 at 02:24:30PM +0200, Andrea Righi wrote:
> @@ -149,6 +149,7 @@ struct sched_ext_entity {
>  	s32			selected_cpu;
>  	u32			kf_mask;	/* see scx_kf_mask above */
>  	struct task_struct	*kf_tasks[2];	/* see SCX_CALL_OP_TASK() */
> +	struct rq		*locked_rq;	/* currently locked rq */

Can this be a percpu variable? While rq is locked, current can't switch out
anyway and that way we don't have to increase the size of task. Note that
kf_tasks[] are different in that some ops may, at least theoretically,
sleep.

> +static inline void update_locked_rq(struct rq *rq)
> +{
> +	/*
> +	 * Check whether @rq is actually locked. This can help expose bugs
> +	 * or incorrect assumptions about the context in which a kfunc or
> +	 * callback is executed.
> +	 */
> +	if (rq)
> +		lockdep_assert_rq_held(rq);
> +	current->scx.locked_rq = rq;
> +	barrier();

As these conditions are program-order checks on the local CPU, I don't think
any barrier is necessary.

Thanks.

-- 
tejun

Re: [PATCH 1/2] sched_ext: Track currently locked rq

Posted by Andrea Righi 9 months, 3 weeks ago

On Sat, Apr 19, 2025 at 07:34:16AM -1000, Tejun Heo wrote:
> Hello, Andrea.
> 
> On Sat, Apr 19, 2025 at 02:24:30PM +0200, Andrea Righi wrote:
> > @@ -149,6 +149,7 @@ struct sched_ext_entity {
> >  	s32			selected_cpu;
> >  	u32			kf_mask;	/* see scx_kf_mask above */
> >  	struct task_struct	*kf_tasks[2];	/* see SCX_CALL_OP_TASK() */
> > +	struct rq		*locked_rq;	/* currently locked rq */
> 
> Can this be a percpu variable? While rq is locked, current can't switch out
> anyway and that way we don't have to increase the size of task. Note that
> kf_tasks[] are different in that some ops may, at least theoretically,
> sleep.

Yeah, I was debating between using a percpu variable or storing it in
current. I went with current just to stay consistent with kf_tasks.

But you're right about not to increasing the size of the task, and as you
pointed out, we can’t switch if the rq is locked, so a percpu variable
should work. I’ll update that in v2.

> 
> > +static inline void update_locked_rq(struct rq *rq)
> > +{
> > +	/*
> > +	 * Check whether @rq is actually locked. This can help expose bugs
> > +	 * or incorrect assumptions about the context in which a kfunc or
> > +	 * callback is executed.
> > +	 */
> > +	if (rq)
> > +		lockdep_assert_rq_held(rq);
> > +	current->scx.locked_rq = rq;
> > +	barrier();
> 
> As these conditions are program-order checks on the local CPU, I don't think
> any barrier is necessary.

Right, these are local CPU access only, I'll drop the barrier.

Thanks,
-Andrea

Re: [PATCH 1/2] sched_ext: Track currently locked rq

Posted by Andrea Righi 9 months, 3 weeks ago

On Sat, Apr 19, 2025 at 10:10:13PM +0200, Andrea Righi wrote:
> On Sat, Apr 19, 2025 at 07:34:16AM -1000, Tejun Heo wrote:
> > Hello, Andrea.
> > 
> > On Sat, Apr 19, 2025 at 02:24:30PM +0200, Andrea Righi wrote:
> > > @@ -149,6 +149,7 @@ struct sched_ext_entity {
> > >  	s32			selected_cpu;
> > >  	u32			kf_mask;	/* see scx_kf_mask above */
> > >  	struct task_struct	*kf_tasks[2];	/* see SCX_CALL_OP_TASK() */
> > > +	struct rq		*locked_rq;	/* currently locked rq */
> > 
> > Can this be a percpu variable? While rq is locked, current can't switch out
> > anyway and that way we don't have to increase the size of task. Note that
> > kf_tasks[] are different in that some ops may, at least theoretically,
> > sleep.
> 
> Yeah, I was debating between using a percpu variable or storing it in
> current. I went with current just to stay consistent with kf_tasks.
> 
> But you're right about not to increasing the size of the task, and as you
> pointed out, we can’t switch if the rq is locked, so a percpu variable
> should work. I’ll update that in v2.

Hm... actually thinking more about this, a problem with the percpu variable
is that, if no rq is locked, we could move to a different CPU and end up
reading the wrong rq_locked via scx_locked_rq(). I don't think we want to
preempt_disable/enable all the callbacks just to fix this... Maybe storing
in current is a safer choice?

Thanks,
-Andrea

Re: [PATCH 1/2] sched_ext: Track currently locked rq

Posted by Tejun Heo 9 months, 3 weeks ago

Hello,

On Sat, Apr 19, 2025 at 10:30:33PM +0200, Andrea Righi wrote:
> Hm... actually thinking more about this, a problem with the percpu variable
> is that, if no rq is locked, we could move to a different CPU and end up
> reading the wrong rq_locked via scx_locked_rq(). I don't think we want to
> preempt_disable/enable all the callbacks just to fix this... Maybe storing
> in current is a safer choice?

Hmm... I have a hard time imagining a timeline of events which would lead to
the wrong conclusion. The percpu variable is set only while an rq is locked
and cleared before the rq lock is released and thus can only be read as
non-NULL while the rq is locked by that CPU. Maybe I'm missing something.
Can you illustrate a timeline of events which would lead to the wrong
conclusion?

Thanks.

-- 
tejun

Re: [PATCH 1/2] sched_ext: Track currently locked rq

Posted by Andrea Righi 9 months, 3 weeks ago

On Sun, Apr 20, 2025 at 07:44:12AM -1000, Tejun Heo wrote:
> Hello,
> 
> On Sat, Apr 19, 2025 at 10:30:33PM +0200, Andrea Righi wrote:
> > Hm... actually thinking more about this, a problem with the percpu variable
> > is that, if no rq is locked, we could move to a different CPU and end up
> > reading the wrong rq_locked via scx_locked_rq(). I don't think we want to
> > preempt_disable/enable all the callbacks just to fix this... Maybe storing
> > in current is a safer choice?
> 
> Hmm... I have a hard time imagining a timeline of events which would lead to
> the wrong conclusion. The percpu variable is set only while an rq is locked
> and cleared before the rq lock is released and thus can only be read as
> non-NULL while the rq is locked by that CPU. Maybe I'm missing something.
> Can you illustrate a timeline of events which would lead to the wrong
> conclusion?

Oh ok, I was only thinking of setting the percpu variable when we call
SCX_CALL_OP*(), but if we clear it after the scx op returns, then it should
work. If no rq is locked and we bounce to a different CPU, we'd still read
NULL, so it should be always correct.

Alright, I'll send a v2 with this logic and the percpu variable.

Thanks,
-Andrea

Re: [PATCH 1/2] sched_ext: Track currently locked rq

Posted by Andrea Righi 9 months, 3 weeks ago

On Sat, Apr 19, 2025 at 10:30:37PM +0200, Andrea Righi wrote:
> On Sat, Apr 19, 2025 at 10:10:13PM +0200, Andrea Righi wrote:
> > On Sat, Apr 19, 2025 at 07:34:16AM -1000, Tejun Heo wrote:
> > > Hello, Andrea.
> > > 
> > > On Sat, Apr 19, 2025 at 02:24:30PM +0200, Andrea Righi wrote:
> > > > @@ -149,6 +149,7 @@ struct sched_ext_entity {
> > > >  	s32			selected_cpu;
> > > >  	u32			kf_mask;	/* see scx_kf_mask above */
> > > >  	struct task_struct	*kf_tasks[2];	/* see SCX_CALL_OP_TASK() */
> > > > +	struct rq		*locked_rq;	/* currently locked rq */
> > > 
> > > Can this be a percpu variable? While rq is locked, current can't switch out
> > > anyway and that way we don't have to increase the size of task. Note that
> > > kf_tasks[] are different in that some ops may, at least theoretically,
> > > sleep.
> > 
> > Yeah, I was debating between using a percpu variable or storing it in
> > current. I went with current just to stay consistent with kf_tasks.
> > 
> > But you're right about not to increasing the size of the task, and as you
> > pointed out, we can’t switch if the rq is locked, so a percpu variable
> > should work. I’ll update that in v2.
> 
> Hm... actually thinking more about this, a problem with the percpu variable
> is that, if no rq is locked, we could move to a different CPU and end up
> reading the wrong rq_locked via scx_locked_rq(). I don't think we want to
> preempt_disable/enable all the callbacks just to fix this... Maybe storing
> in current is a safer choice?

And if we don't want to increase the size of sched_ext_entity, we could
store the cpu of the currently locked rq, right before "disallow", like:

struct sched_ext_entity {
	struct scx_dispatch_q *    dsq;                  /*     0     8 */
	struct scx_dsq_list_node   dsq_list;             /*     8    24 */
	struct rb_node             dsq_priq __attribute__((__aligned__(8))); /*    32    24 */
	u32                        dsq_seq;              /*    56     4 */
	u32                        dsq_flags;            /*    60     4 */
	/* --- cacheline 1 boundary (64 bytes) --- */
	u32                        flags;                /*    64     4 */
	u32                        weight;               /*    68     4 */
	s32                        sticky_cpu;           /*    72     4 */
	s32                        holding_cpu;          /*    76     4 */
	s32                        selected_cpu;         /*    80     4 */
	u32                        kf_mask;              /*    84     4 */
	struct task_struct *       kf_tasks[2];          /*    88    16 */
	atomic_long_t              ops_state;            /*   104     8 */
	struct list_head           runnable_node;        /*   112    16 */
	/* --- cacheline 2 boundary (128 bytes) --- */
	long unsigned int          runnable_at;          /*   128     8 */
	u64                        core_sched_at;        /*   136     8 */
	u64                        ddsp_dsq_id;          /*   144     8 */
	u64                        ddsp_enq_flags;       /*   152     8 */
	u64                        slice;                /*   160     8 */
	u64                        dsq_vtime;            /*   168     8 */
	int                        locked_cpu;           /*   176     4 */
	bool                       disallow;             /*   180     1 */

	/* XXX 3 bytes hole, try to pack */

	struct cgroup *            cgrp_moving_from;     /*   184     8 */
	/* --- cacheline 3 boundary (192 bytes) --- */
	struct list_head           tasks_node;           /*   192    16 */

	/* size: 208, cachelines: 4, members: 24 */
	/* sum members: 205, holes: 1, sum holes: 3 */
	/* forced alignments: 1 */
	/* last cacheline: 16 bytes */
} __attribute__((__aligned__(8)));

(before the hole was 7 bytes)

Then use cpu_rq()/cpu_of() to resolve that to/from the corresponding rq.

-Andrea

[PATCH 1/2] sched_ext: Track currently locked rq
[PATCH 2/2] sched_ext: Fix missing rq lock in scx_bpf_cpuperf_set()