Properly implement ops.dequeue() to ensure every ops.enqueue() is
balanced by a corresponding ops.dequeue() call, regardless of whether
the task is on a BPF data structure or already dispatched to a DSQ.
A task is considered enqueued when it is owned by the BPF scheduler.
This ownership persists until the task is either dispatched (moved to a
local DSQ for execution) or removed from the BPF scheduler, such as when
it blocks waiting for an event or when its properties (for example, CPU
affinity or priority) are updated.
When the task enters the BPF scheduler ops.enqueue() is invoked, when it
leaves BPF scheduler ownership, ops.dequeue() is invoked.
This allows BPF schedulers to reliably track task ownership and maintain
accurate accounting.
Cc: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Andrea Righi <arighi@nvidia.com>
---
Documentation/scheduler/sched-ext.rst | 22 ++++++++++++++++++++++
include/linux/sched/ext.h | 1 +
kernel/sched/ext.c | 27 ++++++++++++++++++++++++++-
3 files changed, 49 insertions(+), 1 deletion(-)
diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
index 404fe6126a769..3ed4be53f97da 100644
--- a/Documentation/scheduler/sched-ext.rst
+++ b/Documentation/scheduler/sched-ext.rst
@@ -252,6 +252,26 @@ The following briefly shows how a waking task is scheduled and executed.
* Queue the task on the BPF side.
+ Once ``ops.enqueue()`` is called, the task is considered "enqueued" and
+ is owned by the BPF scheduler. Ownership is retained until the task is
+ either dispatched (moved to a local DSQ for execution) or dequeued
+ (removed from the scheduler due to a blocking event, or to modify a
+ property, like CPU affinity, priority, etc.). When the task leaves the
+ BPF scheduler ``ops.dequeue()`` is invoked.
+
+ **Important**: ``ops.dequeue()`` is called for *any* enqueued task,
+ regardless of whether the task is still on a BPF data structure, or it
+ is already dispatched to a DSQ (global, local, or user DSQ)
+
+ This guarantees that every ``ops.enqueue()`` will eventually be followed
+ by a ``ops.dequeue()``. This makes it reliable for BPF schedulers to
+ track task ownership and maintain accurate accounting, such as per-DSQ
+ queued runtime sums.
+
+ BPF schedulers can choose not to implement ``ops.dequeue()`` if they
+ don't need to track these transitions. The sched_ext core will safely
+ handle all dequeue operations regardless.
+
3. When a CPU is ready to schedule, it first looks at its local DSQ. If
empty, it then looks at the global DSQ. If there still isn't a task to
run, ``ops.dispatch()`` is invoked which can use the following two
@@ -319,6 +339,8 @@ by a sched_ext scheduler:
/* Any usable CPU becomes available */
ops.dispatch(); /* Task is moved to a local DSQ */
+
+ ops.dequeue(); /* Exiting BPF scheduler */
}
ops.running(); /* Task starts running on its assigned CPU */
while (task->scx.slice > 0 && task is runnable)
diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index bcb962d5ee7d8..334c3692a9c62 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -84,6 +84,7 @@ struct scx_dispatch_q {
/* scx_entity.flags */
enum scx_ent_flags {
SCX_TASK_QUEUED = 1 << 0, /* on ext runqueue */
+ SCX_TASK_OPS_ENQUEUED = 1 << 1, /* ops.enqueue() was called */
SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */
SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 94164f2dec6dc..985d75d374385 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1390,6 +1390,9 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq);
+ /* Mark that ops.enqueue() is being called for this task */
+ p->scx.flags |= SCX_TASK_OPS_ENQUEUED;
+
ddsp_taskp = this_cpu_ptr(&direct_dispatch_task);
WARN_ON_ONCE(*ddsp_taskp);
*ddsp_taskp = p;
@@ -1522,6 +1525,21 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
switch (opss & SCX_OPSS_STATE_MASK) {
case SCX_OPSS_NONE:
+ /*
+ * Task is not currently being enqueued or queued on the BPF
+ * scheduler. Check if ops.enqueue() was called for this task.
+ */
+ if ((p->scx.flags & SCX_TASK_OPS_ENQUEUED) &&
+ SCX_HAS_OP(sch, dequeue)) {
+ /*
+ * ops.enqueue() was called and the task was dispatched.
+ * Call ops.dequeue() to notify the BPF scheduler that
+ * the task is leaving.
+ */
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq,
+ p, deq_flags);
+ p->scx.flags &= ~SCX_TASK_OPS_ENQUEUED;
+ }
break;
case SCX_OPSS_QUEUEING:
/*
@@ -1530,9 +1548,16 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
*/
BUG();
case SCX_OPSS_QUEUED:
- if (SCX_HAS_OP(sch, dequeue))
+ /*
+ * Task is owned by the BPF scheduler. Call ops.dequeue()
+ * to notify the BPF scheduler that the task is being
+ * removed.
+ */
+ if (SCX_HAS_OP(sch, dequeue)) {
SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq,
p, deq_flags);
+ p->scx.flags &= ~SCX_TASK_OPS_ENQUEUED;
+ }
if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
SCX_OPSS_NONE))
--
2.52.0
Sorry about the million replies. Pretty squirrel brained right now. On Fri, Dec 19, 2025 at 11:43:14PM +0100, Andrea Righi wrote: > @@ -1390,6 +1390,9 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, > WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); > atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq); > > + /* Mark that ops.enqueue() is being called for this task */ > + p->scx.flags |= SCX_TASK_OPS_ENQUEUED; Is this guaranteed to be cleared after dispatch? ops_dequeue() is called from dequeue_task_scx() and set_next_task_scx(). It looks like the call from set_next_task_scx() may end up calling ops.dequeue() when the task starts running, this seems mostly accidental. - The BPF sched probably expects ops.dequeue() call immediately after dispatch rather than on the running transition. e.g. imagine a scenario where a BPF sched dispatches multiple tasks to a local DSQ. Wouldn't the expectation be that ops.dequeue() is called as soon as a task is dispatched into a local DSQ? - If this depends on the ops_dequeue() call from set_next_task_scx(), it'd also be using the wrong DEQ flag - SCX_DEQ_CORE_SCHED_EXEC - for regular ops.dequeue() following a dispatch. That call there is that way only because ops_dequeue() didn't do anything when OPSS_NONE. Thanks. -- tejun
On Sun, Dec 28, 2025 at 02:06:19PM -1000, Tejun Heo wrote: > Sorry about the million replies. Pretty squirrel brained right now. > > On Fri, Dec 19, 2025 at 11:43:14PM +0100, Andrea Righi wrote: > > @@ -1390,6 +1390,9 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, > > WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); > > atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq); > > > > + /* Mark that ops.enqueue() is being called for this task */ > > + p->scx.flags |= SCX_TASK_OPS_ENQUEUED; > > Is this guaranteed to be cleared after dispatch? ops_dequeue() is called > from dequeue_task_scx() and set_next_task_scx(). It looks like the call from > set_next_task_scx() may end up calling ops.dequeue() when the task starts > running, this seems mostly accidental. > > - The BPF sched probably expects ops.dequeue() call immediately after > dispatch rather than on the running transition. e.g. imagine a scenario > where a BPF sched dispatches multiple tasks to a local DSQ. Wouldn't the > expectation be that ops.dequeue() is called as soon as a task is > dispatched into a local DSQ? > > - If this depends on the ops_dequeue() call from set_next_task_scx(), it'd > also be using the wrong DEQ flag - SCX_DEQ_CORE_SCHED_EXEC - for regular > ops.dequeue() following a dispatch. That call there is that way only > because ops_dequeue() didn't do anything when OPSS_NONE. You're right, the flag should be cleared and ops.dequeue() should be called immediately when the async dispatch completes and the task is inserted into the DSQ. I'll add an explicit ops.dequeue() call in the dispatch completion path. Thanks, -Andrea
Hello, On Fri, Dec 19, 2025 at 11:43:14PM +0100, Andrea Righi wrote: > + Once ``ops.enqueue()`` is called, the task is considered "enqueued" and > + is owned by the BPF scheduler. Ownership is retained until the task is Can we avoid using "ownership" for this? From user's POV, this is fine but kernel side internally uses the word for different purposes - e.g. we say the BPF side owns the task if the task's SCX_OPSS_QUEUED is set (ie. it's on BPF data structure, not on a DSQ). Here, the ownership encompasses both kernel-side and BPF-side queueing, so the term becomes rather confusing. Maybe we can stick with "queued" or "enqueued"? Thanks. -- tejun
Hi, On Sun, Dec 28, 2025 at 01:42:28PM -1000, Tejun Heo wrote: > Hello, > > On Fri, Dec 19, 2025 at 11:43:14PM +0100, Andrea Righi wrote: > > + Once ``ops.enqueue()`` is called, the task is considered "enqueued" and > > + is owned by the BPF scheduler. Ownership is retained until the task is > > Can we avoid using "ownership" for this? From user's POV, this is fine but > kernel side internally uses the word for different purposes - e.g. we say > the BPF side owns the task if the task's SCX_OPSS_QUEUED is set (ie. it's on > BPF data structure, not on a DSQ). Here, the ownership encompasses both > kernel-side and BPF-side queueing, so the term becomes rather confusing. > Maybe we can stick with "queued" or "enqueued"? Agreed. I can't find a better term to describe this phase of the lifecycle, where ops.enqueue() has been called and the task remains in that state until the corresponding ops.dequeue() occurs (either due to a "dispatch" dequeue or "real" dequeue). So maybe we should stick with "enqueued" and clarify exactly what this state means. Thanks, -Andrea
Hello, Andrea. On Fri, Dec 19, 2025 at 11:43:14PM +0100, Andrea Righi wrote: ... > + Once ``ops.enqueue()`` is called, the task is considered "enqueued" and > + is owned by the BPF scheduler. Ownership is retained until the task is > + either dispatched (moved to a local DSQ for execution) or dequeued > + (removed from the scheduler due to a blocking event, or to modify a > + property, like CPU affinity, priority, etc.). When the task leaves the > + BPF scheduler ``ops.dequeue()`` is invoked. > + > + **Important**: ``ops.dequeue()`` is called for *any* enqueued task, > + regardless of whether the task is still on a BPF data structure, or it > + is already dispatched to a DSQ (global, local, or user DSQ) > + > + This guarantees that every ``ops.enqueue()`` will eventually be followed > + by a ``ops.dequeue()``. This makes it reliable for BPF schedulers to > + track task ownership and maintain accurate accounting, such as per-DSQ > + queued runtime sums. While this works, from the BPF sched's POV, there's no way to tell whether an ops.dequeue() call is from the task being actually dequeued or the follow-up to the dispatch operation it just did. This won't make much difference if ops.dequeue() is just used for accounting purposes, but, a scheduler which uses an arena data structure for queueing would likely need to perform extra tests to tell whether the task needs to be dequeued from the arena side. I *think* hot path (ops.dequeue() following the task's dispatch) can be a simple lockless test, so this may be okay, but from API POV, it can probably be better. The counter interlocking point is scx_bpf_dsq_insert(). If we can synchronize scx_bpf_dsq_insert() and dequeue so that ops.dequeue() is not called for a successfully inserted task, I think the semantics would be neater - an enqueued task is either dispatched or dequeued. Due to the async dispatch operation, this likely is difficult to do without adding extra sync operations in scx_bpf_dsq_insert(). However, I *think* we may be able to get rid of dspc and async inserting if we call ops.dispatch() w/ rq lock dropped. That may make the whole dispatch path simpler and the behavior neater too. What do you think? Thanks. -- tejun
Hello, On Sun, Dec 28, 2025 at 07:19:46AM -1000, Tejun Heo wrote: > While this works, from the BPF sched's POV, there's no way to tell whether > an ops.dequeue() call is from the task being actually dequeued or the > follow-up to the dispatch operation it just did. This won't make much > difference if ops.dequeue() is just used for accounting purposes, but, a > scheduler which uses an arena data structure for queueing would likely need > to perform extra tests to tell whether the task needs to be dequeued from > the arena side. I *think* hot path (ops.dequeue() following the task's > dispatch) can be a simple lockless test, so this may be okay, but from API > POV, it can probably be better. > > The counter interlocking point is scx_bpf_dsq_insert(). If we can > synchronize scx_bpf_dsq_insert() and dequeue so that ops.dequeue() is not > called for a successfully inserted task, I think the semantics would be > neater - an enqueued task is either dispatched or dequeued. Due to the async > dispatch operation, this likely is difficult to do without adding extra sync > operations in scx_bpf_dsq_insert(). However, I *think* we may be able to get > rid of dspc and async inserting if we call ops.dispatch() w/ rq lock > dropped. That may make the whole dispatch path simpler and the behavior > neater too. What do you think? I sat down and went through the code to see whether I was actually making sense, and I wasn't: The async dispatch buffering is necessary to avoid lock inversion between rq lock and whatever locks the BPF scheduler might be using internally. This is necessary because enqueue path runs with rq lock held. Thus, any lock that BPF sched uses in tne enqueue path has to nest inside rq lock. In dispatch, scx_bpf_dsq_insert() is likely to be called with the same BPF sched side lock held. If we try to do rq lock dancing synchronously, we can end up trying to grab rq lock while holding BPF side lock leading to deadlock. Kernel side has no control over BPF side locking, so the asynchronous operation is there to side-step the issue. I don't see a good way to make this synchronous. So, please ignore that part. That's non-sense. I still wonder whether we can create some interlocking between scx_bpf_dsq_insert() and ops.dequeue() without making hot path slower. I'll think more about it. Thanks. -- tejun
Hello again, again. On Sun, Dec 28, 2025 at 01:28:04PM -1000, Tejun Heo wrote: ... > So, please ignore that part. That's non-sense. I still wonder whether we can > create some interlocking between scx_bpf_dsq_insert() and ops.dequeue() > without making hot path slower. I'll think more about it. And we can't create an interlocking between scx_bpf_dsq_insert() and ops.dequeue() without adding extra atomic operations in hot paths. The only thing shared is task rq lock and dispatch path can't do that synchronously. So, yeah, it looks like the best we can do is always letting the BPF sched know and let it figure out locking and whether the task needs to be dequeued from BPF side. Thanks. -- tejun
Hi Tejun, On Sun, Dec 28, 2025 at 01:38:01PM -1000, Tejun Heo wrote: > Hello again, again. > > On Sun, Dec 28, 2025 at 01:28:04PM -1000, Tejun Heo wrote: > ... > > So, please ignore that part. That's non-sense. I still wonder whether we can > > create some interlocking between scx_bpf_dsq_insert() and ops.dequeue() > > without making hot path slower. I'll think more about it. > > And we can't create an interlocking between scx_bpf_dsq_insert() and > ops.dequeue() without adding extra atomic operations in hot paths. The only > thing shared is task rq lock and dispatch path can't do that synchronously. > So, yeah, it looks like the best we can do is always letting the BPF sched > know and let it figure out locking and whether the task needs to be > dequeued from BPF side. How about setting a flag in deq_flags to distinguish between a "dispatch" dequeue vs a real dequeue (due to property changes or other reasons)? We should be able to pass this information in a reliable way without any additional synchronization in the hot paths. This would let schedulers that use arena data structures check the flag instead of doing their own internal lookups. And it would also allow us to provide both semantics: 1) Catch real dequeues that need special BPF-side actions (check the flag) 2) Track all ops.enqueue()/ops.dequeue() pairs for accounting purposes (ignore the flag) Thanks, -Andrea
On Mon Dec 29, 2025 at 12:07 PM EST, Andrea Righi wrote: > Hi Tejun, > > On Sun, Dec 28, 2025 at 01:38:01PM -1000, Tejun Heo wrote: >> Hello again, again. >> >> On Sun, Dec 28, 2025 at 01:28:04PM -1000, Tejun Heo wrote: >> ... >> > So, please ignore that part. That's non-sense. I still wonder whether we can >> > create some interlocking between scx_bpf_dsq_insert() and ops.dequeue() >> > without making hot path slower. I'll think more about it. >> >> And we can't create an interlocking between scx_bpf_dsq_insert() and >> ops.dequeue() without adding extra atomic operations in hot paths. The only >> thing shared is task rq lock and dispatch path can't do that synchronously. >> So, yeah, it looks like the best we can do is always letting the BPF sched >> know and let it figure out locking and whether the task needs to be >> dequeued from BPF side. > > How about setting a flag in deq_flags to distinguish between a "dispatch" > dequeue vs a real dequeue (due to property changes or other reasons)? > > We should be able to pass this information in a reliable way without any > additional synchronization in the hot paths. This would let schedulers that > use arena data structures check the flag instead of doing their own > internal lookups. > > And it would also allow us to provide both semantics: > 1) Catch real dequeues that need special BPF-side actions (check the flag) > 2) Track all ops.enqueue()/ops.dequeue() pairs for accounting purposes > (ignore the flag) > IMO the extra flag suffices for arena-based queueing, the arena data structures already have to track the state of the task already: Even without the flag it should be possible to infer the task is in in from inside the BPF code. For example, calling .dequeue() while the task is in an arena queue means the task got dequeued _after_ being dispatched, while calling .dequeue() on a queued task means we are removing it because of a true dequeue event (e.g. sched_setaffinity() was called). The only edge case in the logic is if a true dequeue event happens between .dispatch() and .dequeue(), but a new flag would take care of that. > Thanks, > -Andrea
On Fri Dec 19, 2025 at 5:43 PM EST, Andrea Righi wrote:
> Properly implement ops.dequeue() to ensure every ops.enqueue() is
> balanced by a corresponding ops.dequeue() call, regardless of whether
> the task is on a BPF data structure or already dispatched to a DSQ.
>
> A task is considered enqueued when it is owned by the BPF scheduler.
> This ownership persists until the task is either dispatched (moved to a
> local DSQ for execution) or removed from the BPF scheduler, such as when
> it blocks waiting for an event or when its properties (for example, CPU
> affinity or priority) are updated.
>
> When the task enters the BPF scheduler ops.enqueue() is invoked, when it
> leaves BPF scheduler ownership, ops.dequeue() is invoked.
>
> This allows BPF schedulers to reliably track task ownership and maintain
> accurate accounting.
>
> Cc: Emil Tsalapatis <emil@etsalapatis.com>
> Signed-off-by: Andrea Righi <arighi@nvidia.com>
> ---
Hi Andrea,
This change looks reasonable to me. Some comments inline:
> Documentation/scheduler/sched-ext.rst | 22 ++++++++++++++++++++++
> include/linux/sched/ext.h | 1 +
> kernel/sched/ext.c | 27 ++++++++++++++++++++++++++-
> 3 files changed, 49 insertions(+), 1 deletion(-)
>
> diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
> index 404fe6126a769..3ed4be53f97da 100644
> --- a/Documentation/scheduler/sched-ext.rst
> +++ b/Documentation/scheduler/sched-ext.rst
> @@ -252,6 +252,26 @@ The following briefly shows how a waking task is scheduled and executed.
>
> * Queue the task on the BPF side.
>
> + Once ``ops.enqueue()`` is called, the task is considered "enqueued" and
> + is owned by the BPF scheduler. Ownership is retained until the task is
> + either dispatched (moved to a local DSQ for execution) or dequeued
> + (removed from the scheduler due to a blocking event, or to modify a
> + property, like CPU affinity, priority, etc.). When the task leaves the
> + BPF scheduler ``ops.dequeue()`` is invoked.
> +
Can we say "leaves the scx class" instead? On direct dispatch we
technically never insert the task into the BPF scheduler.
> + **Important**: ``ops.dequeue()`` is called for *any* enqueued task,
> + regardless of whether the task is still on a BPF data structure, or it
> + is already dispatched to a DSQ (global, local, or user DSQ)
> +
> + This guarantees that every ``ops.enqueue()`` will eventually be followed
> + by a ``ops.dequeue()``. This makes it reliable for BPF schedulers to
> + track task ownership and maintain accurate accounting, such as per-DSQ
> + queued runtime sums.
> +
> + BPF schedulers can choose not to implement ``ops.dequeue()`` if they
> + don't need to track these transitions. The sched_ext core will safely
> + handle all dequeue operations regardless.
> +
> 3. When a CPU is ready to schedule, it first looks at its local DSQ. If
> empty, it then looks at the global DSQ. If there still isn't a task to
> run, ``ops.dispatch()`` is invoked which can use the following two
> @@ -319,6 +339,8 @@ by a sched_ext scheduler:
> /* Any usable CPU becomes available */
>
> ops.dispatch(); /* Task is moved to a local DSQ */
> +
> + ops.dequeue(); /* Exiting BPF scheduler */
> }
> ops.running(); /* Task starts running on its assigned CPU */
> while (task->scx.slice > 0 && task is runnable)
> diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
> index bcb962d5ee7d8..334c3692a9c62 100644
> --- a/include/linux/sched/ext.h
> +++ b/include/linux/sched/ext.h
> @@ -84,6 +84,7 @@ struct scx_dispatch_q {
> /* scx_entity.flags */
> enum scx_ent_flags {
> SCX_TASK_QUEUED = 1 << 0, /* on ext runqueue */
> + SCX_TASK_OPS_ENQUEUED = 1 << 1, /* ops.enqueue() was called */
Can we rename this flag? For direct dispatch we never got enqueued.
Something like "DEQ_ON_DISPATCH" would show the purpose of the
flag more clearly.
> SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */
> SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */
>
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index 94164f2dec6dc..985d75d374385 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -1390,6 +1390,9 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
> WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
> atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq);
>
> + /* Mark that ops.enqueue() is being called for this task */
> + p->scx.flags |= SCX_TASK_OPS_ENQUEUED;
> +
Can we avoid setting this flag when we have no .dequeue() method?
Otherwise it stays set forever AFAICT, even after the task has been
sent to the runqueues.
> ddsp_taskp = this_cpu_ptr(&direct_dispatch_task);
> WARN_ON_ONCE(*ddsp_taskp);
> *ddsp_taskp = p;
> @@ -1522,6 +1525,21 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
>
> switch (opss & SCX_OPSS_STATE_MASK) {
> case SCX_OPSS_NONE:
> + /*
> + * Task is not currently being enqueued or queued on the BPF
> + * scheduler. Check if ops.enqueue() was called for this task.
> + */
> + if ((p->scx.flags & SCX_TASK_OPS_ENQUEUED) &&
> + SCX_HAS_OP(sch, dequeue)) {
> + /*
> + * ops.enqueue() was called and the task was dispatched.
> + * Call ops.dequeue() to notify the BPF scheduler that
> + * the task is leaving.
> + */
> + SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq,
> + p, deq_flags);
> + p->scx.flags &= ~SCX_TASK_OPS_ENQUEUED;
> + }
> break;
> case SCX_OPSS_QUEUEING:
> /*
> @@ -1530,9 +1548,16 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
> */
> BUG();
> case SCX_OPSS_QUEUED:
> - if (SCX_HAS_OP(sch, dequeue))
> + /*
> + * Task is owned by the BPF scheduler. Call ops.dequeue()
> + * to notify the BPF scheduler that the task is being
> + * removed.
> + */
> + if (SCX_HAS_OP(sch, dequeue)) {
Edge case, but if we have a .dequeue() method but not an .enqueue() we
still make this call. Can we add flags & SCX_TASK_OPS_ENQUEUED as an
extra condition to be consistent with the SCX_OPSS_NONE case above?
> SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq,
> p, deq_flags);
> + p->scx.flags &= ~SCX_TASK_OPS_ENQUEUED;
> + }
>
> if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
> SCX_OPSS_NONE))
Hi Emil,
On Sat, Dec 27, 2025 at 10:20:06PM -0500, Emil Tsalapatis wrote:
> On Fri Dec 19, 2025 at 5:43 PM EST, Andrea Righi wrote:
> > Properly implement ops.dequeue() to ensure every ops.enqueue() is
> > balanced by a corresponding ops.dequeue() call, regardless of whether
> > the task is on a BPF data structure or already dispatched to a DSQ.
> >
> > A task is considered enqueued when it is owned by the BPF scheduler.
> > This ownership persists until the task is either dispatched (moved to a
> > local DSQ for execution) or removed from the BPF scheduler, such as when
> > it blocks waiting for an event or when its properties (for example, CPU
> > affinity or priority) are updated.
> >
> > When the task enters the BPF scheduler ops.enqueue() is invoked, when it
> > leaves BPF scheduler ownership, ops.dequeue() is invoked.
> >
> > This allows BPF schedulers to reliably track task ownership and maintain
> > accurate accounting.
> >
> > Cc: Emil Tsalapatis <emil@etsalapatis.com>
> > Signed-off-by: Andrea Righi <arighi@nvidia.com>
> > ---
>
>
> Hi Andrea,
>
> This change looks reasonable to me. Some comments inline:
>
> > Documentation/scheduler/sched-ext.rst | 22 ++++++++++++++++++++++
> > include/linux/sched/ext.h | 1 +
> > kernel/sched/ext.c | 27 ++++++++++++++++++++++++++-
> > 3 files changed, 49 insertions(+), 1 deletion(-)
> >
> > diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
> > index 404fe6126a769..3ed4be53f97da 100644
> > --- a/Documentation/scheduler/sched-ext.rst
> > +++ b/Documentation/scheduler/sched-ext.rst
> > @@ -252,6 +252,26 @@ The following briefly shows how a waking task is scheduled and executed.
> >
> > * Queue the task on the BPF side.
> >
> > + Once ``ops.enqueue()`` is called, the task is considered "enqueued" and
> > + is owned by the BPF scheduler. Ownership is retained until the task is
> > + either dispatched (moved to a local DSQ for execution) or dequeued
> > + (removed from the scheduler due to a blocking event, or to modify a
> > + property, like CPU affinity, priority, etc.). When the task leaves the
> > + BPF scheduler ``ops.dequeue()`` is invoked.
> > +
>
> Can we say "leaves the scx class" instead? On direct dispatch we
> technically never insert the task into the BPF scheduler.
Hm.. I agree that'd be more accurate, but it might also be slightly
misleading, as it could be interpreted as the task being moved to a
different scheduling class. How about saying "leaves the enqueued state"
instead, where enqueued means ops.enqueue() being called... I can't find a
better name for this state, like "ops_enqueued", but that's be even more
confusing. :)
>
> > + **Important**: ``ops.dequeue()`` is called for *any* enqueued task,
> > + regardless of whether the task is still on a BPF data structure, or it
> > + is already dispatched to a DSQ (global, local, or user DSQ)
> > +
> > + This guarantees that every ``ops.enqueue()`` will eventually be followed
> > + by a ``ops.dequeue()``. This makes it reliable for BPF schedulers to
> > + track task ownership and maintain accurate accounting, such as per-DSQ
> > + queued runtime sums.
> > +
> > + BPF schedulers can choose not to implement ``ops.dequeue()`` if they
> > + don't need to track these transitions. The sched_ext core will safely
> > + handle all dequeue operations regardless.
> > +
> > 3. When a CPU is ready to schedule, it first looks at its local DSQ. If
> > empty, it then looks at the global DSQ. If there still isn't a task to
> > run, ``ops.dispatch()`` is invoked which can use the following two
> > @@ -319,6 +339,8 @@ by a sched_ext scheduler:
> > /* Any usable CPU becomes available */
> >
> > ops.dispatch(); /* Task is moved to a local DSQ */
> > +
> > + ops.dequeue(); /* Exiting BPF scheduler */
> > }
> > ops.running(); /* Task starts running on its assigned CPU */
> > while (task->scx.slice > 0 && task is runnable)
> > diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
> > index bcb962d5ee7d8..334c3692a9c62 100644
> > --- a/include/linux/sched/ext.h
> > +++ b/include/linux/sched/ext.h
> > @@ -84,6 +84,7 @@ struct scx_dispatch_q {
> > /* scx_entity.flags */
> > enum scx_ent_flags {
> > SCX_TASK_QUEUED = 1 << 0, /* on ext runqueue */
> > + SCX_TASK_OPS_ENQUEUED = 1 << 1, /* ops.enqueue() was called */
>
> Can we rename this flag? For direct dispatch we never got enqueued.
> Something like "DEQ_ON_DISPATCH" would show the purpose of the
> flag more clearly.
Good point. However, ops.dequeue() isn't only called on dispatch, it can
also be triggered when a task property is changed.
So the flag should represent the "enqueued state" in the sense that
ops.enqueue() has been called and a corresponding ops.dequeue() is
expected. This is a lifecycle state, not an indication that the task is in
any queue.
Would a more descriptive comment clarify this? Something like:
SCX_TASK_OPS_ENQUEUED = 1 << 1, /* Task in enqueued state: ops.enqueue()
called, ops.dequeue() will be called
when task leaves this state. */
>
> > SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */
> > SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */
> >
> > diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> > index 94164f2dec6dc..985d75d374385 100644
> > --- a/kernel/sched/ext.c
> > +++ b/kernel/sched/ext.c
> > @@ -1390,6 +1390,9 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
> > WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
> > atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq);
> >
> > + /* Mark that ops.enqueue() is being called for this task */
> > + p->scx.flags |= SCX_TASK_OPS_ENQUEUED;
> > +
>
> Can we avoid setting this flag when we have no .dequeue() method?
> Otherwise it stays set forever AFAICT, even after the task has been
> sent to the runqueues.
Good catch! Definitely we don't need to set this for schedulers that don't
implement ops.dequeue().
>
> > ddsp_taskp = this_cpu_ptr(&direct_dispatch_task);
> > WARN_ON_ONCE(*ddsp_taskp);
> > *ddsp_taskp = p;
> > @@ -1522,6 +1525,21 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
> >
> > switch (opss & SCX_OPSS_STATE_MASK) {
> > case SCX_OPSS_NONE:
> > + /*
> > + * Task is not currently being enqueued or queued on the BPF
> > + * scheduler. Check if ops.enqueue() was called for this task.
> > + */
> > + if ((p->scx.flags & SCX_TASK_OPS_ENQUEUED) &&
> > + SCX_HAS_OP(sch, dequeue)) {
> > + /*
> > + * ops.enqueue() was called and the task was dispatched.
> > + * Call ops.dequeue() to notify the BPF scheduler that
> > + * the task is leaving.
> > + */
> > + SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq,
> > + p, deq_flags);
> > + p->scx.flags &= ~SCX_TASK_OPS_ENQUEUED;
> > + }
> > break;
> > case SCX_OPSS_QUEUEING:
> > /*
> > @@ -1530,9 +1548,16 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
> > */
> > BUG();
> > case SCX_OPSS_QUEUED:
> > - if (SCX_HAS_OP(sch, dequeue))
> > + /*
> > + * Task is owned by the BPF scheduler. Call ops.dequeue()
> > + * to notify the BPF scheduler that the task is being
> > + * removed.
> > + */
> > + if (SCX_HAS_OP(sch, dequeue)) {
>
> Edge case, but if we have a .dequeue() method but not an .enqueue() we
> still make this call. Can we add flags & SCX_TASK_OPS_ENQUEUED as an
> extra condition to be consistent with the SCX_OPSS_NONE case above?
Also good catch. Will add that.
>
> > SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq,
> > p, deq_flags);
> > + p->scx.flags &= ~SCX_TASK_OPS_ENQUEUED;
> > + }
> >
> > if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
> > SCX_OPSS_NONE))
>
Thanks,
-Andrea
On Mon Dec 29, 2025 at 11:36 AM EST, Andrea Righi wrote:
> Hi Emil,
>
> On Sat, Dec 27, 2025 at 10:20:06PM -0500, Emil Tsalapatis wrote:
>> On Fri Dec 19, 2025 at 5:43 PM EST, Andrea Righi wrote:
>> > Properly implement ops.dequeue() to ensure every ops.enqueue() is
>> > balanced by a corresponding ops.dequeue() call, regardless of whether
>> > the task is on a BPF data structure or already dispatched to a DSQ.
>> >
>> > A task is considered enqueued when it is owned by the BPF scheduler.
>> > This ownership persists until the task is either dispatched (moved to a
>> > local DSQ for execution) or removed from the BPF scheduler, such as when
>> > it blocks waiting for an event or when its properties (for example, CPU
>> > affinity or priority) are updated.
>> >
>> > When the task enters the BPF scheduler ops.enqueue() is invoked, when it
>> > leaves BPF scheduler ownership, ops.dequeue() is invoked.
>> >
>> > This allows BPF schedulers to reliably track task ownership and maintain
>> > accurate accounting.
>> >
>> > Cc: Emil Tsalapatis <emil@etsalapatis.com>
>> > Signed-off-by: Andrea Righi <arighi@nvidia.com>
>> > ---
>>
>>
>> Hi Andrea,
>>
>> This change looks reasonable to me. Some comments inline:
>>
>> > Documentation/scheduler/sched-ext.rst | 22 ++++++++++++++++++++++
>> > include/linux/sched/ext.h | 1 +
>> > kernel/sched/ext.c | 27 ++++++++++++++++++++++++++-
>> > 3 files changed, 49 insertions(+), 1 deletion(-)
>> >
>> > diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
>> > index 404fe6126a769..3ed4be53f97da 100644
>> > --- a/Documentation/scheduler/sched-ext.rst
>> > +++ b/Documentation/scheduler/sched-ext.rst
>> > @@ -252,6 +252,26 @@ The following briefly shows how a waking task is scheduled and executed.
>> >
>> > * Queue the task on the BPF side.
>> >
>> > + Once ``ops.enqueue()`` is called, the task is considered "enqueued" and
>> > + is owned by the BPF scheduler. Ownership is retained until the task is
>> > + either dispatched (moved to a local DSQ for execution) or dequeued
>> > + (removed from the scheduler due to a blocking event, or to modify a
>> > + property, like CPU affinity, priority, etc.). When the task leaves the
>> > + BPF scheduler ``ops.dequeue()`` is invoked.
>> > +
>>
>> Can we say "leaves the scx class" instead? On direct dispatch we
>> technically never insert the task into the BPF scheduler.
>
> Hm.. I agree that'd be more accurate, but it might also be slightly
> misleading, as it could be interpreted as the task being moved to a
> different scheduling class. How about saying "leaves the enqueued state"
> instead, where enqueued means ops.enqueue() being called... I can't find a
> better name for this state, like "ops_enqueued", but that's be even more
> confusing. :)
>
I like "leaves the enqueued state", it implies that the task has no
state in the scx scheduler.
>>
>> > + **Important**: ``ops.dequeue()`` is called for *any* enqueued task,
>> > + regardless of whether the task is still on a BPF data structure, or it
>> > + is already dispatched to a DSQ (global, local, or user DSQ)
>> > +
>> > + This guarantees that every ``ops.enqueue()`` will eventually be followed
>> > + by a ``ops.dequeue()``. This makes it reliable for BPF schedulers to
>> > + track task ownership and maintain accurate accounting, such as per-DSQ
>> > + queued runtime sums.
>> > +
>> > + BPF schedulers can choose not to implement ``ops.dequeue()`` if they
>> > + don't need to track these transitions. The sched_ext core will safely
>> > + handle all dequeue operations regardless.
>> > +
>> > 3. When a CPU is ready to schedule, it first looks at its local DSQ. If
>> > empty, it then looks at the global DSQ. If there still isn't a task to
>> > run, ``ops.dispatch()`` is invoked which can use the following two
>> > @@ -319,6 +339,8 @@ by a sched_ext scheduler:
>> > /* Any usable CPU becomes available */
>> >
>> > ops.dispatch(); /* Task is moved to a local DSQ */
>> > +
>> > + ops.dequeue(); /* Exiting BPF scheduler */
>> > }
>> > ops.running(); /* Task starts running on its assigned CPU */
>> > while (task->scx.slice > 0 && task is runnable)
>> > diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
>> > index bcb962d5ee7d8..334c3692a9c62 100644
>> > --- a/include/linux/sched/ext.h
>> > +++ b/include/linux/sched/ext.h
>> > @@ -84,6 +84,7 @@ struct scx_dispatch_q {
>> > /* scx_entity.flags */
>> > enum scx_ent_flags {
>> > SCX_TASK_QUEUED = 1 << 0, /* on ext runqueue */
>> > + SCX_TASK_OPS_ENQUEUED = 1 << 1, /* ops.enqueue() was called */
>>
>> Can we rename this flag? For direct dispatch we never got enqueued.
>> Something like "DEQ_ON_DISPATCH" would show the purpose of the
>> flag more clearly.
>
> Good point. However, ops.dequeue() isn't only called on dispatch, it can
> also be triggered when a task property is changed.
>
> So the flag should represent the "enqueued state" in the sense that
> ops.enqueue() has been called and a corresponding ops.dequeue() is
> expected. This is a lifecycle state, not an indication that the task is in
> any queue.
>
> Would a more descriptive comment clarify this? Something like:
>
> SCX_TASK_OPS_ENQUEUED = 1 << 1, /* Task in enqueued state: ops.enqueue()
> called, ops.dequeue() will be called
> when task leaves this state. */
>
That makes sense, my reasoning was that we actually use the flag for
is not whether the task is enqueued, but rather whether whether we
need to call the dequeue callback when dequeueing from the SCX_OPSS_NONE
state. Can the comment maybe more concretely explain this?
As an aside, I think this change makes it so we can remove the _OPSS_ state
machine with some more refactoring.
>>
>> > SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */
>> > SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */
>> >
>> > diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
>> > index 94164f2dec6dc..985d75d374385 100644
>> > --- a/kernel/sched/ext.c
>> > +++ b/kernel/sched/ext.c
>> > @@ -1390,6 +1390,9 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
>> > WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
>> > atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq);
>> >
>> > + /* Mark that ops.enqueue() is being called for this task */
>> > + p->scx.flags |= SCX_TASK_OPS_ENQUEUED;
>> > +
>>
>> Can we avoid setting this flag when we have no .dequeue() method?
>> Otherwise it stays set forever AFAICT, even after the task has been
>> sent to the runqueues.
>
> Good catch! Definitely we don't need to set this for schedulers that don't
> implement ops.dequeue().
>
>>
>> > ddsp_taskp = this_cpu_ptr(&direct_dispatch_task);
>> > WARN_ON_ONCE(*ddsp_taskp);
>> > *ddsp_taskp = p;
>> > @@ -1522,6 +1525,21 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
>> >
>> > switch (opss & SCX_OPSS_STATE_MASK) {
>> > case SCX_OPSS_NONE:
>> > + /*
>> > + * Task is not currently being enqueued or queued on the BPF
>> > + * scheduler. Check if ops.enqueue() was called for this task.
>> > + */
>> > + if ((p->scx.flags & SCX_TASK_OPS_ENQUEUED) &&
>> > + SCX_HAS_OP(sch, dequeue)) {
>> > + /*
>> > + * ops.enqueue() was called and the task was dispatched.
>> > + * Call ops.dequeue() to notify the BPF scheduler that
>> > + * the task is leaving.
>> > + */
>> > + SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq,
>> > + p, deq_flags);
>> > + p->scx.flags &= ~SCX_TASK_OPS_ENQUEUED;
>> > + }
>> > break;
>> > case SCX_OPSS_QUEUEING:
>> > /*
>> > @@ -1530,9 +1548,16 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
>> > */
>> > BUG();
>> > case SCX_OPSS_QUEUED:
>> > - if (SCX_HAS_OP(sch, dequeue))
>> > + /*
>> > + * Task is owned by the BPF scheduler. Call ops.dequeue()
>> > + * to notify the BPF scheduler that the task is being
>> > + * removed.
>> > + */
>> > + if (SCX_HAS_OP(sch, dequeue)) {
>>
>> Edge case, but if we have a .dequeue() method but not an .enqueue() we
>> still make this call. Can we add flags & SCX_TASK_OPS_ENQUEUED as an
>> extra condition to be consistent with the SCX_OPSS_NONE case above?
>
> Also good catch. Will add that.
>
>>
>> > SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq,
>> > p, deq_flags);
>> > + p->scx.flags &= ~SCX_TASK_OPS_ENQUEUED;
>> > + }
>> >
>> > if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
>> > SCX_OPSS_NONE))
>>
>
> Thanks,
> -Andrea
© 2016 - 2026 Red Hat, Inc.