sched_ext: Implement proper ops.dequeue() semantics

[PATCH 1/2] sched_ext: Fix ops.dequeue() semantics

Posted by Andrea Righi 1 month, 3 weeks ago

Properly implement ops.dequeue() to ensure every ops.enqueue() is
balanced by a corresponding ops.dequeue() call, regardless of whether
the task is on a BPF data structure or already dispatched to a DSQ.

A task is considered enqueued when it is owned by the BPF scheduler.
This ownership persists until the task is either dispatched (moved to a
local DSQ for execution) or removed from the BPF scheduler, such as when
it blocks waiting for an event or when its properties (for example, CPU
affinity or priority) are updated.

When the task enters the BPF scheduler ops.enqueue() is invoked, when it
leaves BPF scheduler ownership, ops.dequeue() is invoked.

This allows BPF schedulers to reliably track task ownership and maintain
accurate accounting.

Cc: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Andrea Righi <arighi@nvidia.com>
---
 Documentation/scheduler/sched-ext.rst | 22 ++++++++++++++++++++++
 include/linux/sched/ext.h             |  1 +
 kernel/sched/ext.c                    | 27 ++++++++++++++++++++++++++-
 3 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
index 404fe6126a769..3ed4be53f97da 100644
--- a/Documentation/scheduler/sched-ext.rst
+++ b/Documentation/scheduler/sched-ext.rst
@@ -252,6 +252,26 @@ The following briefly shows how a waking task is scheduled and executed.
 
    * Queue the task on the BPF side.
 
+   Once ``ops.enqueue()`` is called, the task is considered "enqueued" and
+   is owned by the BPF scheduler. Ownership is retained until the task is
+   either dispatched (moved to a local DSQ for execution) or dequeued
+   (removed from the scheduler due to a blocking event, or to modify a
+   property, like CPU affinity, priority, etc.). When the task leaves the
+   BPF scheduler ``ops.dequeue()`` is invoked.
+
+   **Important**: ``ops.dequeue()`` is called for *any* enqueued task,
+   regardless of whether the task is still on a BPF data structure, or it
+   is already dispatched to a DSQ (global, local, or user DSQ)
+
+   This guarantees that every ``ops.enqueue()`` will eventually be followed
+   by a ``ops.dequeue()``. This makes it reliable for BPF schedulers to
+   track task ownership and maintain accurate accounting, such as per-DSQ
+   queued runtime sums.
+
+   BPF schedulers can choose not to implement ``ops.dequeue()`` if they
+   don't need to track these transitions. The sched_ext core will safely
+   handle all dequeue operations regardless.
+
 3. When a CPU is ready to schedule, it first looks at its local DSQ. If
    empty, it then looks at the global DSQ. If there still isn't a task to
    run, ``ops.dispatch()`` is invoked which can use the following two
@@ -319,6 +339,8 @@ by a sched_ext scheduler:
                 /* Any usable CPU becomes available */
 
                 ops.dispatch(); /* Task is moved to a local DSQ */
+
+                ops.dequeue(); /* Exiting BPF scheduler */
             }
             ops.running();      /* Task starts running on its assigned CPU */
             while (task->scx.slice > 0 && task is runnable)
diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index bcb962d5ee7d8..334c3692a9c62 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -84,6 +84,7 @@ struct scx_dispatch_q {
 /* scx_entity.flags */
 enum scx_ent_flags {
 	SCX_TASK_QUEUED		= 1 << 0, /* on ext runqueue */
+	SCX_TASK_OPS_ENQUEUED	= 1 << 1, /* ops.enqueue() was called */
 	SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */
 	SCX_TASK_DEQD_FOR_SLEEP	= 1 << 3, /* last dequeue was for SLEEP */
 
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 94164f2dec6dc..985d75d374385 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1390,6 +1390,9 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 	WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
 	atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq);
 
+	/* Mark that ops.enqueue() is being called for this task */
+	p->scx.flags |= SCX_TASK_OPS_ENQUEUED;
+
 	ddsp_taskp = this_cpu_ptr(&direct_dispatch_task);
 	WARN_ON_ONCE(*ddsp_taskp);
 	*ddsp_taskp = p;
@@ -1522,6 +1525,21 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
 
 	switch (opss & SCX_OPSS_STATE_MASK) {
 	case SCX_OPSS_NONE:
+		/*
+		 * Task is not currently being enqueued or queued on the BPF
+		 * scheduler. Check if ops.enqueue() was called for this task.
+		 */
+		if ((p->scx.flags & SCX_TASK_OPS_ENQUEUED) &&
+		    SCX_HAS_OP(sch, dequeue)) {
+			/*
+			 * ops.enqueue() was called and the task was dispatched.
+			 * Call ops.dequeue() to notify the BPF scheduler that
+			 * the task is leaving.
+			 */
+			SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq,
+					 p, deq_flags);
+			p->scx.flags &= ~SCX_TASK_OPS_ENQUEUED;
+		}
 		break;
 	case SCX_OPSS_QUEUEING:
 		/*
@@ -1530,9 +1548,16 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
 		 */
 		BUG();
 	case SCX_OPSS_QUEUED:
-		if (SCX_HAS_OP(sch, dequeue))
+		/*
+		 * Task is owned by the BPF scheduler. Call ops.dequeue()
+		 * to notify the BPF scheduler that the task is being
+		 * removed.
+		 */
+		if (SCX_HAS_OP(sch, dequeue)) {
 			SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq,
 					 p, deq_flags);
+			p->scx.flags &= ~SCX_TASK_OPS_ENQUEUED;
+		}
 
 		if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
 					    SCX_OPSS_NONE))
-- 
2.52.0

Re: [PATCH 1/2] sched_ext: Fix ops.dequeue() semantics

Posted by Tejun Heo 1 month, 1 week ago

Sorry about the million replies. Pretty squirrel brained right now.

On Fri, Dec 19, 2025 at 11:43:14PM +0100, Andrea Righi wrote:
> @@ -1390,6 +1390,9 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
>  	WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
>  	atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq);
>  
> +	/* Mark that ops.enqueue() is being called for this task */
> +	p->scx.flags |= SCX_TASK_OPS_ENQUEUED;

Is this guaranteed to be cleared after dispatch? ops_dequeue() is called
from dequeue_task_scx() and set_next_task_scx(). It looks like the call from
set_next_task_scx() may end up calling ops.dequeue() when the task starts
running, this seems mostly accidental.

- The BPF sched probably expects ops.dequeue() call immediately after
  dispatch rather than on the running transition. e.g. imagine a scenario
  where a BPF sched dispatches multiple tasks to a local DSQ. Wouldn't the
  expectation be that ops.dequeue() is called as soon as a task is
  dispatched into a local DSQ?

- If this depends on the ops_dequeue() call from set_next_task_scx(), it'd
  also be using the wrong DEQ flag - SCX_DEQ_CORE_SCHED_EXEC - for regular
  ops.dequeue() following a dispatch. That call there is that way only
  because ops_dequeue() didn't do anything when OPSS_NONE.

Thanks.

-- 
tejun

Re: [PATCH 1/2] sched_ext: Fix ops.dequeue() semantics

Posted by Andrea Righi 1 month, 1 week ago

On Sun, Dec 28, 2025 at 02:06:19PM -1000, Tejun Heo wrote:
> Sorry about the million replies. Pretty squirrel brained right now.
> 
> On Fri, Dec 19, 2025 at 11:43:14PM +0100, Andrea Righi wrote:
> > @@ -1390,6 +1390,9 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
> >  	WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
> >  	atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq);
> >  
> > +	/* Mark that ops.enqueue() is being called for this task */
> > +	p->scx.flags |= SCX_TASK_OPS_ENQUEUED;
> 
> Is this guaranteed to be cleared after dispatch? ops_dequeue() is called
> from dequeue_task_scx() and set_next_task_scx(). It looks like the call from
> set_next_task_scx() may end up calling ops.dequeue() when the task starts
> running, this seems mostly accidental.
> 
> - The BPF sched probably expects ops.dequeue() call immediately after
>   dispatch rather than on the running transition. e.g. imagine a scenario
>   where a BPF sched dispatches multiple tasks to a local DSQ. Wouldn't the
>   expectation be that ops.dequeue() is called as soon as a task is
>   dispatched into a local DSQ?
> 
> - If this depends on the ops_dequeue() call from set_next_task_scx(), it'd
>   also be using the wrong DEQ flag - SCX_DEQ_CORE_SCHED_EXEC - for regular
>   ops.dequeue() following a dispatch. That call there is that way only
>   because ops_dequeue() didn't do anything when OPSS_NONE.

You're right, the flag should be cleared and ops.dequeue() should be called
immediately when the async dispatch completes and the task is inserted into
the DSQ. I'll add an explicit ops.dequeue() call in the dispatch completion
path.

Thanks,
-Andrea

Re: [PATCH 1/2] sched_ext: Fix ops.dequeue() semantics

Posted by Tejun Heo 1 month, 1 week ago

Hello,

On Fri, Dec 19, 2025 at 11:43:14PM +0100, Andrea Righi wrote:
> +   Once ``ops.enqueue()`` is called, the task is considered "enqueued" and
> +   is owned by the BPF scheduler. Ownership is retained until the task is

Can we avoid using "ownership" for this? From user's POV, this is fine but
kernel side internally uses the word for different purposes - e.g. we say
the BPF side owns the task if the task's SCX_OPSS_QUEUED is set (ie. it's on
BPF data structure, not on a DSQ). Here, the ownership encompasses both
kernel-side and BPF-side queueing, so the term becomes rather confusing.
Maybe we can stick with "queued" or "enqueued"?

Thanks.

-- 
tejun

Re: [PATCH 1/2] sched_ext: Fix ops.dequeue() semantics

Posted by Andrea Righi 1 month, 1 week ago

Hi,

On Sun, Dec 28, 2025 at 01:42:28PM -1000, Tejun Heo wrote:
> Hello,
> 
> On Fri, Dec 19, 2025 at 11:43:14PM +0100, Andrea Righi wrote:
> > +   Once ``ops.enqueue()`` is called, the task is considered "enqueued" and
> > +   is owned by the BPF scheduler. Ownership is retained until the task is
> 
> Can we avoid using "ownership" for this? From user's POV, this is fine but
> kernel side internally uses the word for different purposes - e.g. we say
> the BPF side owns the task if the task's SCX_OPSS_QUEUED is set (ie. it's on
> BPF data structure, not on a DSQ). Here, the ownership encompasses both
> kernel-side and BPF-side queueing, so the term becomes rather confusing.
> Maybe we can stick with "queued" or "enqueued"?

Agreed. I can't find a better term to describe this phase of the lifecycle,
where ops.enqueue() has been called and the task remains in that state
until the corresponding ops.dequeue() occurs (either due to a "dispatch"
dequeue or "real" dequeue).

So maybe we should stick with "enqueued" and clarify exactly what this
state means.

Thanks,
-Andrea

Re: [PATCH 1/2] sched_ext: Fix ops.dequeue() semantics

Posted by Tejun Heo 1 month, 1 week ago

Hello, Andrea.

On Fri, Dec 19, 2025 at 11:43:14PM +0100, Andrea Righi wrote:
...
> +   Once ``ops.enqueue()`` is called, the task is considered "enqueued" and
> +   is owned by the BPF scheduler. Ownership is retained until the task is
> +   either dispatched (moved to a local DSQ for execution) or dequeued
> +   (removed from the scheduler due to a blocking event, or to modify a
> +   property, like CPU affinity, priority, etc.). When the task leaves the
> +   BPF scheduler ``ops.dequeue()`` is invoked.
> +
> +   **Important**: ``ops.dequeue()`` is called for *any* enqueued task,
> +   regardless of whether the task is still on a BPF data structure, or it
> +   is already dispatched to a DSQ (global, local, or user DSQ)
> +
> +   This guarantees that every ``ops.enqueue()`` will eventually be followed
> +   by a ``ops.dequeue()``. This makes it reliable for BPF schedulers to
> +   track task ownership and maintain accurate accounting, such as per-DSQ
> +   queued runtime sums.

While this works, from the BPF sched's POV, there's no way to tell whether
an ops.dequeue() call is from the task being actually dequeued or the
follow-up to the dispatch operation it just did. This won't make much
difference if ops.dequeue() is just used for accounting purposes, but, a
scheduler which uses an arena data structure for queueing would likely need
to perform extra tests to tell whether the task needs to be dequeued from
the arena side. I *think* hot path (ops.dequeue() following the task's
dispatch) can be a simple lockless test, so this may be okay, but from API
POV, it can probably be better.

The counter interlocking point is scx_bpf_dsq_insert(). If we can
synchronize scx_bpf_dsq_insert() and dequeue so that ops.dequeue() is not
called for a successfully inserted task, I think the semantics would be
neater - an enqueued task is either dispatched or dequeued. Due to the async
dispatch operation, this likely is difficult to do without adding extra sync
operations in scx_bpf_dsq_insert(). However, I *think* we may be able to get
rid of dspc and async inserting if we call ops.dispatch() w/ rq lock
dropped. That may make the whole dispatch path simpler and the behavior
neater too. What do you think?

Thanks.

-- 
tejun

Re: [PATCH 1/2] sched_ext: Fix ops.dequeue() semantics

Posted by Tejun Heo 1 month, 1 week ago

Hello,

On Sun, Dec 28, 2025 at 07:19:46AM -1000, Tejun Heo wrote:
> While this works, from the BPF sched's POV, there's no way to tell whether
> an ops.dequeue() call is from the task being actually dequeued or the
> follow-up to the dispatch operation it just did. This won't make much
> difference if ops.dequeue() is just used for accounting purposes, but, a
> scheduler which uses an arena data structure for queueing would likely need
> to perform extra tests to tell whether the task needs to be dequeued from
> the arena side. I *think* hot path (ops.dequeue() following the task's
> dispatch) can be a simple lockless test, so this may be okay, but from API
> POV, it can probably be better.
> 
> The counter interlocking point is scx_bpf_dsq_insert(). If we can
> synchronize scx_bpf_dsq_insert() and dequeue so that ops.dequeue() is not
> called for a successfully inserted task, I think the semantics would be
> neater - an enqueued task is either dispatched or dequeued. Due to the async
> dispatch operation, this likely is difficult to do without adding extra sync
> operations in scx_bpf_dsq_insert(). However, I *think* we may be able to get
> rid of dspc and async inserting if we call ops.dispatch() w/ rq lock
> dropped. That may make the whole dispatch path simpler and the behavior
> neater too. What do you think?

I sat down and went through the code to see whether I was actually making
sense, and I wasn't:

The async dispatch buffering is necessary to avoid lock inversion between rq
lock and whatever locks the BPF scheduler might be using internally. This is
necessary because enqueue path runs with rq lock held. Thus, any lock that
BPF sched uses in tne enqueue path has to nest inside rq lock.

In dispatch, scx_bpf_dsq_insert() is likely to be called with the same BPF
sched side lock held. If we try to do rq lock dancing synchronously, we can
end up trying to grab rq lock while holding BPF side lock leading to
deadlock.

Kernel side has no control over BPF side locking, so the asynchronous
operation is there to side-step the issue. I don't see a good way to make
this synchronous.

So, please ignore that part. That's non-sense. I still wonder whether we can
create some interlocking between scx_bpf_dsq_insert() and ops.dequeue()
without making hot path slower. I'll think more about it.

Thanks.

-- 
tejun

Re: [PATCH 1/2] sched_ext: Fix ops.dequeue() semantics

Posted by Tejun Heo 1 month, 1 week ago

Hello again, again.

On Sun, Dec 28, 2025 at 01:28:04PM -1000, Tejun Heo wrote:
...
> So, please ignore that part. That's non-sense. I still wonder whether we can
> create some interlocking between scx_bpf_dsq_insert() and ops.dequeue()
> without making hot path slower. I'll think more about it.

And we can't create an interlocking between scx_bpf_dsq_insert() and
ops.dequeue() without adding extra atomic operations in hot paths. The only
thing shared is task rq lock and dispatch path can't do that synchronously.
So, yeah, it looks like the best we can do is always letting the BPF sched
know and let it figure out locking and whether the task needs to be
dequeued from BPF side.

Thanks.

-- 
tejun

Re: [PATCH 1/2] sched_ext: Fix ops.dequeue() semantics

Posted by Andrea Righi 1 month, 1 week ago

Hi Tejun,

On Sun, Dec 28, 2025 at 01:38:01PM -1000, Tejun Heo wrote:
> Hello again, again.
> 
> On Sun, Dec 28, 2025 at 01:28:04PM -1000, Tejun Heo wrote:
> ...
> > So, please ignore that part. That's non-sense. I still wonder whether we can
> > create some interlocking between scx_bpf_dsq_insert() and ops.dequeue()
> > without making hot path slower. I'll think more about it.
> 
> And we can't create an interlocking between scx_bpf_dsq_insert() and
> ops.dequeue() without adding extra atomic operations in hot paths. The only
> thing shared is task rq lock and dispatch path can't do that synchronously.
> So, yeah, it looks like the best we can do is always letting the BPF sched
> know and let it figure out locking and whether the task needs to be
> dequeued from BPF side.

How about setting a flag in deq_flags to distinguish between a "dispatch"
dequeue vs a real dequeue (due to property changes or other reasons)?

We should be able to pass this information in a reliable way without any
additional synchronization in the hot paths. This would let schedulers that
use arena data structures check the flag instead of doing their own
internal lookups.

And it would also allow us to provide both semantics:
1) Catch real dequeues that need special BPF-side actions (check the flag)
2) Track all ops.enqueue()/ops.dequeue() pairs for accounting purposes
   (ignore the flag)

Thanks,
-Andrea

Re: [PATCH 1/2] sched_ext: Fix ops.dequeue() semantics

Posted by Emil Tsalapatis 1 month, 1 week ago

On Mon Dec 29, 2025 at 12:07 PM EST, Andrea Righi wrote:
> Hi Tejun,
>
> On Sun, Dec 28, 2025 at 01:38:01PM -1000, Tejun Heo wrote:
>> Hello again, again.
>> 
>> On Sun, Dec 28, 2025 at 01:28:04PM -1000, Tejun Heo wrote:
>> ...
>> > So, please ignore that part. That's non-sense. I still wonder whether we can
>> > create some interlocking between scx_bpf_dsq_insert() and ops.dequeue()
>> > without making hot path slower. I'll think more about it.
>> 
>> And we can't create an interlocking between scx_bpf_dsq_insert() and
>> ops.dequeue() without adding extra atomic operations in hot paths. The only
>> thing shared is task rq lock and dispatch path can't do that synchronously.
>> So, yeah, it looks like the best we can do is always letting the BPF sched
>> know and let it figure out locking and whether the task needs to be
>> dequeued from BPF side.
>
> How about setting a flag in deq_flags to distinguish between a "dispatch"
> dequeue vs a real dequeue (due to property changes or other reasons)?
>
> We should be able to pass this information in a reliable way without any
> additional synchronization in the hot paths. This would let schedulers that
> use arena data structures check the flag instead of doing their own
> internal lookups.
>
> And it would also allow us to provide both semantics:
> 1) Catch real dequeues that need special BPF-side actions (check the flag)
> 2) Track all ops.enqueue()/ops.dequeue() pairs for accounting purposes
>    (ignore the flag)
>

IMO the extra flag suffices for arena-based queueing, the arena data
structures already have to track the state of the task already:

Even without the flag it should be possible to infer the task is in
in from inside the BPF code. For example, calling .dequeue() while 
the task is in an arena queue means the task got dequeued _after_
being dispatched, while calling .dequeue() on a queued task means we are
removing it because of a true dequeue event (e.g. sched_setaffinity()
was called). The only edge case in the logic is if a true dequeue event 
happens between .dispatch() and .dequeue(), but a new flag would take 
care of that.


> Thanks,
> -Andrea

Re: [PATCH 1/2] sched_ext: Fix ops.dequeue() semantics

Posted by Emil Tsalapatis 1 month, 1 week ago

On Fri Dec 19, 2025 at 5:43 PM EST, Andrea Righi wrote:
> Properly implement ops.dequeue() to ensure every ops.enqueue() is
> balanced by a corresponding ops.dequeue() call, regardless of whether
> the task is on a BPF data structure or already dispatched to a DSQ.
>
> A task is considered enqueued when it is owned by the BPF scheduler.
> This ownership persists until the task is either dispatched (moved to a
> local DSQ for execution) or removed from the BPF scheduler, such as when
> it blocks waiting for an event or when its properties (for example, CPU
> affinity or priority) are updated.
>
> When the task enters the BPF scheduler ops.enqueue() is invoked, when it
> leaves BPF scheduler ownership, ops.dequeue() is invoked.
>
> This allows BPF schedulers to reliably track task ownership and maintain
> accurate accounting.
>
> Cc: Emil Tsalapatis <emil@etsalapatis.com>
> Signed-off-by: Andrea Righi <arighi@nvidia.com>
> ---


Hi Andrea,

	This change looks reasonable to me. Some comments inline:

>  Documentation/scheduler/sched-ext.rst | 22 ++++++++++++++++++++++
>  include/linux/sched/ext.h             |  1 +
>  kernel/sched/ext.c                    | 27 ++++++++++++++++++++++++++-
>  3 files changed, 49 insertions(+), 1 deletion(-)
>
> diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
> index 404fe6126a769..3ed4be53f97da 100644
> --- a/Documentation/scheduler/sched-ext.rst
> +++ b/Documentation/scheduler/sched-ext.rst
> @@ -252,6 +252,26 @@ The following briefly shows how a waking task is scheduled and executed.
>  
>     * Queue the task on the BPF side.
>  
> +   Once ``ops.enqueue()`` is called, the task is considered "enqueued" and
> +   is owned by the BPF scheduler. Ownership is retained until the task is
> +   either dispatched (moved to a local DSQ for execution) or dequeued
> +   (removed from the scheduler due to a blocking event, or to modify a
> +   property, like CPU affinity, priority, etc.). When the task leaves the
> +   BPF scheduler ``ops.dequeue()`` is invoked.
> +

Can we say "leaves the scx class" instead? On direct dispatch we
technically never insert the task into the BPF scheduler.

> +   **Important**: ``ops.dequeue()`` is called for *any* enqueued task,
> +   regardless of whether the task is still on a BPF data structure, or it
> +   is already dispatched to a DSQ (global, local, or user DSQ)
> +
> +   This guarantees that every ``ops.enqueue()`` will eventually be followed
> +   by a ``ops.dequeue()``. This makes it reliable for BPF schedulers to
> +   track task ownership and maintain accurate accounting, such as per-DSQ
> +   queued runtime sums.
> +
> +   BPF schedulers can choose not to implement ``ops.dequeue()`` if they
> +   don't need to track these transitions. The sched_ext core will safely
> +   handle all dequeue operations regardless.
> +
>  3. When a CPU is ready to schedule, it first looks at its local DSQ. If
>     empty, it then looks at the global DSQ. If there still isn't a task to
>     run, ``ops.dispatch()`` is invoked which can use the following two
> @@ -319,6 +339,8 @@ by a sched_ext scheduler:
>                  /* Any usable CPU becomes available */
>  
>                  ops.dispatch(); /* Task is moved to a local DSQ */
> +
> +                ops.dequeue(); /* Exiting BPF scheduler */
>              }
>              ops.running();      /* Task starts running on its assigned CPU */
>              while (task->scx.slice > 0 && task is runnable)
> diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
> index bcb962d5ee7d8..334c3692a9c62 100644
> --- a/include/linux/sched/ext.h
> +++ b/include/linux/sched/ext.h
> @@ -84,6 +84,7 @@ struct scx_dispatch_q {
>  /* scx_entity.flags */
>  enum scx_ent_flags {
>  	SCX_TASK_QUEUED		= 1 << 0, /* on ext runqueue */
> +	SCX_TASK_OPS_ENQUEUED	= 1 << 1, /* ops.enqueue() was called */

Can we rename this flag? For direct dispatch we never got enqueued.
Something like "DEQ_ON_DISPATCH" would show the purpose of the
flag more clearly.

>  	SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */
>  	SCX_TASK_DEQD_FOR_SLEEP	= 1 << 3, /* last dequeue was for SLEEP */
>  
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index 94164f2dec6dc..985d75d374385 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -1390,6 +1390,9 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
>  	WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
>  	atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq);
>  
> +	/* Mark that ops.enqueue() is being called for this task */
> +	p->scx.flags |= SCX_TASK_OPS_ENQUEUED;
> +

Can we avoid setting this flag when we have no .dequeue() method?
Otherwise it stays set forever AFAICT, even after the task has been
sent to the runqueues.

>  	ddsp_taskp = this_cpu_ptr(&direct_dispatch_task);
>  	WARN_ON_ONCE(*ddsp_taskp);
>  	*ddsp_taskp = p;
> @@ -1522,6 +1525,21 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
>  
>  	switch (opss & SCX_OPSS_STATE_MASK) {
>  	case SCX_OPSS_NONE:
> +		/*
> +		 * Task is not currently being enqueued or queued on the BPF
> +		 * scheduler. Check if ops.enqueue() was called for this task.
> +		 */
> +		if ((p->scx.flags & SCX_TASK_OPS_ENQUEUED) &&
> +		    SCX_HAS_OP(sch, dequeue)) {
> +			/*
> +			 * ops.enqueue() was called and the task was dispatched.
> +			 * Call ops.dequeue() to notify the BPF scheduler that
> +			 * the task is leaving.
> +			 */
> +			SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq,
> +					 p, deq_flags);
> +			p->scx.flags &= ~SCX_TASK_OPS_ENQUEUED;
> +		}
>  		break;
>  	case SCX_OPSS_QUEUEING:
>  		/*
> @@ -1530,9 +1548,16 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
>  		 */
>  		BUG();
>  	case SCX_OPSS_QUEUED:
> -		if (SCX_HAS_OP(sch, dequeue))
> +		/*
> +		 * Task is owned by the BPF scheduler. Call ops.dequeue()
> +		 * to notify the BPF scheduler that the task is being
> +		 * removed.
> +		 */
> +		if (SCX_HAS_OP(sch, dequeue)) {

Edge case, but if we have a .dequeue() method but not an .enqueue() we
still make this call. Can we add flags & SCX_TASK_OPS_ENQUEUED as an 
extra condition to be consistent with the SCX_OPSS_NONE case above?

>  			SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq,
>  					 p, deq_flags);
> +			p->scx.flags &= ~SCX_TASK_OPS_ENQUEUED;
> +		}
>  
>  		if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
>  					    SCX_OPSS_NONE))

Re: [PATCH 1/2] sched_ext: Fix ops.dequeue() semantics

Posted by Andrea Righi 1 month, 1 week ago

Hi Emil,

On Sat, Dec 27, 2025 at 10:20:06PM -0500, Emil Tsalapatis wrote:
> On Fri Dec 19, 2025 at 5:43 PM EST, Andrea Righi wrote:
> > Properly implement ops.dequeue() to ensure every ops.enqueue() is
> > balanced by a corresponding ops.dequeue() call, regardless of whether
> > the task is on a BPF data structure or already dispatched to a DSQ.
> >
> > A task is considered enqueued when it is owned by the BPF scheduler.
> > This ownership persists until the task is either dispatched (moved to a
> > local DSQ for execution) or removed from the BPF scheduler, such as when
> > it blocks waiting for an event or when its properties (for example, CPU
> > affinity or priority) are updated.
> >
> > When the task enters the BPF scheduler ops.enqueue() is invoked, when it
> > leaves BPF scheduler ownership, ops.dequeue() is invoked.
> >
> > This allows BPF schedulers to reliably track task ownership and maintain
> > accurate accounting.
> >
> > Cc: Emil Tsalapatis <emil@etsalapatis.com>
> > Signed-off-by: Andrea Righi <arighi@nvidia.com>
> > ---
> 
> 
> Hi Andrea,
> 
> 	This change looks reasonable to me. Some comments inline:
> 
> >  Documentation/scheduler/sched-ext.rst | 22 ++++++++++++++++++++++
> >  include/linux/sched/ext.h             |  1 +
> >  kernel/sched/ext.c                    | 27 ++++++++++++++++++++++++++-
> >  3 files changed, 49 insertions(+), 1 deletion(-)
> >
> > diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
> > index 404fe6126a769..3ed4be53f97da 100644
> > --- a/Documentation/scheduler/sched-ext.rst
> > +++ b/Documentation/scheduler/sched-ext.rst
> > @@ -252,6 +252,26 @@ The following briefly shows how a waking task is scheduled and executed.
> >  
> >     * Queue the task on the BPF side.
> >  
> > +   Once ``ops.enqueue()`` is called, the task is considered "enqueued" and
> > +   is owned by the BPF scheduler. Ownership is retained until the task is
> > +   either dispatched (moved to a local DSQ for execution) or dequeued
> > +   (removed from the scheduler due to a blocking event, or to modify a
> > +   property, like CPU affinity, priority, etc.). When the task leaves the
> > +   BPF scheduler ``ops.dequeue()`` is invoked.
> > +
> 
> Can we say "leaves the scx class" instead? On direct dispatch we
> technically never insert the task into the BPF scheduler.

Hm.. I agree that'd be more accurate, but it might also be slightly
misleading, as it could be interpreted as the task being moved to a
different scheduling class. How about saying "leaves the enqueued state"
instead, where enqueued means ops.enqueue() being called... I can't find a
better name for this state, like "ops_enqueued", but that's be even more
confusing. :)

> 
> > +   **Important**: ``ops.dequeue()`` is called for *any* enqueued task,
> > +   regardless of whether the task is still on a BPF data structure, or it
> > +   is already dispatched to a DSQ (global, local, or user DSQ)
> > +
> > +   This guarantees that every ``ops.enqueue()`` will eventually be followed
> > +   by a ``ops.dequeue()``. This makes it reliable for BPF schedulers to
> > +   track task ownership and maintain accurate accounting, such as per-DSQ
> > +   queued runtime sums.
> > +
> > +   BPF schedulers can choose not to implement ``ops.dequeue()`` if they
> > +   don't need to track these transitions. The sched_ext core will safely
> > +   handle all dequeue operations regardless.
> > +
> >  3. When a CPU is ready to schedule, it first looks at its local DSQ. If
> >     empty, it then looks at the global DSQ. If there still isn't a task to
> >     run, ``ops.dispatch()`` is invoked which can use the following two
> > @@ -319,6 +339,8 @@ by a sched_ext scheduler:
> >                  /* Any usable CPU becomes available */
> >  
> >                  ops.dispatch(); /* Task is moved to a local DSQ */
> > +
> > +                ops.dequeue(); /* Exiting BPF scheduler */
> >              }
> >              ops.running();      /* Task starts running on its assigned CPU */
> >              while (task->scx.slice > 0 && task is runnable)
> > diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
> > index bcb962d5ee7d8..334c3692a9c62 100644
> > --- a/include/linux/sched/ext.h
> > +++ b/include/linux/sched/ext.h
> > @@ -84,6 +84,7 @@ struct scx_dispatch_q {
> >  /* scx_entity.flags */
> >  enum scx_ent_flags {
> >  	SCX_TASK_QUEUED		= 1 << 0, /* on ext runqueue */
> > +	SCX_TASK_OPS_ENQUEUED	= 1 << 1, /* ops.enqueue() was called */
> 
> Can we rename this flag? For direct dispatch we never got enqueued.
> Something like "DEQ_ON_DISPATCH" would show the purpose of the
> flag more clearly.

Good point. However, ops.dequeue() isn't only called on dispatch, it can
also be triggered when a task property is changed.

So the flag should represent the "enqueued state" in the sense that
ops.enqueue() has been called and a corresponding ops.dequeue() is
expected. This is a lifecycle state, not an indication that the task is in
any queue.

Would a more descriptive comment clarify this? Something like:

  SCX_TASK_OPS_ENQUEUED = 1 << 1, /* Task in enqueued state: ops.enqueue()
                                     called, ops.dequeue() will be called
                                     when task leaves this state. */

> 
> >  	SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */
> >  	SCX_TASK_DEQD_FOR_SLEEP	= 1 << 3, /* last dequeue was for SLEEP */
> >  
> > diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> > index 94164f2dec6dc..985d75d374385 100644
> > --- a/kernel/sched/ext.c
> > +++ b/kernel/sched/ext.c
> > @@ -1390,6 +1390,9 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
> >  	WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
> >  	atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq);
> >  
> > +	/* Mark that ops.enqueue() is being called for this task */
> > +	p->scx.flags |= SCX_TASK_OPS_ENQUEUED;
> > +
> 
> Can we avoid setting this flag when we have no .dequeue() method?
> Otherwise it stays set forever AFAICT, even after the task has been
> sent to the runqueues.

Good catch! Definitely we don't need to set this for schedulers that don't
implement ops.dequeue().

> 
> >  	ddsp_taskp = this_cpu_ptr(&direct_dispatch_task);
> >  	WARN_ON_ONCE(*ddsp_taskp);
> >  	*ddsp_taskp = p;
> > @@ -1522,6 +1525,21 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
> >  
> >  	switch (opss & SCX_OPSS_STATE_MASK) {
> >  	case SCX_OPSS_NONE:
> > +		/*
> > +		 * Task is not currently being enqueued or queued on the BPF
> > +		 * scheduler. Check if ops.enqueue() was called for this task.
> > +		 */
> > +		if ((p->scx.flags & SCX_TASK_OPS_ENQUEUED) &&
> > +		    SCX_HAS_OP(sch, dequeue)) {
> > +			/*
> > +			 * ops.enqueue() was called and the task was dispatched.
> > +			 * Call ops.dequeue() to notify the BPF scheduler that
> > +			 * the task is leaving.
> > +			 */
> > +			SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq,
> > +					 p, deq_flags);
> > +			p->scx.flags &= ~SCX_TASK_OPS_ENQUEUED;
> > +		}
> >  		break;
> >  	case SCX_OPSS_QUEUEING:
> >  		/*
> > @@ -1530,9 +1548,16 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
> >  		 */
> >  		BUG();
> >  	case SCX_OPSS_QUEUED:
> > -		if (SCX_HAS_OP(sch, dequeue))
> > +		/*
> > +		 * Task is owned by the BPF scheduler. Call ops.dequeue()
> > +		 * to notify the BPF scheduler that the task is being
> > +		 * removed.
> > +		 */
> > +		if (SCX_HAS_OP(sch, dequeue)) {
> 
> Edge case, but if we have a .dequeue() method but not an .enqueue() we
> still make this call. Can we add flags & SCX_TASK_OPS_ENQUEUED as an 
> extra condition to be consistent with the SCX_OPSS_NONE case above?

Also good catch. Will add that.

> 
> >  			SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq,
> >  					 p, deq_flags);
> > +			p->scx.flags &= ~SCX_TASK_OPS_ENQUEUED;
> > +		}
> >  
> >  		if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
> >  					    SCX_OPSS_NONE))
> 

Thanks,
-Andrea

Re: [PATCH 1/2] sched_ext: Fix ops.dequeue() semantics

Posted by Emil Tsalapatis 1 month, 1 week ago

On Mon Dec 29, 2025 at 11:36 AM EST, Andrea Righi wrote:
> Hi Emil,
>
> On Sat, Dec 27, 2025 at 10:20:06PM -0500, Emil Tsalapatis wrote:
>> On Fri Dec 19, 2025 at 5:43 PM EST, Andrea Righi wrote:
>> > Properly implement ops.dequeue() to ensure every ops.enqueue() is
>> > balanced by a corresponding ops.dequeue() call, regardless of whether
>> > the task is on a BPF data structure or already dispatched to a DSQ.
>> >
>> > A task is considered enqueued when it is owned by the BPF scheduler.
>> > This ownership persists until the task is either dispatched (moved to a
>> > local DSQ for execution) or removed from the BPF scheduler, such as when
>> > it blocks waiting for an event or when its properties (for example, CPU
>> > affinity or priority) are updated.
>> >
>> > When the task enters the BPF scheduler ops.enqueue() is invoked, when it
>> > leaves BPF scheduler ownership, ops.dequeue() is invoked.
>> >
>> > This allows BPF schedulers to reliably track task ownership and maintain
>> > accurate accounting.
>> >
>> > Cc: Emil Tsalapatis <emil@etsalapatis.com>
>> > Signed-off-by: Andrea Righi <arighi@nvidia.com>
>> > ---
>> 
>> 
>> Hi Andrea,
>> 
>> 	This change looks reasonable to me. Some comments inline:
>> 
>> >  Documentation/scheduler/sched-ext.rst | 22 ++++++++++++++++++++++
>> >  include/linux/sched/ext.h             |  1 +
>> >  kernel/sched/ext.c                    | 27 ++++++++++++++++++++++++++-
>> >  3 files changed, 49 insertions(+), 1 deletion(-)
>> >
>> > diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
>> > index 404fe6126a769..3ed4be53f97da 100644
>> > --- a/Documentation/scheduler/sched-ext.rst
>> > +++ b/Documentation/scheduler/sched-ext.rst
>> > @@ -252,6 +252,26 @@ The following briefly shows how a waking task is scheduled and executed.
>> >  
>> >     * Queue the task on the BPF side.
>> >  
>> > +   Once ``ops.enqueue()`` is called, the task is considered "enqueued" and
>> > +   is owned by the BPF scheduler. Ownership is retained until the task is
>> > +   either dispatched (moved to a local DSQ for execution) or dequeued
>> > +   (removed from the scheduler due to a blocking event, or to modify a
>> > +   property, like CPU affinity, priority, etc.). When the task leaves the
>> > +   BPF scheduler ``ops.dequeue()`` is invoked.
>> > +
>> 
>> Can we say "leaves the scx class" instead? On direct dispatch we
>> technically never insert the task into the BPF scheduler.
>
> Hm.. I agree that'd be more accurate, but it might also be slightly
> misleading, as it could be interpreted as the task being moved to a
> different scheduling class. How about saying "leaves the enqueued state"
> instead, where enqueued means ops.enqueue() being called... I can't find a
> better name for this state, like "ops_enqueued", but that's be even more
> confusing. :)
>

I like "leaves the enqueued state", it implies that the task has no
state in the scx scheduler.

>> 
>> > +   **Important**: ``ops.dequeue()`` is called for *any* enqueued task,
>> > +   regardless of whether the task is still on a BPF data structure, or it
>> > +   is already dispatched to a DSQ (global, local, or user DSQ)
>> > +
>> > +   This guarantees that every ``ops.enqueue()`` will eventually be followed
>> > +   by a ``ops.dequeue()``. This makes it reliable for BPF schedulers to
>> > +   track task ownership and maintain accurate accounting, such as per-DSQ
>> > +   queued runtime sums.
>> > +
>> > +   BPF schedulers can choose not to implement ``ops.dequeue()`` if they
>> > +   don't need to track these transitions. The sched_ext core will safely
>> > +   handle all dequeue operations regardless.
>> > +
>> >  3. When a CPU is ready to schedule, it first looks at its local DSQ. If
>> >     empty, it then looks at the global DSQ. If there still isn't a task to
>> >     run, ``ops.dispatch()`` is invoked which can use the following two
>> > @@ -319,6 +339,8 @@ by a sched_ext scheduler:
>> >                  /* Any usable CPU becomes available */
>> >  
>> >                  ops.dispatch(); /* Task is moved to a local DSQ */
>> > +
>> > +                ops.dequeue(); /* Exiting BPF scheduler */
>> >              }
>> >              ops.running();      /* Task starts running on its assigned CPU */
>> >              while (task->scx.slice > 0 && task is runnable)
>> > diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
>> > index bcb962d5ee7d8..334c3692a9c62 100644
>> > --- a/include/linux/sched/ext.h
>> > +++ b/include/linux/sched/ext.h
>> > @@ -84,6 +84,7 @@ struct scx_dispatch_q {
>> >  /* scx_entity.flags */
>> >  enum scx_ent_flags {
>> >  	SCX_TASK_QUEUED		= 1 << 0, /* on ext runqueue */
>> > +	SCX_TASK_OPS_ENQUEUED	= 1 << 1, /* ops.enqueue() was called */
>> 
>> Can we rename this flag? For direct dispatch we never got enqueued.
>> Something like "DEQ_ON_DISPATCH" would show the purpose of the
>> flag more clearly.
>
> Good point. However, ops.dequeue() isn't only called on dispatch, it can
> also be triggered when a task property is changed.
>
> So the flag should represent the "enqueued state" in the sense that
> ops.enqueue() has been called and a corresponding ops.dequeue() is
> expected. This is a lifecycle state, not an indication that the task is in
> any queue.
>
> Would a more descriptive comment clarify this? Something like:
>
>   SCX_TASK_OPS_ENQUEUED = 1 << 1, /* Task in enqueued state: ops.enqueue()
>                                      called, ops.dequeue() will be called
>                                      when task leaves this state. */
>

That makes sense, my reasoning was that we actually use the flag for
is not whether the task is enqueued, but rather whether whether we 
need to call the dequeue callback when dequeueing from the SCX_OPSS_NONE 
state. Can the comment maybe more concretely explain this?

As an aside, I think this change makes it so we can remove the _OPSS_ state 
machine with some more refactoring. 

>> 
>> >  	SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */
>> >  	SCX_TASK_DEQD_FOR_SLEEP	= 1 << 3, /* last dequeue was for SLEEP */
>> >  
>> > diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
>> > index 94164f2dec6dc..985d75d374385 100644
>> > --- a/kernel/sched/ext.c
>> > +++ b/kernel/sched/ext.c
>> > @@ -1390,6 +1390,9 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
>> >  	WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
>> >  	atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq);
>> >  
>> > +	/* Mark that ops.enqueue() is being called for this task */
>> > +	p->scx.flags |= SCX_TASK_OPS_ENQUEUED;
>> > +
>> 
>> Can we avoid setting this flag when we have no .dequeue() method?
>> Otherwise it stays set forever AFAICT, even after the task has been
>> sent to the runqueues.
>
> Good catch! Definitely we don't need to set this for schedulers that don't
> implement ops.dequeue().
>
>> 
>> >  	ddsp_taskp = this_cpu_ptr(&direct_dispatch_task);
>> >  	WARN_ON_ONCE(*ddsp_taskp);
>> >  	*ddsp_taskp = p;
>> > @@ -1522,6 +1525,21 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
>> >  
>> >  	switch (opss & SCX_OPSS_STATE_MASK) {
>> >  	case SCX_OPSS_NONE:
>> > +		/*
>> > +		 * Task is not currently being enqueued or queued on the BPF
>> > +		 * scheduler. Check if ops.enqueue() was called for this task.
>> > +		 */
>> > +		if ((p->scx.flags & SCX_TASK_OPS_ENQUEUED) &&
>> > +		    SCX_HAS_OP(sch, dequeue)) {
>> > +			/*
>> > +			 * ops.enqueue() was called and the task was dispatched.
>> > +			 * Call ops.dequeue() to notify the BPF scheduler that
>> > +			 * the task is leaving.
>> > +			 */
>> > +			SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq,
>> > +					 p, deq_flags);
>> > +			p->scx.flags &= ~SCX_TASK_OPS_ENQUEUED;
>> > +		}
>> >  		break;
>> >  	case SCX_OPSS_QUEUEING:
>> >  		/*
>> > @@ -1530,9 +1548,16 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
>> >  		 */
>> >  		BUG();
>> >  	case SCX_OPSS_QUEUED:
>> > -		if (SCX_HAS_OP(sch, dequeue))
>> > +		/*
>> > +		 * Task is owned by the BPF scheduler. Call ops.dequeue()
>> > +		 * to notify the BPF scheduler that the task is being
>> > +		 * removed.
>> > +		 */
>> > +		if (SCX_HAS_OP(sch, dequeue)) {
>> 
>> Edge case, but if we have a .dequeue() method but not an .enqueue() we
>> still make this call. Can we add flags & SCX_TASK_OPS_ENQUEUED as an 
>> extra condition to be consistent with the SCX_OPSS_NONE case above?
>
> Also good catch. Will add that.
>
>> 
>> >  			SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq,
>> >  					 p, deq_flags);
>> > +			p->scx.flags &= ~SCX_TASK_OPS_ENQUEUED;
>> > +		}
>> >  
>> >  		if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
>> >  					    SCX_OPSS_NONE))
>> 
>
> Thanks,
> -Andrea