Add a core event, SCX_EV_ENQ_SLICE_DFL, which represents how many
tasks have been enqueued (or pick_task-ed) with a default time slice
(SCX_SLICE_DFL).
Scheduling a task with SCX_SLICE_DFL unintentionally would be a source
of latency spikes because SCX_SLICE_DFL is relatively long (20 msec).
Thus, soaring the SCX_EV_ENQ_SLICE_DFL value would be a sign of BPF
scheduler bugs, causing latency spikes.
__scx_add_event() is used since the caller holds an rq lock,
so the preemption has already been disabled.
Signed-off-by: Changwoo Min <changwoo@igalia.com>
---
kernel/sched/ext.c | 15 ++++++++++++++-
1 file changed, 14 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 8a9a30895381..1077df9280bb 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1468,6 +1468,12 @@ struct scx_event_stats {
*/
u64 SCX_EV_ENQ_SKIP_EXITING;
+ /*
+ * The total number of tasks enqueued (or pick_task-ed) with a
+ * default time slice (SCX_SLICE_DFL).
+ */
+ u64 SCX_EV_ENQ_SLICE_DFL;
+
/*
* The total duration of bypass modes in nanoseconds.
*/
@@ -2134,6 +2140,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
*/
touch_core_sched(rq, p);
p->scx.slice = SCX_SLICE_DFL;
+ __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
local_norefill:
dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags);
return;
@@ -2141,6 +2148,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
global:
touch_core_sched(rq, p); /* see the comment in local: */
p->scx.slice = SCX_SLICE_DFL;
+ __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
dispatch_enqueue(find_global_dsq(p), p, enq_flags);
}
@@ -3202,8 +3210,10 @@ static struct task_struct *pick_task_scx(struct rq *rq)
*/
if (keep_prev) {
p = prev;
- if (!p->scx.slice)
+ if (!p->scx.slice) {
p->scx.slice = SCX_SLICE_DFL;
+ __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
+ }
} else {
p = first_local_task(rq);
if (!p) {
@@ -3219,6 +3229,7 @@ static struct task_struct *pick_task_scx(struct rq *rq)
scx_warned_zero_slice = true;
}
p->scx.slice = SCX_SLICE_DFL;
+ __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
}
}
@@ -5023,6 +5034,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST);
scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING);
+ scx_dump_event(s, &events, SCX_EV_ENQ_SLICE_DFL);
scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION);
scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH);
scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE);
@@ -7163,6 +7175,7 @@ __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events,
scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_KEEP_LAST);
scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_EXITING);
+ scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SLICE_DFL);
scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DURATION);
scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DISPATCH);
scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_ACTIVATE);
--
2.48.1
Hi Changwoo,
On Fri, Feb 07, 2025 at 12:13:37PM +0900, Changwoo Min wrote:
> Add a core event, SCX_EV_ENQ_SLICE_DFL, which represents how many
> tasks have been enqueued (or pick_task-ed) with a default time slice
> (SCX_SLICE_DFL).
>
> Scheduling a task with SCX_SLICE_DFL unintentionally would be a source
> of latency spikes because SCX_SLICE_DFL is relatively long (20 msec).
> Thus, soaring the SCX_EV_ENQ_SLICE_DFL value would be a sign of BPF
> scheduler bugs, causing latency spikes.
>
> __scx_add_event() is used since the caller holds an rq lock,
> so the preemption has already been disabled.
We may want to consider select_task_rq_scx() as well, when ops.select_cpu()
is not implemented (or during rq_bypass).
In this case, if scx_select_cpu_dfl() finds an idle CPU, we implicitly
dispatch the task to the local DSQ with SCX_SLICE_DFL.
Thanks,
-Andrea
>
> Signed-off-by: Changwoo Min <changwoo@igalia.com>
> ---
> kernel/sched/ext.c | 15 ++++++++++++++-
> 1 file changed, 14 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index 8a9a30895381..1077df9280bb 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -1468,6 +1468,12 @@ struct scx_event_stats {
> */
> u64 SCX_EV_ENQ_SKIP_EXITING;
>
> + /*
> + * The total number of tasks enqueued (or pick_task-ed) with a
> + * default time slice (SCX_SLICE_DFL).
> + */
> + u64 SCX_EV_ENQ_SLICE_DFL;
> +
> /*
> * The total duration of bypass modes in nanoseconds.
> */
> @@ -2134,6 +2140,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
> */
> touch_core_sched(rq, p);
> p->scx.slice = SCX_SLICE_DFL;
> + __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
> local_norefill:
> dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags);
> return;
> @@ -2141,6 +2148,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
> global:
> touch_core_sched(rq, p); /* see the comment in local: */
> p->scx.slice = SCX_SLICE_DFL;
> + __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
> dispatch_enqueue(find_global_dsq(p), p, enq_flags);
> }
>
> @@ -3202,8 +3210,10 @@ static struct task_struct *pick_task_scx(struct rq *rq)
> */
> if (keep_prev) {
> p = prev;
> - if (!p->scx.slice)
> + if (!p->scx.slice) {
> p->scx.slice = SCX_SLICE_DFL;
> + __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
> + }
> } else {
> p = first_local_task(rq);
> if (!p) {
> @@ -3219,6 +3229,7 @@ static struct task_struct *pick_task_scx(struct rq *rq)
> scx_warned_zero_slice = true;
> }
> p->scx.slice = SCX_SLICE_DFL;
> + __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
> }
> }
>
> @@ -5023,6 +5034,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
> scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
> scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST);
> scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING);
> + scx_dump_event(s, &events, SCX_EV_ENQ_SLICE_DFL);
> scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION);
> scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH);
> scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE);
> @@ -7163,6 +7175,7 @@ __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events,
> scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
> scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_KEEP_LAST);
> scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_EXITING);
> + scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SLICE_DFL);
> scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DURATION);
> scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DISPATCH);
> scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_ACTIVATE);
> --
> 2.48.1
>
Hi Andrea, On 25. 2. 7. 15:17, Andrea Righi wrote: > Hi Changwoo, > > On Fri, Feb 07, 2025 at 12:13:37PM +0900, Changwoo Min wrote: >> Add a core event, SCX_EV_ENQ_SLICE_DFL, which represents how many >> tasks have been enqueued (or pick_task-ed) with a default time slice >> (SCX_SLICE_DFL). >> >> Scheduling a task with SCX_SLICE_DFL unintentionally would be a source >> of latency spikes because SCX_SLICE_DFL is relatively long (20 msec). >> Thus, soaring the SCX_EV_ENQ_SLICE_DFL value would be a sign of BPF >> scheduler bugs, causing latency spikes. >> >> __scx_add_event() is used since the caller holds an rq lock, >> so the preemption has already been disabled. > > We may want to consider select_task_rq_scx() as well, when ops.select_cpu() > is not implemented (or during rq_bypass). > > In this case, if scx_select_cpu_dfl() finds an idle CPU, we implicitly > dispatch the task to the local DSQ with SCX_SLICE_DFL. You are right. I will add it too. Thanks! -- Changwoo
© 2016 - 2026 Red Hat, Inc.