[v1] sched_ext: Add a core event and update scx schedulers

[PATCH 1/2] sched_ext: Add an event, SCX_EV_ENQ_SLICE_DFL

Posted by Changwoo Min 1 year ago

Add a core event, SCX_EV_ENQ_SLICE_DFL, which represents how many
tasks have been enqueued (or pick_task-ed) with a default time slice
(SCX_SLICE_DFL).

Scheduling a task with SCX_SLICE_DFL unintentionally would be a source
of latency spikes because SCX_SLICE_DFL is relatively long (20 msec).
Thus, soaring the SCX_EV_ENQ_SLICE_DFL value would be a sign of BPF
scheduler bugs, causing latency spikes.

__scx_add_event() is used since the caller holds an rq lock,
so the preemption has already been disabled.

Signed-off-by: Changwoo Min <changwoo@igalia.com>
---
 kernel/sched/ext.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 8a9a30895381..1077df9280bb 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1468,6 +1468,12 @@ struct scx_event_stats {
 	 */
 	u64		SCX_EV_ENQ_SKIP_EXITING;
 
+	/*
+	 * The total number of tasks enqueued (or pick_task-ed) with a
+	 * default time slice (SCX_SLICE_DFL).
+	 */
+	u64		SCX_EV_ENQ_SLICE_DFL;
+
 	/*
 	 * The total duration of bypass modes in nanoseconds.
 	 */
@@ -2134,6 +2140,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 	 */
 	touch_core_sched(rq, p);
 	p->scx.slice = SCX_SLICE_DFL;
+	__scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
 local_norefill:
 	dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags);
 	return;
@@ -2141,6 +2148,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 global:
 	touch_core_sched(rq, p);	/* see the comment in local: */
 	p->scx.slice = SCX_SLICE_DFL;
+	__scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
 	dispatch_enqueue(find_global_dsq(p), p, enq_flags);
 }
 
@@ -3202,8 +3210,10 @@ static struct task_struct *pick_task_scx(struct rq *rq)
 	 */
 	if (keep_prev) {
 		p = prev;
-		if (!p->scx.slice)
+		if (!p->scx.slice) {
 			p->scx.slice = SCX_SLICE_DFL;
+			__scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
+		}
 	} else {
 		p = first_local_task(rq);
 		if (!p) {
@@ -3219,6 +3229,7 @@ static struct task_struct *pick_task_scx(struct rq *rq)
 				scx_warned_zero_slice = true;
 			}
 			p->scx.slice = SCX_SLICE_DFL;
+			__scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
 		}
 	}
 
@@ -5023,6 +5034,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
 	scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
 	scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST);
 	scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING);
+	scx_dump_event(s, &events, SCX_EV_ENQ_SLICE_DFL);
 	scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION);
 	scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH);
 	scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE);
@@ -7163,6 +7175,7 @@ __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events,
 		scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
 		scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_KEEP_LAST);
 		scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_EXITING);
+		scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SLICE_DFL);
 		scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DURATION);
 		scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DISPATCH);
 		scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_ACTIVATE);
-- 
2.48.1

Re: [PATCH 1/2] sched_ext: Add an event, SCX_EV_ENQ_SLICE_DFL

Posted by Andrea Righi 1 year ago

Hi Changwoo,

On Fri, Feb 07, 2025 at 12:13:37PM +0900, Changwoo Min wrote:
> Add a core event, SCX_EV_ENQ_SLICE_DFL, which represents how many
> tasks have been enqueued (or pick_task-ed) with a default time slice
> (SCX_SLICE_DFL).
> 
> Scheduling a task with SCX_SLICE_DFL unintentionally would be a source
> of latency spikes because SCX_SLICE_DFL is relatively long (20 msec).
> Thus, soaring the SCX_EV_ENQ_SLICE_DFL value would be a sign of BPF
> scheduler bugs, causing latency spikes.
> 
> __scx_add_event() is used since the caller holds an rq lock,
> so the preemption has already been disabled.

We may want to consider select_task_rq_scx() as well, when ops.select_cpu()
is not implemented (or during rq_bypass).

In this case, if scx_select_cpu_dfl() finds an idle CPU, we implicitly
dispatch the task to the local DSQ with SCX_SLICE_DFL.

Thanks,
-Andrea

> 
> Signed-off-by: Changwoo Min <changwoo@igalia.com>
> ---
>  kernel/sched/ext.c | 15 ++++++++++++++-
>  1 file changed, 14 insertions(+), 1 deletion(-)
> 
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index 8a9a30895381..1077df9280bb 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -1468,6 +1468,12 @@ struct scx_event_stats {
>  	 */
>  	u64		SCX_EV_ENQ_SKIP_EXITING;
>  
> +	/*
> +	 * The total number of tasks enqueued (or pick_task-ed) with a
> +	 * default time slice (SCX_SLICE_DFL).
> +	 */
> +	u64		SCX_EV_ENQ_SLICE_DFL;
> +
>  	/*
>  	 * The total duration of bypass modes in nanoseconds.
>  	 */
> @@ -2134,6 +2140,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
>  	 */
>  	touch_core_sched(rq, p);
>  	p->scx.slice = SCX_SLICE_DFL;
> +	__scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
>  local_norefill:
>  	dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags);
>  	return;
> @@ -2141,6 +2148,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
>  global:
>  	touch_core_sched(rq, p);	/* see the comment in local: */
>  	p->scx.slice = SCX_SLICE_DFL;
> +	__scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
>  	dispatch_enqueue(find_global_dsq(p), p, enq_flags);
>  }
>  
> @@ -3202,8 +3210,10 @@ static struct task_struct *pick_task_scx(struct rq *rq)
>  	 */
>  	if (keep_prev) {
>  		p = prev;
> -		if (!p->scx.slice)
> +		if (!p->scx.slice) {
>  			p->scx.slice = SCX_SLICE_DFL;
> +			__scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
> +		}
>  	} else {
>  		p = first_local_task(rq);
>  		if (!p) {
> @@ -3219,6 +3229,7 @@ static struct task_struct *pick_task_scx(struct rq *rq)
>  				scx_warned_zero_slice = true;
>  			}
>  			p->scx.slice = SCX_SLICE_DFL;
> +			__scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
>  		}
>  	}
>  
> @@ -5023,6 +5034,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
>  	scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
>  	scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST);
>  	scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING);
> +	scx_dump_event(s, &events, SCX_EV_ENQ_SLICE_DFL);
>  	scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION);
>  	scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH);
>  	scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE);
> @@ -7163,6 +7175,7 @@ __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events,
>  		scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
>  		scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_KEEP_LAST);
>  		scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_EXITING);
> +		scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SLICE_DFL);
>  		scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DURATION);
>  		scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DISPATCH);
>  		scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_ACTIVATE);
> -- 
> 2.48.1
>

Re: [PATCH 1/2] sched_ext: Add an event, SCX_EV_ENQ_SLICE_DFL

Posted by Changwoo Min 1 year ago

Hi Andrea,

On 25. 2. 7. 15:17, Andrea Righi wrote:
> Hi Changwoo,
> 
> On Fri, Feb 07, 2025 at 12:13:37PM +0900, Changwoo Min wrote:
>> Add a core event, SCX_EV_ENQ_SLICE_DFL, which represents how many
>> tasks have been enqueued (or pick_task-ed) with a default time slice
>> (SCX_SLICE_DFL).
>>
>> Scheduling a task with SCX_SLICE_DFL unintentionally would be a source
>> of latency spikes because SCX_SLICE_DFL is relatively long (20 msec).
>> Thus, soaring the SCX_EV_ENQ_SLICE_DFL value would be a sign of BPF
>> scheduler bugs, causing latency spikes.
>>
>> __scx_add_event() is used since the caller holds an rq lock,
>> so the preemption has already been disabled.
> 
> We may want to consider select_task_rq_scx() as well, when ops.select_cpu()
> is not implemented (or during rq_bypass).
> 
> In this case, if scx_select_cpu_dfl() finds an idle CPU, we implicitly
> dispatch the task to the local DSQ with SCX_SLICE_DFL.

You are right. I will add it too.

Thanks!
-- Changwoo